From 7ba158183a34d71b3989512c059a01d35c4c4673 Mon Sep 17 00:00:00 2001
From: zeripath <art27@cantab.net>
Date: Wed, 17 Feb 2021 19:32:47 +0000
Subject: [PATCH] Use cat-file --batch in GetLanguageStats (#14685)

* Use cat-file --batch in GetLanguageStats

This PR moves to using a single cat-file --batch in GetLanguageStats
significantly reducing the number of processes spawned during language stat
processing.

Signed-off-by: Andrew Thornton <art27@cantab.net>

* placate lint

Signed-off-by: Andrew Thornton <art27@cantab.net>

* Update modules/git/repo_language_stats_nogogit.go

Co-authored-by: a1012112796 <1012112796@qq.com>

Co-authored-by: Lauris BH <lauris@nix.lv>
Co-authored-by: 6543 <6543@obermui.de>
Co-authored-by: a1012112796 <1012112796@qq.com>
Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
---
 modules/git/repo_language_stats_nogogit.go | 130 ++++++++++++++++-----
 1 file changed, 100 insertions(+), 30 deletions(-)

diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go
index 5607e4591..4c6f07f0f 100644
--- a/modules/git/repo_language_stats_nogogit.go
+++ b/modules/git/repo_language_stats_nogogit.go
@@ -7,9 +7,11 @@
 package git
 
 import (
+	"bufio"
 	"bytes"
 	"io"
-	"io/ioutil"
+	"math"
+	"strings"
 
 	"code.gitea.io/gitea/modules/analyze"
 
@@ -18,16 +20,60 @@ import (
 
 // GetLanguageStats calculates language stats for git repository at specified commit
 func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
-	// FIXME: We can be more efficient here...
-	//
-	// We're expecting that we will be reading a lot of blobs and the trees
-	// Thus we should use a shared `cat-file --batch` to get all of this data
-	// And keep the buffers around with resets as necessary.
-	//
-	// It's more complicated so...
-	commit, err := repo.GetCommit(commitID)
+	// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
+	// so let's create a batch stdin and stdout
+
+	batchStdinReader, batchStdinWriter := io.Pipe()
+	batchStdoutReader, batchStdoutWriter := io.Pipe()
+	defer func() {
+		_ = batchStdinReader.Close()
+		_ = batchStdinWriter.Close()
+		_ = batchStdoutReader.Close()
+		_ = batchStdoutWriter.Close()
+	}()
+
+	go func() {
+		stderr := strings.Builder{}
+		err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader)
+		if err != nil {
+			_ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String()))
+			_ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String()))
+		} else {
+			_ = batchStdoutWriter.Close()
+			_ = batchStdinReader.Close()
+		}
+	}()
+
+	// For simplicities sake we'll us a buffered reader
+	batchReader := bufio.NewReader(batchStdoutReader)
+
+	writeID := func(id string) error {
+		_, err := batchStdinWriter.Write([]byte(id))
+		if err != nil {
+			return err
+		}
+		_, err = batchStdinWriter.Write([]byte{'\n'})
+		return err
+	}
+
+	if err := writeID(commitID); err != nil {
+		return nil, err
+	}
+	shaBytes, typ, size, err := ReadBatchLine(batchReader)
+	if typ != "commit" {
+		log("Unable to get commit for: %s. Err: %v", commitID, err)
+		return nil, ErrNotExist{commitID, ""}
+	}
+
+	sha, err := NewIDFromString(string(shaBytes))
 	if err != nil {
-		log("Unable to get commit for: %s", commitID)
+		log("Unable to get commit for: %s. Err: %v", commitID, err)
+		return nil, ErrNotExist{commitID, ""}
+	}
+
+	commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size))
+	if err != nil {
+		log("Unable to get commit for: %s. Err: %v", commitID, err)
 		return nil, err
 	}
 
@@ -38,17 +84,45 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		return nil, err
 	}
 
+	contentBuf := bytes.Buffer{}
+	var content []byte
 	sizes := make(map[string]int64)
 	for _, f := range entries {
+		contentBuf.Reset()
+		content = contentBuf.Bytes()
 		if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) ||
 			enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
 			continue
 		}
 
 		// If content can not be read or file is too big just do detection by filename
-		var content []byte
+
 		if f.Size() <= bigFileSize {
-			content, _ = readFile(f, fileSizeLimit)
+			if err := writeID(f.ID.String()); err != nil {
+				return nil, err
+			}
+			_, _, size, err := ReadBatchLine(batchReader)
+			if err != nil {
+				log("Error reading blob: %s Err: %v", f.ID.String(), err)
+				return nil, err
+			}
+
+			sizeToRead := size
+			discard := int64(0)
+			if size > fileSizeLimit {
+				sizeToRead = fileSizeLimit
+				discard = size - fileSizeLimit
+			}
+
+			_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
+			if err != nil {
+				return nil, err
+			}
+			content = contentBuf.Bytes()
+			err = discardFull(batchReader, discard)
+			if err != nil {
+				return nil, err
+			}
 		}
 		if enry.IsGenerated(f.Name(), content) {
 			continue
@@ -86,24 +160,20 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 	return sizes, nil
 }
 
-func readFile(entry *TreeEntry, limit int64) ([]byte, error) {
-	// FIXME: We can probably be a little more efficient here... see above
-	r, err := entry.Blob().DataAsync()
-	if err != nil {
-		return nil, err
+func discardFull(rd *bufio.Reader, discard int64) error {
+	if discard > math.MaxInt32 {
+		n, err := rd.Discard(math.MaxInt32)
+		discard -= int64(n)
+		if err != nil {
+			return err
+		}
 	}
-	defer r.Close()
-
-	if limit <= 0 {
-		return ioutil.ReadAll(r)
+	for discard > 0 {
+		n, err := rd.Discard(int(discard))
+		discard -= int64(n)
+		if err != nil {
+			return err
+		}
 	}
-
-	size := entry.Size()
-	if limit > 0 && size > limit {
-		size = limit
-	}
-	buf := bytes.NewBuffer(nil)
-	buf.Grow(int(size))
-	_, err = io.Copy(buf, io.LimitReader(r, limit))
-	return buf.Bytes(), err
+	return nil
 }