From 7ba158183a34d71b3989512c059a01d35c4c4673 Mon Sep 17 00:00:00 2001 From: zeripath Date: Wed, 17 Feb 2021 19:32:47 +0000 Subject: [PATCH] Use cat-file --batch in GetLanguageStats (#14685) * Use cat-file --batch in GetLanguageStats This PR moves to using a single cat-file --batch in GetLanguageStats significantly reducing the number of processes spawned during language stat processing. Signed-off-by: Andrew Thornton * placate lint Signed-off-by: Andrew Thornton * Update modules/git/repo_language_stats_nogogit.go Co-authored-by: a1012112796 <1012112796@qq.com> Co-authored-by: Lauris BH Co-authored-by: 6543 <6543@obermui.de> Co-authored-by: a1012112796 <1012112796@qq.com> Co-authored-by: Lunny Xiao --- modules/git/repo_language_stats_nogogit.go | 130 ++++++++++++++++----- 1 file changed, 100 insertions(+), 30 deletions(-) diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index 5607e4591..4c6f07f0f 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -7,9 +7,11 @@ package git import ( + "bufio" "bytes" "io" - "io/ioutil" + "math" + "strings" "code.gitea.io/gitea/modules/analyze" @@ -18,16 +20,60 @@ import ( // GetLanguageStats calculates language stats for git repository at specified commit func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { - // FIXME: We can be more efficient here... - // - // We're expecting that we will be reading a lot of blobs and the trees - // Thus we should use a shared `cat-file --batch` to get all of this data - // And keep the buffers around with resets as necessary. - // - // It's more complicated so... - commit, err := repo.GetCommit(commitID) + // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary. + // so let's create a batch stdin and stdout + + batchStdinReader, batchStdinWriter := io.Pipe() + batchStdoutReader, batchStdoutWriter := io.Pipe() + defer func() { + _ = batchStdinReader.Close() + _ = batchStdinWriter.Close() + _ = batchStdoutReader.Close() + _ = batchStdoutWriter.Close() + }() + + go func() { + stderr := strings.Builder{} + err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader) + if err != nil { + _ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String())) + _ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String())) + } else { + _ = batchStdoutWriter.Close() + _ = batchStdinReader.Close() + } + }() + + // For simplicities sake we'll us a buffered reader + batchReader := bufio.NewReader(batchStdoutReader) + + writeID := func(id string) error { + _, err := batchStdinWriter.Write([]byte(id)) + if err != nil { + return err + } + _, err = batchStdinWriter.Write([]byte{'\n'}) + return err + } + + if err := writeID(commitID); err != nil { + return nil, err + } + shaBytes, typ, size, err := ReadBatchLine(batchReader) + if typ != "commit" { + log("Unable to get commit for: %s. Err: %v", commitID, err) + return nil, ErrNotExist{commitID, ""} + } + + sha, err := NewIDFromString(string(shaBytes)) if err != nil { - log("Unable to get commit for: %s", commitID) + log("Unable to get commit for: %s. Err: %v", commitID, err) + return nil, ErrNotExist{commitID, ""} + } + + commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size)) + if err != nil { + log("Unable to get commit for: %s. Err: %v", commitID, err) return nil, err } @@ -38,17 +84,45 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } + contentBuf := bytes.Buffer{} + var content []byte sizes := make(map[string]int64) for _, f := range entries { + contentBuf.Reset() + content = contentBuf.Bytes() if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) || enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) { continue } // If content can not be read or file is too big just do detection by filename - var content []byte + if f.Size() <= bigFileSize { - content, _ = readFile(f, fileSizeLimit) + if err := writeID(f.ID.String()); err != nil { + return nil, err + } + _, _, size, err := ReadBatchLine(batchReader) + if err != nil { + log("Error reading blob: %s Err: %v", f.ID.String(), err) + return nil, err + } + + sizeToRead := size + discard := int64(0) + if size > fileSizeLimit { + sizeToRead = fileSizeLimit + discard = size - fileSizeLimit + } + + _, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead)) + if err != nil { + return nil, err + } + content = contentBuf.Bytes() + err = discardFull(batchReader, discard) + if err != nil { + return nil, err + } } if enry.IsGenerated(f.Name(), content) { continue @@ -86,24 +160,20 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return sizes, nil } -func readFile(entry *TreeEntry, limit int64) ([]byte, error) { - // FIXME: We can probably be a little more efficient here... see above - r, err := entry.Blob().DataAsync() - if err != nil { - return nil, err +func discardFull(rd *bufio.Reader, discard int64) error { + if discard > math.MaxInt32 { + n, err := rd.Discard(math.MaxInt32) + discard -= int64(n) + if err != nil { + return err + } } - defer r.Close() - - if limit <= 0 { - return ioutil.ReadAll(r) + for discard > 0 { + n, err := rd.Discard(int(discard)) + discard -= int64(n) + if err != nil { + return err + } } - - size := entry.Size() - if limit > 0 && size > limit { - size = limit - } - buf := bytes.NewBuffer(nil) - buf.Grow(int(size)) - _, err = io.Copy(buf, io.LimitReader(r, limit)) - return buf.Bytes(), err + return nil }