From 4ce32c9e93591f2449a388201c323ca193f59c07 Mon Sep 17 00:00:00 2001 From: Jimmy Praet Date: Tue, 13 Jul 2021 03:13:52 +0200 Subject: [PATCH] Detect encoding changes while parsing diff (#16330) * Detect encoding changes while parsing diff --- services/gitdiff/gitdiff.go | 50 +++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index f8f0fd7e3..d50e41eb4 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -32,6 +32,7 @@ import ( "github.com/sergi/go-diff/diffmatchpatch" stdcharset "golang.org/x/net/html/charset" + "golang.org/x/text/encoding" "golang.org/x/text/transform" ) @@ -883,35 +884,46 @@ parsingLoop: } - // FIXME: There are numerous issues with this: + // TODO: There are numerous issues with this: // - we might want to consider detecting encoding while parsing but... // - we're likely to fail to get the correct encoding here anyway as we won't have enough information - // - and this doesn't really account for changes in encoding - var buf bytes.Buffer + var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3) + var diffLineTypeDecoders = make(map[DiffLineType]*encoding.Decoder, 3) + diffLineTypeBuffers[DiffLinePlain] = new(bytes.Buffer) + diffLineTypeBuffers[DiffLineAdd] = new(bytes.Buffer) + diffLineTypeBuffers[DiffLineDel] = new(bytes.Buffer) for _, f := range diff.Files { - buf.Reset() + for _, buffer := range diffLineTypeBuffers { + buffer.Reset() + } for _, sec := range f.Sections { for _, l := range sec.Lines { if l.Type == DiffLineSection { continue } - buf.WriteString(l.Content[1:]) - buf.WriteString("\n") + diffLineTypeBuffers[l.Type].WriteString(l.Content[1:]) + diffLineTypeBuffers[l.Type].WriteString("\n") } } - charsetLabel, err := charset.DetectEncoding(buf.Bytes()) - if charsetLabel != "UTF-8" && err == nil { - encoding, _ := stdcharset.Lookup(charsetLabel) - if encoding != nil { - d := encoding.NewDecoder() - for _, sec := range f.Sections { - for _, l := range sec.Lines { - if l.Type == DiffLineSection { - continue - } - if c, _, err := transform.String(d, l.Content[1:]); err == nil { - l.Content = l.Content[0:1] + c - } + for lineType, buffer := range diffLineTypeBuffers { + diffLineTypeDecoders[lineType] = nil + if buffer.Len() == 0 { + continue + } + charsetLabel, err := charset.DetectEncoding(buffer.Bytes()) + if charsetLabel != "UTF-8" && err == nil { + encoding, _ := stdcharset.Lookup(charsetLabel) + if encoding != nil { + diffLineTypeDecoders[lineType] = encoding.NewDecoder() + } + } + } + for _, sec := range f.Sections { + for _, l := range sec.Lines { + decoder := diffLineTypeDecoders[l.Type] + if decoder != nil { + if c, _, err := transform.String(decoder, l.Content[1:]); err == nil { + l.Content = l.Content[0:1] + c } } }