internal/diff: avoid unnecessary allocations

Previously, a diff of non-ASCII strings would incur
three complete copies of the buffers; this changes
reduces it to one.

Also:
- add diff.Bytes function, to avoid unnecessary conversions.
- remove unused diff.Lines and LineEdits functions from API.
- remove TODO to use []bytes everywhere.
  We tried it in CL 439277 and didn't like it.
- Document that the diff is textual, even when given []byte.

Change-Id: I2da3257cc3d12c569218a2d7ce182452e8647a96
Reviewed-on: https://go-review.googlesource.com/c/tools/+/439835
Reviewed-by: Robert Findley <rfindley@google.com>
Run-TryBot: Alan Donovan <adonovan@google.com>
Auto-Submit: Alan Donovan <adonovan@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
gopls-CI: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Alan Donovan 2022-10-06 16:11:42 -04:00 committed by Gopher Robot
parent 60ddccae85
commit f2c4579afe
6 changed files with 97 additions and 139 deletions

View File

@ -157,7 +157,7 @@ func parseGoImpl(ctx context.Context, fset *token.FileSet, fh source.FileHandle,
// it is likely we got stuck in a loop somehow. Log out a diff
// of the last changes we made to aid in debugging.
if i == 9 {
edits := diff.Strings(string(src), string(newSrc))
edits := diff.Bytes(src, newSrc)
unified := diff.Unified("before", "after", string(src), edits)
event.Log(ctx, fmt.Sprintf("fixSrc loop - last diff:\n%v", unified), tag.File.Of(tok.Name()))
}

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package diff computes differences between files or strings.
// Package diff computes differences between text files or strings.
package diff
import (
@ -11,13 +11,7 @@ import (
"strings"
)
// TODO(adonovan): switch to []byte throughout.
// But make clear that the operation is defined on runes, not bytes.
// Also:
// - delete LineEdits? (used only by Unified and test)
// - delete Lines (unused except by its test)
// An Edit describes the replacement of a portion of a file.
// An Edit describes the replacement of a portion of a text file.
type Edit struct {
Start, End int // byte offsets of the region to replace
New string // the replacement
@ -88,15 +82,9 @@ func (a editsSort) Less(i, j int) bool {
}
func (a editsSort) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// LineEdits expands and merges a sequence of edits so that each
// lineEdits expands and merges a sequence of edits so that each
// resulting edit replaces one or more complete lines.
//
// It may panic or produce garbage if the edits
// are overlapping, out of bounds of src, or out of order.
// TODO(adonovan): see consistency note at Apply.
// We could hide this from the API so that we can enforce
// the precondition... but it seems like a reasonable feature.
func LineEdits(src string, edits []Edit) []Edit {
func lineEdits(src string, edits []Edit) []Edit {
sortEdits(edits) // TODO(adonovan): is this necessary? Move burden to caller?
// Do all edits begin and end at the start of a line?

View File

@ -87,36 +87,6 @@ func FuzzRoundTrip(f *testing.F) {
})
}
func TestNLinesRandom(t *testing.T) {
rand.Seed(2)
for i := 0; i < 1000; i++ {
x := randlines("abω", 4) // avg line length is 6, want a change every 3rd line or so
v := []rune(x)
for i := 0; i < len(v); i++ {
if rand.Float64() < .05 {
v[i] = 'N'
}
}
y := string(v)
// occasionally remove the trailing \n
if rand.Float64() < .1 {
x = x[:len(x)-1]
}
if rand.Float64() < .1 {
y = y[:len(y)-1]
}
a, b := strings.SplitAfter(x, "\n"), strings.SplitAfter(y, "\n")
edits := diff.Lines(a, b)
got, err := diff.Apply(x, edits)
if err != nil {
t.Fatalf("Apply failed: %v", err)
}
if got != y {
t.Fatalf("%d: got\n%q, wanted\n%q, starting with %q", i, got, y, a)
}
}
}
func TestLineEdits(t *testing.T) {
for _, tc := range difftest.TestCases {
t.Run(tc.Name, func(t *testing.T) {

View File

@ -0,0 +1,9 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package diff
// This file exports some private declarations to tests.
var LineEdits = lineEdits

View File

@ -5,111 +5,104 @@
package diff
import (
"strings"
"bytes"
"unicode/utf8"
"golang.org/x/tools/internal/diff/lcs"
)
// Strings computes the differences between two strings.
// The resulting edits respect rune boundaries.
func Strings(before, after string) []Edit {
if before == after {
return nil // common case
}
if stringIsASCII(before) && stringIsASCII(after) {
return diffASCII([]byte(before), []byte(after))
}
return diffRunes([]rune(before), []rune(after))
}
// Bytes computes the differences between two byte slices.
// The resulting edits respect rune boundaries.
func Bytes(before, after []byte) []Edit {
if bytes.Equal(before, after) {
return nil // common case
}
if bytesIsASCII(before) && bytesIsASCII(after) {
return diffASCII(before, after)
}
return diffRunes(runes(before), runes(after))
}
func diffASCII(before, after []byte) []Edit {
diffs, _ := lcs.Compute(before, after, maxDiffs/2)
// Convert from LCS diffs.
res := make([]Edit, len(diffs))
for i, d := range diffs {
res[i] = Edit{d.Start, d.End, d.Text}
}
return res
}
func diffRunes(before, after []rune) []Edit {
diffs, _ := lcs.Compute(before, after, maxDiffs/2)
// The diffs returned by the lcs package use indexes
// into whatever slice was passed in.
// Convert rune offsets to byte offsets.
res := make([]Edit, len(diffs))
lastEnd := 0
utf8Len := 0
for i, d := range diffs {
utf8Len += runesLen(before[lastEnd:d.Start]) // text between edits
start := utf8Len
utf8Len += runesLen(before[d.Start:d.End]) // text deleted by this edit
res[i] = Edit{start, utf8Len, d.Text}
lastEnd = d.End
}
return res
}
// maxDiffs is a limit on how deeply the lcs algorithm should search
// the value is just a guess
const maxDiffs = 30
// Strings computes the differences between two strings.
// (Both it and the diff in the myers package have type ComputeEdits, which
// is why the arguments are strings, not []bytes.)
// TODO(adonovan): opt: consider switching everything to []bytes, if
// that's the more common type in practice. Or provide both flavors?
func Strings(before, after string) []Edit {
if before == after {
// very frequently true
return nil
// runes is like []rune(string(bytes)) without the duplicate allocation.
func runes(bytes []byte) []rune {
n := utf8.RuneCount(bytes)
runes := make([]rune, n)
for i := 0; i < n; i++ {
r, sz := utf8.DecodeRune(bytes)
bytes = bytes[sz:]
runes[i] = r
}
// The diffs returned by the lcs package use indexes into
// whatever slice was passed in. Edits use byte offsets, so
// rune or line offsets need to be converted.
// TODO(adonovan): opt: eliminate all the unnecessary allocations.
var diffs []lcs.Diff
if !isASCII(before) || !isASCII(after) {
diffs, _ = lcs.Compute([]rune(before), []rune(after), maxDiffs/2)
diffs = runeOffsets(diffs, []rune(before))
} else {
// Common case: pure ASCII. Avoid expansion to []rune slice.
diffs, _ = lcs.Compute([]byte(before), []byte(after), maxDiffs/2)
}
return convertDiffs(diffs)
return runes
}
// Lines computes the differences between two list of lines.
// TODO(adonovan): unused except by its test. Do we actually need it?
func Lines(before, after []string) []Edit {
diffs, _ := lcs.Compute(before, after, maxDiffs/2)
diffs = lineOffsets(diffs, before)
return convertDiffs(diffs)
// the code is not coping with possible missing \ns at the ends
}
func convertDiffs(diffs []lcs.Diff) []Edit {
ans := make([]Edit, len(diffs))
for i, d := range diffs {
ans[i] = Edit{d.Start, d.End, d.Text}
// runesLen returns the length in bytes of the UTF-8 encoding of runes.
func runesLen(runes []rune) (len int) {
for _, r := range runes {
len += utf8.RuneLen(r)
}
return ans
}
// convert diffs with rune offsets into diffs with byte offsets
func runeOffsets(diffs []lcs.Diff, src []rune) []lcs.Diff {
var idx int
var tmp strings.Builder // string because []byte([]rune) is illegal
for i, d := range diffs {
tmp.WriteString(string(src[idx:d.Start]))
v := tmp.Len()
tmp.WriteString(string(src[d.Start:d.End]))
d.Start = v
idx = d.End
d.End = tmp.Len()
diffs[i] = d
}
return diffs
}
// convert diffs with line offsets into diffs with byte offsets
func lineOffsets(diffs []lcs.Diff, src []string) []lcs.Diff {
var idx int
var tmp strings.Builder // bytes/
for i, d := range diffs {
tmp.WriteString(strJoin(src[idx:d.Start]))
v := tmp.Len()
tmp.WriteString(strJoin(src[d.Start:d.End]))
d.Start = v
idx = d.End
d.End = tmp.Len()
diffs[i] = d
}
return diffs
}
// join lines. (strings.Join doesn't add a trailing separator)
func strJoin(elems []string) string {
if len(elems) == 0 {
return ""
}
n := 0
for i := 0; i < len(elems); i++ {
n += len(elems[i])
}
var b strings.Builder
b.Grow(n)
for _, s := range elems {
b.WriteString(s)
//b.WriteByte('\n')
}
return b.String()
return len
}
// isASCII reports whether s contains only ASCII.
func isASCII(s string) bool {
// TODO(adonovan): combine when x/tools allows generics.
func stringIsASCII(s string) bool {
for i := 0; i < len(s); i++ {
if s[i] >= utf8.RuneSelf {
return false
}
}
return true
}
func bytesIsASCII(s []byte) bool {
for i := 0; i < len(s); i++ {
if s[i] >= utf8.RuneSelf {
return false

View File

@ -9,8 +9,6 @@ import (
"strings"
)
// TODO(adonovan): API: hide all but func Unified.
// Unified applies the edits to content and presents a unified diff.
// The old and new labels are the names of the content and result files.
func Unified(oldLabel, newLabel string, content string, edits []Edit) string {
@ -92,7 +90,7 @@ func toUnified(fromName, toName string, content string, edits []Edit) unified {
if len(edits) == 0 {
return u
}
edits = LineEdits(content, edits) // expand to whole lines
edits = lineEdits(content, edits) // expand to whole lines
lines := splitLines(content)
var h *hunk
last := 0