internal/diff: avoid unnecessary allocations

Previously, a diff of non-ASCII strings would incur three complete copies of the buffers; this changes reduces it to one. Also: - add diff.Bytes function, to avoid unnecessary conversions. - remove unused diff.Lines and LineEdits functions from API. - remove TODO to use []bytes everywhere. We tried it in CL 439277 and didn't like it. - Document that the diff is textual, even when given []byte. Change-Id: I2da3257cc3d12c569218a2d7ce182452e8647a96 Reviewed-on: https://go-review.googlesource.com/c/tools/+/439835 Reviewed-by: Robert Findley <rfindley@google.com> Run-TryBot: Alan Donovan <adonovan@google.com> Auto-Submit: Alan Donovan <adonovan@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> gopls-CI: kokoro <noreply+kokoro@google.com>
2022-10-06 16:11:42 -04:00 · 2022-10-06 16:11:42 -04:00 · f2c4579afe
parent 60ddccae85
commit f2c4579afe
6 changed files with 97 additions and 139 deletions
--- a/gopls/internal/lsp/cache/parse.go
+++ b/gopls/internal/lsp/cache/parse.go
@ -157,7 +157,7 @@ func parseGoImpl(ctx context.Context, fset *token.FileSet, fh source.FileHandle,
 			// it is likely we got stuck in a loop somehow. Log out a diff
 			// of the last changes we made to aid in debugging.
 			if i == 9 {
-				edits := diff.Strings(string(src), string(newSrc))
+				edits := diff.Bytes(src, newSrc)
 				unified := diff.Unified("before", "after", string(src), edits)
 				event.Log(ctx, fmt.Sprintf("fixSrc loop - last diff:\n%v", unified), tag.File.Of(tok.Name()))
 			}
--- a/internal/diff/diff.go
+++ b/internal/diff/diff.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// Package diff computes differences between files or strings.
+// Package diff computes differences between text files or strings.
 package diff

 import (
@ -11,13 +11,7 @@ import (
 	"strings"
 )

-// TODO(adonovan): switch to []byte throughout.
-// But make clear that the operation is defined on runes, not bytes.
-// Also:
-// - delete LineEdits? (used only by Unified and test)
-// - delete Lines (unused except by its test)
-
-// An Edit describes the replacement of a portion of a file.
+// An Edit describes the replacement of a portion of a text file.
 type Edit struct {
 	Start, End int    // byte offsets of the region to replace
 	New        string // the replacement
@ -88,15 +82,9 @@ func (a editsSort) Less(i, j int) bool {
 }
 func (a editsSort) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

-// LineEdits expands and merges a sequence of edits so that each
+// lineEdits expands and merges a sequence of edits so that each
 // resulting edit replaces one or more complete lines.
-//
-// It may panic or produce garbage if the edits
-// are overlapping, out of bounds of src, or out of order.
-// TODO(adonovan): see consistency note at Apply.
-// We could hide this from the API so that we can enforce
-// the precondition... but it seems like a reasonable feature.
-func LineEdits(src string, edits []Edit) []Edit {
+func lineEdits(src string, edits []Edit) []Edit {
 	sortEdits(edits) // TODO(adonovan): is this necessary? Move burden to caller?

 	// Do all edits begin and end at the start of a line?
--- a/internal/diff/diff_test.go
+++ b/internal/diff/diff_test.go
@ -87,36 +87,6 @@ func FuzzRoundTrip(f *testing.F) {
 	})
 }

-func TestNLinesRandom(t *testing.T) {
-	rand.Seed(2)
-	for i := 0; i < 1000; i++ {
-		x := randlines("abω", 4) // avg line length is 6, want a change every 3rd line or so
-		v := []rune(x)
-		for i := 0; i < len(v); i++ {
-			if rand.Float64() < .05 {
-				v[i] = 'N'
-			}
-		}
-		y := string(v)
-		// occasionally remove the trailing \n
-		if rand.Float64() < .1 {
-			x = x[:len(x)-1]
-		}
-		if rand.Float64() < .1 {
-			y = y[:len(y)-1]
-		}
-		a, b := strings.SplitAfter(x, "\n"), strings.SplitAfter(y, "\n")
-		edits := diff.Lines(a, b)
-		got, err := diff.Apply(x, edits)
-		if err != nil {
-			t.Fatalf("Apply failed: %v", err)
-		}
-		if got != y {
-			t.Fatalf("%d: got\n%q, wanted\n%q, starting with %q", i, got, y, a)
-		}
-	}
-}
-
 func TestLineEdits(t *testing.T) {
 	for _, tc := range difftest.TestCases {
 		t.Run(tc.Name, func(t *testing.T) {
--- a/internal/diff/export_test.go
+++ b/internal/diff/export_test.go
@ -0,0 +1,9 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package diff
+
+// This file exports some private declarations to tests.
+
+var LineEdits = lineEdits
--- a/internal/diff/ndiff.go
+++ b/internal/diff/ndiff.go
@ -5,111 +5,104 @@
 package diff

 import (
-	"strings"
+	"bytes"
 	"unicode/utf8"

 	"golang.org/x/tools/internal/diff/lcs"
 )

+// Strings computes the differences between two strings.
+// The resulting edits respect rune boundaries.
+func Strings(before, after string) []Edit {
+	if before == after {
+		return nil // common case
+	}
+
+	if stringIsASCII(before) && stringIsASCII(after) {
+		return diffASCII([]byte(before), []byte(after))
+	}
+	return diffRunes([]rune(before), []rune(after))
+}
+
+// Bytes computes the differences between two byte slices.
+// The resulting edits respect rune boundaries.
+func Bytes(before, after []byte) []Edit {
+	if bytes.Equal(before, after) {
+		return nil // common case
+	}
+
+	if bytesIsASCII(before) && bytesIsASCII(after) {
+		return diffASCII(before, after)
+	}
+	return diffRunes(runes(before), runes(after))
+}
+
+func diffASCII(before, after []byte) []Edit {
+	diffs, _ := lcs.Compute(before, after, maxDiffs/2)
+
+	// Convert from LCS diffs.
+	res := make([]Edit, len(diffs))
+	for i, d := range diffs {
+		res[i] = Edit{d.Start, d.End, d.Text}
+	}
+	return res
+}
+
+func diffRunes(before, after []rune) []Edit {
+	diffs, _ := lcs.Compute(before, after, maxDiffs/2)
+
+	// The diffs returned by the lcs package use indexes
+	// into whatever slice was passed in.
+	// Convert rune offsets to byte offsets.
+	res := make([]Edit, len(diffs))
+	lastEnd := 0
+	utf8Len := 0
+	for i, d := range diffs {
+		utf8Len += runesLen(before[lastEnd:d.Start]) // text between edits
+		start := utf8Len
+		utf8Len += runesLen(before[d.Start:d.End]) // text deleted by this edit
+		res[i] = Edit{start, utf8Len, d.Text}
+		lastEnd = d.End
+	}
+	return res
+}
+
 // maxDiffs is a limit on how deeply the lcs algorithm should search
 // the value is just a guess
 const maxDiffs = 30

-// Strings computes the differences between two strings.
-// (Both it and the diff in the myers package have type ComputeEdits, which
-// is why the arguments are strings, not []bytes.)
-// TODO(adonovan): opt: consider switching everything to []bytes, if
-// that's the more common type in practice. Or provide both flavors?
-func Strings(before, after string) []Edit {
-	if before == after {
-		// very frequently true
-		return nil
+// runes is like []rune(string(bytes)) without the duplicate allocation.
+func runes(bytes []byte) []rune {
+	n := utf8.RuneCount(bytes)
+	runes := make([]rune, n)
+	for i := 0; i < n; i++ {
+		r, sz := utf8.DecodeRune(bytes)
+		bytes = bytes[sz:]
+		runes[i] = r
 	}
-	// The diffs returned by the lcs package use indexes into
-	// whatever slice was passed in. Edits use byte offsets, so
-	// rune or line offsets need to be converted.
-	// TODO(adonovan): opt: eliminate all the unnecessary allocations.
-	var diffs []lcs.Diff
-	if !isASCII(before) || !isASCII(after) {
-		diffs, _ = lcs.Compute([]rune(before), []rune(after), maxDiffs/2)
-		diffs = runeOffsets(diffs, []rune(before))
-	} else {
-		// Common case: pure ASCII. Avoid expansion to []rune slice.
-		diffs, _ = lcs.Compute([]byte(before), []byte(after), maxDiffs/2)
-	}
-	return convertDiffs(diffs)
+	return runes
 }

-// Lines computes the differences between two list of lines.
-// TODO(adonovan): unused except by its test. Do we actually need it?
-func Lines(before, after []string) []Edit {
-	diffs, _ := lcs.Compute(before, after, maxDiffs/2)
-	diffs = lineOffsets(diffs, before)
-	return convertDiffs(diffs)
-	// the code is not coping with possible missing \ns at the ends
-}
-
-func convertDiffs(diffs []lcs.Diff) []Edit {
-	ans := make([]Edit, len(diffs))
-	for i, d := range diffs {
-		ans[i] = Edit{d.Start, d.End, d.Text}
+// runesLen returns the length in bytes of the UTF-8 encoding of runes.
+func runesLen(runes []rune) (len int) {
+	for _, r := range runes {
+		len += utf8.RuneLen(r)
 	}
-	return ans
-}
-
-// convert diffs with rune offsets into diffs with byte offsets
-func runeOffsets(diffs []lcs.Diff, src []rune) []lcs.Diff {
-	var idx int
-	var tmp strings.Builder // string because []byte([]rune) is illegal
-	for i, d := range diffs {
-		tmp.WriteString(string(src[idx:d.Start]))
-		v := tmp.Len()
-		tmp.WriteString(string(src[d.Start:d.End]))
-		d.Start = v
-		idx = d.End
-		d.End = tmp.Len()
-		diffs[i] = d
-	}
-	return diffs
-}
-
-// convert diffs with line offsets into diffs with byte offsets
-func lineOffsets(diffs []lcs.Diff, src []string) []lcs.Diff {
-	var idx int
-	var tmp strings.Builder // bytes/
-	for i, d := range diffs {
-		tmp.WriteString(strJoin(src[idx:d.Start]))
-		v := tmp.Len()
-		tmp.WriteString(strJoin(src[d.Start:d.End]))
-		d.Start = v
-		idx = d.End
-		d.End = tmp.Len()
-		diffs[i] = d
-	}
-	return diffs
-}
-
-// join lines. (strings.Join doesn't add a trailing separator)
-func strJoin(elems []string) string {
-	if len(elems) == 0 {
-		return ""
-	}
-	n := 0
-	for i := 0; i < len(elems); i++ {
-		n += len(elems[i])
-	}
-
-	var b strings.Builder
-	b.Grow(n)
-	for _, s := range elems {
-		b.WriteString(s)
-		//b.WriteByte('\n')
-	}
-	return b.String()
+	return len
 }

 // isASCII reports whether s contains only ASCII.
-func isASCII(s string) bool {
+// TODO(adonovan): combine when x/tools allows generics.
+func stringIsASCII(s string) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] >= utf8.RuneSelf {
+			return false
+		}
+	}
+	return true
+}
+
+func bytesIsASCII(s []byte) bool {
 	for i := 0; i < len(s); i++ {
 		if s[i] >= utf8.RuneSelf {
 			return false
--- a/internal/diff/unified.go
+++ b/internal/diff/unified.go
@ -9,8 +9,6 @@ import (
 	"strings"
 )

-// TODO(adonovan): API: hide all but func Unified.
-
 // Unified applies the edits to content and presents a unified diff.
 // The old and new labels are the names of the content and result files.
 func Unified(oldLabel, newLabel string, content string, edits []Edit) string {
@ -92,7 +90,7 @@ func toUnified(fromName, toName string, content string, edits []Edit) unified {
 	if len(edits) == 0 {
 		return u
 	}
-	edits = LineEdits(content, edits) // expand to whole lines
+	edits = lineEdits(content, edits) // expand to whole lines
 	lines := splitLines(content)
 	var h *hunk
 	last := 0