mirror of https://github.com/golang/go.git
internal/diff: avoid unnecessary allocations
Previously, a diff of non-ASCII strings would incur three complete copies of the buffers; this changes reduces it to one. Also: - add diff.Bytes function, to avoid unnecessary conversions. - remove unused diff.Lines and LineEdits functions from API. - remove TODO to use []bytes everywhere. We tried it in CL 439277 and didn't like it. - Document that the diff is textual, even when given []byte. Change-Id: I2da3257cc3d12c569218a2d7ce182452e8647a96 Reviewed-on: https://go-review.googlesource.com/c/tools/+/439835 Reviewed-by: Robert Findley <rfindley@google.com> Run-TryBot: Alan Donovan <adonovan@google.com> Auto-Submit: Alan Donovan <adonovan@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> gopls-CI: kokoro <noreply+kokoro@google.com>
This commit is contained in:
parent
60ddccae85
commit
f2c4579afe
|
|
@ -157,7 +157,7 @@ func parseGoImpl(ctx context.Context, fset *token.FileSet, fh source.FileHandle,
|
|||
// it is likely we got stuck in a loop somehow. Log out a diff
|
||||
// of the last changes we made to aid in debugging.
|
||||
if i == 9 {
|
||||
edits := diff.Strings(string(src), string(newSrc))
|
||||
edits := diff.Bytes(src, newSrc)
|
||||
unified := diff.Unified("before", "after", string(src), edits)
|
||||
event.Log(ctx, fmt.Sprintf("fixSrc loop - last diff:\n%v", unified), tag.File.Of(tok.Name()))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package diff computes differences between files or strings.
|
||||
// Package diff computes differences between text files or strings.
|
||||
package diff
|
||||
|
||||
import (
|
||||
|
|
@ -11,13 +11,7 @@ import (
|
|||
"strings"
|
||||
)
|
||||
|
||||
// TODO(adonovan): switch to []byte throughout.
|
||||
// But make clear that the operation is defined on runes, not bytes.
|
||||
// Also:
|
||||
// - delete LineEdits? (used only by Unified and test)
|
||||
// - delete Lines (unused except by its test)
|
||||
|
||||
// An Edit describes the replacement of a portion of a file.
|
||||
// An Edit describes the replacement of a portion of a text file.
|
||||
type Edit struct {
|
||||
Start, End int // byte offsets of the region to replace
|
||||
New string // the replacement
|
||||
|
|
@ -88,15 +82,9 @@ func (a editsSort) Less(i, j int) bool {
|
|||
}
|
||||
func (a editsSort) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
|
||||
// LineEdits expands and merges a sequence of edits so that each
|
||||
// lineEdits expands and merges a sequence of edits so that each
|
||||
// resulting edit replaces one or more complete lines.
|
||||
//
|
||||
// It may panic or produce garbage if the edits
|
||||
// are overlapping, out of bounds of src, or out of order.
|
||||
// TODO(adonovan): see consistency note at Apply.
|
||||
// We could hide this from the API so that we can enforce
|
||||
// the precondition... but it seems like a reasonable feature.
|
||||
func LineEdits(src string, edits []Edit) []Edit {
|
||||
func lineEdits(src string, edits []Edit) []Edit {
|
||||
sortEdits(edits) // TODO(adonovan): is this necessary? Move burden to caller?
|
||||
|
||||
// Do all edits begin and end at the start of a line?
|
||||
|
|
|
|||
|
|
@ -87,36 +87,6 @@ func FuzzRoundTrip(f *testing.F) {
|
|||
})
|
||||
}
|
||||
|
||||
func TestNLinesRandom(t *testing.T) {
|
||||
rand.Seed(2)
|
||||
for i := 0; i < 1000; i++ {
|
||||
x := randlines("abω", 4) // avg line length is 6, want a change every 3rd line or so
|
||||
v := []rune(x)
|
||||
for i := 0; i < len(v); i++ {
|
||||
if rand.Float64() < .05 {
|
||||
v[i] = 'N'
|
||||
}
|
||||
}
|
||||
y := string(v)
|
||||
// occasionally remove the trailing \n
|
||||
if rand.Float64() < .1 {
|
||||
x = x[:len(x)-1]
|
||||
}
|
||||
if rand.Float64() < .1 {
|
||||
y = y[:len(y)-1]
|
||||
}
|
||||
a, b := strings.SplitAfter(x, "\n"), strings.SplitAfter(y, "\n")
|
||||
edits := diff.Lines(a, b)
|
||||
got, err := diff.Apply(x, edits)
|
||||
if err != nil {
|
||||
t.Fatalf("Apply failed: %v", err)
|
||||
}
|
||||
if got != y {
|
||||
t.Fatalf("%d: got\n%q, wanted\n%q, starting with %q", i, got, y, a)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLineEdits(t *testing.T) {
|
||||
for _, tc := range difftest.TestCases {
|
||||
t.Run(tc.Name, func(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,9 @@
|
|||
// Copyright 2022 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package diff
|
||||
|
||||
// This file exports some private declarations to tests.
|
||||
|
||||
var LineEdits = lineEdits
|
||||
|
|
@ -5,111 +5,104 @@
|
|||
package diff
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/tools/internal/diff/lcs"
|
||||
)
|
||||
|
||||
// Strings computes the differences between two strings.
|
||||
// The resulting edits respect rune boundaries.
|
||||
func Strings(before, after string) []Edit {
|
||||
if before == after {
|
||||
return nil // common case
|
||||
}
|
||||
|
||||
if stringIsASCII(before) && stringIsASCII(after) {
|
||||
return diffASCII([]byte(before), []byte(after))
|
||||
}
|
||||
return diffRunes([]rune(before), []rune(after))
|
||||
}
|
||||
|
||||
// Bytes computes the differences between two byte slices.
|
||||
// The resulting edits respect rune boundaries.
|
||||
func Bytes(before, after []byte) []Edit {
|
||||
if bytes.Equal(before, after) {
|
||||
return nil // common case
|
||||
}
|
||||
|
||||
if bytesIsASCII(before) && bytesIsASCII(after) {
|
||||
return diffASCII(before, after)
|
||||
}
|
||||
return diffRunes(runes(before), runes(after))
|
||||
}
|
||||
|
||||
func diffASCII(before, after []byte) []Edit {
|
||||
diffs, _ := lcs.Compute(before, after, maxDiffs/2)
|
||||
|
||||
// Convert from LCS diffs.
|
||||
res := make([]Edit, len(diffs))
|
||||
for i, d := range diffs {
|
||||
res[i] = Edit{d.Start, d.End, d.Text}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func diffRunes(before, after []rune) []Edit {
|
||||
diffs, _ := lcs.Compute(before, after, maxDiffs/2)
|
||||
|
||||
// The diffs returned by the lcs package use indexes
|
||||
// into whatever slice was passed in.
|
||||
// Convert rune offsets to byte offsets.
|
||||
res := make([]Edit, len(diffs))
|
||||
lastEnd := 0
|
||||
utf8Len := 0
|
||||
for i, d := range diffs {
|
||||
utf8Len += runesLen(before[lastEnd:d.Start]) // text between edits
|
||||
start := utf8Len
|
||||
utf8Len += runesLen(before[d.Start:d.End]) // text deleted by this edit
|
||||
res[i] = Edit{start, utf8Len, d.Text}
|
||||
lastEnd = d.End
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
// maxDiffs is a limit on how deeply the lcs algorithm should search
|
||||
// the value is just a guess
|
||||
const maxDiffs = 30
|
||||
|
||||
// Strings computes the differences between two strings.
|
||||
// (Both it and the diff in the myers package have type ComputeEdits, which
|
||||
// is why the arguments are strings, not []bytes.)
|
||||
// TODO(adonovan): opt: consider switching everything to []bytes, if
|
||||
// that's the more common type in practice. Or provide both flavors?
|
||||
func Strings(before, after string) []Edit {
|
||||
if before == after {
|
||||
// very frequently true
|
||||
return nil
|
||||
// runes is like []rune(string(bytes)) without the duplicate allocation.
|
||||
func runes(bytes []byte) []rune {
|
||||
n := utf8.RuneCount(bytes)
|
||||
runes := make([]rune, n)
|
||||
for i := 0; i < n; i++ {
|
||||
r, sz := utf8.DecodeRune(bytes)
|
||||
bytes = bytes[sz:]
|
||||
runes[i] = r
|
||||
}
|
||||
// The diffs returned by the lcs package use indexes into
|
||||
// whatever slice was passed in. Edits use byte offsets, so
|
||||
// rune or line offsets need to be converted.
|
||||
// TODO(adonovan): opt: eliminate all the unnecessary allocations.
|
||||
var diffs []lcs.Diff
|
||||
if !isASCII(before) || !isASCII(after) {
|
||||
diffs, _ = lcs.Compute([]rune(before), []rune(after), maxDiffs/2)
|
||||
diffs = runeOffsets(diffs, []rune(before))
|
||||
} else {
|
||||
// Common case: pure ASCII. Avoid expansion to []rune slice.
|
||||
diffs, _ = lcs.Compute([]byte(before), []byte(after), maxDiffs/2)
|
||||
}
|
||||
return convertDiffs(diffs)
|
||||
return runes
|
||||
}
|
||||
|
||||
// Lines computes the differences between two list of lines.
|
||||
// TODO(adonovan): unused except by its test. Do we actually need it?
|
||||
func Lines(before, after []string) []Edit {
|
||||
diffs, _ := lcs.Compute(before, after, maxDiffs/2)
|
||||
diffs = lineOffsets(diffs, before)
|
||||
return convertDiffs(diffs)
|
||||
// the code is not coping with possible missing \ns at the ends
|
||||
}
|
||||
|
||||
func convertDiffs(diffs []lcs.Diff) []Edit {
|
||||
ans := make([]Edit, len(diffs))
|
||||
for i, d := range diffs {
|
||||
ans[i] = Edit{d.Start, d.End, d.Text}
|
||||
// runesLen returns the length in bytes of the UTF-8 encoding of runes.
|
||||
func runesLen(runes []rune) (len int) {
|
||||
for _, r := range runes {
|
||||
len += utf8.RuneLen(r)
|
||||
}
|
||||
return ans
|
||||
}
|
||||
|
||||
// convert diffs with rune offsets into diffs with byte offsets
|
||||
func runeOffsets(diffs []lcs.Diff, src []rune) []lcs.Diff {
|
||||
var idx int
|
||||
var tmp strings.Builder // string because []byte([]rune) is illegal
|
||||
for i, d := range diffs {
|
||||
tmp.WriteString(string(src[idx:d.Start]))
|
||||
v := tmp.Len()
|
||||
tmp.WriteString(string(src[d.Start:d.End]))
|
||||
d.Start = v
|
||||
idx = d.End
|
||||
d.End = tmp.Len()
|
||||
diffs[i] = d
|
||||
}
|
||||
return diffs
|
||||
}
|
||||
|
||||
// convert diffs with line offsets into diffs with byte offsets
|
||||
func lineOffsets(diffs []lcs.Diff, src []string) []lcs.Diff {
|
||||
var idx int
|
||||
var tmp strings.Builder // bytes/
|
||||
for i, d := range diffs {
|
||||
tmp.WriteString(strJoin(src[idx:d.Start]))
|
||||
v := tmp.Len()
|
||||
tmp.WriteString(strJoin(src[d.Start:d.End]))
|
||||
d.Start = v
|
||||
idx = d.End
|
||||
d.End = tmp.Len()
|
||||
diffs[i] = d
|
||||
}
|
||||
return diffs
|
||||
}
|
||||
|
||||
// join lines. (strings.Join doesn't add a trailing separator)
|
||||
func strJoin(elems []string) string {
|
||||
if len(elems) == 0 {
|
||||
return ""
|
||||
}
|
||||
n := 0
|
||||
for i := 0; i < len(elems); i++ {
|
||||
n += len(elems[i])
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
b.Grow(n)
|
||||
for _, s := range elems {
|
||||
b.WriteString(s)
|
||||
//b.WriteByte('\n')
|
||||
}
|
||||
return b.String()
|
||||
return len
|
||||
}
|
||||
|
||||
// isASCII reports whether s contains only ASCII.
|
||||
func isASCII(s string) bool {
|
||||
// TODO(adonovan): combine when x/tools allows generics.
|
||||
func stringIsASCII(s string) bool {
|
||||
for i := 0; i < len(s); i++ {
|
||||
if s[i] >= utf8.RuneSelf {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func bytesIsASCII(s []byte) bool {
|
||||
for i := 0; i < len(s); i++ {
|
||||
if s[i] >= utf8.RuneSelf {
|
||||
return false
|
||||
|
|
|
|||
|
|
@ -9,8 +9,6 @@ import (
|
|||
"strings"
|
||||
)
|
||||
|
||||
// TODO(adonovan): API: hide all but func Unified.
|
||||
|
||||
// Unified applies the edits to content and presents a unified diff.
|
||||
// The old and new labels are the names of the content and result files.
|
||||
func Unified(oldLabel, newLabel string, content string, edits []Edit) string {
|
||||
|
|
@ -92,7 +90,7 @@ func toUnified(fromName, toName string, content string, edits []Edit) unified {
|
|||
if len(edits) == 0 {
|
||||
return u
|
||||
}
|
||||
edits = LineEdits(content, edits) // expand to whole lines
|
||||
edits = lineEdits(content, edits) // expand to whole lines
|
||||
lines := splitLines(content)
|
||||
var h *hunk
|
||||
last := 0
|
||||
|
|
|
|||
Loading…
Reference in New Issue