internal/lsp: update the fuzzy matcher to operate on chunks

We can avoid allocating strings when performing workspace symbol search
by having the fuzzy match operate directly on chunks.

When operating on a single string, this slows down the matcher slightly
(perhaps 10%) due to copying bytes rather than accessing the string
directly. We could work around this using unsafe, but this could also be
mitigated by generics.

Benchmark ("test" in x/tools): 48ms->46ms
Benchmark ("test" in kubernetes): 868ms->857ms

Change-Id: Icf0f15aaa5cc3c875cf157a7b90db801045d9ed4
Reviewed-on: https://go-review.googlesource.com/c/tools/+/338694
Trust: Robert Findley <rfindley@google.com>
Run-TryBot: Robert Findley <rfindley@google.com>
gopls-CI: kokoro <noreply+kokoro@google.com>
Reviewed-by: Rebecca Stambler <rstambler@golang.org>
This commit is contained in:
Rob Findley 2021-07-30 15:11:53 -04:00 committed by Robert Findley
parent 0d28b7d7c5
commit 15eebf7e82
4 changed files with 47 additions and 24 deletions

View File

@ -27,23 +27,23 @@ const (
// RuneRoles detects the roles of each byte rune in an input string and stores it in the output
// slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
// or when it filled the output. If output is nil, then it gets created.
func RuneRoles(str string, reuse []RuneRole) []RuneRole {
func RuneRoles(candidate []byte, reuse []RuneRole) []RuneRole {
var output []RuneRole
if cap(reuse) < len(str) {
output = make([]RuneRole, 0, len(str))
if cap(reuse) < len(candidate) {
output = make([]RuneRole, 0, len(candidate))
} else {
output = reuse[:0]
}
prev, prev2 := rtNone, rtNone
for i := 0; i < len(str); i++ {
r := rune(str[i])
for i := 0; i < len(candidate); i++ {
r := rune(candidate[i])
role := RNone
curr := rtLower
if str[i] <= unicode.MaxASCII {
curr = runeType(rt[str[i]] - '0')
if candidate[i] <= unicode.MaxASCII {
curr = runeType(rt[candidate[i]] - '0')
}
if curr == rtLower {
@ -58,7 +58,7 @@ func RuneRoles(str string, reuse []RuneRole) []RuneRole {
if prev == rtUpper {
// This and previous characters are both upper case.
if i+1 == len(str) {
if i+1 == len(candidate) {
// This is last character, previous was also uppercase -> this is UCTail
// i.e., (current char is C): aBC / BC / ABC
role = RUCTail
@ -118,11 +118,26 @@ func LastSegment(input string, roles []RuneRole) string {
return input[start+1 : end+1]
}
// ToLower transforms the input string to lower case, which is stored in the output byte slice.
// fromChunks copies string chunks into the given buffer.
func fromChunks(chunks []string, buffer []byte) []byte {
ii := 0
for _, chunk := range chunks {
for i := 0; i < len(chunk); i++ {
if ii >= cap(buffer) {
break
}
buffer[ii] = chunk[i]
ii++
}
}
return buffer[:ii]
}
// toLower transforms the input string to lower case, which is stored in the output byte slice.
// The lower casing considers only ASCII values - non ASCII values are left unmodified.
// Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
// created.
func ToLower(input string, reuse []byte) []byte {
func toLower(input []byte, reuse []byte) []byte {
output := reuse
if cap(reuse) < len(input) {
output = make([]byte, len(input))
@ -130,7 +145,7 @@ func ToLower(input string, reuse []byte) []byte {
for i := 0; i < len(input); i++ {
r := rune(input[i])
if r <= unicode.MaxASCII {
if input[i] <= unicode.MaxASCII {
if 'A' <= r && r <= 'Z' {
r += 'a' - 'A'
}

View File

@ -36,7 +36,7 @@ func rolesString(roles []fuzzy.RuneRole) string {
func TestRoles(t *testing.T) {
for _, tc := range rolesTests {
gotRoles := make([]fuzzy.RuneRole, len(tc.str))
fuzzy.RuneRoles(tc.str, gotRoles)
fuzzy.RuneRoles([]byte(tc.str), gotRoles)
got := rolesString(gotRoles)
if got != tc.want {
t.Errorf("roles(%s) = %v; want %v", tc.str, got, tc.want)
@ -68,7 +68,7 @@ var wordSplitTests = []struct {
func TestWordSplit(t *testing.T) {
for _, tc := range wordSplitTests {
roles := fuzzy.RuneRoles(tc.input, nil)
roles := fuzzy.RuneRoles([]byte(tc.input), nil)
var got []string
consumer := func(i, j int) {
@ -120,7 +120,7 @@ var lastSegmentSplitTests = []struct {
func TestLastSegment(t *testing.T) {
for _, tc := range lastSegmentSplitTests {
roles := fuzzy.RuneRoles(tc.str, nil)
roles := fuzzy.RuneRoles([]byte(tc.str), nil)
got := fuzzy.LastSegment(tc.str, roles)
@ -135,7 +135,7 @@ func BenchmarkRoles(b *testing.B) {
out := make([]fuzzy.RuneRole, len(str))
for i := 0; i < b.N; i++ {
fuzzy.RuneRoles(str, out)
fuzzy.RuneRoles([]byte(str), out)
}
b.SetBytes(int64(len(str)))
}

View File

@ -51,8 +51,12 @@ type Matcher struct {
lastCandidateLen int // in bytes
lastCandidateMatched bool
// Here we save the last candidate in lower-case. This is basically a byte slice we reuse for
// performance reasons, so the slice is not reallocated for every candidate.
// Reusable buffers to avoid allocating for every candidate.
// - inputBuf stores the concatenated input chunks
// - lowerBuf stores the last candidate in lower-case
// - rolesBuf stores the calculated roles for each rune in the last
// candidate.
inputBuf [MaxInputSize]byte
lowerBuf [MaxInputSize]byte
rolesBuf [MaxInputSize]RuneRole
}
@ -72,7 +76,7 @@ func NewMatcher(pattern string) *Matcher {
m := &Matcher{
pattern: pattern,
patternLower: ToLower(pattern, nil),
patternLower: toLower([]byte(pattern), nil),
}
for i, c := range m.patternLower {
@ -88,7 +92,7 @@ func NewMatcher(pattern string) *Matcher {
m.patternShort = m.patternLower
}
m.patternRoles = RuneRoles(pattern, nil)
m.patternRoles = RuneRoles([]byte(pattern), nil)
if len(pattern) > 0 {
maxCharScore := 4
@ -102,10 +106,15 @@ func NewMatcher(pattern string) *Matcher {
// This is not designed for parallel use. Multiple candidates must be scored sequentially.
// Returns a score between 0 and 1 (0 - no match, 1 - perfect match).
func (m *Matcher) Score(candidate string) float32 {
return m.ScoreChunks([]string{candidate})
}
func (m *Matcher) ScoreChunks(chunks []string) float32 {
candidate := fromChunks(chunks, m.inputBuf[:])
if len(candidate) > MaxInputSize {
candidate = candidate[:MaxInputSize]
}
lower := ToLower(candidate, m.lowerBuf[:])
lower := toLower(candidate, m.lowerBuf[:])
m.lastCandidateLen = len(candidate)
if len(m.pattern) == 0 {
@ -174,7 +183,7 @@ func (m *Matcher) MatchedRanges() []int {
return ret
}
func (m *Matcher) match(candidate string, candidateLower []byte) bool {
func (m *Matcher) match(candidate []byte, candidateLower []byte) bool {
i, j := 0, 0
for ; i < len(candidateLower) && j < len(m.patternLower); i++ {
if candidateLower[i] == m.patternLower[j] {
@ -192,7 +201,7 @@ func (m *Matcher) match(candidate string, candidateLower []byte) bool {
return true
}
func (m *Matcher) computeScore(candidate string, candidateLower []byte) int {
func (m *Matcher) computeScore(candidate []byte, candidateLower []byte) int {
pattLen, candLen := len(m.pattern), len(candidate)
for j := 0; j <= len(m.pattern); j++ {

View File

@ -233,8 +233,7 @@ func parseQuery(q string) matcherFunc {
default:
fm := fuzzy.NewMatcher(field)
f = func(chunks []string) (int, float64) {
s := strings.Join(chunks, "")
score := float64(fm.Score(s))
score := float64(fm.ScoreChunks(chunks))
ranges := fm.MatchedRanges()
if len(ranges) > 0 {
return ranges[0], score