diff --git a/src/strings/strings.go b/src/strings/strings.go index 2650fb057c..a01eb698c4 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -290,11 +290,118 @@ func SplitAfter(s, sep string) []string { return genSplit(s, sep, len(sep), -1) } +var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} + // Fields splits the string s around each instance of one or more consecutive white space // characters, as defined by unicode.IsSpace, returning an array of substrings of s or an // empty list if s contains only white space. func Fields(s string) []string { - return FieldsFunc(s, unicode.IsSpace) + // First count the fields. + // This is an exact count if s is ASCII, otherwise it is an approximation. + n := 0 + wasSpace := 1 + // setBits is used to track which bits are set in the bytes of s. + setBits := uint8(0) + for i := 0; i < len(s); i++ { + r := s[i] + setBits |= r + isSpace := int(asciiSpace[r]) + n += wasSpace & ^isSpace + wasSpace = isSpace + } + + if setBits < utf8.RuneSelf { // ASCII fast path + a := make([]string, n) + na := 0 + fieldStart := 0 + i := 0 + // Skip spaces in the front of the input. + for i < len(s) && asciiSpace[s[i]] != 0 { + i++ + } + fieldStart = i + for i < len(s) { + if asciiSpace[s[i]] == 0 { + i++ + continue + } + a[na] = s[fieldStart:i] + na++ + i++ + // Skip spaces in between fields. + for i < len(s) && asciiSpace[s[i]] != 0 { + i++ + } + fieldStart = i + } + if fieldStart < len(s) { // Last field might end at EOF. + a[na] = s[fieldStart:] + } + return a + } + + // Some runes in the input string are not ASCII. + // Same general approach as in the ASCII path but + // uses DecodeRuneInString and unicode.IsSpace if + // a non-ASCII rune needs to be decoded and checked + // if it corresponds to a space. + a := make([]string, 0, n) + fieldStart := 0 + i := 0 + // Skip spaces in the front of the input. + for i < len(s) { + if c := s[i]; c < utf8.RuneSelf { + if asciiSpace[c] == 0 { + break + } + i++ + } else { + r, w := utf8.DecodeRuneInString(s[i:]) + if !unicode.IsSpace(r) { + break + } + i += w + } + } + fieldStart = i + for i < len(s) { + if c := s[i]; c < utf8.RuneSelf { + if asciiSpace[c] == 0 { + i++ + continue + } + a = append(a, s[fieldStart:i]) + i++ + } else { + r, w := utf8.DecodeRuneInString(s[i:]) + if !unicode.IsSpace(r) { + i += w + continue + } + a = append(a, s[fieldStart:i]) + i += w + } + // Skip spaces in between fields. + for i < len(s) { + if c := s[i]; c < utf8.RuneSelf { + if asciiSpace[c] == 0 { + break + } + i++ + } else { + r, w := utf8.DecodeRuneInString(s[i:]) + if !unicode.IsSpace(r) { + break + } + i += w + } + } + fieldStart = i + } + if fieldStart < len(s) { // Last field might end at EOF. + a = append(a, s[fieldStart:]) + } + return a } // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go index 97041eb9ac..58314a6868 100644 --- a/src/strings/strings_test.go +++ b/src/strings/strings_test.go @@ -452,6 +452,7 @@ var fieldstests = []FieldsTest{ {"", []string{}}, {" ", []string{}}, {" \t ", []string{}}, + {"\u2000", []string{}}, {" abc ", []string{"abc"}}, {"1 2 3 4", []string{"1", "2", "3", "4"}}, {"1 2 3 4", []string{"1", "2", "3", "4"}}, @@ -459,6 +460,9 @@ var fieldstests = []FieldsTest{ {"1\u20002\u20013\u20024", []string{"1", "2", "3", "4"}}, {"\u2000\u2001\u2002", []string{}}, {"\n™\t™\n", []string{"™", "™"}}, + {"\n\u20001™2\u2000 \u2001 ™", []string{"1™2", "™"}}, + {"\n1\uFFFD \uFFFD2\u20003\uFFFD4", []string{"1\uFFFD", "\uFFFD2", "3\uFFFD4"}}, + {"1\xFF\u2000\xFF2\xFF \xFF", []string{"1\xFF", "\xFF2\xFF", "\xFF"}}, {faces, []string{faces}}, } @@ -1473,19 +1477,55 @@ var makeFieldsInput = func() string { return string(x) } -var fieldsInput = makeFieldsInput() +var makeFieldsInputASCII = func() string { + x := make([]byte, 1<<20) + // Input is ~10% space, rest ASCII non-space. + for i := range x { + if rand.Intn(10) == 0 { + x[i] = ' ' + } else { + x[i] = 'x' + } + } + return string(x) +} + +var stringdata = []struct{ name, data string }{ + {"ASCII", makeFieldsInputASCII()}, + {"Mixed", makeFieldsInput()}, +} func BenchmarkFields(b *testing.B) { - b.SetBytes(int64(len(fieldsInput))) - for i := 0; i < b.N; i++ { - Fields(fieldsInput) + for _, sd := range stringdata { + b.Run(sd.name, func(b *testing.B) { + for j := 1 << 4; j <= 1<<20; j <<= 4 { + b.Run(fmt.Sprintf("%d", j), func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(j)) + data := sd.data[:j] + for i := 0; i < b.N; i++ { + Fields(data) + } + }) + } + }) } } func BenchmarkFieldsFunc(b *testing.B) { - b.SetBytes(int64(len(fieldsInput))) - for i := 0; i < b.N; i++ { - FieldsFunc(fieldsInput, unicode.IsSpace) + for _, sd := range stringdata { + b.Run(sd.name, func(b *testing.B) { + for j := 1 << 4; j <= 1<<20; j <<= 4 { + b.Run(fmt.Sprintf("%d", j), func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(j)) + data := sd.data[:j] + for i := 0; i < b.N; i++ { + FieldsFunc(data, unicode.IsSpace) + } + }) + } + }) } }