mirror of https://github.com/golang/go.git
encoding/csv: add option to reuse slices returned by Read
In many cases the records returned by Reader.Read will only be used between calls to Read and become garbage once a new record is read. In this case, instead of allocating a new slice on each call to Read, we can reuse the last allocated slice for successive calls to avoid unnecessary allocations. This change adds a new field ReuseRecord to the Reader struct to enable this reuse. ReuseRecord is false by default to avoid breaking existing code which dependss on the current behaviour. I also added 4 new benchmarks, corresponding to the existing Read benchmarks, which set ReuseRecord to true. Benchstat on my local machine (old is ReuseRecord = false, new is ReuseRecord = true) name old time/op new time/op delta Read-8 2.75µs ± 2% 1.88µs ± 1% -31.52% (p=0.000 n=14+15) ReadWithFieldsPerRecord-8 2.75µs ± 0% 1.89µs ± 1% -31.43% (p=0.000 n=13+13) ReadWithoutFieldsPerRecord-8 2.77µs ± 1% 1.88µs ± 1% -32.06% (p=0.000 n=15+15) ReadLargeFields-8 55.4µs ± 1% 54.2µs ± 0% -2.07% (p=0.000 n=15+14) name old alloc/op new alloc/op delta Read-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadWithFieldsPerRecord-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadWithoutFieldsPerRecord-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadLargeFields-8 3.94kB ± 0% 2.98kB ± 0% -24.39% (p=0.000 n=15+15) name old allocs/op new allocs/op delta Read-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadWithFieldsPerRecord-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadWithoutFieldsPerRecord-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadLargeFields-8 24.0 ± 0% 12.0 ± 0% -50.00% (p=0.000 n=15+15) Fixes #19721 Change-Id: I79b14128bb9bb3465f53f40f93b1b528a9da6f58 Reviewed-on: https://go-review.googlesource.com/41730 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
ba8ff87dbe
commit
2181653be6
|
|
@ -110,6 +110,10 @@ type Reader struct {
|
|||
// If TrimLeadingSpace is true, leading white space in a field is ignored.
|
||||
// This is done even if the field delimiter, Comma, is white space.
|
||||
TrimLeadingSpace bool
|
||||
// ReuseRecord controls whether calls to Read may return a slice sharing
|
||||
// the backing array of the previous call's returned slice for performance.
|
||||
// By default, each call to Read returns newly allocated memory owned by the caller.
|
||||
ReuseRecord bool
|
||||
|
||||
line int
|
||||
column int
|
||||
|
|
@ -122,6 +126,9 @@ type Reader struct {
|
|||
// Indexes of fields inside lineBuffer
|
||||
// The i'th field starts at offset fieldIndexes[i] in lineBuffer.
|
||||
fieldIndexes []int
|
||||
|
||||
// only used when ReuseRecord == true
|
||||
lastRecord []string
|
||||
}
|
||||
|
||||
// NewReader returns a new Reader that reads from r.
|
||||
|
|
@ -147,9 +154,43 @@ func (r *Reader) error(err error) error {
|
|||
// Except for that case, Read always returns either a non-nil
|
||||
// record or a non-nil error, but not both.
|
||||
// If there is no data left to be read, Read returns nil, io.EOF.
|
||||
// If ReuseRecord is true, the returned slice may be shared
|
||||
// between multiple calls to Read.
|
||||
func (r *Reader) Read() (record []string, err error) {
|
||||
if r.ReuseRecord {
|
||||
record, err = r.readRecord(r.lastRecord)
|
||||
r.lastRecord = record
|
||||
} else {
|
||||
record, err = r.readRecord(nil)
|
||||
}
|
||||
|
||||
return record, err
|
||||
}
|
||||
|
||||
// ReadAll reads all the remaining records from r.
|
||||
// Each record is a slice of fields.
|
||||
// A successful call returns err == nil, not err == io.EOF. Because ReadAll is
|
||||
// defined to read until EOF, it does not treat end of file as an error to be
|
||||
// reported.
|
||||
func (r *Reader) ReadAll() (records [][]string, err error) {
|
||||
for {
|
||||
record, err = r.parseRecord()
|
||||
record, err := r.readRecord(nil)
|
||||
if err == io.EOF {
|
||||
return records, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
records = append(records, record)
|
||||
}
|
||||
}
|
||||
|
||||
// readRecord reads and parses a single csv record from r.
|
||||
// Unlike parseRecord, readRecord handles FieldsPerRecord.
|
||||
// If dst has enough capacity it will be used for the returned record.
|
||||
func (r *Reader) readRecord(dst []string) (record []string, err error) {
|
||||
for {
|
||||
record, err = r.parseRecord(dst)
|
||||
if record != nil {
|
||||
break
|
||||
}
|
||||
|
|
@ -169,24 +210,6 @@ func (r *Reader) Read() (record []string, err error) {
|
|||
return record, nil
|
||||
}
|
||||
|
||||
// ReadAll reads all the remaining records from r.
|
||||
// Each record is a slice of fields.
|
||||
// A successful call returns err == nil, not err == io.EOF. Because ReadAll is
|
||||
// defined to read until EOF, it does not treat end of file as an error to be
|
||||
// reported.
|
||||
func (r *Reader) ReadAll() (records [][]string, err error) {
|
||||
for {
|
||||
record, err := r.Read()
|
||||
if err == io.EOF {
|
||||
return records, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
records = append(records, record)
|
||||
}
|
||||
}
|
||||
|
||||
// readRune reads one rune from r, folding \r\n to \n and keeping track
|
||||
// of how far into the line we have read. r.column will point to the start
|
||||
// of this rune, not the end of this rune.
|
||||
|
|
@ -223,7 +246,8 @@ func (r *Reader) skip(delim rune) error {
|
|||
}
|
||||
|
||||
// parseRecord reads and parses a single csv record from r.
|
||||
func (r *Reader) parseRecord() (fields []string, err error) {
|
||||
// If dst has enough capacity it will be used for the returned fields.
|
||||
func (r *Reader) parseRecord(dst []string) (fields []string, err error) {
|
||||
// Each record starts on a new line. We increment our line
|
||||
// number (lines start at 1, not 0) and set column to -1
|
||||
// so as we increment in readRune it points to the character we read.
|
||||
|
|
@ -275,7 +299,12 @@ func (r *Reader) parseRecord() (fields []string, err error) {
|
|||
// minimal and a tradeoff for better performance through the combined
|
||||
// allocations.
|
||||
line := r.lineBuffer.String()
|
||||
fields = make([]string, fieldCount)
|
||||
|
||||
if cap(dst) >= fieldCount {
|
||||
fields = dst[:fieldCount]
|
||||
} else {
|
||||
fields = make([]string, fieldCount)
|
||||
}
|
||||
|
||||
for i, idx := range r.fieldIndexes {
|
||||
if i == fieldCount-1 {
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ var readTests = []struct {
|
|||
LazyQuotes bool
|
||||
TrailingComma bool
|
||||
TrimLeadingSpace bool
|
||||
ReuseRecord bool
|
||||
|
||||
Error string
|
||||
Line int // Expected error line if != 0
|
||||
|
|
@ -260,6 +261,15 @@ x,,,
|
|||
{"c", "d", "e"},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "ReadAllReuseRecord",
|
||||
ReuseRecord: true,
|
||||
Input: "a,b\nc,d",
|
||||
Output: [][]string{
|
||||
{"a", "b"},
|
||||
{"c", "d"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func TestRead(t *testing.T) {
|
||||
|
|
@ -274,6 +284,7 @@ func TestRead(t *testing.T) {
|
|||
r.LazyQuotes = tt.LazyQuotes
|
||||
r.TrailingComma = tt.TrailingComma
|
||||
r.TrimLeadingSpace = tt.TrimLeadingSpace
|
||||
r.ReuseRecord = tt.ReuseRecord
|
||||
if tt.Comma != 0 {
|
||||
r.Comma = tt.Comma
|
||||
}
|
||||
|
|
@ -369,3 +380,23 @@ xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzz
|
|||
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
`, 3))
|
||||
}
|
||||
|
||||
func BenchmarkReadReuseRecord(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadReuseRecordWithFieldsPerRecord(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = 4 }, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadReuseRecordWithoutFieldsPerRecord(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = -1 }, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadReuseRecordLargeFields(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv
|
||||
,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
`, 3))
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue