diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go index e49240fb53..3c08b9f9d1 100644 --- a/src/encoding/csv/reader.go +++ b/src/encoding/csv/reader.go @@ -58,6 +58,7 @@ import ( "fmt" "io" "unicode" + "unicode/utf8" ) // A ParseError is returned for parsing errors. @@ -115,20 +116,25 @@ type Reader struct { // By default, each call to Read returns newly allocated memory owned by the caller. ReuseRecord bool - line int - recordLine int // line where the current record started - column int - r *bufio.Reader - // lineBuffer holds the unescaped fields read by readField, one after another. + r *bufio.Reader + + // numLine is the current line being read in the CSV file. + numLine int + + // rawBuffer is a line buffer only used by the readLine method. + rawBuffer []byte + + // recordBuffer holds the unescaped fields, one after another. // The fields can be accessed by using the indexes in fieldIndexes. - // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and - // fieldIndexes will contain the indexes 0, 1, 2, 5. - lineBuffer bytes.Buffer - // Indexes of fields inside lineBuffer - // The i'th field starts at offset fieldIndexes[i] in lineBuffer. + // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de` + // and fieldIndexes will contain the indexes [1, 2, 5, 6]. + recordBuffer []byte + + // fieldIndexes is an index of fields inside recordBuffer. + // The i'th field ends at offset fieldIndexes[i] in recordBuffer. fieldIndexes []int - // only used when ReuseRecord == true + // lastRecord is a record cache and only used when ReuseRecord == true. lastRecord []string } @@ -140,15 +146,6 @@ func NewReader(r io.Reader) *Reader { } } -// error creates a new ParseError based on err. -func (r *Reader) error(err error) error { - return &ParseError{ - Line: r.recordLine, - Column: r.column, - Err: err, - } -} - // Read reads one record (a slice of fields) from r. // If the record has an unexpected number of fields, // Read returns the record along with the error ErrFieldCount. @@ -164,7 +161,6 @@ func (r *Reader) Read() (record []string, err error) { } else { record, err = r.readRecord(nil) } - return record, err } @@ -186,237 +182,182 @@ func (r *Reader) ReadAll() (records [][]string, err error) { } } -// readRecord reads and parses a single csv record from r. -// Unlike parseRecord, readRecord handles FieldsPerRecord. -// If dst has enough capacity it will be used for the returned record. -func (r *Reader) readRecord(dst []string) (record []string, err error) { - for { - record, err = r.parseRecord(dst) - if record != nil { - break +// readLine reads the next line (with the trailing endline). +// If EOF is hit without a trailing endline, it will be omitted. +// If some bytes were read, then the error is never io.EOF. +// The result is only valid until the next call to readLine. +func (r *Reader) readLine() ([]byte, error) { + line, err := r.r.ReadSlice('\n') + if err == bufio.ErrBufferFull { + r.rawBuffer = append(r.rawBuffer[:0], line...) + for err == bufio.ErrBufferFull { + line, err = r.r.ReadSlice('\n') + r.rawBuffer = append(r.rawBuffer, line...) } - if err != nil { - return nil, err + line = r.rawBuffer + } + if len(line) > 0 && err == io.EOF { + err = nil + } + r.numLine++ + return line, err +} + +// lengthCRLF reports the number of bytes for a trailing "\r\n". +func lengthCRLF(b []byte) int { + if j := len(b) - 1; j >= 0 && b[j] == '\n' { + if j := len(b) - 2; j >= 0 && b[j] == '\r' { + return 2 } + return 1 + } + return 0 +} + +// nextRune returns the next rune in b or utf8.RuneError. +func nextRune(b []byte) rune { + r, _ := utf8.DecodeRune(b) + return r +} + +func (r *Reader) readRecord(dst []string) ([]string, error) { + // Read line (automatically skipping past empty lines and any comments). + var line, fullLine []byte + var errRead error + for errRead == nil { + line, errRead = r.readLine() + if r.Comment != 0 && nextRune(line) == r.Comment { + line = nil + continue // Skip comment lines + } + if errRead == nil && len(line) == lengthCRLF(line) { + line = nil + continue // Skip empty lines + } + fullLine = line + break + } + if errRead == io.EOF { + return nil, errRead } + // Parse each field in the record. + var err error + const quoteLen = len(`"`) + commaLen := utf8.RuneLen(r.Comma) + recLine := r.numLine // Starting line for record + r.recordBuffer = r.recordBuffer[:0] + r.fieldIndexes = r.fieldIndexes[:0] +parseField: + for { + if r.TrimLeadingSpace { + line = bytes.TrimLeftFunc(line, unicode.IsSpace) + } + if len(line) == 0 || line[0] != '"' { + // Non-quoted string field + i := bytes.IndexRune(line, r.Comma) + field := line + if i >= 0 { + field = field[:i] + } else { + field = field[:len(field)-lengthCRLF(field)] + } + // Check to make sure a quote does not appear in field. + if !r.LazyQuotes { + if j := bytes.IndexByte(field, '"'); j >= 0 { + col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])]) + err = &ParseError{Line: r.numLine, Column: col, Err: ErrBareQuote} + break parseField + } + } + r.recordBuffer = append(r.recordBuffer, field...) + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + if i >= 0 { + line = line[i+commaLen:] + continue parseField + } + break parseField + } else { + // Quoted string field + line = line[quoteLen:] + for { + i := bytes.IndexByte(line, '"') + if i >= 0 { + // Hit next quote. + r.recordBuffer = append(r.recordBuffer, line[:i]...) + line = line[i+quoteLen:] + switch rn := nextRune(line); { + case rn == '"': + // `""` sequence (append quote). + r.recordBuffer = append(r.recordBuffer, '"') + line = line[quoteLen:] + case rn == r.Comma: + // `",` sequence (end of field). + line = line[commaLen:] + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + continue parseField + case lengthCRLF(line) == len(line): + // `"\n` sequence (end of line). + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + break parseField + case r.LazyQuotes: + // `"` sequence (bare quote). + r.recordBuffer = append(r.recordBuffer, '"') + default: + // `"*` squence (invalid non-escaped quote). + col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen]) + err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote} + break parseField + } + } else if len(line) > 0 { + // Hit end of line (copy all data so far). + r.recordBuffer = append(r.recordBuffer, line...) + if errRead != nil { + break parseField + } + line, errRead = r.readLine() + if errRead == io.EOF { + errRead = nil + } + fullLine = line + } else { + // Abrupt end of file (EOF or error). + if !r.LazyQuotes && errRead == nil { + col := utf8.RuneCount(fullLine) + err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote} + break parseField + } + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + break parseField + } + } + } + } + if err == nil { + err = errRead + } + + // Create a single string and create slices out of it. + // This pins the memory of the fields together, but allocates once. + str := string(r.recordBuffer) // Convert to string once to batch allocations + dst = dst[:0] + if cap(dst) < len(r.fieldIndexes) { + dst = make([]string, len(r.fieldIndexes)) + } + dst = dst[:len(r.fieldIndexes)] + var preIdx int + for i, idx := range r.fieldIndexes { + dst[i] = str[preIdx:idx] + preIdx = idx + } + + // Check or update the expected fields per record. if r.FieldsPerRecord > 0 { - if len(record) != r.FieldsPerRecord { - r.column = 0 // report at start of record - return record, r.error(ErrFieldCount) + if len(dst) != r.FieldsPerRecord && err == nil { + err = &ParseError{Line: recLine, Err: ErrFieldCount} } } else if r.FieldsPerRecord == 0 { - r.FieldsPerRecord = len(record) + r.FieldsPerRecord = len(dst) } - return record, nil -} - -// readRune reads one rune from r, folding \r\n to \n and keeping track -// of how far into the line we have read. r.column will point to the start -// of this rune, not the end of this rune. -func (r *Reader) readRune() (rune, error) { - r1, _, err := r.r.ReadRune() - - // Handle \r\n here. We make the simplifying assumption that - // anytime \r is followed by \n that it can be folded to \n. - // We will not detect files which contain both \r\n and bare \n. - if r1 == '\r' { - r1, _, err = r.r.ReadRune() - if err == nil { - if r1 != '\n' { - r.r.UnreadRune() - r1 = '\r' - } - } - } - r.column++ - return r1, err -} - -// readRawRune works the same way as readRune, but does not fold \r\n to \n. -func (r *Reader) readRawRune() (rune, error) { - r1, _, err := r.r.ReadRune() - r.column++ - return r1, err -} - -// skip reads runes up to and including the rune delim or until error. -func (r *Reader) skip(delim rune) error { - for { - r1, err := r.readRune() - if err != nil { - return err - } - if r1 == delim { - return nil - } - } -} - -// parseRecord reads and parses a single csv record from r. -// If dst has enough capacity it will be used for the returned fields. -func (r *Reader) parseRecord(dst []string) (fields []string, err error) { - // Each record starts on a new line. We increment our line - // number (lines start at 1, not 0) and set column to -1 - // so as we increment in readRune it points to the character we read. - // We track the line where the record starts in recordLine for use in errors. - r.line++ - r.recordLine = r.line - r.column = -1 - - // Peek at the first rune. If it is an error we are done. - // If we support comments and it is the comment character - // then skip to the end of line. - - r1, _, err := r.r.ReadRune() - if err != nil { - return nil, err - } - - if r.Comment != 0 && r1 == r.Comment { - return nil, r.skip('\n') - } - r.r.UnreadRune() - - r.lineBuffer.Reset() - r.fieldIndexes = r.fieldIndexes[:0] - - // At this point we have at least one field. - for { - idx := r.lineBuffer.Len() - - haveField, delim, err := r.parseField() - if haveField { - r.fieldIndexes = append(r.fieldIndexes, idx) - } - - if delim == '\n' || err == io.EOF { - if len(r.fieldIndexes) == 0 { - return nil, err - } - break - } - - if err != nil { - return nil, err - } - } - - fieldCount := len(r.fieldIndexes) - // Using this approach (creating a single string and taking slices of it) - // means that a single reference to any of the fields will retain the whole - // string. The risk of a nontrivial space leak caused by this is considered - // minimal and a tradeoff for better performance through the combined - // allocations. - line := r.lineBuffer.String() - - if cap(dst) >= fieldCount { - fields = dst[:fieldCount] - } else { - fields = make([]string, fieldCount) - } - - for i, idx := range r.fieldIndexes { - if i == fieldCount-1 { - fields[i] = line[idx:] - } else { - fields[i] = line[idx:r.fieldIndexes[i+1]] - } - } - - return fields, nil -} - -// parseField parses the next field in the record. The read field is -// appended to r.lineBuffer. Delim is the first character not part of the field -// (r.Comma or '\n'). -func (r *Reader) parseField() (haveField bool, delim rune, err error) { - r1, err := r.readRune() - for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) { - r1, err = r.readRune() - } - - if err == io.EOF && r.column != 0 { - return true, 0, err - } - if err != nil { - return false, 0, err - } - - switch r1 { - case r.Comma: - // will check below - - case '\n': - // We are a trailing empty field or a blank line - if r.column == 0 { - return false, r1, nil - } - return true, r1, nil - - case '"': - // quoted field - Quoted: - for { - // use readRawRune instead of readRune to preserve \r\n - // in quotes fields. - r1, err = r.readRawRune() - if err != nil { - if err == io.EOF { - if r.LazyQuotes { - return true, 0, err - } - return false, 0, r.error(ErrQuote) - } - return false, 0, err - } - switch r1 { - case '"': - r1, err = r.readRune() - if err != nil || r1 == r.Comma { - break Quoted - } - if r1 == '\n' { - return true, r1, nil - } - if r1 != '"' { - if !r.LazyQuotes { - r.column-- - return false, 0, r.error(ErrQuote) - } - // accept the bare quote - r.lineBuffer.WriteRune('"') - } - case '\n': - r.line++ - r.column = -1 - } - r.lineBuffer.WriteRune(r1) - } - - default: - // unquoted field - for { - r.lineBuffer.WriteRune(r1) - r1, err = r.readRune() - if err != nil || r1 == r.Comma { - break - } - if r1 == '\n' { - return true, r1, nil - } - if !r.LazyQuotes && r1 == '"' { - return false, 0, r.error(ErrBareQuote) - } - } - } - - if err != nil { - if err == io.EOF { - return true, 0, err - } - return false, 0, err - } - - return true, r1, nil + return dst, err } diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go index 3811629aad..781847cefa 100644 --- a/src/encoding/csv/reader_test.go +++ b/src/encoding/csv/reader_test.go @@ -26,9 +26,8 @@ var readTests = []struct { TrimLeadingSpace bool ReuseRecord bool - Error string - Line int // Expected error line if != 0 - Column int // Expected error column if line != 0 + Error error + Line int // Expected error line if != 0 }{ { Name: "Simple", @@ -140,7 +139,7 @@ field"`, { Name: "BadDoubleQuotes", Input: `a""b,c`, - Error: `bare " in non-quoted-field`, Line: 1, Column: 1, + Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote}, }, { Name: "TrimQuote", @@ -151,30 +150,30 @@ field"`, { Name: "BadBareQuote", Input: `a "word","b"`, - Error: `bare " in non-quoted-field`, Line: 1, Column: 2, + Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote}, }, { Name: "BadTrailingQuote", Input: `"a word",b"`, - Error: `bare " in non-quoted-field`, Line: 1, Column: 10, + Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote}, }, { Name: "ExtraneousQuote", Input: `"a "word","b"`, - Error: `extraneous " in field`, Line: 1, Column: 3, + Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote}, }, { Name: "BadFieldCount", UseFieldsPerRecord: true, Input: "a,b,c\nd,e", - Error: "wrong number of fields", Line: 2, + Error: &ParseError{Line: 2, Err: ErrFieldCount}, }, { Name: "BadFieldCount1", UseFieldsPerRecord: true, FieldsPerRecord: 2, Input: `a,b,c`, - Error: "wrong number of fields", Line: 1, + Error: &ParseError{Line: 1, Err: ErrFieldCount}, }, { Name: "FieldCount", @@ -271,18 +270,14 @@ x,,, }, }, { // issue 19019 - Name: "RecordLine1", - Input: "a,\"b\nc\"d,e", - Error: `extraneous " in field`, - Line: 1, - Column: 1, + Name: "RecordLine1", + Input: "a,\"b\nc\"d,e", + Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote}, }, { - Name: "RecordLine2", - Input: "a,b\n\"d\n\n,e", - Error: `extraneous " in field`, - Line: 2, - Column: 2, + Name: "RecordLine2", + Input: "a,b\n\"d\n\n,e", + Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote}, }, { // issue 21201 Name: "CRLFInQuotedField", @@ -291,6 +286,95 @@ x,,, {"Hello\r\nHi"}, }, }, + { // issue 19410 + Name: "BinaryBlobField", + Input: "x09\x41\xb4\x1c,aktau", + Output: [][]string{{"x09A\xb4\x1c", "aktau"}}, + }, + { + Name: "TrailingCR", + Input: "field1,field2\r", + Output: [][]string{{"field1", "field2\r"}}, + }, + { + Name: "NonASCIICommaAndComment", + TrimLeadingSpace: true, + Comma: '£', + Comment: '€', + Input: "a£b,c£ \td,e\n€ comment\n", + Output: [][]string{{"a", "b,c", "d,e"}}, + }, + { + Name: "NonASCIICommaAndCommentWithQuotes", + Comma: '€', + Comment: 'λ', + Input: "a€\" b,\"€ c\nλ comment\n", + Output: [][]string{{"a", " b,", " c"}}, + }, + { + Name: "NonASCIICommaConfusion", + Comma: 'λ', + Comment: '€', + // λ and θ start with the same byte. This test is intended to ensure the parser doesn't + // confuse such characters. + Input: "\"abθcd\"λefθgh", + Output: [][]string{{"abθcd", "efθgh"}}, + }, + { + Name: "NonASCIICommentConfusion", + Comment: 'θ', + Input: "λ\nλ\nθ\nλ\n", + Output: [][]string{{"λ"}, {"λ"}, {"λ"}}, + }, + { + Name: "QuotedFieldMultipleLF", + Input: "\"\n\n\n\n\"", + Output: [][]string{{"\n\n\n\n"}}, + }, + { + Name: "MultipleCRLF", + Input: "\r\n\r\n\r\n\r\n", + }, + { + // The implementation may read each line in several chunks if it doesn't fit entirely + // in the read buffer, so we should test the code to handle that condition. + Name: "HugeLines", + Comment: '#', + Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000), + Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}}, + }, + { + Name: "QuoteWithTrailingCRLF", + Input: "\"foo\"bar\"\r\n", + Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote}, + }, + { + Name: "LazyQuoteWithTrailingCRLF", + Input: "\"foo\"bar\"\r\n", + LazyQuotes: true, + Output: [][]string{{`foo"bar`}}, + }, + { + Name: "DoubleQuoteWithTrailingCRLF", + Input: "\"foo\"\"bar\"\r\n", + Output: [][]string{{`foo"bar`}}, + }, + { + Name: "EvenQuotes", + Input: `""""""""`, + Output: [][]string{{`"""`}}, + }, + { + Name: "OddQuotes", + Input: `"""""""`, + Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote}, + }, + { + Name: "LazyOddQuotes", + Input: `"""""""`, + LazyQuotes: true, + Output: [][]string{{`"""`}}, + }, } func TestRead(t *testing.T) { @@ -310,17 +394,10 @@ func TestRead(t *testing.T) { r.Comma = tt.Comma } out, err := r.ReadAll() - perr, _ := err.(*ParseError) - if tt.Error != "" { - if err == nil || !strings.Contains(err.Error(), tt.Error) { - t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error) - } else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) { - t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column) - } - } else if err != nil { - t.Errorf("%s: unexpected error %v", tt.Name, err) + if !reflect.DeepEqual(err, tt.Error) { + t.Errorf("%s: ReadAll() error:\ngot %v\nwant %v", tt.Name, err, tt.Error) } else if !reflect.DeepEqual(out, tt.Output) { - t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output) + t.Errorf("%s: ReadAll() output:\ngot %q\nwant %q", tt.Name, out, tt.Output) } } }