go/src/cmd/compile/internal/syntax/source.go

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package syntax

import (
	"io"
	"unicode/utf8"
)

// buf [...read...|...|...unread...|s|...free...]
//         ^      ^   ^            ^
//         |      |   |            |
//        suf     r0  r            w

type source struct {
	src io.Reader

	// source buffer
	buf         [4 << 10]byte
	offs        int   // source offset of buf
	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
	line0, line int   // previous/current line
	err         error // pending io error

	// literal buffer
	lit []byte // literal prefix
	suf int    // literal suffix; suf >= 0 means we are scanning a literal
}

func (s *source) init(src io.Reader) {
	s.src = src
	s.buf[0] = utf8.RuneSelf // terminate with sentinel
	s.offs = 0
	s.r0, s.r, s.w = 0, 0, 0
	s.line0, s.line = 1, 1
	s.err = nil

	s.lit = s.lit[:0]
	s.suf = -1
}

func (s *source) pos() int {
	return s.offs + s.r
}

func (s *source) ungetr() {
	s.r, s.line = s.r0, s.line0
}

func (s *source) getr() rune {
	for {
		s.r0, s.line0 = s.r, s.line

		// common case: ASCII and enough bytes
		if b := s.buf[s.r]; b < utf8.RuneSelf {
			s.r++
			if b == 0 {
				panic("invalid NUL character")
				continue
			}
			if b == '\n' {
				s.line++
			}
			return rune(b)
		}

		// uncommon case: not ASCII or not enough bytes
		r, w := utf8.DecodeRune(s.buf[s.r:s.w]) // optimistically assume valid rune
		if r != utf8.RuneError || w > 1 {
			s.r += w
			// BOM's are only allowed as the first character in a file
			const BOM = 0xfeff
			if r == BOM && s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
				panic("invalid BOM in the middle of the file")
				continue
			}
			return r
		}

		if w == 0 && s.err != nil {
			if s.err != io.EOF {
				panic(s.err)
			}
			return -1
		}

		if w == 1 && (s.r+utf8.UTFMax <= s.w || utf8.FullRune(s.buf[s.r:s.w])) {
			s.r++
			panic("invalid UTF-8 encoding")
			continue
		}

		s.fill()
	}
}

func (s *source) fill() {
	// Slide unread bytes to beginning but preserve last read char
	// (for one ungetr call) plus one extra byte (for a 2nd ungetr
	// call, only for ".." character sequence).
	if s.r0 > 1 {
		// save literal prefix, if any
		// (We see at most one ungetr call while reading
		// a literal, so make sure s.r0 remains in buf.)
		if s.suf >= 0 {
			s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
			s.suf = 1 // == s.r0 after slide below
		}
		s.offs += s.r0 - 1
		r := s.r - s.r0 + 1 // last read char plus one byte
		s.w = r + copy(s.buf[r:], s.buf[s.r:s.w])
		s.r = r
		s.r0 = 1
	}

	// read more data: try a limited number of times
	for i := 100; i > 0; i-- {
		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
		if n < 0 {
			panic("negative read")
		}
		s.w += n
		if n > 0 || err != nil {
			s.buf[s.w] = utf8.RuneSelf // sentinel
			if err != nil {
				s.err = err
			}
			return
		}
	}

	panic("no progress")
}

func (s *source) startLit() {
	s.suf = s.r0
	s.lit = s.lit[:0] // reuse lit
}

func (s *source) stopLit() []byte {
	lit := s.buf[s.suf:s.r]
	if len(s.lit) > 0 {
		lit = append(s.lit, lit...)
	}
	s.suf = -1 // no pending literal
	return lit
}