".
// Precondition: z.err == nil.
func (z *Tokenizer) readTagAttrKey() {
z.pendingAttr[0].start = z.raw.end
@@ -452,7 +415,7 @@ func (z *Tokenizer) readTagAttrKey() {
}
}
-// readTagAttrVal sets z.pendingAttr[1] to the "v" in "
".
+// readTagAttrVal sets z.pendingAttr[1] to the "v" in "
".
func (z *Tokenizer) readTagAttrVal() {
z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end
@@ -514,69 +477,100 @@ func (z *Tokenizer) readTagAttrVal() {
}
}
-// nextText reads all text up until a start tag "
",
+ // "" and "".
+ tokenType = CommentToken
+ default:
continue
}
- c = z.readByte()
- if z.err != nil {
- break
+
+ // We have a non-text token, but we might have accumulated some text
+ // before that. If so, we return the text first, and return the non-
+ // text token on the subsequent call to Next.
+ if x := z.raw.end - len("' {
+ // ">" does not generate a token at all.
+ // Reset the tokenizer state and start again.
+ z.raw.start = z.raw.end
+ z.data.start = z.raw.end
+ z.data.end = z.raw.end
+ continue loop
+ }
+ if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
+ z.readEndTag()
+ return EndTagToken
+ }
+ z.raw.end--
+ z.readUntilCloseAngle()
+ return CommentToken
+ case CommentToken:
+ if c == '!' {
+ return z.readMarkupDeclaration()
+ }
+ z.raw.end--
+ z.readUntilCloseAngle()
+ return CommentToken
}
}
- z.data = z.raw
+ if z.raw.start < z.raw.end {
+ z.data.end = z.raw.end
+ return TextToken
+ }
+ return ErrorToken
}
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
for {
- if z.err != nil {
- z.tt = ErrorToken
- return z.tt
- }
- z.raw.start = z.raw.end
- z.data.start = z.raw.end
- z.data.end = z.raw.end
- z.attr = z.attr[:0]
- z.nAttrReturned = 0
-
- c := z.readByte()
- if z.err != nil {
- z.tt = ErrorToken
- return z.tt
- }
- // We assume that the next token is text unless proven otherwise.
- z.tt = TextToken
- if c != '<' {
- z.nextText()
- } else {
- z.nextTag()
- if z.tt == CommentToken && !z.ReturnComments {
- continue
- }
+ z.tt = z.next()
+ // TODO: remove the ReturnComments option. A tokenizer should
+ // always return comment tags.
+ if z.tt == CommentToken && !z.ReturnComments {
+ continue
}
return z.tt
}
@@ -606,12 +600,14 @@ func (z *Tokenizer) Text() []byte {
// `
`) and whether the tag has attributes.
// The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
- switch z.tt {
- case StartTagToken, EndTagToken, SelfClosingTagToken:
- s := z.buf[z.data.start:z.data.end]
- z.data.start = z.raw.end
- z.data.end = z.raw.end
- return lower(s), z.nAttrReturned < len(z.attr)
+ if z.data.start < z.data.end {
+ switch z.tt {
+ case StartTagToken, EndTagToken, SelfClosingTagToken:
+ s := z.buf[z.data.start:z.data.end]
+ z.data.start = z.raw.end
+ z.data.end = z.raw.end
+ return lower(s), z.nAttrReturned < len(z.attr)
+ }
}
return nil, false
}
@@ -622,7 +618,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
if z.nAttrReturned < len(z.attr) {
switch z.tt {
- case StartTagToken, EndTagToken, SelfClosingTagToken:
+ case StartTagToken, SelfClosingTagToken:
x := z.attr[z.nAttrReturned]
z.nAttrReturned++
key = z.buf[x[0].start:x[0].end]
@@ -640,7 +636,7 @@ func (z *Tokenizer) Token() Token {
switch z.tt {
case TextToken, CommentToken, DoctypeToken:
t.Data = string(z.Text())
- case StartTagToken, EndTagToken, SelfClosingTagToken:
+ case StartTagToken, SelfClosingTagToken:
var attr []Attribute
name, moreAttr := z.TagName()
for moreAttr {
@@ -650,6 +646,9 @@ func (z *Tokenizer) Token() Token {
}
t.Data = string(name)
t.Attr = attr
+ case EndTagToken:
+ name, _ := z.TagName()
+ t.Data = string(name)
}
return t
}
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go
index 09bb75be15..2bd87e9129 100644
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -57,19 +57,16 @@ var tokenTests = []tokenTest{
"",
"</",
},
- /*
- // TODO: re-enable these tests when we tokenize them correctly.
- {
- "not a tag #2",
- ">",
- "",
- },
- {
- "not a tag #3",
- "a>b",
- "a$b",
- },
- */
+ {
+ "not a tag #2",
+ ">",
+ "",
+ },
+ {
+ "not a tag #3",
+ "a>b",
+ "a$b",
+ },
{
"not a tag #4",
" >",
@@ -77,21 +74,31 @@ var tokenTests = []tokenTest{
},
{
"not a tag #5",
+ "",
+ },
+ {
+ "not a tag #6",
+ "",
+ "",
+ },
+ {
+ "not a tag #7",
"a < b",
"a < b",
},
{
- "not a tag #6",
+ "not a tag #8",
"<.>",
"<.>",
},
{
- "not a tag #7",
+ "not a tag #9",
"a<<>>c",
"a<<$$>>c",
},
{
- "not a tag #8",
+ "not a tag #10",
"if x<0 and y < 0 then x*y>0",
"if x<0 and y < 0 then x*y>0",
},
@@ -345,7 +352,7 @@ var tokenTests = []tokenTest{
func TestTokenizer(t *testing.T) {
loop:
for _, tt := range tokenTests {
- z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
+ z := NewTokenizer(strings.NewReader(tt.html))
z.ReturnComments = true
if tt.golden != "" {
for i, s := range strings.Split(tt.golden, "$") {