// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package html import ( "bytes" "os" "strings" "testing" ) type tokenTest struct { // A short description of the test case. desc string // The HTML to parse. html string // The string representations of the expected tokens, joined by '$'. golden string } var tokenTests = []tokenTest{ { "empty", "", "", }, // A single text node. The tokenizer should not break text nodes on whitespace, // nor should it normalize whitespace within a text node. { "text", "foo bar", "foo bar", }, // An entity. { "entity", "one < two", "one < two", }, // A start, self-closing and end tag. The tokenizer does not care if the start // and end tokens don't match; that is the job of the parser. { "tags", "bd", "$b$$d$", }, // Angle brackets that aren't a tag. { "not a tag #0", "<", "<", }, { "not a tag #1", "", "", }, { "not a tag #3", "ab", "a$b", }, { "not a tag #4", "", "", }, { "not a tag #5", "", }, { "not a tag #6", "", "", }, { "not a tag #7", "a < b", "a < b", }, { "not a tag #8", "<.>", "<.>", }, { "not a tag #9", "a<<>>c", "a<<$$>>c", }, { "not a tag #10", "if x<0 and y < 0 then x*y>0", "if x<0 and y < 0 then x*y>0", }, // EOF in a tag name. { "tag name eof #0", "", }, { "tag name eof #4", ``, }, // Some malformed tags that are missing a '>'. { "malformed tag #0", ``, ``, }, { "malformed tag #1", `

`, `

`, }, { "malformed tag #2", `

`, }, { "malformed tag #3", `

`, }, { "malformed tag #4", `

`, `

`, }, { "malformed tag #5", `

`, }, { "malformed tag #6", `

`, `

`, }, { "malformed tag #7", `

`, }, { "malformed tag #8", `

`, `

`, }, // DOCTYPE tests. { "Proper DOCTYPE", "", "", }, { "DOCTYPE with no space", "", "", }, { "DOCTYPE with two spaces", "", "", }, { "looks like DOCTYPE but isn't", "", "", }, { "DOCTYPE at EOF", "", }, // XML processing instructions. { "XML processing instruction", "", "", }, // Comments. { "comment0", "abcdef", "abc$$$$def", }, { "comment1", "az", "a$$z", }, { "comment2", "az", "a$$z", }, { "comment3", "az", "a$$z", }, { "comment4", "az", "a$$z", }, { "comment5", "az", "a$$z", }, { "comment6", "az", "a$$z", }, { "comment7", "a", }, { "comment8", "a", }, { "comment9", "az", "a$$z", }, // An attribute with a backslash. { "backslash", `

`, `

`, }, // Entities, tag name and attribute key lower-casing, and whitespace // normalization within a tag. { "tricky", "

te<&;xt

", `

$$te<&;xt$$

`, }, // A nonexistent entity. Tokenizing and converting back to a string should // escape the "&" to become "&". { "noSuchEntity", `
<&alsoDoesntExist;&`, `$<&alsoDoesntExist;&`, }, /* // TODO: re-enable this test when it works. This input/output matches html5lib's behavior. { "entity without semicolon", `¬it;∉`, `¬it;∉$`, }, */ { "entity with digits", "½", "½", }, // Attribute tests: // http://dev.w3.org/html5/spec/Overview.html#attributes-0 { "Empty attribute", ``, ``, }, { "Empty attribute, whitespace", ``, ``, }, { "Unquoted attribute value", ``, ``, }, { "Unquoted attribute value, spaces", ``, ``, }, { "Unquoted attribute value, trailing space", ``, ``, }, { "Single-quoted attribute value", ``, ``, }, { "Single-quoted attribute value, trailing space", ``, ``, }, { "Double-quoted attribute value", ``, ``, }, { "Attribute name characters", ``, ``, }, { "Mixed attributes", `a

z`, `a$

$z`, }, { "Attributes with a solitary single quote", `

`, `

$

`, }, } func TestTokenizer(t *testing.T) { loop: for _, tt := range tokenTests { z := NewTokenizer(strings.NewReader(tt.html)) z.ReturnComments = true if tt.golden != "" { for i, s := range strings.Split(tt.golden, "$") { if z.Next() == ErrorToken { t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) continue loop } actual := z.Token().String() if s != actual { t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) continue loop } } } z.Next() if z.Error() != os.EOF { t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String()) } } } type unescapeTest struct { // A short description of the test case. desc string // The HTML text. html string // The unescaped text. unescaped string } var unescapeTests = []unescapeTest{ // Handle no entities. { "copy", "A\ttext\nstring", "A\ttext\nstring", }, // Handle simple named entities. { "simple", "& > <", "& > <", }, // Handle hitting the end of the string. { "stringEnd", "& &", "& &", }, // Handle entities with two codepoints. { "multiCodepoint", "text ⋛︀ blah", "text \u22db\ufe00 blah", }, // Handle decimal numeric entities. { "decimalEntity", "Delta = Δ ", "Delta = Δ ", }, // Handle hexadecimal numeric entities. { "hexadecimalEntity", "Lambda = λ = λ ", "Lambda = λ = λ ", }, // Handle numeric early termination. { "numericEnds", "&# &#x €43 © = ©f = ©", "&# &#x €43 © = ©f = ©", }, // Handle numeric ISO-8859-1 entity replacements. { "numericReplacements", "Footnote‡", "Footnote‡", }, } func TestUnescape(t *testing.T) { for _, tt := range unescapeTests { unescaped := UnescapeString(tt.html) if unescaped != tt.unescaped { t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped) } } } func TestUnescapeEscape(t *testing.T) { ss := []string{ ``, `abc def`, `a & b`, `a&b`, `a & b`, `"`, `"`, `"<&>"`, `"<&>"`, `3&5==1 && 0<1, "0<1", a+acute=á`, } for _, s := range ss { if s != UnescapeString(EscapeString(s)) { t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s) } } } func TestBufAPI(t *testing.T) { s := "0123456789" z := NewTokenizer(bytes.NewBuffer([]byte(s))) result := bytes.NewBuffer(nil) depth := 0 loop: for { tt := z.Next() switch tt { case ErrorToken: if z.Error() != os.EOF { t.Error(z.Error()) } break loop case TextToken: if depth > 0 { result.Write(z.Text()) } case StartTagToken, EndTagToken: tn, _ := z.TagName() if len(tn) == 1 && tn[0] == 'a' { if tt == StartTagToken { depth++ } else { depth-- } } } } u := "14567" v := string(result.Bytes()) if u != v { t.Errorf("TestBufAPI: want %q got %q", u, v) } }