mirror of https://github.com/golang/go.git
118 lines
2.9 KiB
Go
118 lines
2.9 KiB
Go
// Copyright 2010 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package html
|
|
|
|
import (
|
|
"bytes"
|
|
"strings"
|
|
"utf8"
|
|
)
|
|
|
|
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
|
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
|
// Precondition: src[0] == '&' && dst <= src.
|
|
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
|
// TODO(nigeltao): Check that this entity substitution algorithm matches the spec:
|
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
|
// TODO(nigeltao): Handle things like "中" or "中".
|
|
|
|
// i starts at 1 because we already know that s[0] == '&'.
|
|
i, s := 1, b[src:]
|
|
for i < len(s) {
|
|
c := s[i]
|
|
i++
|
|
// Lower-cased characters are more common in entities, so we check for them first.
|
|
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
|
|
continue
|
|
}
|
|
if c != ';' {
|
|
i--
|
|
}
|
|
x := entity[string(s[1:i])]
|
|
if x != 0 {
|
|
return dst + utf8.EncodeRune(x, b[dst:]), src + i
|
|
}
|
|
break
|
|
}
|
|
dst1, src1 = dst+i, src+i
|
|
copy(b[dst:dst1], b[src:src1])
|
|
return dst1, src1
|
|
}
|
|
|
|
// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
|
|
func unescape(b []byte) []byte {
|
|
for i, c := range b {
|
|
if c == '&' {
|
|
dst, src := unescapeEntity(b, i, i)
|
|
for src < len(b) {
|
|
c := b[src]
|
|
if c == '&' {
|
|
dst, src = unescapeEntity(b, dst, src)
|
|
} else {
|
|
b[dst] = c
|
|
dst, src = dst+1, src+1
|
|
}
|
|
}
|
|
return b[0:dst]
|
|
}
|
|
}
|
|
return b
|
|
}
|
|
|
|
const escapedChars = `&'<>"`
|
|
|
|
func escape(buf *bytes.Buffer, s string) {
|
|
i := strings.IndexAny(s, escapedChars)
|
|
for i != -1 {
|
|
buf.WriteString(s[0:i])
|
|
var esc string
|
|
switch s[i] {
|
|
case '&':
|
|
esc = "&"
|
|
case '\'':
|
|
esc = "'"
|
|
case '<':
|
|
esc = "<"
|
|
case '>':
|
|
esc = ">"
|
|
case '"':
|
|
esc = """
|
|
default:
|
|
panic("unrecognized escape character")
|
|
}
|
|
s = s[i+1:]
|
|
buf.WriteString(esc)
|
|
i = strings.IndexAny(s, escapedChars)
|
|
}
|
|
buf.WriteString(s)
|
|
}
|
|
|
|
// EscapeString escapes special characters like "<" to become "<". It
|
|
// escapes only five such characters: amp, apos, lt, gt and quot.
|
|
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
|
// always true.
|
|
func EscapeString(s string) string {
|
|
if strings.IndexAny(s, escapedChars) == -1 {
|
|
return s
|
|
}
|
|
buf := bytes.NewBuffer(nil)
|
|
escape(buf, s)
|
|
return buf.String()
|
|
}
|
|
|
|
// UnescapeString unescapes entities like "<" to become "<". It unescapes a
|
|
// larger range of entities than EscapeString escapes. For example, "á"
|
|
// unescapes to "á", as does "á" and "&xE1;".
|
|
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
|
// always true.
|
|
func UnescapeString(s string) string {
|
|
for _, c := range s {
|
|
if c == '&' {
|
|
return string(unescape([]byte(s)))
|
|
}
|
|
}
|
|
return s
|
|
}
|