// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package html import ( "bytes" "strings" "utf8" ) // unescapeEntity reads an entity like "<" from b[src:] and writes the // corresponding "<" to b[dst:], returning the incremented dst and src cursors. // Precondition: src[0] == '&' && dst <= src. func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { // TODO(nigeltao): Check that this entity substitution algorithm matches the spec: // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference // TODO(nigeltao): Handle things like "中" or "中". // i starts at 1 because we already know that s[0] == '&'. i, s := 1, b[src:] for i < len(s) { c := s[i] i++ // Lower-cased characters are more common in entities, so we check for them first. if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { continue } if c != ';' { i-- } x := entity[string(s[1:i])] if x != 0 { return dst + utf8.EncodeRune(x, b[dst:]), src + i } break } dst1, src1 = dst+i, src+i copy(b[dst:dst1], b[src:src1]) return dst1, src1 } // unescape unescapes b's entities in-place, so that "a<b" becomes "a"` func escape(buf *bytes.Buffer, s string) { i := strings.IndexAny(s, escapedChars) for i != -1 { buf.WriteString(s[0:i]) var esc string switch s[i] { case '&': esc = "&" case '\'': esc = "'" case '<': esc = "<" case '>': esc = ">" case '"': esc = """ default: panic("unrecognized escape character") } s = s[i+1:] buf.WriteString(esc) i = strings.IndexAny(s, escapedChars) } buf.WriteString(s) } // EscapeString escapes special characters like "<" to become "<". It // escapes only five such characters: amp, apos, lt, gt and quot. // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't // always true. func EscapeString(s string) string { if strings.IndexAny(s, escapedChars) == -1 { return s } buf := bytes.NewBuffer(nil) escape(buf, s) return buf.String() } // UnescapeString unescapes entities like "<" to become "<". It unescapes a // larger range of entities than EscapeString escapes. For example, "á" // unescapes to "รก", as does "á" and "&xE1;". // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't // always true. func UnescapeString(s string) string { for _, c := range s { if c == '&' { return string(unescape([]byte(s))) } } return s }