diff options
Diffstat (limited to 'libgo/go/html/escape.go')
-rw-r--r-- | libgo/go/html/escape.go | 121 |
1 files changed, 114 insertions, 7 deletions
diff --git a/libgo/go/html/escape.go b/libgo/go/html/escape.go index f30086f3678..2799f690876 100644 --- a/libgo/go/html/escape.go +++ b/libgo/go/html/escape.go @@ -10,16 +10,118 @@ import ( "utf8" ) +// These replacements permit compatibility with old numeric entities that +// assumed Windows-1252 encoding. +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference +var replacementTable = [...]int{ + '\u20AC', // First entry is what 0x80 should be replaced with. + '\u0081', + '\u201A', + '\u0192', + '\u201E', + '\u2026', + '\u2020', + '\u2021', + '\u02C6', + '\u2030', + '\u0160', + '\u2039', + '\u0152', + '\u008D', + '\u017D', + '\u008F', + '\u0090', + '\u2018', + '\u2019', + '\u201C', + '\u201D', + '\u2022', + '\u2013', + '\u2014', + '\u02DC', + '\u2122', + '\u0161', + '\u203A', + '\u0153', + '\u009D', + '\u017E', + '\u0178', // Last entry is 0x9F. + // 0x00->'\uFFFD' is handled programmatically. + // 0x0D->'\u000D' is a no-op. +} + // unescapeEntity reads an entity like "<" from b[src:] and writes the // corresponding "<" to b[dst:], returning the incremented dst and src cursors. -// Precondition: src[0] == '&' && dst <= src. +// Precondition: b[src] == '&' && dst <= src. func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { - // TODO(nigeltao): Check that this entity substitution algorithm matches the spec: // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference - // TODO(nigeltao): Handle things like "中" or "中". // i starts at 1 because we already know that s[0] == '&'. i, s := 1, b[src:] + + if len(s) <= 1 { + b[dst] = b[src] + return dst + 1, src + 1 + } + + if s[i] == '#' { + if len(s) <= 3 { // We need to have at least "&#.". + b[dst] = b[src] + return dst + 1, src + 1 + } + i++ + c := s[i] + hex := false + if c == 'x' || c == 'X' { + hex = true + i++ + } + + x := 0 + for i < len(s) { + c = s[i] + i++ + if hex { + if '0' <= c && c <= '9' { + x = 16*x + int(c) - '0' + continue + } else if 'a' <= c && c <= 'f' { + x = 16*x + int(c) - 'a' + 10 + continue + } else if 'A' <= c && c <= 'F' { + x = 16*x + int(c) - 'A' + 10 + continue + } + } else if '0' <= c && c <= '9' { + x = 10*x + int(c) - '0' + continue + } + if c != ';' { + i-- + } + break + } + + if i <= 3 { // No characters matched. + b[dst] = b[src] + return dst + 1, src + 1 + } + + if 0x80 <= x && x <= 0x9F { + // Replace characters from Windows-1252 with UTF-8 equivalents. + x = replacementTable[x-0x80] + } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { + // Replace invalid characters with the replacement character. + x = '\uFFFD' + } + + return dst + utf8.EncodeRune(b[dst:], x), src + i + } + + // Consume the maximum number of characters possible, with the + // consumed characters matching one of the named references. + + // TODO(nigeltao): unescape("¬it;") should be "¬it;" for i < len(s) { c := s[i] i++ @@ -30,12 +132,17 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { if c != ';' { i-- } - x := entity[string(s[1:i])] - if x != 0 { - return dst + utf8.EncodeRune(x, b[dst:]), src + i - } break } + + entityName := string(s[1:i]) + if x := entity[entityName]; x != 0 { + return dst + utf8.EncodeRune(b[dst:], x), src + i + } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity. + dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) + return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i + } + dst1, src1 = dst+i, src+i copy(b[dst:dst1], b[src:src1]) return dst1, src1 |