diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
commit | 1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch) | |
tree | 46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp | |
parent | 32761a6cee1d0dee366b885b7b9c777e67885688 (diff) | |
download | WebKitGtk-tarball-master.tar.gz |
webkitgtk-2.16.5HEADwebkitgtk-2.16.5master
Diffstat (limited to 'Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp')
-rw-r--r-- | Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp | 95 |
1 files changed, 38 insertions, 57 deletions
diff --git a/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp b/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp index 11e14a46e..8a8468915 100644 --- a/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp +++ b/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp @@ -1,5 +1,6 @@ /* * Copyright (C) 2010 Google Inc. All Rights Reserved. + * Copyright (C) 2015-2016 Apple Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -28,41 +29,26 @@ #include "HTMLNames.h" #include "HTMLParserIdioms.h" -#include "HTMLTokenizer.h" -#include "TextCodec.h" #include "TextEncodingRegistry.h" -using namespace WTF; - namespace WebCore { using namespace HTMLNames; HTMLMetaCharsetParser::HTMLMetaCharsetParser() - : m_tokenizer(std::make_unique<HTMLTokenizer>(HTMLParserOptions())) - , m_assumedCodec(newTextCodec(Latin1Encoding())) - , m_inHeadSection(true) - , m_doneChecking(false) -{ -} - -HTMLMetaCharsetParser::~HTMLMetaCharsetParser() + : m_codec(newTextCodec(Latin1Encoding())) { } -static const char charsetString[] = "charset"; -static const size_t charsetLength = sizeof("charset") - 1; - -String HTMLMetaCharsetParser::extractCharset(const String& value) +static StringView extractCharset(const String& value) { - size_t pos = 0; unsigned length = value.length(); - - while (pos < length) { - pos = value.find(charsetString, pos, false); + for (size_t pos = 0; pos < length; ) { + pos = value.find("charset", pos, false); if (pos == notFound) break; + static const size_t charsetLength = sizeof("charset") - 1; pos += charsetLength; // Skip whitespace. @@ -77,12 +63,10 @@ String HTMLMetaCharsetParser::extractCharset(const String& value) while (pos < length && value[pos] <= ' ') ++pos; - char quoteMark = 0; - if (pos < length && (value[pos] == '"' || value[pos] == '\'')) { - quoteMark = static_cast<char>(value[pos++]); - ASSERT(!(quoteMark & 0x80)); - } - + UChar quoteMark = 0; + if (pos < length && (value[pos] == '"' || value[pos] == '\'')) + quoteMark = value[pos++]; + if (pos == length) break; @@ -93,19 +77,17 @@ String HTMLMetaCharsetParser::extractCharset(const String& value) if (quoteMark && (end == length)) break; // Close quote not found. - return value.substring(pos, end - pos); + return StringView(value).substring(pos, end - pos); } - - return ""; + return StringView(); } -bool HTMLMetaCharsetParser::processMeta() +bool HTMLMetaCharsetParser::processMeta(HTMLToken& token) { - const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); AttributeList attributes; - for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin(); iter != tokenAttributes.end(); ++iter) { - String attributeName = StringImpl::create8BitIfPossible(iter->name); - String attributeValue = StringImpl::create8BitIfPossible(iter->value); + for (auto& attribute : token.attributes()) { + String attributeName = StringImpl::create8BitIfPossible(attribute.name); + String attributeValue = StringImpl::create8BitIfPossible(attribute.value); attributes.append(std::make_pair(attributeName, attributeValue)); } @@ -116,15 +98,15 @@ bool HTMLMetaCharsetParser::processMeta() TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const AttributeList& attributes) { bool gotPragma = false; - Mode mode = None; - String charset; + enum { None, Charset, Pragma } mode = None; + StringView charset; - for (AttributeList::const_iterator iter = attributes.begin(); iter != attributes.end(); ++iter) { - const AtomicString& attributeName = iter->first; - const String& attributeValue = iter->second; + for (auto& attribute : attributes) { + const String& attributeName = attribute.first; + const String& attributeValue = attribute.second; if (attributeName == http_equivAttr) { - if (equalIgnoringCase(attributeValue, "content-type")) + if (equalLettersIgnoringASCIICase(attributeValue, "content-type")) gotPragma = true; } else if (charset.isEmpty()) { if (attributeName == charsetAttr) { @@ -139,13 +121,11 @@ TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const AttributeLi } if (mode == Charset || (mode == Pragma && gotPragma)) - return TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset)); + return TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset.toStringWithoutCopying())); return TextEncoding(); } -static const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over. - bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) { if (m_doneChecking) @@ -156,30 +136,32 @@ bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) // We still don't have an encoding, and are in the head. // The following tags are allowed in <head>: // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE - + // // We stop scanning when a tag that is not permitted in <head> // is seen, rather when </head> is seen, because that more closely // matches behavior in other browsers; more details in // <http://bugs.webkit.org/show_bug.cgi?id=3590>. - + // // Additionally, we ignore things that looks like tags in <title>, <script> // and <noscript>; see <http://bugs.webkit.org/show_bug.cgi?id=4560>, // <http://bugs.webkit.org/show_bug.cgi?id=12165> and // <http://bugs.webkit.org/show_bug.cgi?id=12389>. - + // // Since many sites have charset declarations after <body> or other tags // that are disallowed in <head>, we don't bail out until we've checked at // least bytesToCheckUnconditionally bytes of input. - m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); + constexpr int bytesToCheckUnconditionally = 1024; + + m_input.append(m_codec->decode(data, length)); - while (m_tokenizer->nextToken(m_input, m_token)) { - bool end = m_token.type() == HTMLToken::EndTag; - if (end || m_token.type() == HTMLToken::StartTag) { - AtomicString tagName(m_token.name()); - if (!end) { - m_tokenizer->updateStateFor(tagName); - if (tagName == metaTag && processMeta()) { + while (auto token = m_tokenizer.nextToken(m_input)) { + bool isEnd = token->type() == HTMLToken::EndTag; + if (isEnd || token->type() == HTMLToken::StartTag) { + AtomicString tagName(token->name()); + if (!isEnd) { + m_tokenizer.updateStateFor(tagName); + if (tagName == metaTag && processMeta(*token)) { m_doneChecking = true; return true; } @@ -189,7 +171,8 @@ bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) && tagName != styleTag && tagName != linkTag && tagName != metaTag && tagName != objectTag && tagName != titleTag && tagName != baseTag - && (end || tagName != htmlTag) && (end || tagName != headTag)) { + && (isEnd || tagName != htmlTag) + && (isEnd || tagName != headTag)) { m_inHeadSection = false; } } @@ -198,8 +181,6 @@ bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) m_doneChecking = true; return true; } - - m_token.clear(); } return false; |