summaryrefslogtreecommitdiff
path: root/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
commit1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch)
tree46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp
parent32761a6cee1d0dee366b885b7b9c777e67885688 (diff)
downloadWebKitGtk-tarball-master.tar.gz
Diffstat (limited to 'Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp')
-rw-r--r--Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp95
1 files changed, 38 insertions, 57 deletions
diff --git a/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp b/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp
index 11e14a46e..8a8468915 100644
--- a/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp
+++ b/Source/WebCore/html/parser/HTMLMetaCharsetParser.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2010 Google Inc. All Rights Reserved.
+ * Copyright (C) 2015-2016 Apple Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -28,41 +29,26 @@
#include "HTMLNames.h"
#include "HTMLParserIdioms.h"
-#include "HTMLTokenizer.h"
-#include "TextCodec.h"
#include "TextEncodingRegistry.h"
-using namespace WTF;
-
namespace WebCore {
using namespace HTMLNames;
HTMLMetaCharsetParser::HTMLMetaCharsetParser()
- : m_tokenizer(std::make_unique<HTMLTokenizer>(HTMLParserOptions()))
- , m_assumedCodec(newTextCodec(Latin1Encoding()))
- , m_inHeadSection(true)
- , m_doneChecking(false)
-{
-}
-
-HTMLMetaCharsetParser::~HTMLMetaCharsetParser()
+ : m_codec(newTextCodec(Latin1Encoding()))
{
}
-static const char charsetString[] = "charset";
-static const size_t charsetLength = sizeof("charset") - 1;
-
-String HTMLMetaCharsetParser::extractCharset(const String& value)
+static StringView extractCharset(const String& value)
{
- size_t pos = 0;
unsigned length = value.length();
-
- while (pos < length) {
- pos = value.find(charsetString, pos, false);
+ for (size_t pos = 0; pos < length; ) {
+ pos = value.find("charset", pos, false);
if (pos == notFound)
break;
+ static const size_t charsetLength = sizeof("charset") - 1;
pos += charsetLength;
// Skip whitespace.
@@ -77,12 +63,10 @@ String HTMLMetaCharsetParser::extractCharset(const String& value)
while (pos < length && value[pos] <= ' ')
++pos;
- char quoteMark = 0;
- if (pos < length && (value[pos] == '"' || value[pos] == '\'')) {
- quoteMark = static_cast<char>(value[pos++]);
- ASSERT(!(quoteMark & 0x80));
- }
-
+ UChar quoteMark = 0;
+ if (pos < length && (value[pos] == '"' || value[pos] == '\''))
+ quoteMark = value[pos++];
+
if (pos == length)
break;
@@ -93,19 +77,17 @@ String HTMLMetaCharsetParser::extractCharset(const String& value)
if (quoteMark && (end == length))
break; // Close quote not found.
- return value.substring(pos, end - pos);
+ return StringView(value).substring(pos, end - pos);
}
-
- return "";
+ return StringView();
}
-bool HTMLMetaCharsetParser::processMeta()
+bool HTMLMetaCharsetParser::processMeta(HTMLToken& token)
{
- const HTMLToken::AttributeList& tokenAttributes = m_token.attributes();
AttributeList attributes;
- for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin(); iter != tokenAttributes.end(); ++iter) {
- String attributeName = StringImpl::create8BitIfPossible(iter->name);
- String attributeValue = StringImpl::create8BitIfPossible(iter->value);
+ for (auto& attribute : token.attributes()) {
+ String attributeName = StringImpl::create8BitIfPossible(attribute.name);
+ String attributeValue = StringImpl::create8BitIfPossible(attribute.value);
attributes.append(std::make_pair(attributeName, attributeValue));
}
@@ -116,15 +98,15 @@ bool HTMLMetaCharsetParser::processMeta()
TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const AttributeList& attributes)
{
bool gotPragma = false;
- Mode mode = None;
- String charset;
+ enum { None, Charset, Pragma } mode = None;
+ StringView charset;
- for (AttributeList::const_iterator iter = attributes.begin(); iter != attributes.end(); ++iter) {
- const AtomicString& attributeName = iter->first;
- const String& attributeValue = iter->second;
+ for (auto& attribute : attributes) {
+ const String& attributeName = attribute.first;
+ const String& attributeValue = attribute.second;
if (attributeName == http_equivAttr) {
- if (equalIgnoringCase(attributeValue, "content-type"))
+ if (equalLettersIgnoringASCIICase(attributeValue, "content-type"))
gotPragma = true;
} else if (charset.isEmpty()) {
if (attributeName == charsetAttr) {
@@ -139,13 +121,11 @@ TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const AttributeLi
}
if (mode == Charset || (mode == Pragma && gotPragma))
- return TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset));
+ return TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset.toStringWithoutCopying()));
return TextEncoding();
}
-static const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over.
-
bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length)
{
if (m_doneChecking)
@@ -156,30 +136,32 @@ bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length)
// We still don't have an encoding, and are in the head.
// The following tags are allowed in <head>:
// SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
-
+ //
// We stop scanning when a tag that is not permitted in <head>
// is seen, rather when </head> is seen, because that more closely
// matches behavior in other browsers; more details in
// <http://bugs.webkit.org/show_bug.cgi?id=3590>.
-
+ //
// Additionally, we ignore things that looks like tags in <title>, <script>
// and <noscript>; see <http://bugs.webkit.org/show_bug.cgi?id=4560>,
// <http://bugs.webkit.org/show_bug.cgi?id=12165> and
// <http://bugs.webkit.org/show_bug.cgi?id=12389>.
-
+ //
// Since many sites have charset declarations after <body> or other tags
// that are disallowed in <head>, we don't bail out until we've checked at
// least bytesToCheckUnconditionally bytes of input.
- m_input.append(SegmentedString(m_assumedCodec->decode(data, length)));
+ constexpr int bytesToCheckUnconditionally = 1024;
+
+ m_input.append(m_codec->decode(data, length));
- while (m_tokenizer->nextToken(m_input, m_token)) {
- bool end = m_token.type() == HTMLToken::EndTag;
- if (end || m_token.type() == HTMLToken::StartTag) {
- AtomicString tagName(m_token.name());
- if (!end) {
- m_tokenizer->updateStateFor(tagName);
- if (tagName == metaTag && processMeta()) {
+ while (auto token = m_tokenizer.nextToken(m_input)) {
+ bool isEnd = token->type() == HTMLToken::EndTag;
+ if (isEnd || token->type() == HTMLToken::StartTag) {
+ AtomicString tagName(token->name());
+ if (!isEnd) {
+ m_tokenizer.updateStateFor(tagName);
+ if (tagName == metaTag && processMeta(*token)) {
m_doneChecking = true;
return true;
}
@@ -189,7 +171,8 @@ bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length)
&& tagName != styleTag && tagName != linkTag
&& tagName != metaTag && tagName != objectTag
&& tagName != titleTag && tagName != baseTag
- && (end || tagName != htmlTag) && (end || tagName != headTag)) {
+ && (isEnd || tagName != htmlTag)
+ && (isEnd || tagName != headTag)) {
m_inHeadSection = false;
}
}
@@ -198,8 +181,6 @@ bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length)
m_doneChecking = true;
return true;
}
-
- m_token.clear();
}
return false;