/* * Copyright (C) 2011 Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "platform/mhtml/MHTMLParser.h" #include "platform/MIMETypeRegistry.h" #include "platform/mhtml/MHTMLArchive.h" #include "platform/network/ParsedContentType.h" #include "platform/text/QuotedPrintable.h" #include "wtf/HashMap.h" #include "wtf/RefCounted.h" #include "wtf/text/Base64.h" #include "wtf/text/StringBuilder.h" #include "wtf/text/StringConcatenate.h" #include "wtf/text/StringHash.h" #include "wtf/text/WTFString.h" namespace WebCore { // This class is a limited MIME parser used to parse the MIME headers of MHTML files. class MIMEHeader : public RefCounted { public: enum Encoding { QuotedPrintable, Base64, EightBit, SevenBit, Binary, Unknown }; static PassRefPtr parseHeader(SharedBufferChunkReader* crLFLineReader); bool isMultipart() const { return m_contentType.startsWith("multipart/"); } String contentType() const { return m_contentType; } String charset() const { return m_charset; } Encoding contentTransferEncoding() const { return m_contentTransferEncoding; } String contentLocation() const { return m_contentLocation; } // Multi-part type and boundaries are only valid for multipart MIME headers. String multiPartType() const { return m_multipartType; } String endOfPartBoundary() const { return m_endOfPartBoundary; } String endOfDocumentBoundary() const { return m_endOfDocumentBoundary; } private: MIMEHeader(); static Encoding parseContentTransferEncoding(const String&); String m_contentType; String m_charset; Encoding m_contentTransferEncoding; String m_contentLocation; String m_multipartType; String m_endOfPartBoundary; String m_endOfDocumentBoundary; }; typedef HashMap KeyValueMap; static KeyValueMap retrieveKeyValuePairs(WebCore::SharedBufferChunkReader* buffer) { KeyValueMap keyValuePairs; String line; String key; StringBuilder value; while (!(line = buffer->nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { if (line.isEmpty()) break; // Empty line means end of key/value section. if (line[0] == '\t') { ASSERT(!key.isEmpty()); value.append(line.substring(1)); continue; } // New key/value, store the previous one if any. if (!key.isEmpty()) { if (keyValuePairs.find(key) != keyValuePairs.end()) WTF_LOG_ERROR("Key duplicate found in MIME header. Key is '%s', previous value replaced.", key.ascii().data()); keyValuePairs.add(key, value.toString().stripWhiteSpace()); key = String(); value.clear(); } size_t semiColonIndex = line.find(':'); if (semiColonIndex == kNotFound) { // This is not a key value pair, ignore. continue; } key = line.substring(0, semiColonIndex).lower().stripWhiteSpace(); value.append(line.substring(semiColonIndex + 1)); } // Store the last property if there is one. if (!key.isEmpty()) keyValuePairs.set(key, value.toString().stripWhiteSpace()); return keyValuePairs; } PassRefPtr MIMEHeader::parseHeader(SharedBufferChunkReader* buffer) { RefPtr mimeHeader = adoptRef(new MIMEHeader); KeyValueMap keyValuePairs = retrieveKeyValuePairs(buffer); KeyValueMap::iterator mimeParametersIterator = keyValuePairs.find("content-type"); if (mimeParametersIterator != keyValuePairs.end()) { ParsedContentType parsedContentType(mimeParametersIterator->value); mimeHeader->m_contentType = parsedContentType.mimeType(); if (!mimeHeader->isMultipart()) { mimeHeader->m_charset = parsedContentType.charset().stripWhiteSpace(); } else { mimeHeader->m_multipartType = parsedContentType.parameterValueForName("type"); mimeHeader->m_endOfPartBoundary = parsedContentType.parameterValueForName("boundary"); if (mimeHeader->m_endOfPartBoundary.isNull()) { WTF_LOG_ERROR("No boundary found in multipart MIME header."); return 0; } mimeHeader->m_endOfPartBoundary.insert("--", 0); mimeHeader->m_endOfDocumentBoundary = mimeHeader->m_endOfPartBoundary; mimeHeader->m_endOfDocumentBoundary.append("--"); } } mimeParametersIterator = keyValuePairs.find("content-transfer-encoding"); if (mimeParametersIterator != keyValuePairs.end()) mimeHeader->m_contentTransferEncoding = parseContentTransferEncoding(mimeParametersIterator->value); mimeParametersIterator = keyValuePairs.find("content-location"); if (mimeParametersIterator != keyValuePairs.end()) mimeHeader->m_contentLocation = mimeParametersIterator->value; return mimeHeader.release(); } MIMEHeader::Encoding MIMEHeader::parseContentTransferEncoding(const String& text) { String encoding = text.stripWhiteSpace().lower(); if (encoding == "base64") return Base64; if (encoding == "quoted-printable") return QuotedPrintable; if (encoding == "8bit") return EightBit; if (encoding == "7bit") return SevenBit; if (encoding == "binary") return Binary; WTF_LOG_ERROR("Unknown encoding '%s' found in MIME header.", text.ascii().data()); return Unknown; } MIMEHeader::MIMEHeader() : m_contentTransferEncoding(Unknown) { } static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary) { String line; while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { if (line == boundary) return true; } return false; } MHTMLParser::MHTMLParser(SharedBuffer* data) : m_lineReader(data, "\r\n") { } PassRefPtr MHTMLParser::parseArchive() { RefPtr header = MIMEHeader::parseHeader(&m_lineReader); return parseArchiveWithHeader(header.get()); } PassRefPtr MHTMLParser::parseArchiveWithHeader(MIMEHeader* header) { if (!header) { WTF_LOG_ERROR("Failed to parse MHTML part: no header."); return 0; } RefPtr archive = MHTMLArchive::create(); if (!header->isMultipart()) { // With IE a page with no resource is not multi-part. bool endOfArchiveReached = false; RefPtr resource = parseNextPart(*header, String(), String(), endOfArchiveReached); if (!resource) return 0; archive->setMainResource(resource); return archive; } // Skip the message content (it's a generic browser specific message). skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); bool endOfArchive = false; while (!endOfArchive) { RefPtr resourceHeader = MIMEHeader::parseHeader(&m_lineReader); if (!resourceHeader) { WTF_LOG_ERROR("Failed to parse MHTML, invalid MIME header."); return 0; } if (resourceHeader->contentType() == "multipart/alternative") { // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames). RefPtr subframeArchive = parseArchiveWithHeader(resourceHeader.get()); if (!subframeArchive) { WTF_LOG_ERROR("Failed to parse MHTML subframe."); return 0; } bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); ASSERT_UNUSED(endOfPartReached, endOfPartReached); // The top-frame is the first frame found, regardless of the nesting level. if (subframeArchive->mainResource()) addResourceToArchive(subframeArchive->mainResource(), archive.get()); archive->addSubframeArchive(subframeArchive); continue; } RefPtr resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive); if (!resource) { WTF_LOG_ERROR("Failed to parse MHTML part."); return 0; } addResourceToArchive(resource.get(), archive.get()); } return archive.release(); } void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive) { const AtomicString& mimeType = resource->mimeType(); if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") { m_resources.append(resource); return; } // The first document suitable resource is the main frame. if (!archive->mainResource()) { archive->setMainResource(resource); m_frames.append(archive); return; } RefPtr subframe = MHTMLArchive::create(); subframe->setMainResource(resource); m_frames.append(subframe); } PassRefPtr MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) { ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); // If no content transfer encoding is specified, default to binary encoding. MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding(); if (contentTransferEncoding == MIMEHeader::Unknown) contentTransferEncoding = MIMEHeader::Binary; RefPtr content = SharedBuffer::create(); const bool checkBoundary = !endOfPartBoundary.isEmpty(); bool endOfPartReached = false; if (contentTransferEncoding == MIMEHeader::Binary) { if (!checkBoundary) { WTF_LOG_ERROR("Binary contents requires end of part"); return 0; } m_lineReader.setSeparator(endOfPartBoundary.utf8().data()); Vector part; if (!m_lineReader.nextChunk(part)) { WTF_LOG_ERROR("Binary contents requires end of part"); return 0; } content->append(part); m_lineReader.setSeparator("\r\n"); Vector nextChars; if (m_lineReader.peek(nextChars, 2) != 2) { WTF_LOG_ERROR("Invalid seperator."); return 0; } endOfPartReached = true; ASSERT(nextChars.size() == 2); endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-'); if (!endOfArchiveReached) { String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback(); if (!line.isEmpty()) { WTF_LOG_ERROR("No CRLF at end of binary section."); return 0; } } } else { String line; while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { endOfArchiveReached = (line == endOfDocumentBoundary); if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) { endOfPartReached = true; break; } // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. content->append(line.utf8().data(), line.length()); if (contentTransferEncoding == MIMEHeader::QuotedPrintable) { // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. content->append("\r\n", 2); } } } if (!endOfPartReached && checkBoundary) { WTF_LOG_ERROR("No bounday found for MHTML part."); return 0; } Vector data; switch (contentTransferEncoding) { case MIMEHeader::Base64: if (!base64Decode(content->data(), content->size(), data)) { WTF_LOG_ERROR("Invalid base64 content for MHTML part."); return 0; } break; case MIMEHeader::QuotedPrintable: quotedPrintableDecode(content->data(), content->size(), data); break; case MIMEHeader::EightBit: case MIMEHeader::SevenBit: case MIMEHeader::Binary: data.append(content->data(), content->size()); break; default: WTF_LOG_ERROR("Invalid encoding for MHTML part."); return 0; } RefPtr contentBuffer = SharedBuffer::adoptVector(data); // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 // IE and Firefox (UNMht) seem to generate only absolute URLs. KURL location = KURL(KURL(), mimeHeader.contentLocation()); return ArchiveResource::create(contentBuffer, location, AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset()), String()); } size_t MHTMLParser::frameCount() const { return m_frames.size(); } MHTMLArchive* MHTMLParser::frameAt(size_t index) const { return m_frames[index].get(); } size_t MHTMLParser::subResourceCount() const { return m_resources.size(); } ArchiveResource* MHTMLParser::subResourceAt(size_t index) const { return m_resources[index].get(); } }