/* * Copyright (C) 2009 Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "third_party/blink/public/web/web_frame_serializer.h" #include "base/macros.h" #include "third_party/blink/public/platform/web_string.h" #include "third_party/blink/public/platform/web_url.h" #include "third_party/blink/public/platform/web_url_response.h" #include "third_party/blink/public/web/web_document.h" #include "third_party/blink/public/web/web_document_loader.h" #include "third_party/blink/public/web/web_frame.h" #include "third_party/blink/public/web/web_frame_serializer_client.h" #include "third_party/blink/renderer/core/dom/document.h" #include "third_party/blink/renderer/core/dom/element.h" #include "third_party/blink/renderer/core/dom/shadow_root.h" #include "third_party/blink/renderer/core/exported/web_remote_frame_impl.h" #include "third_party/blink/renderer/core/frame/frame.h" #include "third_party/blink/renderer/core/frame/frame_serializer.h" #include "third_party/blink/renderer/core/frame/local_dom_window.h" #include "third_party/blink/renderer/core/frame/local_frame.h" #include "third_party/blink/renderer/core/frame/remote_frame.h" #include "third_party/blink/renderer/core/frame/web_frame_serializer_impl.h" #include "third_party/blink/renderer/core/frame/web_local_frame_impl.h" #include "third_party/blink/renderer/core/html/forms/html_input_element.h" #include "third_party/blink/renderer/core/html/html_all_collection.h" #include "third_party/blink/renderer/core/html/html_frame_element_base.h" #include "third_party/blink/renderer/core/html/html_frame_owner_element.h" #include "third_party/blink/renderer/core/html/html_head_element.h" #include "third_party/blink/renderer/core/html/html_image_element.h" #include "third_party/blink/renderer/core/html/html_link_element.h" #include "third_party/blink/renderer/core/html/html_table_element.h" #include "third_party/blink/renderer/core/html_names.h" #include "third_party/blink/renderer/core/input_type_names.h" #include "third_party/blink/renderer/core/layout/layout_box.h" #include "third_party/blink/renderer/core/loader/document_loader.h" #include "third_party/blink/renderer/core/loader/frame_loader.h" #include "third_party/blink/renderer/core/page/chrome_client.h" #include "third_party/blink/renderer/core/page/page.h" #include "third_party/blink/renderer/platform/instrumentation/histogram.h" #include "third_party/blink/renderer/platform/instrumentation/tracing/trace_event.h" #include "third_party/blink/renderer/platform/loader/fetch/resource_request.h" #include "third_party/blink/renderer/platform/loader/fetch/resource_response.h" #include "third_party/blink/renderer/platform/mhtml/mhtml_archive.h" #include "third_party/blink/renderer/platform/mhtml/mhtml_parser.h" #include "third_party/blink/renderer/platform/mhtml/serialized_resource.h" #include "third_party/blink/renderer/platform/shared_buffer.h" #include "third_party/blink/renderer/platform/weborigin/kurl.h" #include "third_party/blink/renderer/platform/wtf/assertions.h" #include "third_party/blink/renderer/platform/wtf/deque.h" #include "third_party/blink/renderer/platform/wtf/hash_map.h" #include "third_party/blink/renderer/platform/wtf/hash_set.h" #include "third_party/blink/renderer/platform/wtf/text/string_concatenate.h" #include "third_party/blink/renderer/platform/wtf/time.h" #include "third_party/blink/renderer/platform/wtf/vector.h" namespace blink { namespace { const int kPopupOverlayZIndexThreshold = 50; const char kShadowModeAttributeName[] = "shadowmode"; const char kShadowDelegatesFocusAttributeName[] = "shadowdelegatesfocus"; // Returns a Content-ID to be used for the given frame. // See rfc2557 - section 8.3 - "Use of the Content-ID header and CID URLs". // Format note - the returned string should be of the form "" // (i.e. the strings should include the angle brackets). String GetContentID(Frame* frame) { String frame_id = String(ToTraceValue(frame).data()); return ""; } class MHTMLFrameSerializerDelegate final : public FrameSerializer::Delegate { STACK_ALLOCATED(); public: MHTMLFrameSerializerDelegate( WebFrameSerializer::MHTMLPartsGenerationDelegate&, HeapHashSet>&); ~MHTMLFrameSerializerDelegate() override = default; bool ShouldIgnoreElement(const Element&) override; bool ShouldIgnoreAttribute(const Element&, const Attribute&) override; bool RewriteLink(const Element&, String& rewritten_link) override; bool ShouldSkipResourceWithURL(const KURL&) override; Vector GetCustomAttributes(const Element&) override; std::pair GetAuxiliaryDOMTree(const Element&) const override; bool ShouldCollectProblemMetric() override; private: bool ShouldIgnoreHiddenElement(const Element&); bool ShouldIgnoreMetaElement(const Element&); bool ShouldIgnorePopupOverlayElement(const Element&); void GetCustomAttributesForImageElement(const HTMLImageElement&, Vector*); WebFrameSerializer::MHTMLPartsGenerationDelegate& web_delegate_; HeapHashSet>& shadow_template_elements_; bool popup_overlays_skipped_; DISALLOW_COPY_AND_ASSIGN(MHTMLFrameSerializerDelegate); }; MHTMLFrameSerializerDelegate::MHTMLFrameSerializerDelegate( WebFrameSerializer::MHTMLPartsGenerationDelegate& web_delegate, HeapHashSet>& shadow_template_elements) : web_delegate_(web_delegate), shadow_template_elements_(shadow_template_elements), popup_overlays_skipped_(false) {} bool MHTMLFrameSerializerDelegate::ShouldIgnoreElement(const Element& element) { if (ShouldIgnoreHiddenElement(element)) return true; if (ShouldIgnoreMetaElement(element)) return true; if (web_delegate_.RemovePopupOverlay() && ShouldIgnorePopupOverlayElement(element)) { return true; } // Remove for stylesheets that do not load. if (IsHTMLLinkElement(element) && ToHTMLLinkElement(element).RelAttribute().IsStyleSheet() && !ToHTMLLinkElement(element).sheet()) { return true; } return false; } bool MHTMLFrameSerializerDelegate::ShouldIgnoreHiddenElement( const Element& element) { // If an iframe is in the head, it will be moved to the body when the page is // being loaded. But if an iframe is injected into the head later, it will // stay there and not been displayed. To prevent it from being brought to the // saved page and cause it being displayed, we should not include it. if (IsHTMLIFrameElement(element) && Traversal::FirstAncestor(element)) { return true; } // Do not include the element that is marked with hidden attribute. if (element.FastHasAttribute(html_names::kHiddenAttr)) return true; // Do not include the hidden form element. return IsHTMLInputElement(element) && ToHTMLInputElement(&element)->type() == input_type_names::kHidden; } bool MHTMLFrameSerializerDelegate::ShouldIgnoreMetaElement( const Element& element) { // Do not include meta elements that declare Content-Security-Policy // directives. They should have already been enforced when the original // document is loaded. Since only the rendered resources are encapsulated in // the saved MHTML page, there is no need to carry the directives. If they // are still kept in the MHTML, child frames that are referred to using cid: // scheme could be prevented from loading. if (!IsHTMLMetaElement(element)) return false; if (!element.FastHasAttribute(html_names::kContentAttr)) return false; const AtomicString& http_equiv = element.FastGetAttribute(html_names::kHttpEquivAttr); return http_equiv == "Content-Security-Policy"; } bool MHTMLFrameSerializerDelegate::ShouldIgnorePopupOverlayElement( const Element& element) { // The element should be visible. LayoutBox* box = element.GetLayoutBox(); if (!box) return false; // The bounding box of the element should contain center point of the // viewport. LocalDOMWindow* window = element.GetDocument().domWindow(); DCHECK(window); int center_x = window->innerWidth() / 2; int center_y = window->innerHeight() / 2; if (Page* page = element.GetDocument().GetPage()) { center_x = page->GetChromeClient().WindowToViewportScalar(center_x); center_y = page->GetChromeClient().WindowToViewportScalar(center_y); } LayoutPoint center_point(center_x, center_y); if (!box->FrameRect().Contains(center_point)) return false; // The z-index should be greater than the threshold. if (box->Style()->ZIndex() < kPopupOverlayZIndexThreshold) return false; popup_overlays_skipped_ = true; return true; } bool MHTMLFrameSerializerDelegate::ShouldIgnoreAttribute( const Element& element, const Attribute& attribute) { // TODO(fgorski): Presence of srcset attribute causes MHTML to not display // images, as only the value of src is pulled into the archive. Discarding // srcset prevents the problem. Long term we should make sure to MHTML plays // nicely with srcset. if (IsHTMLImageElement(element) && (attribute.LocalName() == html_names::kSrcsetAttr || attribute.LocalName() == html_names::kSizesAttr)) { return true; } // Do not save ping attribute since anyway the ping will be blocked from // MHTML. if (IsHTMLAnchorElement(element) && attribute.LocalName() == html_names::kPingAttr) { return true; } // The special attribute in a template element to denote the shadow DOM // should only be generated from MHTML serialization. If it is found in the // original page, it should be ignored. if (IsHTMLTemplateElement(element) && (attribute.LocalName() == kShadowModeAttributeName || attribute.LocalName() == kShadowDelegatesFocusAttributeName) && !shadow_template_elements_.Contains(&element)) { return true; } // If srcdoc attribute for frame elements will be rewritten as src attribute // containing link instead of html contents, don't ignore the attribute. // Bail out now to avoid the check in Element::isScriptingAttribute. bool is_src_doc_attribute = IsHTMLFrameElementBase(element) && attribute.GetName() == html_names::kSrcdocAttr; String new_link_for_the_element; if (is_src_doc_attribute && RewriteLink(element, new_link_for_the_element)) return false; // Drop integrity attribute for those links with subresource loaded. if (attribute.LocalName() == html_names::kIntegrityAttr && IsHTMLLinkElement(element) && ToHTMLLinkElement(element).sheet()) { return true; } // Do not include attributes that contain javascript. This is because the // script will not be executed when a MHTML page is being loaded. return element.IsScriptingAttribute(attribute); } bool MHTMLFrameSerializerDelegate::RewriteLink(const Element& element, String& rewritten_link) { auto* frame_owner = DynamicTo(element); if (!frame_owner) return false; Frame* frame = frame_owner->ContentFrame(); if (!frame) return false; WebString content_id = GetContentID(frame); KURL cid_uri = MHTMLParser::ConvertContentIDToURI(content_id); DCHECK(cid_uri.IsValid()); rewritten_link = cid_uri.GetString(); return true; } bool MHTMLFrameSerializerDelegate::ShouldSkipResourceWithURL(const KURL& url) { return web_delegate_.ShouldSkipResource(url); } Vector MHTMLFrameSerializerDelegate::GetCustomAttributes( const Element& element) { Vector attributes; if (auto* image = ToHTMLImageElementOrNull(element)) { GetCustomAttributesForImageElement(*image, &attributes); } return attributes; } bool MHTMLFrameSerializerDelegate::ShouldCollectProblemMetric() { return web_delegate_.UsePageProblemDetectors(); } void MHTMLFrameSerializerDelegate::GetCustomAttributesForImageElement( const HTMLImageElement& element, Vector* attributes) { // Currently only the value of src is pulled into the archive and the srcset // attribute is ignored (see shouldIgnoreAttribute() above). If the device // has a higher DPR, a different image from srcset could be loaded instead. // When this occurs, we should provide the rendering width and height for // element if not set. // The image should be loaded and participate the layout. ImageResourceContent* image = element.CachedImage(); if (!image || !image->HasImage() || image->ErrorOccurred() || !element.GetLayoutObject()) { return; } // The width and height attributes should not be set. if (element.FastHasAttribute(html_names::kWidthAttr) || element.FastHasAttribute(html_names::kHeightAttr)) { return; } // Check if different image is loaded. naturalWidth/naturalHeight will return // the image size adjusted with current DPR. if (((int)element.naturalWidth()) == image->GetImage()->width() && ((int)element.naturalHeight()) == image->GetImage()->height()) { return; } Attribute width_attribute(html_names::kWidthAttr, AtomicString::Number(element.LayoutBoxWidth())); attributes->push_back(width_attribute); Attribute height_attribute(html_names::kHeightAttr, AtomicString::Number(element.LayoutBoxHeight())); attributes->push_back(height_attribute); } std::pair MHTMLFrameSerializerDelegate::GetAuxiliaryDOMTree( const Element& element) const { ShadowRoot* shadow_root = element.GetShadowRoot(); if (!shadow_root) return std::pair(); String shadow_mode; switch (shadow_root->GetType()) { case ShadowRootType::kUserAgent: // No need to serialize. return std::pair(); case ShadowRootType::V0: shadow_mode = "v0"; break; case ShadowRootType::kOpen: shadow_mode = "open"; break; case ShadowRootType::kClosed: shadow_mode = "closed"; break; } // Put the shadow DOM content inside a template element. A special attribute // is set to tell the mode of the shadow DOM. Element* template_element = Element::Create(html_names::kTemplateTag, &(element.GetDocument())); template_element->setAttribute( QualifiedName(g_null_atom, kShadowModeAttributeName, g_null_atom), AtomicString(shadow_mode)); if (shadow_root->GetType() != ShadowRootType::V0 && shadow_root->delegatesFocus()) { template_element->setAttribute( QualifiedName(g_null_atom, kShadowDelegatesFocusAttributeName, g_null_atom), g_empty_atom); } shadow_template_elements_.insert(template_element); return std::pair(shadow_root, template_element); } } // namespace WebThreadSafeData WebFrameSerializer::GenerateMHTMLHeader( const WebString& boundary, WebLocalFrame* frame, MHTMLPartsGenerationDelegate* delegate) { TRACE_EVENT0("page-serialization", "WebFrameSerializer::generateMHTMLHeader"); DCHECK(frame); DCHECK(delegate); auto* web_local_frame = To(frame); Document* document = web_local_frame->GetFrame()->GetDocument(); scoped_refptr buffer = RawData::Create(); MHTMLArchive::GenerateMHTMLHeader( boundary, document->Url(), document->title(), document->SuggestedMIMEType(), base::Time::Now(), *buffer->MutableData()); return WebThreadSafeData(buffer); } WebThreadSafeData WebFrameSerializer::GenerateMHTMLParts( const WebString& boundary, WebLocalFrame* web_frame, MHTMLPartsGenerationDelegate* web_delegate) { TRACE_EVENT0("page-serialization", "WebFrameSerializer::generateMHTMLParts"); DCHECK(web_frame); DCHECK(web_delegate); // Translate arguments from public to internal blink APIs. LocalFrame* frame = To(web_frame)->GetFrame(); MHTMLArchive::EncodingPolicy encoding_policy = web_delegate->UseBinaryEncoding() ? MHTMLArchive::EncodingPolicy::kUseBinaryEncoding : MHTMLArchive::EncodingPolicy::kUseDefaultEncoding; // Serialize. TRACE_EVENT_BEGIN0("page-serialization", "WebFrameSerializer::generateMHTMLParts serializing"); Deque resources; { SCOPED_BLINK_UMA_HISTOGRAM_TIMER( "PageSerialization.MhtmlGeneration.SerializationTime.SingleFrame"); HeapHashSet> shadow_template_elements; MHTMLFrameSerializerDelegate core_delegate(*web_delegate, shadow_template_elements); FrameSerializer serializer(resources, core_delegate); serializer.SerializeFrame(*frame); } TRACE_EVENT_END1("page-serialization", "WebFrameSerializer::generateMHTMLParts serializing", "resource count", static_cast(resources.size())); // There was an error serializing the frame (e.g. of an image resource). if (resources.IsEmpty()) return WebThreadSafeData(); // Encode serialized resources as MHTML. scoped_refptr output = RawData::Create(); { SCOPED_BLINK_UMA_HISTOGRAM_TIMER( "PageSerialization.MhtmlGeneration.EncodingTime.SingleFrame"); // Frame is the 1st resource (see FrameSerializer::serializeFrame doc // comment). Frames get a Content-ID header. MHTMLArchive::GenerateMHTMLPart(boundary, GetContentID(frame), encoding_policy, resources.TakeFirst(), *output->MutableData()); while (!resources.IsEmpty()) { TRACE_EVENT0("page-serialization", "WebFrameSerializer::generateMHTMLParts encoding"); MHTMLArchive::GenerateMHTMLPart(boundary, String(), encoding_policy, resources.TakeFirst(), *output->MutableData()); } } return WebThreadSafeData(output); } bool WebFrameSerializer::Serialize( WebLocalFrame* frame, WebFrameSerializerClient* client, WebFrameSerializer::LinkRewritingDelegate* delegate, bool save_with_empty_url) { WebFrameSerializerImpl serializer_impl(frame, client, delegate, save_with_empty_url); return serializer_impl.Serialize(); } WebString WebFrameSerializer::GenerateMetaCharsetDeclaration( const WebString& charset) { // TODO(yosin) We should call |FrameSerializer::metaCharsetDeclarationOf()|. String charset_string = ""; return charset_string; } WebString WebFrameSerializer::GenerateMarkOfTheWebDeclaration( const WebURL& url) { StringBuilder builder; builder.Append("\n\n"); return builder.ToString(); } } // namespace blink