diff options
author | Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> | 2016-08-01 12:59:39 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2016-08-04 12:40:43 +0000 |
commit | 28b1110370900897ab652cb420c371fab8857ad4 (patch) | |
tree | 41b32127d23b0df4f2add2a27e12dc87bddb260e /chromium/pdf | |
parent | 399c965b6064c440ddcf4015f5f8e9d131c7a0a6 (diff) | |
download | qtwebengine-chromium-28b1110370900897ab652cb420c371fab8857ad4.tar.gz |
BASELINE: Update Chromium to 53.0.2785.41
Also adds a few extra files for extensions.
Change-Id: Iccdd55d98660903331cf8b7b29188da781830af4
Reviewed-by: Michael BrĂ¼ning <michael.bruning@qt.io>
Diffstat (limited to 'chromium/pdf')
-rw-r--r-- | chromium/pdf/document_loader.cc | 153 | ||||
-rw-r--r-- | chromium/pdf/out_of_process_instance.cc | 161 | ||||
-rw-r--r-- | chromium/pdf/out_of_process_instance.h | 15 | ||||
-rw-r--r-- | chromium/pdf/paint_aggregator.h | 2 | ||||
-rw-r--r-- | chromium/pdf/pdf.cc | 90 | ||||
-rw-r--r-- | chromium/pdf/pdf_engine.h | 18 | ||||
-rw-r--r-- | chromium/pdf/pdfium/fuzzers/BUILD.gn | 53 | ||||
-rw-r--r-- | chromium/pdf/pdfium/fuzzers/dicts/pdf_css.dict | 73 | ||||
-rw-r--r-- | chromium/pdf/pdfium/pdfium_engine.cc | 120 | ||||
-rw-r--r-- | chromium/pdf/pdfium/pdfium_engine.h | 26 | ||||
-rw-r--r-- | chromium/pdf/pdfium/pdfium_page.cc | 224 | ||||
-rw-r--r-- | chromium/pdf/pdfium/pdfium_page.h | 17 |
12 files changed, 566 insertions, 386 deletions
diff --git a/chromium/pdf/document_loader.cc b/chromium/pdf/document_loader.cc index 3117d08fffc..ae608eff93e 100644 --- a/chromium/pdf/document_loader.cc +++ b/chromium/pdf/document_loader.cc @@ -410,93 +410,94 @@ void DocumentLoader::ReadMore() { } void DocumentLoader::DidRead(int32_t result) { - if (result > 0) { - char* start = buffer_; - size_t length = result; - if (is_multipart_ && result > 2) { - for (int i = 2; i < result; ++i) { - if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') || - (i >= 4 && - buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' && - buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) { - uint32_t start_pos, end_pos; - if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) { - current_pos_ = start_pos; - start += i; - length -= i; - if (end_pos && end_pos > start_pos) - current_chunk_size_ = end_pos - start_pos + 1; - } - break; + if (result <= 0) { + // If |result| == PP_OK, the document was loaded, otherwise an error was + // encountered. Either way we want to stop processing the response. In the + // case where an error occurred, the renderer will detect that we're missing + // data and will display a message. + ReadComplete(); + return; + } + + char* start = buffer_; + size_t length = result; + if (is_multipart_ && result > 2) { + for (int i = 2; i < result; ++i) { + if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') || + (i >= 4 && buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' && + buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) { + uint32_t start_pos, end_pos; + if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) { + current_pos_ = start_pos; + start += i; + length -= i; + if (end_pos && end_pos > start_pos) + current_chunk_size_ = end_pos - start_pos + 1; } + break; } - - // Reset this flag so we don't look inside the buffer in future calls of - // DidRead for this response. Note that this code DOES NOT handle multi- - // part responses with more than one part (we don't issue them at the - // moment, so they shouldn't arrive). - is_multipart_ = false; } - if (current_chunk_size_ && - current_chunk_read_ + length > current_chunk_size_) - length = current_chunk_size_ - current_chunk_read_; - - if (length) { - if (document_size_ > 0) { - chunk_stream_.WriteData(current_pos_, start, length); - } else { - // If we did not get content-length in the response, we can't - // preallocate buffer for the entire document. Resizing array causing - // memory fragmentation issues on the large files and OOM exceptions. - // To fix this, we collect all chunks of the file to the list and - // concatenate them together after request is complete. - std::vector<unsigned char> buf(length); - memcpy(buf.data(), start, length); - chunk_buffer_.push_back(std::move(buf)); - } - current_pos_ += length; - current_chunk_read_ += length; - client_->OnNewDataAvailable(); - } + // Reset this flag so we don't look inside the buffer in future calls of + // DidRead for this response. Note that this code DOES NOT handle multi- + // part responses with more than one part (we don't issue them at the + // moment, so they shouldn't arrive). + is_multipart_ = false; + } - // Only call the renderer if we allow partial loading. - if (!partial_document_) { - ReadMore(); - return; + if (current_chunk_size_ && current_chunk_read_ + length > current_chunk_size_) + length = current_chunk_size_ - current_chunk_read_; + + if (length) { + if (document_size_ > 0) { + chunk_stream_.WriteData(current_pos_, start, length); + } else { + // If we did not get content-length in the response, we can't + // preallocate buffer for the entire document. Resizing array causing + // memory fragmentation issues on the large files and OOM exceptions. + // To fix this, we collect all chunks of the file to the list and + // concatenate them together after request is complete. + std::vector<unsigned char> buf(length); + memcpy(buf.data(), start, length); + chunk_buffer_.push_back(std::move(buf)); } + current_pos_ += length; + current_chunk_read_ += length; + client_->OnNewDataAvailable(); + } - UpdateRendering(); - RemoveCompletedRanges(); + // Only call the renderer if we allow partial loading. + if (!partial_document_) { + ReadMore(); + return; + } - if (!pending_requests_.empty()) { - // If there are pending requests and the current content we're downloading - // doesn't satisfy any of these requests, cancel the current request to - // fullfill those more important requests. - bool satisfying_pending_request = - SatisfyingRequest(current_request_offset_, current_request_size_); - for (const auto& pending_request : pending_requests_) { - if (SatisfyingRequest(pending_request.first, pending_request.second)) { - satisfying_pending_request = true; - break; - } - } - // Cancel the request as it's not satisfying any request from the - // renderer, unless the current request is finished in which case we let - // it finish cleanly. - if (!satisfying_pending_request && - current_pos_ < current_request_offset_ + - current_request_extended_size_) { - loader_.Close(); + UpdateRendering(); + RemoveCompletedRanges(); + + if (!pending_requests_.empty()) { + // If there are pending requests and the current content we're downloading + // doesn't satisfy any of these requests, cancel the current request to + // fullfill those more important requests. + bool satisfying_pending_request = + SatisfyingRequest(current_request_offset_, current_request_size_); + for (const auto& pending_request : pending_requests_) { + if (SatisfyingRequest(pending_request.first, pending_request.second)) { + satisfying_pending_request = true; + break; } } - - ReadMore(); - } else if (result == PP_OK || result == PP_ERROR_ABORTED) { - ReadComplete(); - } else { - NOTREACHED(); + // Cancel the request as it's not satisfying any request from the + // renderer, unless the current request is finished in which case we let + // it finish cleanly. + if (!satisfying_pending_request && + current_pos_ < + current_request_offset_ + current_request_extended_size_) { + loader_.Close(); + } } + + ReadMore(); } bool DocumentLoader::SatisfyingRequest(size_t offset, size_t size) const { diff --git a/chromium/pdf/out_of_process_instance.cc b/chromium/pdf/out_of_process_instance.cc index 869fdf24958..ae680bde0ef 100644 --- a/chromium/pdf/out_of_process_instance.cc +++ b/chromium/pdf/out_of_process_instance.cc @@ -13,8 +13,6 @@ #include <math.h> #include <list> -#include "base/json/json_reader.h" -#include "base/json/json_writer.h" #include "base/logging.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_split.h" @@ -51,11 +49,6 @@ const char kChromePrint[] = "chrome://print/"; const char kChromeExtension[] = "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai"; -// Dictionary Value key names for the document accessibility info -const char kAccessibleNumberOfPages[] = "numberOfPages"; -const char kAccessibleLoaded[] = "loaded"; -const char kAccessibleCopyable[] = "copyable"; - // Constants used in handling postMessage() messages. const char kType[] = "type"; // Viewport message arguments. (Page -> Plugin). @@ -106,12 +99,6 @@ const char kJSPreviewPageIndex[] = "index"; const char kJSSetScrollPositionType[] = "setScrollPosition"; const char kJSPositionX[] = "x"; const char kJSPositionY[] = "y"; -// Request accessibility JSON data (Page -> Plugin) -const char kJSGetAccessibilityJSONType[] = "getAccessibilityJSON"; -const char kJSAccessibilityPageNumber[] = "page"; -// Reply with accessibility JSON data (Plugin -> Page) -const char kJSGetAccessibilityJSONReplyType[] = "getAccessibilityJSONReply"; -const char kJSAccessibilityJSON[] = "json"; // Cancel the stream URL request (Plugin -> Page) const char kJSCancelStreamUrlType[] = "cancelStreamUrl"; // Navigate to the given URL (Plugin -> Page) @@ -153,6 +140,10 @@ const char kJSFieldFocus[] = "focused"; const int kFindResultCooldownMs = 100; +// A delay to wait between each accessibility page to keep the system +// responsive. +const int kAccessibilityPageDelayMs = 100; + const double kMinZoom = 0.01; namespace { @@ -206,10 +197,20 @@ PP_Bool GetPrintPresetOptionsFromDocument( return PP_TRUE; } +void EnableAccessibility(PP_Instance instance) { + void* object = pp::Instance::GetPerInstanceObject(instance, kPPPPdfInterface); + if (object) { + OutOfProcessInstance* obj_instance = + static_cast<OutOfProcessInstance*>(object); + return obj_instance->EnableAccessibility(); + } +} + const PPP_Pdf ppp_private = { &GetLinkAtPosition, &Transform, - &GetPrintPresetOptionsFromDocument + &GetPrintPresetOptionsFromDocument, + &EnableAccessibility, }; int ExtractPrintPreviewPageIndex(const std::string& src_url) { @@ -288,7 +289,8 @@ OutOfProcessInstance::OutOfProcessInstance(PP_Instance instance) did_call_start_loading_(false), stop_scrolling_(false), background_color_(0), - top_toolbar_height_(0) { + top_toolbar_height_(0), + accessibility_state_(ACCESSIBILITY_STATE_OFF) { loader_factory_.Initialize(this); timer_factory_.Initialize(this); form_factory_.Initialize(this); @@ -446,27 +448,6 @@ void OutOfProcessInstance::HandleMessage(const pp::Var& message) { dict.Get(pp::Var(kJSPreviewPageIndex)).is_int()) { ProcessPreviewPageInfo(dict.Get(pp::Var(kJSPreviewPageUrl)).AsString(), dict.Get(pp::Var(kJSPreviewPageIndex)).AsInt()); - } else if (type == kJSGetAccessibilityJSONType) { - pp::VarDictionary reply; - reply.Set(pp::Var(kType), pp::Var(kJSGetAccessibilityJSONReplyType)); - if (dict.Get(pp::Var(kJSAccessibilityPageNumber)).is_int()) { - int page = dict.Get(pp::Var(kJSAccessibilityPageNumber)).AsInt(); - reply.Set(pp::Var(kJSAccessibilityJSON), - pp::Var(engine_->GetPageAsJSON(page))); - } else { - base::DictionaryValue node; - node.SetInteger(kAccessibleNumberOfPages, engine_->GetNumberOfPages()); - node.SetBoolean(kAccessibleLoaded, - document_load_state_ != LOAD_STATE_LOADING); - bool has_permissions = - engine_->HasPermission(PDFEngine::PERMISSION_COPY) || - engine_->HasPermission(PDFEngine::PERMISSION_COPY_ACCESSIBLE); - node.SetBoolean(kAccessibleCopyable, has_permissions); - std::string json; - base::JSONWriter::Write(node, &json); - reply.Set(pp::Var(kJSAccessibilityJSON), pp::Var(json)); - } - PostMessage(reply); } else if (type == kJSStopScrollingType) { stop_scrolling_ = true; } else if (type == kJSGetSelectedTextType) { @@ -614,6 +595,109 @@ void OutOfProcessInstance::GetPrintPresetOptionsFromDocument( options->uniform_page_size = uniform_page_size; } +void OutOfProcessInstance::EnableAccessibility() { + if (accessibility_state_ == ACCESSIBILITY_STATE_LOADED) + return; + + if (accessibility_state_ == ACCESSIBILITY_STATE_OFF) + accessibility_state_ = ACCESSIBILITY_STATE_PENDING; + + if (document_load_state_ == LOAD_STATE_COMPLETE) + LoadAccessibility(); +} + +void OutOfProcessInstance::LoadAccessibility() { + accessibility_state_ = ACCESSIBILITY_STATE_LOADED; + PP_PrivateAccessibilityDocInfo doc_info; + doc_info.page_count = engine_->GetNumberOfPages(); + doc_info.text_accessible = PP_FromBool( + engine_->HasPermission(PDFEngine::PERMISSION_COPY_ACCESSIBLE)); + doc_info.text_copyable = PP_FromBool( + engine_->HasPermission(PDFEngine::PERMISSION_COPY)); + + pp::PDF::SetAccessibilityDocInfo(GetPluginInstance(), &doc_info); + + // If the document contents isn't accessible, don't send anything more. + if (!(engine_->HasPermission(PDFEngine::PERMISSION_COPY) || + engine_->HasPermission(PDFEngine::PERMISSION_COPY_ACCESSIBLE))) { + return; + } + + PP_PrivateAccessibilityViewportInfo viewport_info; + viewport_info.scroll.x = 0; + viewport_info.scroll.y = -top_toolbar_height_ * device_scale_; + viewport_info.offset = available_area_.point(); + viewport_info.zoom = zoom_ * device_scale_; + pp::PDF::SetAccessibilityViewportInfo(GetPluginInstance(), &viewport_info); + + // Schedule loading the first page. + pp::CompletionCallback callback = timer_factory_.NewCallback( + &OutOfProcessInstance::SendNextAccessibilityPage); + pp::Module::Get()->core()->CallOnMainThread(kAccessibilityPageDelayMs, + callback, 0); +} + +void OutOfProcessInstance::SendNextAccessibilityPage(int32_t page_index) { + int page_count = engine_->GetNumberOfPages(); + if (page_index < 0 || page_index >= page_count) + return; + + int char_count = engine_->GetCharCount(page_index); + PP_PrivateAccessibilityPageInfo page_info; + page_info.page_index = page_index; + page_info.bounds = engine_->GetPageBoundsRect(page_index); + page_info.char_count = char_count; + + std::vector<PP_PrivateAccessibilityCharInfo> chars(page_info.char_count); + for (uint32_t i = 0; i < page_info.char_count; ++i) { + chars[i].unicode_character = engine_->GetCharUnicode(page_index, i); + } + + std::vector<PP_PrivateAccessibilityTextRunInfo> text_runs; + int char_index = 0; + while (char_index < char_count) { + PP_PrivateAccessibilityTextRunInfo text_run_info; + pp::FloatRect bounds; + engine_->GetTextRunInfo(page_index, char_index, &text_run_info.len, + &text_run_info.font_size, &bounds); + DCHECK_LE(char_index + text_run_info.len, + static_cast<uint32_t>(char_count)); + text_run_info.direction = PP_PRIVATEDIRECTION_LTR; + text_run_info.bounds = bounds; + text_runs.push_back(text_run_info); + + // We need to provide enough information to draw a bounding box + // around any arbitrary text range, but the bounding boxes of characters + // we get from PDFium don't necessarily "line up". Walk through the + // characters in each text run and let the width of each character be + // the difference between the x coordinate of one character and the + // x coordinate of the next. The rest of the bounds of each character + // can be computed from the bounds of the text run. + pp::FloatRect char_bounds = engine_->GetCharBounds(page_index, char_index); + for (uint32_t i = 0; i < text_run_info.len - 1; i++) { + DCHECK_LT(char_index + i + 1, + static_cast<uint32_t>(char_count)); + pp::FloatRect next_char_bounds = engine_->GetCharBounds( + page_index, char_index + i + 1); + chars[char_index + i].char_width = next_char_bounds.x() - char_bounds.x(); + char_bounds = next_char_bounds; + } + chars[char_index + text_run_info.len - 1].char_width = char_bounds.width(); + + char_index += text_run_info.len; + } + + page_info.text_run_count = text_runs.size(); + pp::PDF::SetAccessibilityPageInfo(GetPluginInstance(), &page_info, + text_runs.data(), chars.data()); + + // Schedule loading the next page. + pp::CompletionCallback callback = timer_factory_.NewCallback( + &OutOfProcessInstance::SendNextAccessibilityPage); + pp::Module::Get()->core()->CallOnMainThread(kAccessibilityPageDelayMs, + callback, page_index + 1); +} + pp::Var OutOfProcessInstance::GetLinkAtPosition( const pp::Point& point) { pp::Point offset_point(point); @@ -1164,6 +1248,9 @@ void OutOfProcessInstance::DocumentLoadComplete(int page_count) { pp::PDF::SetContentRestriction(this, content_restrictions); uma_.HistogramCustomCounts("PDF.PageCount", page_count, 1, 1000000, 50); + + if (accessibility_state_ == ACCESSIBILITY_STATE_PENDING) + LoadAccessibility(); } void OutOfProcessInstance::RotateClockwise() { @@ -1324,7 +1411,7 @@ void OutOfProcessInstance::OnGeometryChanged(double old_zoom, engine_->PageOffsetUpdated(available_area_.point()); engine_->PluginSizeUpdated(available_area_.size()); - if (!document_size_.GetArea()) + if (document_size_.IsEmpty()) return; paint_manager_.InvalidateRect(pp::Rect(pp::Point(), plugin_size_)); } diff --git a/chromium/pdf/out_of_process_instance.h b/chromium/pdf/out_of_process_instance.h index 0842d33bde1..fb08986078a 100644 --- a/chromium/pdf/out_of_process_instance.h +++ b/chromium/pdf/out_of_process_instance.h @@ -77,6 +77,13 @@ class OutOfProcessInstance : public pp::Instance, // pp::Private implementation. pp::Var GetLinkAtPosition(const pp::Point& point); void GetPrintPresetOptionsFromDocument(PP_PdfPrintPresetOptions_Dev* options); + void EnableAccessibility(); + + // Start loading accessibility information. + void LoadAccessibility(); + + // Send accessibility information about the given page index. + void SendNextAccessibilityPage(int32_t page_index); void FlushCallback(int32_t result); void DidOpen(int32_t result); @@ -342,6 +349,14 @@ class OutOfProcessInstance : public pp::Instance, // toolbar. int top_toolbar_height_; + // The current state of accessibility: either off, enabled but waiting + // for the document to load, or fully loaded. + enum AccessibilityState { + ACCESSIBILITY_STATE_OFF, + ACCESSIBILITY_STATE_PENDING, // Enabled but waiting for doc to load. + ACCESSIBILITY_STATE_LOADED + } accessibility_state_; + DISALLOW_COPY_AND_ASSIGN(OutOfProcessInstance); }; diff --git a/chromium/pdf/paint_aggregator.h b/chromium/pdf/paint_aggregator.h index 96f61e08787..74737ab8fb6 100644 --- a/chromium/pdf/paint_aggregator.h +++ b/chromium/pdf/paint_aggregator.h @@ -90,7 +90,7 @@ class PaintAggregator { // InvalidateRect. We need to know this distinction for some operations. // // - The paint bounds union is computed on the fly so we don't have to keep - // a rectangle up-to-date as we do different operations. + // a rectangle up to date as we do different operations. class InternalPaintUpdate { public: InternalPaintUpdate(); diff --git a/chromium/pdf/pdf.cc b/chromium/pdf/pdf.cc index fa43eaeb3c7..bad2ac4e2a8 100644 --- a/chromium/pdf/pdf.cc +++ b/chromium/pdf/pdf.cc @@ -31,7 +31,7 @@ PDFModule::PDFModule() { PDFModule::~PDFModule() { if (g_sdk_initialized_via_pepper) { - chrome_pdf::ShutdownSDK(); + ShutdownSDK(); g_sdk_initialized_via_pepper = false; } } @@ -51,8 +51,8 @@ pp::Instance* PDFModule::CreateInstance(PP_Instance instance) { v8::V8::SetNativesDataBlob(&natives); v8::V8::SetSnapshotDataBlob(&snapshot); } - if (!chrome_pdf::InitializeSDK()) - return NULL; + if (!InitializeSDK()) + return nullptr; g_sdk_initialized_via_pepper = true; } @@ -63,78 +63,73 @@ pp::Instance* PDFModule::CreateInstance(PP_Instance instance) { // Implementation of Global PPP functions --------------------------------- int32_t PPP_InitializeModule(PP_Module module_id, PPB_GetInterface get_browser_interface) { - PDFModule* module = new PDFModule(); - if (!module->InternalInit(module_id, get_browser_interface)) { - delete module; + std::unique_ptr<PDFModule> module(new PDFModule); + if (!module->InternalInit(module_id, get_browser_interface)) return PP_ERROR_FAILED; - } - pp::InternalSetModuleSingleton(module); + pp::InternalSetModuleSingleton(module.release()); return PP_OK; } void PPP_ShutdownModule() { delete pp::Module::Get(); - pp::InternalSetModuleSingleton(NULL); + pp::InternalSetModuleSingleton(nullptr); } const void* PPP_GetInterface(const char* interface_name) { - if (!pp::Module::Get()) - return NULL; - return pp::Module::Get()->GetPluginInterface(interface_name); + auto* module = pp::Module::Get(); + return module ? module->GetPluginInterface(interface_name) : nullptr; } #if defined(OS_WIN) bool RenderPDFPageToDC(const void* pdf_buffer, - int buffer_size, - int page_number, - HDC dc, - int dpi, - int bounds_origin_x, - int bounds_origin_y, - int bounds_width, - int bounds_height, - bool fit_to_bounds, - bool stretch_to_bounds, - bool keep_aspect_ratio, - bool center_in_bounds, - bool autorotate) { + int buffer_size, + int page_number, + HDC dc, + int dpi, + int bounds_origin_x, + int bounds_origin_y, + int bounds_width, + int bounds_height, + bool fit_to_bounds, + bool stretch_to_bounds, + bool keep_aspect_ratio, + bool center_in_bounds, + bool autorotate) { if (!g_sdk_initialized_via_pepper) { - if (!chrome_pdf::InitializeSDK()) { + if (!InitializeSDK()) { return false; } } - chrome_pdf::PDFEngineExports* engine_exports = - chrome_pdf::PDFEngineExports::Get(); - chrome_pdf::PDFEngineExports::RenderingSettings settings( - dpi, dpi, pp::Rect(bounds_origin_x, bounds_origin_y, bounds_width, - bounds_height), + PDFEngineExports* engine_exports = PDFEngineExports::Get(); + PDFEngineExports::RenderingSettings settings( + dpi, dpi, + pp::Rect(bounds_origin_x, bounds_origin_y, bounds_width, bounds_height), fit_to_bounds, stretch_to_bounds, keep_aspect_ratio, center_in_bounds, autorotate); bool ret = engine_exports->RenderPDFPageToDC(pdf_buffer, buffer_size, page_number, settings, dc); - if (!g_sdk_initialized_via_pepper) { - chrome_pdf::ShutdownSDK(); - } + if (!g_sdk_initialized_via_pepper) + ShutdownSDK(); + return ret; } -#endif // OS_WIN +#endif // defined(OS_WIN) bool GetPDFDocInfo(const void* pdf_buffer, int buffer_size, int* page_count, double* max_page_width) { if (!g_sdk_initialized_via_pepper) { - if (!chrome_pdf::InitializeSDK()) + if (!InitializeSDK()) return false; } - chrome_pdf::PDFEngineExports* engine_exports = - chrome_pdf::PDFEngineExports::Get(); + PDFEngineExports* engine_exports = PDFEngineExports::Get(); bool ret = engine_exports->GetPDFDocInfo( pdf_buffer, buffer_size, page_count, max_page_width); - if (!g_sdk_initialized_via_pepper) { - chrome_pdf::ShutdownSDK(); - } + if (!g_sdk_initialized_via_pepper) + ShutdownSDK(); + return ret; } @@ -163,19 +158,18 @@ bool RenderPDFPageToBitmap(const void* pdf_buffer, int dpi, bool autorotate) { if (!g_sdk_initialized_via_pepper) { - if (!chrome_pdf::InitializeSDK()) + if (!InitializeSDK()) return false; } - chrome_pdf::PDFEngineExports* engine_exports = - chrome_pdf::PDFEngineExports::Get(); - chrome_pdf::PDFEngineExports::RenderingSettings settings( + PDFEngineExports* engine_exports = PDFEngineExports::Get(); + PDFEngineExports::RenderingSettings settings( dpi, dpi, pp::Rect(bitmap_width, bitmap_height), true, false, true, true, autorotate); bool ret = engine_exports->RenderPDFPageToBitmap( pdf_buffer, pdf_buffer_size, page_number, settings, bitmap_buffer); - if (!g_sdk_initialized_via_pepper) { - chrome_pdf::ShutdownSDK(); - } + if (!g_sdk_initialized_via_pepper) + ShutdownSDK(); + return ret; } diff --git a/chromium/pdf/pdf_engine.h b/chromium/pdf/pdf_engine.h index c5d33c87066..5bba8b9f860 100644 --- a/chromium/pdf/pdf_engine.h +++ b/chromium/pdf/pdf_engine.h @@ -233,6 +233,8 @@ class PDFEngine { virtual int GetMostVisiblePage() = 0; // Gets the rectangle of the page including shadow. virtual pp::Rect GetPageRect(int index) = 0; + // Gets the rectangle of the page not including the shadow. + virtual pp::Rect GetPageBoundsRect(int index) = 0; // Gets the rectangle of the page excluding any additional areas. virtual pp::Rect GetPageContentsRect(int index) = 0; // Returns a page's rect in screen coordinates, as well as its surrounding @@ -245,8 +247,20 @@ class PDFEngine { virtual void SetGrayscale(bool grayscale) = 0; // Callback for timer that's set with ScheduleCallback(). virtual void OnCallback(int id) = 0; - // Gets the JSON representation of the PDF file - virtual std::string GetPageAsJSON(int index) = 0; + // Get the number of characters on a given page. + virtual int GetCharCount(int page_index) = 0; + // Get the bounds in page pixels of a character on a given page. + virtual pp::FloatRect GetCharBounds(int page_index, int char_index) = 0; + // Get a given unicode character on a given page. + virtual uint32_t GetCharUnicode(int page_index, int char_index) = 0; + // Given a start char index, find the longest continuous run of text that's + // in a single direction and with the same style and font size. Return the + // length of that sequence and its font size and bounding box. + virtual void GetTextRunInfo(int page_index, + int start_char_index, + uint32_t* out_len, + double* out_font_size, + pp::FloatRect* out_bounds) = 0; // Gets the PDF document's print scaling preference. True if the document can // be scaled to fit. virtual bool GetPrintScaling() = 0; diff --git a/chromium/pdf/pdfium/fuzzers/BUILD.gn b/chromium/pdf/pdfium/fuzzers/BUILD.gn index fce0b8c4eac..2df2e9afded 100644 --- a/chromium/pdf/pdfium/fuzzers/BUILD.gn +++ b/chromium/pdf/pdfium/fuzzers/BUILD.gn @@ -22,7 +22,7 @@ fuzzer_test("pdfium_fuzzer") { "//v8:v8_libplatform", ] additional_configs = [ - "//third_party/pdfium:pdfium_config", + "//third_party/pdfium:pdfium_core_config", "//v8:external_startup_data", ] dict = "dicts/pdf.dict" @@ -36,6 +36,49 @@ fuzzer_test("pdf_jpx_fuzzer") { } if (pdf_enable_xfa) { + fuzzer_test("pdf_codec_bmp_fuzzer") { + sources = [] + deps = [ + "//third_party/pdfium/testing/libfuzzer:pdf_codec_bmp_fuzzer", + ] + } + + fuzzer_test("pdf_codec_gif_fuzzer") { + sources = [] + deps = [ + "//third_party/pdfium/testing/libfuzzer:pdf_codec_gif_fuzzer", + ] + } + + fuzzer_test("pdf_codec_jpeg_fuzzer") { + sources = [] + deps = [ + "//third_party/pdfium/testing/libfuzzer:pdf_codec_jpeg_fuzzer", + ] + } + + fuzzer_test("pdf_codec_png_fuzzer") { + sources = [] + deps = [ + "//third_party/pdfium/testing/libfuzzer:pdf_codec_png_fuzzer", + ] + } + + fuzzer_test("pdf_codec_tiff_fuzzer") { + sources = [] + deps = [ + "//third_party/pdfium/testing/libfuzzer:pdf_codec_tiff_fuzzer", + ] + } + + fuzzer_test("pdf_css_fuzzer") { + sources = [] + deps = [ + "//third_party/pdfium/testing/libfuzzer:pdf_css_fuzzer", + ] + dict = "dicts/pdf_css.dict" + } + fuzzer_test("pdf_fm2js_fuzzer") { sources = [] deps = [ @@ -51,4 +94,12 @@ if (pdf_enable_xfa) { ] dict = "dicts/pdf_xml.dict" } + + fuzzer_test("pdf_cfx_saxreader_fuzzer") { + sources = [] + deps = [ + "//third_party/pdfium/testing/libfuzzer:pdf_cfx_saxreader_fuzzer", + ] + dict = "dicts/pdf_xml.dict" + } } diff --git a/chromium/pdf/pdfium/fuzzers/dicts/pdf_css.dict b/chromium/pdf/pdfium/fuzzers/dicts/pdf_css.dict new file mode 100644 index 00000000000..b59b3444748 --- /dev/null +++ b/chromium/pdf/pdfium/fuzzers/dicts/pdf_css.dict @@ -0,0 +1,73 @@ +kw0=";" +kw1="{" +kw2="}" +kw3=":" +kw4="/" +kw5="," +kw6="+" +kw7=">" +kw8="-" +kw9="]" +kw10="[" +kw11="." +kw12="=" +kw13="*" +kw14=")" +kw15="<!--" +kw16="-->" +kw17="~=" +kw18="|=" +kw19="#" +kw20="@import" +kw21="@page" +kw22="@media" +kw23="@" +kw24="@charset" +kw25="!important" +kw26="em" +kw27="ex" +kw28="px" +kw29="cm" +kw30="mm" +kw31="in" +kw32="pt" +kw33="pc" +kw34="deg" +kw35="rad" +kw36="grad" +kw37="ms" +kw38="s" +kw39="hz" +kw40="khz" +kw41="ident" +kw42="url(" +kw43="/*" +kw44="*/" +kw45="color" +kw46="font" +kw47="font-family" +kw48="font-size" +kw49="font-stretch" +kw50="font-style" +kw51="font-weight" +kw52="margin" +kw53="margin-bottom" +kw54="margin-top" +kw55="margin-left" +kw56="margin-right" +kw57="letter-spacing" +kw58="line-height" +kw59="orphans" +kw60="page-break-after" +kw61="page-break-before" +kw62="page-break-inside" +kw63="tab-interval" +kw64="tab-stop" +kw65="text-decoration" +kw66="text-indent" +kw67="vertical-align" +kw68="widows" +kw69="kerning-mode" +kw70="xfa-font-horizontal-scale" +kw71="xfa-font-vertical-scale" +kw72="xfa-tab-stops" diff --git a/chromium/pdf/pdfium/pdfium_engine.cc b/chromium/pdf/pdfium/pdfium_engine.cc index eedf9f706b1..05f73098ec7 100644 --- a/chromium/pdf/pdfium/pdfium_engine.cc +++ b/chromium/pdf/pdfium/pdfium_engine.cc @@ -13,7 +13,6 @@ #include "base/i18n/icu_encoding_detection.h" #include "base/i18n/icu_string_conversions.h" -#include "base/json/json_writer.h" #include "base/lazy_instance.h" #include "base/logging.h" #include "base/macros.h" @@ -23,7 +22,6 @@ #include "base/strings/string_piece.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" -#include "base/values.h" #include "gin/array_buffer.h" #include "gin/public/gin_embedders.h" #include "gin/public/isolate_holder.h" @@ -67,46 +65,48 @@ namespace chrome_pdf { namespace { -#define kPageShadowTop 3 -#define kPageShadowBottom 7 -#define kPageShadowLeft 5 -#define kPageShadowRight 5 +const int32_t kPageShadowTop = 3; +const int32_t kPageShadowBottom = 7; +const int32_t kPageShadowLeft = 5; +const int32_t kPageShadowRight = 5; -#define kPageSeparatorThickness 4 -#define kHighlightColorR 153 -#define kHighlightColorG 193 -#define kHighlightColorB 218 +const int32_t kPageSeparatorThickness = 4; +const int32_t kHighlightColorR = 153; +const int32_t kHighlightColorG = 193; +const int32_t kHighlightColorB = 218; const uint32_t kPendingPageColor = 0xFFEEEEEE; -#define kFormHighlightColor 0xFFE4DD -#define kFormHighlightAlpha 100 +const uint32_t kFormHighlightColor = 0xFFE4DD; +const int32_t kFormHighlightAlpha = 100; -#define kMaxPasswordTries 3 +const int32_t kMaxPasswordTries = 3; // See Table 3.20 in // http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf -#define kPDFPermissionPrintLowQualityMask 1 << 2 -#define kPDFPermissionPrintHighQualityMask 1 << 11 -#define kPDFPermissionCopyMask 1 << 4 -#define kPDFPermissionCopyAccessibleMask 1 << 9 +const uint32_t kPDFPermissionPrintLowQualityMask = 1 << 2; +const uint32_t kPDFPermissionPrintHighQualityMask = 1 << 11; +const uint32_t kPDFPermissionCopyMask = 1 << 4; +const uint32_t kPDFPermissionCopyAccessibleMask = 1 << 9; -#define kLoadingTextVerticalOffset 50 +const int32_t kLoadingTextVerticalOffset = 50; // The maximum amount of time we'll spend doing a paint before we give back // control of the thread. -#define kMaxProgressivePaintTimeMs 50 +const int32_t kMaxProgressivePaintTimeMs = 300; // The maximum amount of time we'll spend doing the first paint. This is less -// than the above to keep things smooth if the user is scrolling quickly. We -// try painting a little because with accelerated compositing, we get flushes -// only every 16 ms. If we were to wait until the next flush to paint the rest -// of the pdf, we would never get to draw the pdf and would only draw the -// scrollbars. This value is picked to give enough time for gpu related code to -// do its thing and still fit within the timelimit for 60Hz. For the -// non-composited case, this doesn't make things worse since we're still -// painting the scrollbars > 60 Hz. -#define kMaxInitialProgressivePaintTimeMs 10 +// than the above to keep things smooth if the user is scrolling quickly. This +// is set to 250 ms to give enough time for most PDFs to render, while avoiding +// adding too much latency to the display of the final image when the user +// stops scrolling. +// Setting a higher value has minimal benefit (scrolling at less than 4 fps will +// never be a great experience) and there is some cost, since when the user +// stops scrolling the in-progress painting has to complete or timeout before +// the final painting can start. +// The scrollbar will always be responsive since it is managed by a separate +// process. +const int32_t kMaxInitialProgressivePaintTimeMs = 250; std::vector<uint32_t> GetPageNumbersFromPrintPageNumberRange( const PP_PrintPageNumberRange_Dev* page_ranges, @@ -2271,6 +2271,10 @@ pp::Rect PDFiumEngine::GetPageRect(int index) { return rc; } +pp::Rect PDFiumEngine::GetPageBoundsRect(int index) { + return pages_[index]->rect(); +} + pp::Rect PDFiumEngine::GetPageContentsRect(int index) { return GetScreenRect(pages_[index]->rect()); } @@ -2288,20 +2292,29 @@ void PDFiumEngine::OnCallback(int id) { client_->ScheduleCallback(id, timers_[id].first); } -std::string PDFiumEngine::GetPageAsJSON(int index) { - if (!(HasPermission(PERMISSION_COPY) || - HasPermission(PERMISSION_COPY_ACCESSIBLE))) { - return "{}"; - } +int PDFiumEngine::GetCharCount(int page_index) { + DCHECK(page_index >= 0 && page_index < static_cast<int>(pages_.size())); + return pages_[page_index]->GetCharCount(); +} - if (index < 0 || static_cast<size_t>(index) > pages_.size() - 1) - return "{}"; +pp::FloatRect PDFiumEngine::GetCharBounds(int page_index, int char_index) { + DCHECK(page_index >= 0 && page_index < static_cast<int>(pages_.size())); + return pages_[page_index]->GetCharBounds(char_index); +} + +uint32_t PDFiumEngine::GetCharUnicode(int page_index, int char_index) { + DCHECK(page_index >= 0 && page_index < static_cast<int>(pages_.size())); + return pages_[page_index]->GetCharUnicode(char_index); +} - std::unique_ptr<base::Value> node( - pages_[index]->GetAccessibleContentAsValue(current_rotation_)); - std::string page_json; - base::JSONWriter::Write(*node, &page_json); - return page_json; +void PDFiumEngine::GetTextRunInfo(int page_index, + int start_char_index, + uint32_t* out_len, + double* out_font_size, + pp::FloatRect* out_bounds) { + DCHECK(page_index >= 0 && page_index < static_cast<int>(pages_.size())); + return pages_[page_index]->GetTextRunInfo(start_char_index, out_len, + out_font_size, out_bounds); } bool PDFiumEngine::GetPrintScaling() { @@ -2404,8 +2417,8 @@ void PDFiumEngine::LoadDocument() { ScopedUnsupportedFeature scoped_unsupported_feature(this); bool needs_password = false; - if (TryLoadingDoc(false, std::string(), &needs_password)) { - ContinueLoadingDocument(false, std::string()); + if (TryLoadingDoc(std::string(), &needs_password)) { + ContinueLoadingDocument(std::string()); return; } if (needs_password) @@ -2414,8 +2427,7 @@ void PDFiumEngine::LoadDocument() { client_->DocumentLoadFailed(); } -bool PDFiumEngine::TryLoadingDoc(bool with_password, - const std::string& password, +bool PDFiumEngine::TryLoadingDoc(const std::string& password, bool* needs_password) { *needs_password = false; if (doc_) { @@ -2427,7 +2439,7 @@ bool PDFiumEngine::TryLoadingDoc(bool with_password, } const char* password_cstr = nullptr; - if (with_password) { + if (!password.empty()) { password_cstr = password.c_str(); password_tries_remaining_--; } @@ -2460,24 +2472,18 @@ void PDFiumEngine::OnGetPasswordComplete(int32_t result, const pp::Var& password) { getting_password_ = false; - bool password_given = false; std::string password_text; - if (result == PP_OK && password.is_string()) { + if (result == PP_OK && password.is_string()) password_text = password.AsString(); - if (!password_text.empty()) - password_given = true; - } - ContinueLoadingDocument(password_given, password_text); + ContinueLoadingDocument(password_text); } -void PDFiumEngine::ContinueLoadingDocument( - bool has_password, - const std::string& password) { +void PDFiumEngine::ContinueLoadingDocument(const std::string& password) { ScopedUnsupportedFeature scoped_unsupported_feature(this); bool needs_password = false; - bool loaded = TryLoadingDoc(has_password, password, &needs_password); - bool password_incorrect = !loaded && has_password && needs_password; + bool loaded = TryLoadingDoc(password, &needs_password); + bool password_incorrect = !loaded && needs_password && !password.empty(); if (password_incorrect && password_tries_remaining_ > 0) { GetPasswordAndLoad(); return; @@ -2660,7 +2666,7 @@ bool PDFiumEngine::CheckPageAvailable(int index, std::vector<int>* pending) { if (index < num_pages) pages_[index]->set_available(true); - if (!default_page_size_.GetArea()) + if (default_page_size_.IsEmpty()) default_page_size_ = GetPageSize(index); return true; } diff --git a/chromium/pdf/pdfium/pdfium_engine.h b/chromium/pdf/pdfium/pdfium_engine.h index dcfbe3d132b..e6392b4a482 100644 --- a/chromium/pdf/pdfium/pdfium_engine.h +++ b/chromium/pdf/pdfium/pdfium_engine.h @@ -84,12 +84,20 @@ class PDFiumEngine : public PDFEngine, int GetNamedDestinationPage(const std::string& destination) override; int GetMostVisiblePage() override; pp::Rect GetPageRect(int index) override; + pp::Rect GetPageBoundsRect(int index) override; pp::Rect GetPageContentsRect(int index) override; pp::Rect GetPageScreenRect(int page_index) const override; int GetVerticalScrollbarYPosition() override { return position_.y(); } void SetGrayscale(bool grayscale) override; void OnCallback(int id) override; - std::string GetPageAsJSON(int index) override; + int GetCharCount(int page_index) override; + pp::FloatRect GetCharBounds(int page_index, int char_index) override; + uint32_t GetCharUnicode(int page_index, int char_index) override; + void GetTextRunInfo(int page_index, + int start_char_index, + uint32_t* out_len, + double* out_font_size, + pp::FloatRect* out_bounds) override; bool GetPrintScaling() override; int GetCopiesToPrint() override; int GetDuplexType() override; @@ -204,13 +212,10 @@ class PDFiumEngine : public PDFEngine, void LoadDocument(); // Try loading the document. Returns true if the document is successfully - // loaded or is already loaded otherwise it will return false. If - // |with_password| is set to true, the document will be loaded with - // |password|. If the document could not be loaded and needs a password, - // |needs_password| will be set to true. - bool TryLoadingDoc(bool with_password, - const std::string& password, - bool* needs_password); + // loaded or is already loaded otherwise it will return false. If there is a + // password, then |password| is non-empty. If the document could not be loaded + // and needs a password, |needs_password| will be set to true. + bool TryLoadingDoc(const std::string& password, bool* needs_password); // Asks the user for the document password and then continue loading the // document. @@ -221,9 +226,8 @@ class PDFiumEngine : public PDFEngine, const pp::Var& password); // Continues loading the document when the password has been retrieved, or if - // there is no password. - void ContinueLoadingDocument(bool has_password, - const std::string& password); + // there is no password. If there is no password, then |password| is empty. + void ContinueLoadingDocument(const std::string& password); // Finishes loading the document. Recalculate the document size if there were // pages that were not previously available. diff --git a/chromium/pdf/pdfium/pdfium_page.cc b/chromium/pdf/pdfium/pdfium_page.cc index af64660a2a8..e95296986d8 100644 --- a/chromium/pdf/pdfium/pdfium_page.cc +++ b/chromium/pdf/pdfium/pdfium_page.cc @@ -9,35 +9,27 @@ #include <algorithm> #include <memory> +#include <utility> #include "base/logging.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" -#include "base/values.h" #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h" #include "pdf/pdfium/pdfium_engine.h" +#include "printing/units.h" // Used when doing hit detection. #define kTolerance 20.0 +using printing::ConvertUnitDouble; +using printing::kPointsPerInch; +using printing::kPixelsPerInch; + namespace { -// Dictionary Value key names for returning the accessible page content as JSON. -const char kPageWidth[] = "width"; -const char kPageHeight[] = "height"; -const char kPageTextBox[] = "textBox"; -const char kTextBoxLeft[] = "left"; -const char kTextBoxTop[] = "top"; -const char kTextBoxWidth[] = "width"; -const char kTextBoxHeight[] = "height"; -const char kTextBoxFontSize[] = "fontSize"; -const char kTextBoxNodes[] = "textNodes"; -const char kTextNodeType[] = "type"; -const char kTextNodeText[] = "text"; -const char kTextNodeTypeText[] = "text"; - -pp::Rect PageRectToGViewRect(FPDF_PAGE page, const pp::Rect& input) { +pp::FloatRect FloatPageRectToPixelRect(FPDF_PAGE page, + const pp::FloatRect& input) { int output_width = FPDF_GetPageWidth(page); int output_height = FPDF_GetPageHeight(page); @@ -45,65 +37,42 @@ pp::Rect PageRectToGViewRect(FPDF_PAGE page, const pp::Rect& input) { int min_y; int max_x; int max_y; - FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, - input.x(), input.y(), &min_x, &min_y); - FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, - input.right(), input.bottom(), &max_x, &max_y); + FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, input.x(), + input.y(), &min_x, &min_y); + FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, input.right(), + input.bottom(), &max_x, &max_y); if (max_x < min_x) std::swap(min_x, max_x); if (max_y < min_y) std::swap(min_y, max_y); - pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y); - output_rect.Intersect(pp::Rect(0, 0, output_width, output_height)); + pp::FloatRect output_rect( + ConvertUnitDouble(min_x, kPointsPerInch, kPixelsPerInch), + ConvertUnitDouble(min_y, kPointsPerInch, kPixelsPerInch), + ConvertUnitDouble(max_x - min_x, kPointsPerInch, kPixelsPerInch), + ConvertUnitDouble(max_y - min_y, kPointsPerInch, kPixelsPerInch)); return output_rect; } -pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page, - int index) { +pp::FloatRect GetFloatCharRectInPixels(FPDF_PAGE page, + FPDF_TEXTPAGE text_page, + int index) { double left, right, bottom, top; FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); if (right < left) std::swap(left, right); if (bottom < top) std::swap(top, bottom); - pp::Rect page_coords(left, top, right - left, bottom - top); - return PageRectToGViewRect(page, page_coords); -} - -// This is the character PDFium inserts where a word is broken across lines. -const unsigned int kSoftHyphen = 0x02; - -// The following characters should all be recognized as Unicode newlines: -// LF: Line Feed, U+000A -// VT: Vertical Tab, U+000B -// FF: Form Feed, U+000C -// CR: Carriage Return, U+000D -// CR+LF: CR (U+000D) followed by LF (U+000A) -// NEL: Next Line, U+0085 -// LS: Line Separator, U+2028 -// PS: Paragraph Separator, U+2029. -// Source: http://en.wikipedia.org/wiki/Newline#Unicode . -const unsigned int kUnicodeNewlines[] = { - 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029 -}; - -bool IsSoftHyphen(unsigned int character) { - return kSoftHyphen == character; + pp::FloatRect page_coords(left, top, right - left, bottom - top); + return FloatPageRectToPixelRect(page, page_coords); } -bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) { +bool OverlapsOnYAxis(const pp::FloatRect &a, const pp::FloatRect& b) { return !(a.IsEmpty() || b.IsEmpty() || a.bottom() < b.y() || b.bottom() < a.y()); } -bool IsEol(unsigned int character) { - const unsigned int* first = kUnicodeNewlines; - const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines); - return std::find(first, last, character) != last; -} - } // namespace namespace chrome_pdf { @@ -191,112 +160,73 @@ FPDF_TEXTPAGE PDFiumPage::GetTextPage() { return text_page_; } -base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { - base::DictionaryValue* node = new base::DictionaryValue(); - - if (!available_) - return node; - +void PDFiumPage::GetTextRunInfo(int start_char_index, + uint32_t* out_len, + double* out_font_size, + pp::FloatRect* out_bounds) { FPDF_PAGE page = GetPage(); FPDF_TEXTPAGE text_page = GetTextPage(); - - double width = FPDF_GetPageWidth(page); - double height = FPDF_GetPageHeight(page); - - node->SetDouble(kPageWidth, width); - node->SetDouble(kPageHeight, height); - std::unique_ptr<base::ListValue> text(new base::ListValue()); - int chars_count = FPDFText_CountChars(text_page); - pp::Rect line_rect; - pp::Rect word_rect; - bool seen_literal_text_in_word = false; - - // Iterate over all of the chars on the page. Explicitly run the loop - // with |i == chars_count|, which is one past the last character, and - // pretend it's a newline character in order to ensure we always flush - // the last line. - base::string16 line; - for (int i = 0; i <= chars_count; i++) { - unsigned int character; - pp::Rect char_rect; - - if (i < chars_count) { - character = FPDFText_GetUnicode(text_page, i); - char_rect = GetCharRectInGViewCoords(page, text_page, i); - } else { - // Make the last character a newline so the last line isn't lost. - character = '\n'; - } - - // There are spurious STX chars appearing in place - // of ligatures. Apply a heuristic to check that some vertical displacement - // is involved before assuming they are line-breaks. - bool is_intraword_linebreak = false; - if (i < chars_count - 1 && IsSoftHyphen(character)) { - // check if the next char and this char are in different lines. - pp::Rect next_char_rect = GetCharRectInGViewCoords( - page, text_page, i + 1); - + int char_index = start_char_index; + while ( + char_index < chars_count && + base::IsUnicodeWhitespace(FPDFText_GetUnicode(text_page, char_index))) { + char_index++; + } + int text_run_font_size = FPDFText_GetFontSize(text_page, char_index); + pp::FloatRect text_run_bounds = + GetFloatCharRectInPixels(page, text_page, char_index); + char_index++; + while (char_index < chars_count) { + unsigned int character = FPDFText_GetUnicode(text_page, char_index); + + if (!base::IsUnicodeWhitespace(character)) { // TODO(dmazzoni): this assumes horizontal text. // https://crbug.com/580311 - is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect); - } - if (is_intraword_linebreak || - base::IsUnicodeWhitespace(character) || - IsEol(character)) { - if (!word_rect.IsEmpty() && seen_literal_text_in_word) { - word_rect = pp::Rect(); - seen_literal_text_in_word = false; - } - } + pp::FloatRect char_rect = GetFloatCharRectInPixels( + page, text_page, char_index); + if (!char_rect.IsEmpty() && !OverlapsOnYAxis(text_run_bounds, char_rect)) + break; - if (is_intraword_linebreak || IsEol(character)) { - if (!line_rect.IsEmpty()) { - if (is_intraword_linebreak) { - // Add a 0-width hyphen. - line.push_back('-'); - } + int font_size = FPDFText_GetFontSize(text_page, char_index); + if (font_size != text_run_font_size) + break; - base::DictionaryValue* text_node = new base::DictionaryValue(); - text_node->SetString(kTextNodeType, kTextNodeTypeText); - text_node->SetString(kTextNodeText, line); - - base::ListValue* text_nodes = new base::ListValue(); - text_nodes->Append(text_node); - - base::DictionaryValue* line_node = new base::DictionaryValue(); - line_node->SetDouble(kTextBoxLeft, line_rect.x()); - line_node->SetDouble(kTextBoxTop, line_rect.y()); - line_node->SetDouble(kTextBoxWidth, line_rect.width()); - line_node->SetDouble(kTextBoxHeight, line_rect.height()); - line_node->SetDouble(kTextBoxFontSize, - FPDFText_GetFontSize(text_page, i)); - line_node->Set(kTextBoxNodes, text_nodes); - text->Append(line_node); - - line.clear(); - line_rect = pp::Rect(); - word_rect = pp::Rect(); - seen_literal_text_in_word = false; - } - continue; + // Heuristic: split a text run after a space longer than 3 average + // characters. + double avg_char_width = + text_run_bounds.width() / (char_index - start_char_index); + if (char_rect.x() - text_run_bounds.right() > avg_char_width * 3) + break; + + text_run_bounds = text_run_bounds.Union(char_rect); } - seen_literal_text_in_word = seen_literal_text_in_word || - !base::IsUnicodeWhitespace(character); - line.push_back(character); - if (!char_rect.IsEmpty()) { - line_rect = line_rect.Union(char_rect); + char_index++; + } - if (!base::IsUnicodeWhitespace(character)) - word_rect = word_rect.Union(char_rect); - } + // Some PDFs have missing or obviously bogus font sizes; substitute the + // height of the bounding box in those cases. + if (text_run_font_size <= 1 || + text_run_font_size < text_run_bounds.height() / 2 || + text_run_font_size > text_run_bounds.height() * 2) { + text_run_font_size = text_run_bounds.height(); } - node->Set(kPageTextBox, text.release()); // Takes ownership of |text| + *out_len = char_index - start_char_index; + *out_font_size = text_run_font_size; + *out_bounds = text_run_bounds; +} + +uint32_t PDFiumPage::GetCharUnicode(int char_index) { + FPDF_TEXTPAGE text_page = GetTextPage(); + return FPDFText_GetUnicode(text_page, char_index); +} - return node; +pp::FloatRect PDFiumPage::GetCharBounds(int char_index) { + FPDF_PAGE page = GetPage(); + FPDF_TEXTPAGE text_page = GetTextPage(); + return GetFloatCharRectInPixels(page, text_page, char_index); } PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point, diff --git a/chromium/pdf/pdfium/pdfium_page.h b/chromium/pdf/pdfium/pdfium_page.h index 802ecb64e55..fa94ed96a08 100644 --- a/chromium/pdf/pdfium/pdfium_page.h +++ b/chromium/pdf/pdfium/pdfium_page.h @@ -14,10 +14,6 @@ #include "third_party/pdfium/public/fpdf_formfill.h" #include "third_party/pdfium/public/fpdf_text.h" -namespace base { -class Value; -} - namespace chrome_pdf { class PDFiumEngine; @@ -43,8 +39,17 @@ class PDFiumPage { // Returns FPDF_TEXTPAGE for the page, loading and parsing it if necessary. FPDF_TEXTPAGE GetTextPage(); - // Returns a DictionaryValue version of the page. - base::Value* GetAccessibleContentAsValue(int rotation); + // Given a start char index, find the longest continuous run of text that's + // in a single direction and with the same style and font size. Return the + // length of that sequence and its font size and bounding box. + void GetTextRunInfo(int start_char_index, + uint32_t* out_len, + double* out_font_size, + pp::FloatRect* out_bounds); + // Get a unicode character from the page. + uint32_t GetCharUnicode(int char_index); + // Get the bounds of a character in page pixels. + pp::FloatRect GetCharBounds(int char_index); enum Area { NONSELECTABLE_AREA, |