From 974086e8a7ca04b9bb947380bb2ac684c028edc1 Mon Sep 17 00:00:00 2001 From: thestig Date: Fri, 12 Sep 2014 12:44:36 -0700 Subject: PDF: Manipulate the selected text buffer to deal with special 0xfffe hyphens. Also format text to match Adobe Reader. BUG=307523 TEST=Manual, see bug (and duplicates for more examples) Review URL: https://codereview.chromium.org/566773005 Cr-Commit-Position: refs/heads/master@{#294638} --- pdf/pdfium/pdfium_engine.cc | 105 +++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/pdf/pdfium/pdfium_engine.cc b/pdf/pdfium/pdfium_engine.cc index 28fca44..6457478 100644 --- a/pdf/pdfium/pdfium_engine.cc +++ b/pdf/pdfium/pdfium_engine.cc @@ -45,6 +45,8 @@ namespace chrome_pdf { +namespace { + #define kPageShadowTop 3 #define kPageShadowBottom 7 #define kPageShadowLeft 5 @@ -458,46 +460,75 @@ void CalculateNonScaledClipBoxOffset(const pp::Rect& content_rect, int rotation, } } -// Do an in-place transformation of objects on |page|. Translate all objects on -// |page| in |source_clip_box| by (|offset_x|, |offset_y|) and scale them by -// |scale_factor|. -// -// |page| Handle to the page. Returned by FPDF_LoadPage function. -// |source_clip_box| specifies the source clip box positions, relative to -// origin at left-bottom. -// |scale_factor| specifies the scale factor that should be applied to page -// objects. -// |offset_x| and |offset_y| specifies the translation offsets for the page -// objects, relative to origin at left-bottom. - -void TransformPageObjects(FPDF_PAGE page, const ClipBox& source_clip_box, - const double scale_factor, double offset_x, - double offset_y) { - const int obj_count = FPDFPage_CountObject(page); - - // Create a new clip path. - FPDF_CLIPPATH clip_path = FPDF_CreateClipPath( - source_clip_box.left + offset_x, source_clip_box.bottom + offset_y, - source_clip_box.right + offset_x, source_clip_box.top + offset_y); - - for (int obj_idx = 0; obj_idx < obj_count; ++obj_idx) { - FPDF_PAGEOBJECT page_obj = FPDFPage_GetObject(page, obj_idx); - FPDFPageObj_Transform(page_obj, scale_factor, 0, 0, scale_factor, - offset_x, offset_y); - FPDFPageObj_TransformClipPath(page_obj, scale_factor, 0, 0, scale_factor, - offset_x, offset_y); +// This formats a string with special 0xfffe end-of-line hyphens the same way +// as Adobe Reader. When a hyphen is encountered, the next non-CR/LF whitespace +// becomes CR+LF and the hyphen is erased. If there is no whitespace between +// two hyphens, the latter hyphen is erased and ignored. +void FormatStringWithHyphens(base::string16* text) { + // First pass marks all the hyphen positions. + struct HyphenPosition { + HyphenPosition() : position(0), next_whitespace_position(0) {} + size_t position; + size_t next_whitespace_position; // 0 for none + }; + std::vector hyphen_positions; + HyphenPosition current_hyphen_position; + bool current_hyphen_position_is_valid = false; + const base::char16 kPdfiumHyphenEOL = 0xfffe; + + for (size_t i = 0; i < text->size(); ++i) { + const base::char16& current_char = (*text)[i]; + if (current_char == kPdfiumHyphenEOL) { + if (current_hyphen_position_is_valid) + hyphen_positions.push_back(current_hyphen_position); + current_hyphen_position = HyphenPosition(); + current_hyphen_position.position = i; + current_hyphen_position_is_valid = true; + } else if (IsWhitespace(current_char)) { + if (current_hyphen_position_is_valid) { + if (current_char != L'\r' && current_char != L'\n') + current_hyphen_position.next_whitespace_position = i; + hyphen_positions.push_back(current_hyphen_position); + current_hyphen_position_is_valid = false; + } + } } - FPDFPage_TransformAnnots(page, scale_factor, 0, 0, scale_factor, - offset_x, offset_y); - FPDFPage_GenerateContent(page); + if (current_hyphen_position_is_valid) + hyphen_positions.push_back(current_hyphen_position); - // Add a extra clip path to the new pdf page here. - FPDFPage_InsertClipPath(page, clip_path); - - // Destroy the clip path. - FPDF_DestroyClipPath(clip_path); + // With all the hyphen positions, do the search and replace. + while (!hyphen_positions.empty()) { + static const base::char16 kCr[] = {L'\r', L'\0'}; + const HyphenPosition& position = hyphen_positions.back(); + if (position.next_whitespace_position != 0) { + (*text)[position.next_whitespace_position] = L'\n'; + text->insert(position.next_whitespace_position, kCr); + } + text->erase(position.position, 1); + hyphen_positions.pop_back(); + } + + // Adobe Reader also get rid of trailing spaces right before a CRLF. + static const base::char16 kSpaceCrCn[] = {L' ', L'\r', L'\n', L'\0'}; + static const base::char16 kCrCn[] = {L'\r', L'\n', L'\0'}; + ReplaceSubstringsAfterOffset(text, 0, kSpaceCrCn, kCrCn); +} + +// Replace CR/LF with just LF on POSIX. +void FormatStringForOS(base::string16* text) { +#if defined(OS_POSIX) + static const base::char16 kCr[] = {L'\r', L'\0'}; + static const base::char16 kBlank[] = {L'\0'}; + base::ReplaceChars(*text, kCr, kBlank, text); +#elif defined(OS_WIN) + // Do nothing +#else + NOTIMPLEMENTED(); +#endif } +} // namespace + bool InitializeSDK(void* data) { FPDF_InitLibrary(data); @@ -1882,6 +1913,8 @@ std::string PDFiumEngine::GetSelectedText() { } } + FormatStringWithHyphens(&result); + FormatStringForOS(&result); return base::UTF16ToUTF8(result); } -- cgit v1.1