diff options
author | dmazzoni <dmazzoni@chromium.org> | 2016-02-04 15:53:06 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-02-04 23:54:24 +0000 |
commit | ee8c002360097305a3b058c0bcb5befdd843ab16 (patch) | |
tree | 178d2644b655b2bde56c30c9f067cae0bb7dfcbd /pdf | |
parent | e9ce0c5eedfc73b15404b5c0a5e737cce94670bf (diff) | |
download | chromium_src-ee8c002360097305a3b058c0bcb5befdd843ab16.zip chromium_src-ee8c002360097305a3b058c0bcb5befdd843ab16.tar.gz chromium_src-ee8c002360097305a3b058c0bcb5befdd843ab16.tar.bz2 |
Improve extraction of accessible text from PDF.
This improves the data that ChromeVox accesses when making PDF files
accessible.
BUG=434175
Review URL: https://codereview.chromium.org/1568723002
Cr-Commit-Position: refs/heads/master@{#373667}
Diffstat (limited to 'pdf')
-rw-r--r-- | pdf/pdfium/pdfium_page.cc | 265 | ||||
-rw-r--r-- | pdf/pdfium/pdfium_page.h | 6 |
2 files changed, 153 insertions, 118 deletions
diff --git a/pdf/pdfium/pdfium_page.cc b/pdf/pdfium/pdfium_page.cc index 95a8fba..deccb5b 100644 --- a/pdf/pdfium/pdfium_page.cc +++ b/pdf/pdfium/pdfium_page.cc @@ -7,6 +7,8 @@ #include <math.h> #include <stddef.h> +#include <algorithm> + #include "base/logging.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_util.h" @@ -32,10 +34,74 @@ const char kTextBoxFontSize[] = "fontSize"; const char kTextBoxNodes[] = "textNodes"; const char kTextNodeType[] = "type"; const char kTextNodeText[] = "text"; -const char kTextNodeURL[] = "url"; const char kTextNodeTypeText[] = "text"; -const char kTextNodeTypeURL[] = "url"; -const char kDocLinkURLPrefix[] = "#page"; + +pp::Rect PageRectToGViewRect(FPDF_PAGE page, const pp::Rect& input) { + int output_width = FPDF_GetPageWidth(page); + int output_height = FPDF_GetPageHeight(page); + + int min_x; + int min_y; + int max_x; + int max_y; + FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, + input.x(), input.y(), &min_x, &min_y); + FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, + input.right(), input.bottom(), &max_x, &max_y); + + if (max_x < min_x) + std::swap(min_x, max_x); + if (max_y < min_y) + std::swap(min_y, max_y); + + pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y); + output_rect.Intersect(pp::Rect(0, 0, output_width, output_height)); + return output_rect; +} + +pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page, + int index) { + double left, right, bottom, top; + FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); + if (right < left) + std::swap(left, right); + if (bottom < top) + std::swap(top, bottom); + pp::Rect page_coords(left, top, right - left, bottom - top); + return PageRectToGViewRect(page, page_coords); +} + +// This is the character PDFium inserts where a word is broken across lines. +const unsigned int kSoftHyphen = 0x02; + +// The following characters should all be recognized as Unicode newlines: +// LF: Line Feed, U+000A +// VT: Vertical Tab, U+000B +// FF: Form Feed, U+000C +// CR: Carriage Return, U+000D +// CR+LF: CR (U+000D) followed by LF (U+000A) +// NEL: Next Line, U+0085 +// LS: Line Separator, U+2028 +// PS: Paragraph Separator, U+2029. +// Source: http://en.wikipedia.org/wiki/Newline#Unicode . +const unsigned int kUnicodeNewlines[] = { + 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029 +}; + +bool IsSoftHyphen(unsigned int character) { + return kSoftHyphen == character; +} + +bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) { + return !(a.IsEmpty() || b.IsEmpty() || + a.bottom() < b.y() || b.bottom() < a.y()); +} + +bool IsEol(unsigned int character) { + const unsigned int* first = kUnicodeNewlines; + const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines); + return std::find(first, last, character) != last; +} } // namespace @@ -130,130 +196,105 @@ base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { if (!available_) return node; - double width = FPDF_GetPageWidth(GetPage()); - double height = FPDF_GetPageHeight(GetPage()); + FPDF_PAGE page = GetPage(); + FPDF_TEXTPAGE text_page = GetTextPage(); - base::ListValue* text = new base::ListValue(); - int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount()); - for (int i = 0; i < box_count; i++) { - double left, top, right, bottom; - FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom); - text->Append( - GetTextBoxAsValue(height, left, top, right, bottom, rotation)); - } + double width = FPDF_GetPageWidth(page); + double height = FPDF_GetPageHeight(page); node->SetDouble(kPageWidth, width); node->SetDouble(kPageHeight, height); - node->Set(kPageTextBox, text); // Takes ownership of |text| - - return node; -} - -base::Value* PDFiumPage::GetTextBoxAsValue(double page_height, - double left, double top, - double right, double bottom, - int rotation) { - base::string16 text_utf16; - int char_count = - FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0); - if (char_count > 0) { - unsigned short* data = reinterpret_cast<unsigned short*>( - base::WriteInto(&text_utf16, char_count + 1)); - FPDFText_GetBoundedText(GetTextPage(), - left, top, right, bottom, - data, char_count); - } - std::string text_utf8 = base::UTF16ToUTF8(text_utf16); - - FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top); - Area area; - std::vector<LinkTarget> targets; - if (link) { - targets.push_back(LinkTarget()); - area = GetLinkTarget(link, &targets[0]); - } else { - pp::Rect rect( - PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation)); - GetLinks(rect, &targets); - area = targets.empty() ? TEXT_AREA : WEBLINK_AREA; - } + scoped_ptr<base::ListValue> text(new base::ListValue()); + + int chars_count = FPDFText_CountChars(text_page); + pp::Rect line_rect; + pp::Rect word_rect; + bool seen_literal_text_in_word = false; + + // Iterate over all of the chars on the page. Explicitly run the loop + // with |i == chars_count|, which is one past the last character, and + // pretend it's a newline character in order to ensure we always flush + // the last line. + base::string16 line; + for (int i = 0; i <= chars_count; i++) { + unsigned int character; + pp::Rect char_rect; + + if (i < chars_count) { + character = FPDFText_GetUnicode(text_page, i); + char_rect = GetCharRectInGViewCoords(page, text_page, i); + } else { + // Make the last character a newline so the last line isn't lost. + character = '\n'; + } - int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top, - kTolerance, kTolerance); - double font_size = FPDFText_GetFontSize(GetTextPage(), char_index); + // There are spurious STX chars appearing in place + // of ligatures. Apply a heuristic to check that some vertical displacement + // is involved before assuming they are line-breaks. + bool is_intraword_linebreak = false; + if (i < chars_count - 1 && IsSoftHyphen(character)) { + // check if the next char and this char are in different lines. + pp::Rect next_char_rect = GetCharRectInGViewCoords( + page, text_page, i + 1); + + // TODO(dmazzoni): this assumes horizontal text. + // https://crbug.com/580311 + is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect); + } + if (is_intraword_linebreak || + base::IsUnicodeWhitespace(character) || + IsEol(character)) { + if (!word_rect.IsEmpty() && seen_literal_text_in_word) { + word_rect = pp::Rect(); + seen_literal_text_in_word = false; + } + } - base::DictionaryValue* node = new base::DictionaryValue(); - node->SetDouble(kTextBoxLeft, left); - node->SetDouble(kTextBoxTop, page_height - top); - node->SetDouble(kTextBoxWidth, right - left); - node->SetDouble(kTextBoxHeight, top - bottom); - node->SetDouble(kTextBoxFontSize, font_size); - - base::ListValue* text_nodes = new base::ListValue(); - - if (area == DOCLINK_AREA) { - std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page); - text_nodes->Append(CreateURLNode(text_utf8, url)); - } else if (area == WEBLINK_AREA && link) { - text_nodes->Append(CreateURLNode(text_utf8, targets[0].url)); - } else if (area == WEBLINK_AREA && !link) { - size_t start = 0; - for (const auto& target : targets) { - // If there is an extra NULL character at end, find() will not return any - // matches. There should not be any though. - if (!target.url.empty()) - DCHECK_NE(target.url.back(), '\0'); - - // PDFium may change the case of generated links. - std::string lowerCaseURL = base::ToLowerASCII(target.url); - std::string lowerCaseText = base::ToLowerASCII(text_utf8); - size_t pos = lowerCaseText.find(lowerCaseURL, start); - size_t length = target.url.size(); - if (pos == std::string::npos) { - // Check if the link is a "mailto:" URL - if (lowerCaseURL.compare(0, 7, "mailto:") == 0) { - pos = lowerCaseText.find(lowerCaseURL.substr(7), start); - length -= 7; + if (is_intraword_linebreak || IsEol(character)) { + if (!line_rect.IsEmpty()) { + if (is_intraword_linebreak) { + // Add a 0-width hyphen. + line.push_back('-'); } - if (pos == std::string::npos) { - // No match has been found. This should never happen. - continue; - } + base::DictionaryValue* text_node = new base::DictionaryValue(); + text_node->SetString(kTextNodeType, kTextNodeTypeText); + text_node->SetString(kTextNodeText, line); + + base::ListValue* text_nodes = new base::ListValue(); + text_nodes->Append(text_node); + + base::DictionaryValue* line_node = new base::DictionaryValue(); + line_node->SetDouble(kTextBoxLeft, line_rect.x()); + line_node->SetDouble(kTextBoxTop, line_rect.y()); + line_node->SetDouble(kTextBoxWidth, line_rect.width()); + line_node->SetDouble(kTextBoxHeight, line_rect.height()); + line_node->SetDouble(kTextBoxFontSize, + FPDFText_GetFontSize(text_page, i)); + line_node->Set(kTextBoxNodes, text_nodes); + text->Append(line_node); + + line.clear(); + line_rect = pp::Rect(); + word_rect = pp::Rect(); + seen_literal_text_in_word = false; } + continue; + } + seen_literal_text_in_word = seen_literal_text_in_word || + !base::IsUnicodeWhitespace(character); + line.push_back(character); - std::string before_text = text_utf8.substr(start, pos - start); - if (!before_text.empty()) - text_nodes->Append(CreateTextNode(before_text)); - std::string link_text = text_utf8.substr(pos, length); - text_nodes->Append(CreateURLNode(link_text, target.url)); + if (!char_rect.IsEmpty()) { + line_rect = line_rect.Union(char_rect); - start = pos + length; + if (!base::IsUnicodeWhitespace(character)) + word_rect = word_rect.Union(char_rect); } - std::string before_text = text_utf8.substr(start); - if (!before_text.empty()) - text_nodes->Append(CreateTextNode(before_text)); - } else { - text_nodes->Append(CreateTextNode(text_utf8)); } - node->Set(kTextBoxNodes, text_nodes); // Takes ownership of |text_nodes|. - return node; -} - -base::Value* PDFiumPage::CreateTextNode(const std::string& text) { - base::DictionaryValue* node = new base::DictionaryValue(); - node->SetString(kTextNodeType, kTextNodeTypeText); - node->SetString(kTextNodeText, text); - return node; -} + node->Set(kPageTextBox, text.release()); // Takes ownership of |text| -base::Value* PDFiumPage::CreateURLNode(const std::string& text, - const std::string& url) { - base::DictionaryValue* node = new base::DictionaryValue(); - node->SetString(kTextNodeType, kTextNodeTypeURL); - node->SetString(kTextNodeText, text); - node->SetString(kTextNodeURL, url); return node; } diff --git a/pdf/pdfium/pdfium_page.h b/pdf/pdfium/pdfium_page.h index da30504..802ecb6 100644 --- a/pdf/pdfium/pdfium_page.h +++ b/pdf/pdfium/pdfium_page.h @@ -106,12 +106,6 @@ class PDFiumPage { Area GetLinkTarget(FPDF_LINK link, LinkTarget* target) const; // Returns target associated with a destination. Area GetDestinationTarget(FPDF_DEST destination, LinkTarget* target) const; - // Returns the text in the supplied box as a Value Node - base::Value* GetTextBoxAsValue(double page_height, double left, double top, - double right, double bottom, int rotation); - // Helper functions for JSON generation - base::Value* CreateTextNode(const std::string& text); - base::Value* CreateURLNode(const std::string& text, const std::string& url); class ScopedLoadCounter { public: |