summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-07-17 08:56:59 +0000
committerhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-07-17 08:56:59 +0000
commit48cf2472324304e12d7138f14841f4b2d566e39b (patch)
treea58e97dfbe2a70bafd3b27b7346ca7cee8780315
parent99c4c707b2cf4e0096991c47dcaf8ac57bc52eaa (diff)
downloadchromium_src-48cf2472324304e12d7138f14841f4b2d566e39b.zip
chromium_src-48cf2472324304e12d7138f14841f4b2d566e39b.tar.gz
chromium_src-48cf2472324304e12d7138f14841f4b2d566e39b.tar.bz2
Adds a hy-phen-ator.
This change adds a project file for the hyphen library and a Hyphenator class, which encapsulates the library. (This class is not integrated into Chrome, though.) BUG=47083 TEST=HyphenatorTest.HyphenateWords Review URL: https://chromiumcodereview.appspot.com/9545017 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@146964 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--content/content_renderer.gypi3
-rw-r--r--content/content_tests.gypi1
-rw-r--r--content/renderer/hyphenator/hyphenator.cc231
-rw-r--r--content/renderer/hyphenator/hyphenator.h66
-rw-r--r--content/renderer/hyphenator/hyphenator_unittest.cc90
-rw-r--r--third_party/hyphen/README.chromium9
-rw-r--r--third_party/hyphen/google.patch148
-rw-r--r--third_party/hyphen/hyph_en_US.dic2
-rw-r--r--third_party/hyphen/hyphen.c75
-rw-r--r--third_party/hyphen/hyphen.gyp32
-rw-r--r--third_party/hyphen/hyphen.h4
11 files changed, 653 insertions, 8 deletions
diff --git a/content/content_renderer.gypi b/content/content_renderer.gypi
index 2d4a777..2e5ad41 100644
--- a/content/content_renderer.gypi
+++ b/content/content_renderer.gypi
@@ -10,6 +10,7 @@
'../ppapi/ppapi_internal.gyp:ppapi_proxy',
'../ppapi/ppapi_internal.gyp:ppapi_shared',
'../skia/skia.gyp:skia',
+ '../third_party/hyphen/hyphen.gyp:hyphen',
'../third_party/icu/icu.gyp:icuuc',
'../third_party/icu/icu.gyp:icui18n',
'../third_party/libjingle/libjingle.gyp:libjingle',
@@ -84,6 +85,8 @@
'renderer/gpu/gpu_benchmarking_extension.h',
'renderer/gpu/stream_texture_host_android.cc',
'renderer/gpu/stream_texture_host_android.h',
+ 'renderer/hyphenator/hyphenator.cc',
+ 'renderer/hyphenator/hyphenator.h',
'renderer/idle_user_detector.cc',
'renderer/idle_user_detector.h',
'renderer/input_tag_speech_dispatcher.cc',
diff --git a/content/content_tests.gypi b/content/content_tests.gypi
index 77b8f47..30f8bb5 100644
--- a/content/content_tests.gypi
+++ b/content/content_tests.gypi
@@ -323,6 +323,7 @@
'renderer/android/email_detector_unittest.cc',
'renderer/android/phone_number_detector_unittest.cc',
'renderer/gpu/input_event_filter_unittest.cc',
+ 'renderer/hyphenator/hyphenator_unittest.cc',
'renderer/media/audio_device_unittest.cc',
'renderer/media/audio_message_filter_unittest.cc',
'renderer/media/audio_renderer_mixer_manager_unittest.cc',
diff --git a/content/renderer/hyphenator/hyphenator.cc b/content/renderer/hyphenator/hyphenator.cc
new file mode 100644
index 0000000..da92f9e
--- /dev/null
+++ b/content/renderer/hyphenator/hyphenator.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/renderer/hyphenator/hyphenator.h"
+
+#include "base/file_util.h"
+#include "base/logging.h"
+#include "base/memory/scoped_ptr.h"
+#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
+#include "third_party/hyphen/hyphen.h"
+#include "unicode/uscript.h"
+
+namespace {
+
+// A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds
+// only the length of converted UTF-16 characters. This class is used for
+// creating a mapping from the position of a UTF-8 string to a position of a
+// UTF-16 string without unnecessary conversions. Even though the following
+// snippet produces the same mapping, it needs to convert same characters many
+// times. This class incrementally counts the number of converted UTF-16
+// characters to avoid this problem.
+//
+// scoped_array<size_t> position(new size_t[text.length()]);
+// for (size_t i = 0; i < text.length(); ++i)
+// position[i] = UTF8ToUTF16(text.substr(0, i)).length();
+//
+class UTF16TextLength {
+ public:
+ UTF16TextLength();
+ ~UTF16TextLength();
+
+ // Returns the current position.
+ int utf16_length() const { return utf16_length_; }
+
+ // Appends one UTF-8 character to this converter and advances the converted
+ // position. This converter increases the position by one when it finishes
+ // reading a BMP character and increases by two when it finish reading a
+ // non-BMP character.
+ void Append(char c);
+
+ private:
+ // The length of the converted UTF-16 text.
+ int utf16_length_;
+
+ // The buffer that stores UTF-8 characters being converted.
+ std::string utf8_text_;
+
+ DISALLOW_COPY_AND_ASSIGN(UTF16TextLength);
+};
+
+UTF16TextLength::UTF16TextLength()
+ : utf16_length_(0) {
+}
+
+UTF16TextLength::~UTF16TextLength() {
+}
+
+void UTF16TextLength::Append(char c) {
+ // Append the given character and try converting the UTF-8 characters in this
+ // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,
+ // get the number of UTF-16 characters representing this codepoint and advance
+ // the position.
+ int code = 0;
+ int index = 0;
+ utf8_text_.push_back(c);
+ U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),
+ code);
+ if (code != U_SENTINEL) {
+ utf8_text_.clear();
+ utf16_length_ += U16_LENGTH(code);
+ }
+}
+
+// A class that encapsulates a hyphenation query. This class owns resources
+// temporarily needed for hyphenating one word, and deletes them when it is
+// deleted as listed in the following snippet.
+//
+// std::vector<int> hyphens;
+// QUery query(UTF8ToUTF16("hyphenate"));
+// query.Hyphenate(dict, &hyphens);
+//
+class Query {
+ public:
+ explicit Query(const string16& word);
+ ~Query();
+
+ // Hyphenates a word with the specified dictionary. This function hyphenates
+ // the word provided to its constructor and returns a list of hyphenation
+ // points, positions where we can insert hyphens.
+ bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);
+
+ private:
+ // A word to be hyphenated.
+ std::string word_utf8_;
+
+ // Return variables from the hyphen library.
+ scoped_array<char> hyphen_vector_;
+ char** rep_;
+ int* pos_;
+ int* cut_;
+
+ DISALLOW_COPY_AND_ASSIGN(Query);
+};
+
+Query::Query(const string16& word)
+ : rep_(NULL),
+ pos_(NULL),
+ cut_(NULL) {
+ // Remove trailing punctuation characters. WebKit does not remove these
+ // characters when it hyphenates a word. These characters prevent the hyphen
+ // library from applying some rules, i.e. they prevent the library from adding
+ // hyphens.
+ DCHECK(!word.empty());
+ const char16* data = word.data();
+ int length = static_cast<int>(word.length());
+ while (length > 0) {
+ int previous = length;
+ int code = 0;
+ U16_PREV(data, 0, previous, code);
+ UErrorCode error = U_ZERO_ERROR;
+ if (uscript_getScript(code, &error) != USCRIPT_COMMON)
+ break;
+ length = previous;
+ }
+ UTF16ToUTF8(word.c_str(), length, &word_utf8_);
+ // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a
+ // buffer of |word_.length()| + 5 as written in Line 112 of
+ // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.
+ hyphen_vector_.reset(new char[word_utf8_.length() + 5]);
+}
+
+Query::~Query() {
+ if (rep_) {
+ for (size_t i = 0; i < word_utf8_.length(); ++i) {
+ if (rep_[i])
+ free(rep_[i]);
+ }
+ free(rep_);
+ }
+ if (pos_)
+ free(pos_);
+ if (cut_)
+ free(cut_);
+}
+
+bool Query::Hyphenate(HyphenDict* dictionary,
+ std::vector<int>* hyphen_offsets) {
+ DCHECK(dictionary);
+ DCHECK(hyphen_offsets);
+
+ int error_code = hnj_hyphen_hyphenate2(dictionary,
+ word_utf8_.data(),
+ static_cast<int>(word_utf8_.length()),
+ hyphen_vector_.get(),
+ NULL,
+ &rep_,
+ &pos_,
+ &cut_);
+ if (error_code)
+ return false;
+
+ // WebKit needs hyphenation points counted in UTF-16 characters. On the other
+ // hand, the hyphen library returns hyphenation points counted in UTF-8
+ // characters. We increamentally convert hyphenation points in UTF-8
+ // characters to hyphenation points in UTF-16 characters and write the
+ // converted hyphenation points to the output vector.
+ UTF16TextLength text_length;
+ hyphen_offsets->clear();
+ for (size_t i = 0; i < word_utf8_.length(); ++i) {
+ text_length.Append(word_utf8_[i]);
+ if (hyphen_vector_[i] & 1)
+ hyphen_offsets->push_back(text_length.utf16_length());
+ }
+ return !hyphen_offsets->empty();
+}
+
+} // namespace
+
+namespace content {
+
+Hyphenator::Hyphenator(base::PlatformFile file)
+ : dictionary_(NULL),
+ rule_file_(file),
+ result_(0) {
+}
+
+Hyphenator::~Hyphenator() {
+ if (dictionary_)
+ hnj_hyphen_free(dictionary_);
+}
+
+bool Hyphenator::Initialize() {
+ if (dictionary_)
+ return true;
+
+ rule_map_.reset(new file_util::MemoryMappedFile);
+ if (!rule_map_->Initialize(rule_file_))
+ return false;
+
+ dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());
+ return !!dictionary_;
+}
+
+size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,
+ size_t before_index) {
+ if (!dictionary_ || word.empty())
+ return 0;
+
+ // Call the hyphen library to get all hyphenation points, i.e. positions where
+ // we can insert hyphens. When WebKit finds a line-break, it calls this
+ // function twice or more with the same word to find the best hyphenation
+ // point. To avoid calling the hyphen library twice or more with the same
+ // word, we cache the last query.
+ if (word_ != word) {
+ word_ = word;
+ Query query(word);
+ result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);
+ }
+ if (!result_)
+ return 0;
+ for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();
+ it != hyphen_offsets_.rend(); ++it) {
+ if (static_cast<size_t>(*it) < before_index)
+ return *it;
+ }
+ return 0;
+}
+
+} // namespace content
diff --git a/content/renderer/hyphenator/hyphenator.h b/content/renderer/hyphenator/hyphenator.h
new file mode 100644
index 0000000..561af80
--- /dev/null
+++ b/content/renderer/hyphenator/hyphenator.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_RENDERER_HYPHENATOR_HYPHENATOR_H_
+#define CONTENT_RENDERER_HYPHENATOR_HYPHENATOR_H_
+
+#include <vector>
+
+#include "base/memory/scoped_ptr.h"
+#include "base/platform_file.h"
+#include "base/string16.h"
+#include "content/common/content_export.h"
+
+namespace file_util {
+class MemoryMappedFile;
+}
+
+typedef struct _HyphenDict HyphenDict;
+
+namespace content {
+
+// A class that hyphenates a word. This class encapsulates the hyphen library
+// and manages resources used by the library. When this class uses a huge
+// dictionary, it takes lots of memory (~1.3MB for English). A renderer should
+// create this object only when it renders a page that needs hyphenation and
+// deletes it when it moves to a page that does not need hyphenation.
+class CONTENT_EXPORT Hyphenator {
+ public:
+ explicit Hyphenator(base::PlatformFile file);
+ ~Hyphenator();
+
+ // Initializes the hyphen library and allocates resources needed for
+ // hyphenation.
+ bool Initialize();
+
+ // Returns the last hyphenation point, the position where we can insert a
+ // hyphen, before the given position. If there are not any hyphenation points,
+ // this function returns 0.
+ size_t ComputeLastHyphenLocation(const string16& word, size_t before_index);
+
+ private:
+ // The dictionary used by the hyphen library.
+ HyphenDict* dictionary_;
+
+ // The dictionary file and its memory-mapping object. (Our copy of the hyphen
+ // library uses a memory-mapped file opened by a browser so renderers can use
+ // it without opening the file.)
+ base::PlatformFile rule_file_;
+ scoped_ptr<file_util::MemoryMappedFile> rule_map_;
+
+ // A cached result. WebKit often calls ComputeLastHyphenLocation with the same
+ // word multiple times to find the best hyphenation point when it finds a line
+ // break. On the other hand, the hyphen library returns all hyphenation points
+ // for a word. This class caches the hyphenation points returned by the hyphen
+ // library to avoid calling the library multiple times.
+ string16 word_;
+ bool result_;
+ std::vector<int> hyphen_offsets_;
+
+ DISALLOW_COPY_AND_ASSIGN(Hyphenator);
+};
+
+} // namespace content
+
+#endif // CONTENT_RENDERER_HYPHENATOR_HYPHENATOR_H_
diff --git a/content/renderer/hyphenator/hyphenator_unittest.cc b/content/renderer/hyphenator/hyphenator_unittest.cc
new file mode 100644
index 0000000..84c1ce1
--- /dev/null
+++ b/content/renderer/hyphenator/hyphenator_unittest.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/renderer/hyphenator/hyphenator.h"
+
+#include "base/path_service.h"
+#include "base/platform_file.h"
+#include "base/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "third_party/hyphen/hyphen.h"
+
+// A unit test for our hyphenator. This class loads a sample hyphenation
+// dictionary and hyphenates words.
+class HyphenatorTest : public testing::Test {
+ public:
+ HyphenatorTest() {
+ Initialize();
+ }
+
+ bool Initialize() {
+ FilePath dictionary_path;
+ if (!PathService::Get(base::DIR_SOURCE_ROOT, &dictionary_path))
+ return false;
+ dictionary_path = dictionary_path.AppendASCII("third_party");
+ dictionary_path = dictionary_path.AppendASCII("hyphen");
+ dictionary_path = dictionary_path.AppendASCII("hyph_en_US.dic");
+ base::PlatformFile file = base::CreatePlatformFile(
+ dictionary_path, base::PLATFORM_FILE_OPEN | base::PLATFORM_FILE_READ,
+ NULL, NULL);
+ hyphenator_.reset(new content::Hyphenator(file));
+ return hyphenator_->Initialize();
+ }
+
+ // Creates a human-readable hyphenated word. This function inserts '-'
+ // characters to all places where we can insert hyphens to improve the
+ // readability of this unit test.
+ string16 Hyphenate(const string16& word) {
+ string16 hyphenated_word(word);
+ size_t position = word.length();
+ while (position > 0) {
+ size_t new_position = hyphenator_->ComputeLastHyphenLocation(word,
+ position);
+ EXPECT_LT(new_position, position);
+ if (new_position > 0)
+ hyphenated_word.insert(new_position, 1, '-');
+ position = new_position;
+ }
+ return hyphenated_word;
+ }
+
+ private:
+ scoped_ptr<content::Hyphenator> hyphenator_;
+};
+
+// Verifies that our hyphenator yields the same hyphenated words as the original
+// hyphen library does.
+TEST_F(HyphenatorTest, HyphenateWords) {
+ static const struct {
+ const char* input;
+ const char* expected;
+ } kTestCases[] = {
+ { "and", "and" },
+ { "concupiscent,", "con-cu-pis-cent," },
+ { "evidence.", "ev-i-dence." },
+ { "first", "first" },
+ { "getting", "get-ting" },
+ { "hedgehog", "hedge-hog" },
+ { "remarkable", "re-mark-able" },
+ { "straightened", "straight-ened" },
+ { "undid", "un-did" },
+ { "were", "were" },
+ { "Simply", "Sim-ply" },
+ { "Undone.", "Un-done." },
+ { "comfortably", "com-fort-ably"},
+ { "declination", "dec-li-na-tion" },
+ { "flamingo:", "flamin-go:" },
+ { "lination", "lina-tion" },
+ { "reciprocity", "rec-i-proc-i-ty" },
+ { "throughout", "through-out" },
+ { "undid", "un-did" },
+ { "undone.", "un-done." },
+ { "unnecessary", "un-nec-es-sary" },
+ };
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kTestCases); ++i) {
+ string16 input = ASCIIToUTF16(kTestCases[i].input);
+ string16 expected = ASCIIToUTF16(kTestCases[i].expected);
+ EXPECT_EQ(expected, Hyphenate(input));
+ }
+}
diff --git a/third_party/hyphen/README.chromium b/third_party/hyphen/README.chromium
index 4cbb02e..7cf556b 100644
--- a/third_party/hyphen/README.chromium
+++ b/third_party/hyphen/README.chromium
@@ -1,8 +1,15 @@
Name: hyphen
URL: http://sourceforge.net/projects/hunspell/files/Hyphen/
Version: 2.6
+License File: COPYING
+Security Critical: yes
Description:
-This is a partial copy of Hyphen 2.6.
+This is a partial copy of Hyphen 2.6 with the following changes:
+* Change the input params of hnj_hyphen_load to receive the pointer to a ruleset
+ instead of a file path.
+* Change RIGHTHYPHENMIN to 2 in hyph_en_US.dic so it hyphenates rec-i-proc-i-ty
+ as expected.
+The patch is in google.patch.
See 'hyphen.tex' for additional requirements regarding that file. \ No newline at end of file
diff --git a/third_party/hyphen/google.patch b/third_party/hyphen/google.patch
new file mode 100644
index 0000000..bca4d2f
--- /dev/null
+++ b/third_party/hyphen/google.patch
@@ -0,0 +1,148 @@
+? google.patch
+Index: hyphen.c
+===================================================================
+RCS file: /cvsroot/hunspell/hyphen/hyphen.c,v
+retrieving revision 1.4
+diff -u -r1.4 hyphen.c
+--- hyphen.c 1 Dec 2010 01:30:20 -0000 1.4
++++ hyphen.c 1 Mar 2012 05:18:32 -0000
+@@ -242,12 +242,71 @@
+ }
+ #endif
+
++#ifdef HYPHEN_CHROME_CLIENT
++typedef struct {
++ const unsigned char *data;
++ size_t offset;
++ size_t size;
++} hnj_file;
++
++static hnj_file *
++hnj_fopen (const unsigned char *data, size_t size)
++{
++ hnj_file *f;
++
++ f = hnj_malloc (sizeof(hnj_file));
++ if (f == NULL)
++ return NULL;
++ f->offset = 0;
++ f->data = data;
++ f->size = size;
++ return f;
++}
++
++static void
++hnj_fclose (hnj_file *f)
++{
++ hnj_free (f);
++}
++
++static char *
++hnj_fgets (char *s, int size, hnj_file *f)
++{
++ int i;
++
++ if (f->offset >= f->size)
++ return NULL;
++ for (i = 0; i < size - 1; i++) {
++ char c;
++
++ if (f->offset >= f->size)
++ break;
++ c = f->data[f->offset++];
++ if (c == '\r' || c == '\n')
++ break;
++ s[i] = c;
++ }
++ s[i] = '\0';
++ return s;
++}
++#else
++typedef FILE hnj_file;
++#define hnj_fopen(fn, mode) fopen((fn), (mode))
++#define hnj_fclose(f) fclose(f)
++#define hnj_fgets(s, size, f) fgets((s), (size), (f))
++#endif
++
++#ifdef HYPHEN_CHROME_CLIENT
++HyphenDict *
++hnj_hyphen_load (const unsigned char *data, size_t size)
++#else
+ HyphenDict *
+ hnj_hyphen_load (const char *fn)
++#endif
+ {
+ HyphenDict *dict[2];
+ HashTab *hashtab;
+- FILE *f;
++ hnj_file *f;
+ char buf[MAX_CHARS];
+ char word[MAX_CHARS];
+ char pattern[MAX_CHARS];
+@@ -261,7 +320,11 @@
+ HashEntry *e;
+ int nextlevel = 0;
+
++#ifdef HYPHEN_CHROME_CLIENT
++ f = hnj_fopen (data, size);
++#else
+ f = fopen (fn, "r");
++#endif
+ if (f == NULL)
+ return NULL;
+
+@@ -291,7 +354,7 @@
+ /* read in character set info */
+ if (k == 0) {
+ for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
+- if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
++ if (hnj_fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
+ for (i=0;i<MAX_NAME;i++)
+ if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
+ dict[k]->cset[i] = 0;
+@@ -304,7 +367,7 @@
+ dict[k]->utf8 = dict[0]->utf8;
+ }
+
+- while (fgets (buf, sizeof(buf), f) != NULL)
++ while (hnj_fgets (buf, sizeof(buf), f) != NULL)
+ {
+ if (buf[0] != '%')
+ {
+@@ -385,7 +448,7 @@
+ if (dict[k]->utf8) {
+ int pu = -1; /* unicode character position */
+ int ps = -1; /* unicode start position (original replindex) */
+- int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
++ size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */
+ for (; pc < (strlen(word) + 1); pc++) {
+ /* beginning of an UTF-8 character (not '10' start bits) */
+ if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
+@@ -478,7 +541,7 @@
+ #endif
+ state_num = 0;
+ }
+- fclose(f);
++ hnj_fclose(f);
+ if (k == 2) dict[0]->nextlevel = dict[1];
+ return dict[0];
+ }
+Index: hyphen.h
+===================================================================
+RCS file: /cvsroot/hunspell/hyphen/hyphen.h,v
+retrieving revision 1.2
+diff -u -r1.2 hyphen.h
+--- hyphen.h 27 Nov 2010 02:20:33 -0000 1.2
++++ hyphen.h 1 Mar 2012 05:18:33 -0000
+@@ -93,7 +93,11 @@
+ int new_state;
+ };
+
++#ifdef HYPHEN_CHROME_CLIENT
++HyphenDict *hnj_hyphen_load (const unsigned char *data, size_t size);
++#else
+ HyphenDict *hnj_hyphen_load (const char *fn);
++#endif
+ void hnj_hyphen_free (HyphenDict *dict);
+
+ /* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */
diff --git a/third_party/hyphen/hyph_en_US.dic b/third_party/hyphen/hyph_en_US.dic
index e38cbce..3baa02d 100644
--- a/third_party/hyphen/hyph_en_US.dic
+++ b/third_party/hyphen/hyph_en_US.dic
@@ -1,6 +1,6 @@
UTF-8
LEFTHYPHENMIN 2
-RIGHTHYPHENMIN 3
+RIGHTHYPHENMIN 2
COMPOUNDLEFTHYPHENMIN 2
COMPOUNDRIGHTHYPHENMIN 3
1'.
diff --git a/third_party/hyphen/hyphen.c b/third_party/hyphen/hyphen.c
index 26fbefd..6b9cb78 100644
--- a/third_party/hyphen/hyphen.c
+++ b/third_party/hyphen/hyphen.c
@@ -242,12 +242,71 @@ get_state_str (int state)
}
#endif
+#ifdef HYPHEN_CHROME_CLIENT
+typedef struct {
+ const unsigned char *data;
+ size_t offset;
+ size_t size;
+} hnj_file;
+
+static hnj_file *
+hnj_fopen (const unsigned char *data, size_t size)
+{
+ hnj_file *f;
+
+ f = hnj_malloc (sizeof(hnj_file));
+ if (f == NULL)
+ return NULL;
+ f->offset = 0;
+ f->data = data;
+ f->size = size;
+ return f;
+}
+
+static void
+hnj_fclose (hnj_file *f)
+{
+ hnj_free (f);
+}
+
+static char *
+hnj_fgets (char *s, int size, hnj_file *f)
+{
+ int i;
+
+ if (f->offset >= f->size)
+ return NULL;
+ for (i = 0; i < size - 1; i++) {
+ char c;
+
+ if (f->offset >= f->size)
+ break;
+ c = f->data[f->offset++];
+ if (c == '\r' || c == '\n')
+ break;
+ s[i] = c;
+ }
+ s[i] = '\0';
+ return s;
+}
+#else
+typedef FILE hnj_file;
+#define hnj_fopen(fn, mode) fopen((fn), (mode))
+#define hnj_fclose(f) fclose(f)
+#define hnj_fgets(s, size, f) fgets((s), (size), (f))
+#endif
+
+#ifdef HYPHEN_CHROME_CLIENT
+HyphenDict *
+hnj_hyphen_load (const unsigned char *data, size_t size)
+#else
HyphenDict *
hnj_hyphen_load (const char *fn)
+#endif
{
HyphenDict *dict[2];
HashTab *hashtab;
- FILE *f;
+ hnj_file *f;
char buf[MAX_CHARS];
char word[MAX_CHARS];
char pattern[MAX_CHARS];
@@ -261,7 +320,11 @@ hnj_hyphen_load (const char *fn)
HashEntry *e;
int nextlevel = 0;
- f = fopen (fn, "r");
+#ifdef HYPHEN_CHROME_CLIENT
+ f = hnj_fopen (data, size);
+#else
+ f = hnj_fopen (fn, "r");
+#endif
if (f == NULL)
return NULL;
@@ -289,7 +352,7 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
/* read in character set info */
if (k == 0) {
for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
- if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
+ if (hnj_fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
for (i=0;i<MAX_NAME;i++)
if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
dict[k]->cset[i] = 0;
@@ -302,7 +365,7 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
dict[k]->utf8 = dict[0]->utf8;
}
- while (fgets (buf, sizeof(buf), f) != NULL)
+ while (hnj_fgets (buf, sizeof(buf), f) != NULL)
{
if (buf[0] != '%')
{
@@ -368,7 +431,7 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
if (dict[k]->utf8) {
int pu = -1; /* unicode character position */
int ps = -1; /* unicode start position (original replindex) */
- int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
+ size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */
for (; pc < (strlen(word) + 1); pc++) {
/* beginning of an UTF-8 character (not '10' start bits) */
if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
@@ -461,7 +524,7 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
#endif
state_num = 0;
}
- fclose(f);
+ hnj_fclose(f);
if (k == 2) dict[0]->nextlevel = dict[1];
return dict[0];
}
diff --git a/third_party/hyphen/hyphen.gyp b/third_party/hyphen/hyphen.gyp
new file mode 100644
index 0000000..35becc4
--- /dev/null
+++ b/third_party/hyphen/hyphen.gyp
@@ -0,0 +1,32 @@
+# Copyright (c) 2012 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+{
+ 'targets': [
+ {
+ 'target_name': 'hyphen',
+ 'type': '<(library)',
+ 'include_dirs': [
+ '.',
+ ],
+ 'defines': [
+ 'HYPHEN_CHROME_CLIENT',
+ ],
+ 'sources': [
+ 'hnjalloc.c',
+ 'hnjalloc.h',
+ 'hyphen.h',
+ 'hyphen.c',
+ ],
+ 'direct_dependent_settings': {
+ 'defines': [
+ 'HYPHEN_CHROME_CLIENT',
+ ],
+ 'include_dirs': [
+ '.',
+ ],
+ },
+ },
+ ],
+}
diff --git a/third_party/hyphen/hyphen.h b/third_party/hyphen/hyphen.h
index 5d79308..b5517d3 100644
--- a/third_party/hyphen/hyphen.h
+++ b/third_party/hyphen/hyphen.h
@@ -90,7 +90,11 @@ struct _HyphenTrans {
int new_state;
};
+#ifdef HYPHEN_CHROME_CLIENT
+HyphenDict *hnj_hyphen_load (const unsigned char *data, size_t size);
+#else
HyphenDict *hnj_hyphen_load (const char *fn);
+#endif
void hnj_hyphen_free (HyphenDict *dict);
/* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */