diff options
author | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-07-17 08:56:59 +0000 |
---|---|---|
committer | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-07-17 08:56:59 +0000 |
commit | 48cf2472324304e12d7138f14841f4b2d566e39b (patch) | |
tree | a58e97dfbe2a70bafd3b27b7346ca7cee8780315 | |
parent | 99c4c707b2cf4e0096991c47dcaf8ac57bc52eaa (diff) | |
download | chromium_src-48cf2472324304e12d7138f14841f4b2d566e39b.zip chromium_src-48cf2472324304e12d7138f14841f4b2d566e39b.tar.gz chromium_src-48cf2472324304e12d7138f14841f4b2d566e39b.tar.bz2 |
Adds a hy-phen-ator.
This change adds a project file for the hyphen library and a Hyphenator class, which encapsulates the library. (This class is not integrated into Chrome, though.)
BUG=47083
TEST=HyphenatorTest.HyphenateWords
Review URL: https://chromiumcodereview.appspot.com/9545017
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@146964 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | content/content_renderer.gypi | 3 | ||||
-rw-r--r-- | content/content_tests.gypi | 1 | ||||
-rw-r--r-- | content/renderer/hyphenator/hyphenator.cc | 231 | ||||
-rw-r--r-- | content/renderer/hyphenator/hyphenator.h | 66 | ||||
-rw-r--r-- | content/renderer/hyphenator/hyphenator_unittest.cc | 90 | ||||
-rw-r--r-- | third_party/hyphen/README.chromium | 9 | ||||
-rw-r--r-- | third_party/hyphen/google.patch | 148 | ||||
-rw-r--r-- | third_party/hyphen/hyph_en_US.dic | 2 | ||||
-rw-r--r-- | third_party/hyphen/hyphen.c | 75 | ||||
-rw-r--r-- | third_party/hyphen/hyphen.gyp | 32 | ||||
-rw-r--r-- | third_party/hyphen/hyphen.h | 4 |
11 files changed, 653 insertions, 8 deletions
diff --git a/content/content_renderer.gypi b/content/content_renderer.gypi index 2d4a777..2e5ad41 100644 --- a/content/content_renderer.gypi +++ b/content/content_renderer.gypi @@ -10,6 +10,7 @@ '../ppapi/ppapi_internal.gyp:ppapi_proxy', '../ppapi/ppapi_internal.gyp:ppapi_shared', '../skia/skia.gyp:skia', + '../third_party/hyphen/hyphen.gyp:hyphen', '../third_party/icu/icu.gyp:icuuc', '../third_party/icu/icu.gyp:icui18n', '../third_party/libjingle/libjingle.gyp:libjingle', @@ -84,6 +85,8 @@ 'renderer/gpu/gpu_benchmarking_extension.h', 'renderer/gpu/stream_texture_host_android.cc', 'renderer/gpu/stream_texture_host_android.h', + 'renderer/hyphenator/hyphenator.cc', + 'renderer/hyphenator/hyphenator.h', 'renderer/idle_user_detector.cc', 'renderer/idle_user_detector.h', 'renderer/input_tag_speech_dispatcher.cc', diff --git a/content/content_tests.gypi b/content/content_tests.gypi index 77b8f47..30f8bb5 100644 --- a/content/content_tests.gypi +++ b/content/content_tests.gypi @@ -323,6 +323,7 @@ 'renderer/android/email_detector_unittest.cc', 'renderer/android/phone_number_detector_unittest.cc', 'renderer/gpu/input_event_filter_unittest.cc', + 'renderer/hyphenator/hyphenator_unittest.cc', 'renderer/media/audio_device_unittest.cc', 'renderer/media/audio_message_filter_unittest.cc', 'renderer/media/audio_renderer_mixer_manager_unittest.cc', diff --git a/content/renderer/hyphenator/hyphenator.cc b/content/renderer/hyphenator/hyphenator.cc new file mode 100644 index 0000000..da92f9e --- /dev/null +++ b/content/renderer/hyphenator/hyphenator.cc @@ -0,0 +1,231 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/renderer/hyphenator/hyphenator.h" + +#include "base/file_util.h" +#include "base/logging.h" +#include "base/memory/scoped_ptr.h" +#include "base/string_util.h" +#include "base/utf_string_conversions.h" +#include "third_party/hyphen/hyphen.h" +#include "unicode/uscript.h" + +namespace { + +// A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds +// only the length of converted UTF-16 characters. This class is used for +// creating a mapping from the position of a UTF-8 string to a position of a +// UTF-16 string without unnecessary conversions. Even though the following +// snippet produces the same mapping, it needs to convert same characters many +// times. This class incrementally counts the number of converted UTF-16 +// characters to avoid this problem. +// +// scoped_array<size_t> position(new size_t[text.length()]); +// for (size_t i = 0; i < text.length(); ++i) +// position[i] = UTF8ToUTF16(text.substr(0, i)).length(); +// +class UTF16TextLength { + public: + UTF16TextLength(); + ~UTF16TextLength(); + + // Returns the current position. + int utf16_length() const { return utf16_length_; } + + // Appends one UTF-8 character to this converter and advances the converted + // position. This converter increases the position by one when it finishes + // reading a BMP character and increases by two when it finish reading a + // non-BMP character. + void Append(char c); + + private: + // The length of the converted UTF-16 text. + int utf16_length_; + + // The buffer that stores UTF-8 characters being converted. + std::string utf8_text_; + + DISALLOW_COPY_AND_ASSIGN(UTF16TextLength); +}; + +UTF16TextLength::UTF16TextLength() + : utf16_length_(0) { +} + +UTF16TextLength::~UTF16TextLength() { +} + +void UTF16TextLength::Append(char c) { + // Append the given character and try converting the UTF-8 characters in this + // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint, + // get the number of UTF-16 characters representing this codepoint and advance + // the position. + int code = 0; + int index = 0; + utf8_text_.push_back(c); + U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()), + code); + if (code != U_SENTINEL) { + utf8_text_.clear(); + utf16_length_ += U16_LENGTH(code); + } +} + +// A class that encapsulates a hyphenation query. This class owns resources +// temporarily needed for hyphenating one word, and deletes them when it is +// deleted as listed in the following snippet. +// +// std::vector<int> hyphens; +// QUery query(UTF8ToUTF16("hyphenate")); +// query.Hyphenate(dict, &hyphens); +// +class Query { + public: + explicit Query(const string16& word); + ~Query(); + + // Hyphenates a word with the specified dictionary. This function hyphenates + // the word provided to its constructor and returns a list of hyphenation + // points, positions where we can insert hyphens. + bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets); + + private: + // A word to be hyphenated. + std::string word_utf8_; + + // Return variables from the hyphen library. + scoped_array<char> hyphen_vector_; + char** rep_; + int* pos_; + int* cut_; + + DISALLOW_COPY_AND_ASSIGN(Query); +}; + +Query::Query(const string16& word) + : rep_(NULL), + pos_(NULL), + cut_(NULL) { + // Remove trailing punctuation characters. WebKit does not remove these + // characters when it hyphenates a word. These characters prevent the hyphen + // library from applying some rules, i.e. they prevent the library from adding + // hyphens. + DCHECK(!word.empty()); + const char16* data = word.data(); + int length = static_cast<int>(word.length()); + while (length > 0) { + int previous = length; + int code = 0; + U16_PREV(data, 0, previous, code); + UErrorCode error = U_ZERO_ERROR; + if (uscript_getScript(code, &error) != USCRIPT_COMMON) + break; + length = previous; + } + UTF16ToUTF8(word.c_str(), length, &word_utf8_); + // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a + // buffer of |word_.length()| + 5 as written in Line 112 of + // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>. + hyphen_vector_.reset(new char[word_utf8_.length() + 5]); +} + +Query::~Query() { + if (rep_) { + for (size_t i = 0; i < word_utf8_.length(); ++i) { + if (rep_[i]) + free(rep_[i]); + } + free(rep_); + } + if (pos_) + free(pos_); + if (cut_) + free(cut_); +} + +bool Query::Hyphenate(HyphenDict* dictionary, + std::vector<int>* hyphen_offsets) { + DCHECK(dictionary); + DCHECK(hyphen_offsets); + + int error_code = hnj_hyphen_hyphenate2(dictionary, + word_utf8_.data(), + static_cast<int>(word_utf8_.length()), + hyphen_vector_.get(), + NULL, + &rep_, + &pos_, + &cut_); + if (error_code) + return false; + + // WebKit needs hyphenation points counted in UTF-16 characters. On the other + // hand, the hyphen library returns hyphenation points counted in UTF-8 + // characters. We increamentally convert hyphenation points in UTF-8 + // characters to hyphenation points in UTF-16 characters and write the + // converted hyphenation points to the output vector. + UTF16TextLength text_length; + hyphen_offsets->clear(); + for (size_t i = 0; i < word_utf8_.length(); ++i) { + text_length.Append(word_utf8_[i]); + if (hyphen_vector_[i] & 1) + hyphen_offsets->push_back(text_length.utf16_length()); + } + return !hyphen_offsets->empty(); +} + +} // namespace + +namespace content { + +Hyphenator::Hyphenator(base::PlatformFile file) + : dictionary_(NULL), + rule_file_(file), + result_(0) { +} + +Hyphenator::~Hyphenator() { + if (dictionary_) + hnj_hyphen_free(dictionary_); +} + +bool Hyphenator::Initialize() { + if (dictionary_) + return true; + + rule_map_.reset(new file_util::MemoryMappedFile); + if (!rule_map_->Initialize(rule_file_)) + return false; + + dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length()); + return !!dictionary_; +} + +size_t Hyphenator::ComputeLastHyphenLocation(const string16& word, + size_t before_index) { + if (!dictionary_ || word.empty()) + return 0; + + // Call the hyphen library to get all hyphenation points, i.e. positions where + // we can insert hyphens. When WebKit finds a line-break, it calls this + // function twice or more with the same word to find the best hyphenation + // point. To avoid calling the hyphen library twice or more with the same + // word, we cache the last query. + if (word_ != word) { + word_ = word; + Query query(word); + result_ = query.Hyphenate(dictionary_, &hyphen_offsets_); + } + if (!result_) + return 0; + for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin(); + it != hyphen_offsets_.rend(); ++it) { + if (static_cast<size_t>(*it) < before_index) + return *it; + } + return 0; +} + +} // namespace content diff --git a/content/renderer/hyphenator/hyphenator.h b/content/renderer/hyphenator/hyphenator.h new file mode 100644 index 0000000..561af80 --- /dev/null +++ b/content/renderer/hyphenator/hyphenator.h @@ -0,0 +1,66 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_RENDERER_HYPHENATOR_HYPHENATOR_H_ +#define CONTENT_RENDERER_HYPHENATOR_HYPHENATOR_H_ + +#include <vector> + +#include "base/memory/scoped_ptr.h" +#include "base/platform_file.h" +#include "base/string16.h" +#include "content/common/content_export.h" + +namespace file_util { +class MemoryMappedFile; +} + +typedef struct _HyphenDict HyphenDict; + +namespace content { + +// A class that hyphenates a word. This class encapsulates the hyphen library +// and manages resources used by the library. When this class uses a huge +// dictionary, it takes lots of memory (~1.3MB for English). A renderer should +// create this object only when it renders a page that needs hyphenation and +// deletes it when it moves to a page that does not need hyphenation. +class CONTENT_EXPORT Hyphenator { + public: + explicit Hyphenator(base::PlatformFile file); + ~Hyphenator(); + + // Initializes the hyphen library and allocates resources needed for + // hyphenation. + bool Initialize(); + + // Returns the last hyphenation point, the position where we can insert a + // hyphen, before the given position. If there are not any hyphenation points, + // this function returns 0. + size_t ComputeLastHyphenLocation(const string16& word, size_t before_index); + + private: + // The dictionary used by the hyphen library. + HyphenDict* dictionary_; + + // The dictionary file and its memory-mapping object. (Our copy of the hyphen + // library uses a memory-mapped file opened by a browser so renderers can use + // it without opening the file.) + base::PlatformFile rule_file_; + scoped_ptr<file_util::MemoryMappedFile> rule_map_; + + // A cached result. WebKit often calls ComputeLastHyphenLocation with the same + // word multiple times to find the best hyphenation point when it finds a line + // break. On the other hand, the hyphen library returns all hyphenation points + // for a word. This class caches the hyphenation points returned by the hyphen + // library to avoid calling the library multiple times. + string16 word_; + bool result_; + std::vector<int> hyphen_offsets_; + + DISALLOW_COPY_AND_ASSIGN(Hyphenator); +}; + +} // namespace content + +#endif // CONTENT_RENDERER_HYPHENATOR_HYPHENATOR_H_ diff --git a/content/renderer/hyphenator/hyphenator_unittest.cc b/content/renderer/hyphenator/hyphenator_unittest.cc new file mode 100644 index 0000000..84c1ce1 --- /dev/null +++ b/content/renderer/hyphenator/hyphenator_unittest.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/renderer/hyphenator/hyphenator.h" + +#include "base/path_service.h" +#include "base/platform_file.h" +#include "base/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "third_party/hyphen/hyphen.h" + +// A unit test for our hyphenator. This class loads a sample hyphenation +// dictionary and hyphenates words. +class HyphenatorTest : public testing::Test { + public: + HyphenatorTest() { + Initialize(); + } + + bool Initialize() { + FilePath dictionary_path; + if (!PathService::Get(base::DIR_SOURCE_ROOT, &dictionary_path)) + return false; + dictionary_path = dictionary_path.AppendASCII("third_party"); + dictionary_path = dictionary_path.AppendASCII("hyphen"); + dictionary_path = dictionary_path.AppendASCII("hyph_en_US.dic"); + base::PlatformFile file = base::CreatePlatformFile( + dictionary_path, base::PLATFORM_FILE_OPEN | base::PLATFORM_FILE_READ, + NULL, NULL); + hyphenator_.reset(new content::Hyphenator(file)); + return hyphenator_->Initialize(); + } + + // Creates a human-readable hyphenated word. This function inserts '-' + // characters to all places where we can insert hyphens to improve the + // readability of this unit test. + string16 Hyphenate(const string16& word) { + string16 hyphenated_word(word); + size_t position = word.length(); + while (position > 0) { + size_t new_position = hyphenator_->ComputeLastHyphenLocation(word, + position); + EXPECT_LT(new_position, position); + if (new_position > 0) + hyphenated_word.insert(new_position, 1, '-'); + position = new_position; + } + return hyphenated_word; + } + + private: + scoped_ptr<content::Hyphenator> hyphenator_; +}; + +// Verifies that our hyphenator yields the same hyphenated words as the original +// hyphen library does. +TEST_F(HyphenatorTest, HyphenateWords) { + static const struct { + const char* input; + const char* expected; + } kTestCases[] = { + { "and", "and" }, + { "concupiscent,", "con-cu-pis-cent," }, + { "evidence.", "ev-i-dence." }, + { "first", "first" }, + { "getting", "get-ting" }, + { "hedgehog", "hedge-hog" }, + { "remarkable", "re-mark-able" }, + { "straightened", "straight-ened" }, + { "undid", "un-did" }, + { "were", "were" }, + { "Simply", "Sim-ply" }, + { "Undone.", "Un-done." }, + { "comfortably", "com-fort-ably"}, + { "declination", "dec-li-na-tion" }, + { "flamingo:", "flamin-go:" }, + { "lination", "lina-tion" }, + { "reciprocity", "rec-i-proc-i-ty" }, + { "throughout", "through-out" }, + { "undid", "un-did" }, + { "undone.", "un-done." }, + { "unnecessary", "un-nec-es-sary" }, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kTestCases); ++i) { + string16 input = ASCIIToUTF16(kTestCases[i].input); + string16 expected = ASCIIToUTF16(kTestCases[i].expected); + EXPECT_EQ(expected, Hyphenate(input)); + } +} diff --git a/third_party/hyphen/README.chromium b/third_party/hyphen/README.chromium index 4cbb02e..7cf556b 100644 --- a/third_party/hyphen/README.chromium +++ b/third_party/hyphen/README.chromium @@ -1,8 +1,15 @@ Name: hyphen URL: http://sourceforge.net/projects/hunspell/files/Hyphen/ Version: 2.6 +License File: COPYING +Security Critical: yes Description: -This is a partial copy of Hyphen 2.6. +This is a partial copy of Hyphen 2.6 with the following changes: +* Change the input params of hnj_hyphen_load to receive the pointer to a ruleset + instead of a file path. +* Change RIGHTHYPHENMIN to 2 in hyph_en_US.dic so it hyphenates rec-i-proc-i-ty + as expected. +The patch is in google.patch. See 'hyphen.tex' for additional requirements regarding that file.
\ No newline at end of file diff --git a/third_party/hyphen/google.patch b/third_party/hyphen/google.patch new file mode 100644 index 0000000..bca4d2f --- /dev/null +++ b/third_party/hyphen/google.patch @@ -0,0 +1,148 @@ +? google.patch +Index: hyphen.c +=================================================================== +RCS file: /cvsroot/hunspell/hyphen/hyphen.c,v +retrieving revision 1.4 +diff -u -r1.4 hyphen.c +--- hyphen.c 1 Dec 2010 01:30:20 -0000 1.4 ++++ hyphen.c 1 Mar 2012 05:18:32 -0000 +@@ -242,12 +242,71 @@ + } + #endif + ++#ifdef HYPHEN_CHROME_CLIENT ++typedef struct { ++ const unsigned char *data; ++ size_t offset; ++ size_t size; ++} hnj_file; ++ ++static hnj_file * ++hnj_fopen (const unsigned char *data, size_t size) ++{ ++ hnj_file *f; ++ ++ f = hnj_malloc (sizeof(hnj_file)); ++ if (f == NULL) ++ return NULL; ++ f->offset = 0; ++ f->data = data; ++ f->size = size; ++ return f; ++} ++ ++static void ++hnj_fclose (hnj_file *f) ++{ ++ hnj_free (f); ++} ++ ++static char * ++hnj_fgets (char *s, int size, hnj_file *f) ++{ ++ int i; ++ ++ if (f->offset >= f->size) ++ return NULL; ++ for (i = 0; i < size - 1; i++) { ++ char c; ++ ++ if (f->offset >= f->size) ++ break; ++ c = f->data[f->offset++]; ++ if (c == '\r' || c == '\n') ++ break; ++ s[i] = c; ++ } ++ s[i] = '\0'; ++ return s; ++} ++#else ++typedef FILE hnj_file; ++#define hnj_fopen(fn, mode) fopen((fn), (mode)) ++#define hnj_fclose(f) fclose(f) ++#define hnj_fgets(s, size, f) fgets((s), (size), (f)) ++#endif ++ ++#ifdef HYPHEN_CHROME_CLIENT ++HyphenDict * ++hnj_hyphen_load (const unsigned char *data, size_t size) ++#else + HyphenDict * + hnj_hyphen_load (const char *fn) ++#endif + { + HyphenDict *dict[2]; + HashTab *hashtab; +- FILE *f; ++ hnj_file *f; + char buf[MAX_CHARS]; + char word[MAX_CHARS]; + char pattern[MAX_CHARS]; +@@ -261,7 +320,11 @@ + HashEntry *e; + int nextlevel = 0; + ++#ifdef HYPHEN_CHROME_CLIENT ++ f = hnj_fopen (data, size); ++#else + f = fopen (fn, "r"); ++#endif + if (f == NULL) + return NULL; + +@@ -291,7 +354,7 @@ + /* read in character set info */ + if (k == 0) { + for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; +- if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { ++ if (hnj_fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { + for (i=0;i<MAX_NAME;i++) + if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) + dict[k]->cset[i] = 0; +@@ -304,7 +367,7 @@ + dict[k]->utf8 = dict[0]->utf8; + } + +- while (fgets (buf, sizeof(buf), f) != NULL) ++ while (hnj_fgets (buf, sizeof(buf), f) != NULL) + { + if (buf[0] != '%') + { +@@ -385,7 +448,7 @@ + if (dict[k]->utf8) { + int pu = -1; /* unicode character position */ + int ps = -1; /* unicode start position (original replindex) */ +- int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ ++ size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */ + for (; pc < (strlen(word) + 1); pc++) { + /* beginning of an UTF-8 character (not '10' start bits) */ + if ((((unsigned char) word[pc]) >> 6) != 2) pu++; +@@ -478,7 +541,7 @@ + #endif + state_num = 0; + } +- fclose(f); ++ hnj_fclose(f); + if (k == 2) dict[0]->nextlevel = dict[1]; + return dict[0]; + } +Index: hyphen.h +=================================================================== +RCS file: /cvsroot/hunspell/hyphen/hyphen.h,v +retrieving revision 1.2 +diff -u -r1.2 hyphen.h +--- hyphen.h 27 Nov 2010 02:20:33 -0000 1.2 ++++ hyphen.h 1 Mar 2012 05:18:33 -0000 +@@ -93,7 +93,11 @@ + int new_state; + }; + ++#ifdef HYPHEN_CHROME_CLIENT ++HyphenDict *hnj_hyphen_load (const unsigned char *data, size_t size); ++#else + HyphenDict *hnj_hyphen_load (const char *fn); ++#endif + void hnj_hyphen_free (HyphenDict *dict); + + /* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */ diff --git a/third_party/hyphen/hyph_en_US.dic b/third_party/hyphen/hyph_en_US.dic index e38cbce..3baa02d 100644 --- a/third_party/hyphen/hyph_en_US.dic +++ b/third_party/hyphen/hyph_en_US.dic @@ -1,6 +1,6 @@ UTF-8 LEFTHYPHENMIN 2 -RIGHTHYPHENMIN 3 +RIGHTHYPHENMIN 2 COMPOUNDLEFTHYPHENMIN 2 COMPOUNDRIGHTHYPHENMIN 3 1'. diff --git a/third_party/hyphen/hyphen.c b/third_party/hyphen/hyphen.c index 26fbefd..6b9cb78 100644 --- a/third_party/hyphen/hyphen.c +++ b/third_party/hyphen/hyphen.c @@ -242,12 +242,71 @@ get_state_str (int state) } #endif +#ifdef HYPHEN_CHROME_CLIENT +typedef struct { + const unsigned char *data; + size_t offset; + size_t size; +} hnj_file; + +static hnj_file * +hnj_fopen (const unsigned char *data, size_t size) +{ + hnj_file *f; + + f = hnj_malloc (sizeof(hnj_file)); + if (f == NULL) + return NULL; + f->offset = 0; + f->data = data; + f->size = size; + return f; +} + +static void +hnj_fclose (hnj_file *f) +{ + hnj_free (f); +} + +static char * +hnj_fgets (char *s, int size, hnj_file *f) +{ + int i; + + if (f->offset >= f->size) + return NULL; + for (i = 0; i < size - 1; i++) { + char c; + + if (f->offset >= f->size) + break; + c = f->data[f->offset++]; + if (c == '\r' || c == '\n') + break; + s[i] = c; + } + s[i] = '\0'; + return s; +} +#else +typedef FILE hnj_file; +#define hnj_fopen(fn, mode) fopen((fn), (mode)) +#define hnj_fclose(f) fclose(f) +#define hnj_fgets(s, size, f) fgets((s), (size), (f)) +#endif + +#ifdef HYPHEN_CHROME_CLIENT +HyphenDict * +hnj_hyphen_load (const unsigned char *data, size_t size) +#else HyphenDict * hnj_hyphen_load (const char *fn) +#endif { HyphenDict *dict[2]; HashTab *hashtab; - FILE *f; + hnj_file *f; char buf[MAX_CHARS]; char word[MAX_CHARS]; char pattern[MAX_CHARS]; @@ -261,7 +320,11 @@ hnj_hyphen_load (const char *fn) HashEntry *e; int nextlevel = 0; - f = fopen (fn, "r"); +#ifdef HYPHEN_CHROME_CLIENT + f = hnj_fopen (data, size); +#else + f = hnj_fopen (fn, "r"); +#endif if (f == NULL) return NULL; @@ -289,7 +352,7 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { /* read in character set info */ if (k == 0) { for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; - if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { + if (hnj_fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { for (i=0;i<MAX_NAME;i++) if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) dict[k]->cset[i] = 0; @@ -302,7 +365,7 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { dict[k]->utf8 = dict[0]->utf8; } - while (fgets (buf, sizeof(buf), f) != NULL) + while (hnj_fgets (buf, sizeof(buf), f) != NULL) { if (buf[0] != '%') { @@ -368,7 +431,7 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { if (dict[k]->utf8) { int pu = -1; /* unicode character position */ int ps = -1; /* unicode start position (original replindex) */ - int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ + size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */ for (; pc < (strlen(word) + 1); pc++) { /* beginning of an UTF-8 character (not '10' start bits) */ if ((((unsigned char) word[pc]) >> 6) != 2) pu++; @@ -461,7 +524,7 @@ for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { #endif state_num = 0; } - fclose(f); + hnj_fclose(f); if (k == 2) dict[0]->nextlevel = dict[1]; return dict[0]; } diff --git a/third_party/hyphen/hyphen.gyp b/third_party/hyphen/hyphen.gyp new file mode 100644 index 0000000..35becc4 --- /dev/null +++ b/third_party/hyphen/hyphen.gyp @@ -0,0 +1,32 @@ +# Copyright (c) 2012 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +{ + 'targets': [ + { + 'target_name': 'hyphen', + 'type': '<(library)', + 'include_dirs': [ + '.', + ], + 'defines': [ + 'HYPHEN_CHROME_CLIENT', + ], + 'sources': [ + 'hnjalloc.c', + 'hnjalloc.h', + 'hyphen.h', + 'hyphen.c', + ], + 'direct_dependent_settings': { + 'defines': [ + 'HYPHEN_CHROME_CLIENT', + ], + 'include_dirs': [ + '.', + ], + }, + }, + ], +} diff --git a/third_party/hyphen/hyphen.h b/third_party/hyphen/hyphen.h index 5d79308..b5517d3 100644 --- a/third_party/hyphen/hyphen.h +++ b/third_party/hyphen/hyphen.h @@ -90,7 +90,11 @@ struct _HyphenTrans { int new_state; }; +#ifdef HYPHEN_CHROME_CLIENT +HyphenDict *hnj_hyphen_load (const unsigned char *data, size_t size); +#else HyphenDict *hnj_hyphen_load (const char *fn); +#endif void hnj_hyphen_free (HyphenDict *dict); /* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */ |