summaryrefslogtreecommitdiffstats
path: root/base/i18n
diff options
context:
space:
mode:
authorKristian Monsen <kristianm@google.com>2011-05-11 20:53:37 +0100
committerKristian Monsen <kristianm@google.com>2011-05-16 13:54:48 +0100
commit21d179b334e59e9a3bfcaed4c4430bef1bc5759d (patch)
tree64e2bb6da27af6a5c93ca34f6051584aafbfcb9e /base/i18n
parent0c63f00edd6ed0482fd5cbcea937ca088baf7858 (diff)
downloadexternal_chromium-21d179b334e59e9a3bfcaed4c4430bef1bc5759d.zip
external_chromium-21d179b334e59e9a3bfcaed4c4430bef1bc5759d.tar.gz
external_chromium-21d179b334e59e9a3bfcaed4c4430bef1bc5759d.tar.bz2
Merge Chromium at 10.0.621.0: Initial merge by git.
Change-Id: I070cc91c608dfa4a968a5a54c173260765ac8097
Diffstat (limited to 'base/i18n')
-rw-r--r--base/i18n/break_iterator.cc101
-rw-r--r--base/i18n/break_iterator.h108
-rw-r--r--base/i18n/break_iterator_unittest.cc308
-rw-r--r--base/i18n/file_util_icu.cc46
-rw-r--r--base/i18n/number_formatting.cc26
-rw-r--r--base/i18n/rtl.cc52
-rw-r--r--base/i18n/rtl.h1
-rw-r--r--base/i18n/time_formatting.cc23
-rw-r--r--base/i18n/time_formatting.h14
-rw-r--r--base/i18n/word_iterator.cc70
-rw-r--r--base/i18n/word_iterator.h89
-rw-r--r--base/i18n/word_iterator_unittest.cc117
12 files changed, 628 insertions, 327 deletions
diff --git a/base/i18n/break_iterator.cc b/base/i18n/break_iterator.cc
new file mode 100644
index 0000000..e1b5e29
--- /dev/null
+++ b/base/i18n/break_iterator.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/break_iterator.h"
+
+#include "base/logging.h"
+#include "unicode/ubrk.h"
+#include "unicode/uchar.h"
+#include "unicode/ustring.h"
+
+namespace base {
+
+const size_t npos = -1;
+
+BreakIterator::BreakIterator(const string16* str, BreakType break_type)
+ : iter_(NULL),
+ string_(str),
+ break_type_(break_type),
+ prev_(npos),
+ pos_(0) {
+}
+
+BreakIterator::~BreakIterator() {
+ if (iter_)
+ ubrk_close(static_cast<UBreakIterator*>(iter_));
+}
+
+bool BreakIterator::Init() {
+ UErrorCode status = U_ZERO_ERROR;
+ UBreakIteratorType break_type;
+ switch (break_type_) {
+ case BREAK_WORD:
+ break_type = UBRK_WORD;
+ break;
+ case BREAK_SPACE:
+ case BREAK_NEWLINE:
+ break_type = UBRK_LINE;
+ break;
+ default:
+ NOTREACHED() << "invalid break_type_";
+ return false;
+ }
+ iter_ = ubrk_open(break_type, NULL,
+ string_->data(), static_cast<int32_t>(string_->size()),
+ &status);
+ if (U_FAILURE(status)) {
+ NOTREACHED() << "ubrk_open failed";
+ return false;
+ }
+ // Move the iterator to the beginning of the string.
+ ubrk_first(static_cast<UBreakIterator*>(iter_));
+ return true;
+}
+
+bool BreakIterator::Advance() {
+ int32_t pos;
+ int32_t status;
+ prev_ = pos_;
+ switch (break_type_) {
+ case BREAK_WORD:
+ case BREAK_SPACE:
+ pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
+ if (pos == UBRK_DONE) {
+ pos_ = npos;
+ return false;
+ }
+ pos_ = static_cast<size_t>(pos);
+ return true;
+ case BREAK_NEWLINE:
+ do {
+ pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
+ if (pos == UBRK_DONE) {
+ break;
+ }
+ pos_ = static_cast<size_t>(pos);
+ status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
+ } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
+ if (pos == UBRK_DONE && prev_ == pos_) {
+ pos_ = npos;
+ return false;
+ }
+ return true;
+ default:
+ NOTREACHED() << "invalid break_type_";
+ return false;
+ }
+}
+
+bool BreakIterator::IsWord() const {
+ return (break_type_ == BREAK_WORD &&
+ ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)) !=
+ UBRK_WORD_NONE);
+}
+
+string16 BreakIterator::GetString() const {
+ DCHECK(prev_ != npos && pos_ != npos);
+ return string_->substr(prev_, pos_ - prev_);
+}
+
+} // namespace base
diff --git a/base/i18n/break_iterator.h b/base/i18n/break_iterator.h
new file mode 100644
index 0000000..9de7ac7
--- /dev/null
+++ b/base/i18n/break_iterator.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_I18N_BREAK_ITERATOR_H_
+#define BASE_I18N_BREAK_ITERATOR_H_
+#pragma once
+
+#include "base/basictypes.h"
+#include "base/string16.h"
+
+// The BreakIterator class iterates through the words, word breaks, and
+// line breaks in a UTF-16 string.
+//
+// It provides several modes, BREAK_WORD, BREAK_SPACE, and BREAK_NEWLINE,
+// which modify how characters are aggregated into the returned string.
+//
+// Under BREAK_WORD mode, once a word is encountered any non-word
+// characters are not included in the returned string (e.g. in the
+// UTF-16 equivalent of the string " foo bar! ", the word breaks are at
+// the periods in ". .foo. .bar.!. .").
+//
+// Under BREAK_SPACE mode, once a word is encountered, any non-word
+// characters are included in the returned string, breaking only when a
+// space-equivalent character is encountered (e.g. in the
+// UTF16-equivalent of the string " foo bar! ", the word breaks are at
+// the periods in ". .foo .bar! .").
+//
+// Under BREAK_NEWLINE mode, all characters are included in the returned
+// string, breking only when a newline-equivalent character is encountered
+// (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line
+// breaks are at the periods in ".foo\n.bar\n.\n.").
+//
+// To extract the words from a string, move a BREAK_WORD BreakIterator
+// through the string and test whether IsWord() is true. E.g.,
+// BreakIterator iter(&str, BreakIterator::BREAK_WORD);
+// if (!iter.Init()) return false;
+// while (iter.Advance()) {
+// if (iter.IsWord()) {
+// // region [iter.prev(),iter.pos()) contains a word.
+// VLOG(1) << "word: " << iter.GetString();
+// }
+// }
+
+namespace base {
+
+class BreakIterator {
+ public:
+ enum BreakType {
+ BREAK_WORD,
+ BREAK_SPACE,
+ BREAK_NEWLINE,
+ };
+
+ // Requires |str| to live as long as the BreakIterator does.
+ BreakIterator(const string16* str, BreakType break_type);
+ ~BreakIterator();
+
+ // Init() must be called before any of the iterators are valid.
+ // Returns false if ICU failed to initialize.
+ bool Init();
+
+ // Return the current break position within the string,
+ // or BreakIterator::npos when done.
+ size_t pos() const { return pos_; }
+
+ // Return the value of pos() returned before Advance() was last called.
+ size_t prev() const { return prev_; }
+
+ // Advance to the next break. Returns false if we've run past the end of
+ // the string. (Note that the very last "break" is after the final
+ // character in the string, and when we advance to that position it's the
+ // last time Advance() returns true.)
+ bool Advance();
+
+ // Under BREAK_WORD mode, returns true if the break we just hit is the
+ // end of a word. (Otherwise, the break iterator just skipped over e.g.
+ // whitespace or punctuation.) Under BREAK_SPACE and BREAK_NEWLINE modes,
+ // this distinction doesn't apply and it always retuns false.
+ bool IsWord() const;
+
+ // Return the string between prev() and pos().
+ // Advance() must have been called successfully at least once
+ // for pos() to have advanced to somewhere useful.
+ string16 GetString() const;
+
+ private:
+ // ICU iterator, avoiding ICU ubrk.h dependence.
+ // This is actually an ICU UBreakiterator* type, which turns out to be
+ // a typedef for a void* in the ICU headers. Using void* directly prevents
+ // callers from needing access to the ICU public headers directory.
+ void* iter_;
+
+ // The string we're iterating over.
+ const string16* string_;
+
+ // The breaking style (word/space/newline).
+ BreakType break_type_;
+
+ // Previous and current iterator positions.
+ size_t prev_, pos_;
+
+ DISALLOW_COPY_AND_ASSIGN(BreakIterator);
+};
+
+} // namespace base
+
+#endif // BASE_I18N_BREAK_ITERATOR_H__
diff --git a/base/i18n/break_iterator_unittest.cc b/base/i18n/break_iterator_unittest.cc
new file mode 100644
index 0000000..bf4fdc1
--- /dev/null
+++ b/base/i18n/break_iterator_unittest.cc
@@ -0,0 +1,308 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/break_iterator.h"
+
+#include "base/string_piece.h"
+#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+TEST(BreakIteratorTest, BreakWordEmpty) {
+ string16 empty;
+ base::BreakIterator iter(&empty, base::BreakIterator::BREAK_WORD);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakWord) {
+ string16 space(UTF8ToUTF16(" "));
+ string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(space, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(space, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("!"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(space, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(space, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakWide16) {
+ // Two greek words separated by space.
+ const string16 str(WideToUTF16(
+ L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+ L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
+ const string16 word1(str.substr(0, 10));
+ const string16 word2(str.substr(11, 5));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(word1, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(word2, iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakWide32) {
+ // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
+ const char* very_wide_char = "\xF0\x9D\x92\x9C";
+ const string16 str(
+ UTF8ToUTF16(StringPrintf("%s a", very_wide_char)));
+ const string16 very_wide_word(str.substr(0, 2));
+
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(very_wide_word, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakSpaceEmpty) {
+ string16 empty;
+ base::BreakIterator iter(&empty, base::BreakIterator::BREAK_SPACE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakSpace) {
+ string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakSpaceSP) {
+ string16 str(UTF8ToUTF16(" foo bar! \npouet boom "));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("boom "), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakSpacekWide16) {
+ // Two Greek words.
+ const string16 str(WideToUTF16(
+ L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+ L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
+ const string16 word1(str.substr(0, 11));
+ const string16 word2(str.substr(11, 5));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(word1, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(word2, iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakSpaceWide32) {
+ // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
+ const char* very_wide_char = "\xF0\x9D\x92\x9C";
+ const string16 str(
+ UTF8ToUTF16(StringPrintf("%s a", very_wide_char)));
+ const string16 very_wide_word(str.substr(0, 3));
+
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(very_wide_word, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLineEmpty) {
+ string16 empty;
+ base::BreakIterator iter(&empty, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLine) {
+ string16 nl(UTF8ToUTF16("\n"));
+ string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom"));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(nl, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(nl, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLineNL) {
+ string16 nl(UTF8ToUTF16("\n"));
+ string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n"));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(nl, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(nl, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLineWide16) {
+ // Two Greek words separated by newline.
+ const string16 str(WideToUTF16(
+ L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+ L"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2"));
+ const string16 line1(str.substr(0, 11));
+ const string16 line2(str.substr(11, 5));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(line1, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(line2, iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLineWide32) {
+ // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
+ const char* very_wide_char = "\xF0\x9D\x92\x9C";
+ const string16 str(
+ UTF8ToUTF16(StringPrintf("%s\na", very_wide_char)));
+ const string16 very_wide_line(str.substr(0, 3));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(very_wide_line, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
diff --git a/base/i18n/file_util_icu.cc b/base/i18n/file_util_icu.cc
index 0e9c2cd..34eefac 100644
--- a/base/i18n/file_util_icu.cc
+++ b/base/i18n/file_util_icu.cc
@@ -21,6 +21,10 @@ namespace {
class IllegalCharacters {
public:
+ static IllegalCharacters* GetInstance() {
+ return Singleton<IllegalCharacters>::get();
+ }
+
bool contains(UChar32 ucs4) {
return !!set->contains(ucs4);
}
@@ -76,19 +80,8 @@ IllegalCharacters::IllegalCharacters() {
class LocaleAwareComparator {
public:
- LocaleAwareComparator() {
- UErrorCode error_code = U_ZERO_ERROR;
- // Use the default collator. The default locale should have been properly
- // set by the time this constructor is called.
- collator_.reset(icu::Collator::createInstance(error_code));
- DCHECK(U_SUCCESS(error_code));
- // Make it case-sensitive.
- collator_->setStrength(icu::Collator::TERTIARY);
- // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we
- // do not pay performance penalty to guarantee sort order correctness for
- // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a
- // reasonable tradeoff because such file names should be rare and the sort
- // order doesn't change much anyway.
+ static LocaleAwareComparator* GetInstance() {
+ return Singleton<LocaleAwareComparator>::get();
}
// Note: A similar function is available in l10n_util.
@@ -111,6 +104,21 @@ class LocaleAwareComparator {
}
private:
+ LocaleAwareComparator() {
+ UErrorCode error_code = U_ZERO_ERROR;
+ // Use the default collator. The default locale should have been properly
+ // set by the time this constructor is called.
+ collator_.reset(icu::Collator::createInstance(error_code));
+ DCHECK(U_SUCCESS(error_code));
+ // Make it case-sensitive.
+ collator_->setStrength(icu::Collator::TERTIARY);
+ // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we
+ // do not pay performance penalty to guarantee sort order correctness for
+ // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a
+ // reasonable tradeoff because such file names should be rare and the sort
+ // order doesn't change much anyway.
+ }
+
scoped_ptr<icu::Collator> collator_;
Lock lock_;
friend struct DefaultSingletonTraits<LocaleAwareComparator>;
@@ -123,19 +131,19 @@ class LocaleAwareComparator {
namespace file_util {
bool IsFilenameLegal(const string16& file_name) {
- return Singleton<IllegalCharacters>()->containsNone(file_name);
+ return IllegalCharacters::GetInstance()->containsNone(file_name);
}
void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
char replace_char) {
DCHECK(file_name);
- DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)));
+ DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char)));
// Remove leading and trailing whitespace.
TrimWhitespace(*file_name, TRIM_ALL, file_name);
- IllegalCharacters* illegal = Singleton<IllegalCharacters>::get();
+ IllegalCharacters* illegal = IllegalCharacters::GetInstance();
int cursor = 0; // The ICU macros expect an int.
while (cursor < static_cast<int>(file_name->size())) {
int char_begin = cursor;
@@ -171,8 +179,8 @@ void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
#if defined(OS_WIN)
- return Singleton<LocaleAwareComparator>()->Compare(a.value().c_str(),
- b.value().c_str()) < 0;
+ return LocaleAwareComparator::GetInstance()->Compare(a.value().c_str(),
+ b.value().c_str()) < 0;
#elif defined(OS_POSIX)
// On linux, the file system encoding is not defined. We assume
@@ -181,7 +189,7 @@ bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
// ICU's collator can take strings in OS native encoding. But we convert the
// strings to UTF-16 ourselves to ensure conversion consistency.
// TODO(yuzo): Perhaps we should define SysNativeMBToUTF16?
- return Singleton<LocaleAwareComparator>()->Compare(
+ return LocaleAwareComparator::GetInstance()->Compare(
WideToUTF16(base::SysNativeMBToWide(a.value().c_str())),
WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) < 0;
#else
diff --git a/base/i18n/number_formatting.cc b/base/i18n/number_formatting.cc
index 7a69294..df6af14 100644
--- a/base/i18n/number_formatting.cc
+++ b/base/i18n/number_formatting.cc
@@ -6,7 +6,8 @@
#include "base/format_macros.h"
#include "base/logging.h"
-#include "base/singleton.h"
+#include "base/lazy_instance.h"
+#include "base/scoped_ptr.h"
#include "base/string_util.h"
#include "base/utf_string_conversions.h"
#include "unicode/numfmt.h"
@@ -16,25 +17,26 @@ namespace base {
namespace {
-struct NumberFormatSingletonTraits
- : public DefaultSingletonTraits<icu::NumberFormat> {
- static icu::NumberFormat* New() {
+struct NumberFormatWrapper {
+ NumberFormatWrapper() {
+ // There's no ICU call to destroy a NumberFormat object other than
+ // operator delete, so use the default Delete, which calls operator delete.
+ // This can cause problems if a different allocator is used by this file
+ // than by ICU.
UErrorCode status = U_ZERO_ERROR;
- icu::NumberFormat* formatter = icu::NumberFormat::createInstance(status);
+ number_format.reset(icu::NumberFormat::createInstance(status));
DCHECK(U_SUCCESS(status));
- return formatter;
}
- // There's no ICU call to destroy a NumberFormat object other than
- // operator delete, so use the default Delete, which calls operator delete.
- // This can cause problems if a different allocator is used by this file than
- // by ICU.
+
+ scoped_ptr<icu::NumberFormat> number_format;
};
} // namespace
+static LazyInstance<NumberFormatWrapper> g_number_format(LINKER_INITIALIZED);
+
string16 FormatNumber(int64 number) {
- icu::NumberFormat* number_format =
- Singleton<icu::NumberFormat, NumberFormatSingletonTraits>::get();
+ icu::NumberFormat* number_format = g_number_format.Get().number_format.get();
if (!number_format) {
// As a fallback, just return the raw number in a string.
diff --git a/base/i18n/rtl.cc b/base/i18n/rtl.cc
index 6a5d293..12b376d 100644
--- a/base/i18n/rtl.cc
+++ b/base/i18n/rtl.cc
@@ -163,6 +163,7 @@ TextDirection GetFirstStrongCharacterDirection(const std::wstring& text) {
}
#endif
+#if defined(OS_WIN)
bool AdjustStringForLocaleDirection(string16* text) {
if (!IsRTL() || text->empty())
return false;
@@ -177,6 +178,57 @@ bool AdjustStringForLocaleDirection(string16* text) {
return true;
}
+#else
+bool AdjustStringForLocaleDirection(string16* text) {
+ // On OS X & GTK the directionality of a label is determined by the first
+ // strongly directional character.
+ // However, we want to make sure that in an LTR-language-UI all strings are
+ // left aligned and vice versa.
+ // A problem can arise if we display a string which starts with user input.
+ // User input may be of the opposite directionality to the UI. So the whole
+ // string will be displayed in the opposite directionality, e.g. if we want to
+ // display in an LTR UI [such as US English]:
+ //
+ // EMAN_NOISNETXE is now installed.
+ //
+ // Since EXTENSION_NAME begins with a strong RTL char, the label's
+ // directionality will be set to RTL and the string will be displayed visually
+ // as:
+ //
+ // .is now installed EMAN_NOISNETXE
+ //
+ // In order to solve this issue, we prepend an LRM to the string. An LRM is a
+ // strongly directional LTR char.
+ // We also append an LRM at the end, which ensures that we're in an LTR
+ // context.
+
+ // Unlike Windows, Linux and OS X can correctly display RTL glyphs out of the
+ // box so there is no issue with displaying zero-width bidi control characters
+ // on any system. Thus no need for the !IsRTL() check here.
+ if (text->empty())
+ return false;
+
+ bool ui_direction_is_rtl = IsRTL();
+
+ bool has_rtl_chars = StringContainsStrongRTLChars(*text);
+ if (!ui_direction_is_rtl && has_rtl_chars) {
+ WrapStringWithRTLFormatting(text);
+ text->insert(0, 1, kLeftToRightMark);
+ text->push_back(kLeftToRightMark);
+ } else if (ui_direction_is_rtl && has_rtl_chars) {
+ WrapStringWithRTLFormatting(text);
+ text->insert(0, 1, kRightToLeftMark);
+ text->push_back(kRightToLeftMark);
+ } else if (ui_direction_is_rtl) {
+ WrapStringWithLTRFormatting(text);
+ text->insert(0, 1, kRightToLeftMark);
+ text->push_back(kRightToLeftMark);
+ }
+
+ return true;
+}
+
+#endif // !OS_WIN
#if defined(WCHAR_T_IS_UTF32)
bool AdjustStringForLocaleDirection(std::wstring* text) {
diff --git a/base/i18n/rtl.h b/base/i18n/rtl.h
index 82ac576..a75ed4f 100644
--- a/base/i18n/rtl.h
+++ b/base/i18n/rtl.h
@@ -84,6 +84,7 @@ TextDirection GetFirstStrongCharacterDirection(const std::wstring& text);
// string is always treated as a right-to-left string. This is done by
// inserting certain Unicode formatting marks into the returned string.
//
+// ** Notes about the Windows version of this function:
// TODO(idana) bug 6806: this function adjusts the string in question only
// if the current locale is right-to-left. The function does not take care of
// the opposite case (an RTL string displayed in an LTR context) since
diff --git a/base/i18n/time_formatting.cc b/base/i18n/time_formatting.cc
index 406145d..3fa984a 100644
--- a/base/i18n/time_formatting.cc
+++ b/base/i18n/time_formatting.cc
@@ -14,24 +14,21 @@ using base::Time;
namespace {
-std::wstring TimeFormat(const icu::DateFormat* formatter,
- const Time& time) {
+string16 TimeFormat(const icu::DateFormat* formatter,
+ const Time& time) {
DCHECK(formatter);
icu::UnicodeString date_string;
formatter->format(static_cast<UDate>(time.ToDoubleT() * 1000), date_string);
- std::wstring output;
- bool success = UTF16ToWide(date_string.getBuffer(), date_string.length(),
- &output);
- DCHECK(success);
- return output;
+ return string16(date_string.getBuffer(),
+ static_cast<size_t>(date_string.length()));
}
} // namespace
namespace base {
-std::wstring TimeFormatTimeOfDay(const Time& time) {
+string16 TimeFormatTimeOfDay(const Time& time) {
// We can omit the locale parameter because the default should match
// Chrome's application locale.
scoped_ptr<icu::DateFormat> formatter(
@@ -39,31 +36,31 @@ std::wstring TimeFormatTimeOfDay(const Time& time) {
return TimeFormat(formatter.get(), time);
}
-std::wstring TimeFormatShortDate(const Time& time) {
+string16 TimeFormatShortDate(const Time& time) {
scoped_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateInstance(icu::DateFormat::kMedium));
return TimeFormat(formatter.get(), time);
}
-std::wstring TimeFormatShortDateNumeric(const Time& time) {
+string16 TimeFormatShortDateNumeric(const Time& time) {
scoped_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateInstance(icu::DateFormat::kShort));
return TimeFormat(formatter.get(), time);
}
-std::wstring TimeFormatShortDateAndTime(const Time& time) {
+string16 TimeFormatShortDateAndTime(const Time& time) {
scoped_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateTimeInstance(icu::DateFormat::kShort));
return TimeFormat(formatter.get(), time);
}
-std::wstring TimeFormatFriendlyDateAndTime(const Time& time) {
+string16 TimeFormatFriendlyDateAndTime(const Time& time) {
scoped_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateTimeInstance(icu::DateFormat::kFull));
return TimeFormat(formatter.get(), time);
}
-std::wstring TimeFormatFriendlyDate(const Time& time) {
+string16 TimeFormatFriendlyDate(const Time& time) {
scoped_ptr<icu::DateFormat> formatter(icu::DateFormat::createDateInstance(
icu::DateFormat::kFull));
return TimeFormat(formatter.get(), time);
diff --git a/base/i18n/time_formatting.h b/base/i18n/time_formatting.h
index d78ae9b..e70ad3d 100644
--- a/base/i18n/time_formatting.h
+++ b/base/i18n/time_formatting.h
@@ -9,32 +9,32 @@
#define BASE_I18N_TIME_FORMATTING_H_
#pragma once
-#include <string>
+#include "base/string16.h"
namespace base {
class Time;
// Returns the time of day, e.g., "3:07 PM".
-std::wstring TimeFormatTimeOfDay(const Time& time);
+string16 TimeFormatTimeOfDay(const Time& time);
// Returns a shortened date, e.g. "Nov 7, 2007"
-std::wstring TimeFormatShortDate(const Time& time);
+string16 TimeFormatShortDate(const Time& time);
// Returns a numeric date such as 12/13/52.
-std::wstring TimeFormatShortDateNumeric(const Time& time);
+string16 TimeFormatShortDateNumeric(const Time& time);
// Formats a time in a friendly sentence format, e.g.
// "Monday, March 6, 2008 2:44:30 PM".
-std::wstring TimeFormatShortDateAndTime(const Time& time);
+string16 TimeFormatShortDateAndTime(const Time& time);
// Formats a time in a friendly sentence format, e.g.
// "Monday, March 6, 2008 2:44:30 PM".
-std::wstring TimeFormatFriendlyDateAndTime(const Time& time);
+string16 TimeFormatFriendlyDateAndTime(const Time& time);
// Formats a time in a friendly sentence format, e.g.
// "Monday, March 6, 2008".
-std::wstring TimeFormatFriendlyDate(const Time& time);
+string16 TimeFormatFriendlyDate(const Time& time);
} // namespace base
diff --git a/base/i18n/word_iterator.cc b/base/i18n/word_iterator.cc
deleted file mode 100644
index a9fa4af..0000000
--- a/base/i18n/word_iterator.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "base/i18n/word_iterator.h"
-
-#include "base/logging.h"
-#include "unicode/ubrk.h"
-#include "unicode/ustring.h"
-
-const size_t npos = -1;
-
-WordIterator::WordIterator(const string16* str, BreakType break_type)
- : iter_(NULL),
- string_(str),
- break_type_(break_type),
- prev_(npos),
- pos_(0) {
-}
-
-WordIterator::~WordIterator() {
- if (iter_)
- ubrk_close(iter_);
-}
-
-bool WordIterator::Init() {
- UErrorCode status = U_ZERO_ERROR;
- UBreakIteratorType break_type;
- switch (break_type_) {
- case BREAK_WORD:
- break_type = UBRK_WORD;
- break;
- case BREAK_LINE:
- break_type = UBRK_LINE;
- break;
- default:
- NOTREACHED();
- break_type = UBRK_LINE;
- }
- iter_ = ubrk_open(break_type, NULL,
- string_->data(), static_cast<int32_t>(string_->size()),
- &status);
- if (U_FAILURE(status)) {
- NOTREACHED() << "ubrk_open failed";
- return false;
- }
- ubrk_first(iter_); // Move the iterator to the beginning of the string.
- return true;
-}
-
-bool WordIterator::Advance() {
- prev_ = pos_;
- const int32_t pos = ubrk_next(iter_);
- if (pos == UBRK_DONE) {
- pos_ = npos;
- return false;
- } else {
- pos_ = static_cast<size_t>(pos);
- return true;
- }
-}
-
-bool WordIterator::IsWord() const {
- return (ubrk_getRuleStatus(iter_) != UBRK_WORD_NONE);
-}
-
-string16 WordIterator::GetWord() const {
- DCHECK(prev_ != npos && pos_ != npos);
- return string_->substr(prev_, pos_ - prev_);
-}
diff --git a/base/i18n/word_iterator.h b/base/i18n/word_iterator.h
deleted file mode 100644
index b097bc2..0000000
--- a/base/i18n/word_iterator.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef BASE_I18N_WORD_ITERATOR_H_
-#define BASE_I18N_WORD_ITERATOR_H_
-#pragma once
-
-#include <vector>
-
-#include "unicode/ubrk.h"
-#include "unicode/uchar.h"
-
-#include "base/basictypes.h"
-#include "base/string16.h"
-
-// The WordIterator class iterates through the words and word breaks
-// in a string. (In the string " foo bar! ", the word breaks are at the
-// periods in ". .foo. .bar.!. .".)
-//
-// To extract the words from a string, move a WordIterator through the
-// string and test whether IsWord() is true. E.g.,
-// WordIterator iter(&str, WordIterator::BREAK_WORD);
-// if (!iter.Init()) return false;
-// while (iter.Advance()) {
-// if (iter.IsWord()) {
-// // region [iter.prev(),iter.pos()) contains a word.
-// VLOG(1) << "word: " << iter.GetWord();
-// }
-// }
-
-
-class WordIterator {
- public:
- enum BreakType {
- BREAK_WORD,
- BREAK_LINE
- };
-
- // Requires |str| to live as long as the WordIterator does.
- WordIterator(const string16* str, BreakType break_type);
- ~WordIterator();
-
- // Init() must be called before any of the iterators are valid.
- // Returns false if ICU failed to initialize.
- bool Init();
-
- // Return the current break position within the string,
- // or WordIterator::npos when done.
- size_t pos() const { return pos_; }
- // Return the value of pos() returned before Advance() was last called.
- size_t prev() const { return prev_; }
-
- // Advance to the next break. Returns false if we've run past the end of
- // the string. (Note that the very last "word break" is after the final
- // character in the string, and when we advance to that position it's the
- // last time Advance() returns true.)
- bool Advance();
-
- // Returns true if the break we just hit is the end of a word.
- // (Otherwise, the break iterator just skipped over e.g. whitespace
- // or punctuation.)
- bool IsWord() const;
-
- // Return the word between prev() and pos().
- // Advance() must have been called successfully at least once
- // for pos() to have advanced to somewhere useful.
- string16 GetWord() const;
-
- private:
- // ICU iterator.
- UBreakIterator* iter_;
-#if !defined(WCHAR_T_IS_UTF16)
- std::vector<UChar> chars_;
-#endif
-
- // The string we're iterating over.
- const string16* string_;
-
- // The breaking style (word/line).
- BreakType break_type_;
-
- // Previous and current iterator positions.
- size_t prev_, pos_;
-
- DISALLOW_COPY_AND_ASSIGN(WordIterator);
-};
-
-#endif // BASE_I18N_WORD_ITERATOR_H__
diff --git a/base/i18n/word_iterator_unittest.cc b/base/i18n/word_iterator_unittest.cc
deleted file mode 100644
index 92aff76..0000000
--- a/base/i18n/word_iterator_unittest.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "base/i18n/word_iterator.h"
-
-#include "base/string_piece.h"
-#include "base/string_util.h"
-#include "base/utf_string_conversions.h"
-#include "testing/gtest/include/gtest/gtest.h"
-
-TEST(WordIteratorTest, BreakWord) {
- string16 space(UTF8ToUTF16(" "));
-
- string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
- WordIterator iter(&str, WordIterator::BREAK_WORD);
- ASSERT_TRUE(iter.Init());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(space, iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(space, iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("!"), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(space, iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(space, iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetWord());
- EXPECT_FALSE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
-}
-
-TEST(WordIteratorTest, BreakLine) {
- string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
- WordIterator iter(&str, WordIterator::BREAK_LINE);
- ASSERT_TRUE(iter.Init());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetWord());
- EXPECT_FALSE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
-}
-
-TEST(WordIteratorTest, BreakWide16) {
- // "Παγκόσμιος Ιστός"
- const string16 str(WideToUTF16(
- L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
- L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
- const string16 word1(str.substr(0, 10));
- const string16 word2(str.substr(11, 5));
- WordIterator iter(&str, WordIterator::BREAK_WORD);
- ASSERT_TRUE(iter.Init());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(word1, iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(word2, iter.GetWord());
- EXPECT_FALSE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
-}
-
-TEST(WordIteratorTest, BreakWide32) {
- // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
- const char* very_wide_char = "\xF0\x9D\x92\x9C";
- const string16 str(
- UTF8ToUTF16(StringPrintf("%s a", very_wide_char)));
- const string16 very_wide_word(str.substr(0, 2));
-
- WordIterator iter(&str, WordIterator::BREAK_WORD);
- ASSERT_TRUE(iter.Init());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(very_wide_word, iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16("a"), iter.GetWord());
- EXPECT_FALSE(iter.Advance());
- EXPECT_FALSE(iter.IsWord());
-}