diff options
| -rw-r--r-- | base/i18n/word_iterator.cc | 14 | ||||
| -rw-r--r-- | base/i18n/word_iterator_unittest.cc | 49 |
2 files changed, 63 insertions, 0 deletions
diff --git a/base/i18n/word_iterator.cc b/base/i18n/word_iterator.cc index 45a06b9..feb77eb 100644 --- a/base/i18n/word_iterator.cc +++ b/base/i18n/word_iterator.cc @@ -83,5 +83,19 @@ bool WordIterator::IsWord() const { std::wstring WordIterator::GetWord() const { DCHECK(prev_ != npos && pos_ != npos); +#if defined(WCHAR_T_IS_UTF16) return string_.substr(prev_, pos_ - prev_); +#else // WCHAR_T_IS_UTF16 + // See comment in Init(). If there are no surrogate pairs, + // |out_length| will be exactly |in_length|, if there are surrogate + // pairs it will be less than |in_length|. + int32_t out_length; + UErrorCode error = U_ZERO_ERROR; + const int32_t in_length = pos_ - prev_; + std::vector<std::wstring::value_type> out_buffer(in_length); + u_strToWCS(&out_buffer[0], in_length, &out_length, + &chars_[prev_], in_length, &error); + DCHECK_LE(out_length, in_length); + return std::wstring(&out_buffer[0], out_length); +#endif } diff --git a/base/i18n/word_iterator_unittest.cc b/base/i18n/word_iterator_unittest.cc index 0d28370..d653e1d4 100644 --- a/base/i18n/word_iterator_unittest.cc +++ b/base/i18n/word_iterator_unittest.cc @@ -4,6 +4,9 @@ #include "base/i18n/word_iterator.h" +#include "base/string_piece.h" +#include "base/string_util.h" +#include "base/sys_string_conversions.h" #include "testing/gtest/include/gtest/gtest.h" TEST(WordIteratorTest, BreakWord) { @@ -66,3 +69,49 @@ TEST(WordIteratorTest, BreakLine) { EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); } + +TEST(WordIteratorTest, BreakWide16) { + // "Παγκόσμιος Ιστός" + const std::wstring str(L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"); + const std::wstring word1(str.substr(0, 10)); + const std::wstring word2(str.substr(11, 5)); + WordIterator iter(str, WordIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(word1, iter.GetWord()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(L" ", iter.GetWord()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(word2, iter.GetWord()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); +} + +TEST(WordIteratorTest, BreakWide32) { + // U+1D49C MATHEMATICAL SCRIPT CAPITAL A + const char *very_wide_char = "\xF0\x9D\x92\x9C"; + const std::wstring str( + base::SysUTF8ToWide(StringPrintf("%s a", very_wide_char))); +#if defined(WCHAR_T_IS_UTF16) + const std::wstring very_wide_word(str.substr(0, 2)); +#elif defined(WCHAR_T_IS_UTF32) + const std::wstring very_wide_word(str.substr(0, 1)); +#endif + WordIterator iter(str, WordIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(very_wide_word, iter.GetWord()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(L" ", iter.GetWord()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(L"a", iter.GetWord()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); +} |
