From 866cd68e4fd9d4195026109b2e6d285b50334218 Mon Sep 17 00:00:00 2001 From: "georgey@chromium.org" Date: Fri, 27 May 2011 01:56:59 +0000 Subject: Change UTF-8 charcters in constants into escape sequences BUG=none TEST=unit-tested Review URL: http://codereview.chromium.org/7077021 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@86950 0039d316-1c4b-4281-b951-d872f2087c98 --- third_party/libphonenumber/README.chromium | 3 +- .../libphonenumber/cpp/src/phonenumberutil.cc | 44 ++-- .../libphonenumber/cpp/src/phonenumberutil_test.cc | 36 +-- third_party/libphonenumber/patches/utf8_v186.patch | 251 +++++++++++++++++++++ 4 files changed, 293 insertions(+), 41 deletions(-) create mode 100644 third_party/libphonenumber/patches/utf8_v186.patch diff --git a/third_party/libphonenumber/README.chromium b/third_party/libphonenumber/README.chromium index 9cf38ee..608a0a5 100644 --- a/third_party/libphonenumber/README.chromium +++ b/third_party/libphonenumber/README.chromium @@ -22,7 +22,8 @@ Additional files, not in the original library: Until the changes are upstreamed library is included directly, with a patch in patches/version186.patch applied. The patch adds plugability to the regular expression engine, its RE2 (default) implementation, and the unit-test for the -changes. +changes. The second patch utf8_v186.patch is applied to allow compilation on +some multi-byte locales. The folders included in our repository for now are cpp/ diff --git a/third_party/libphonenumber/cpp/src/phonenumberutil.cc b/third_party/libphonenumber/cpp/src/phonenumberutil.cc index 75ef374..b0201f2 100644 --- a/third_party/libphonenumber/cpp/src/phonenumberutil.cc +++ b/third_party/libphonenumber/cpp/src/phonenumberutil.cc @@ -72,7 +72,7 @@ scoped_ptr > all_plus_number_grouping_symbols; // The kPlusSign signifies the international prefix. const char kPlusSign[] = "+"; -const char kPlusChars[] = "++"; +const char kPlusChars[] = "+\xEF\xBC\x8B"; scoped_ptr plus_chars_pattern; const char kRfc3966ExtnPrefix[] = ";ext="; @@ -88,7 +88,7 @@ scoped_ptr unique_international_prefix; // Digits accepted in phone numbers. // Both Arabic-Indic and Eastern Arabic-Indic are supported. -const char kValidDigits[] = "0-90-9٠-٩۰-۹"; +const char kValidDigits[] = "0-9\xEF\xBC\x90-\xEF\xBC\x99\xD9\xA0-\xD9\xA9\xDB\xB0-\xDB\xB9"; // We accept alpha characters in phone numbers, ASCII only. We store lower-case // here only since our regular expressions are case-insensitive. const char kValidAlpha[] = "a-z"; @@ -140,7 +140,7 @@ scoped_ptr unwanted_end_char_pattern; // itself. In emacs, you can use M-x unicode-what to query information about the // unicode character. const char kValidPunctuation[] = - "-x‐-―−ー--/  ​⁠ ()()[].\\[\\]/~⁓∼~"; + "-x\xE2\x80\x90-\xE2\x80\x95\xE2\x88\x92\xE3\x83\xBC\xEF\xBC\x8D-\xEF\xBC\x8F \xC2\xA0\xE2\x80\x8B\xE2\x81\xA0\xE3\x80\x80()\xEF\xBC\x88\xEF\xBC\x89\xEF\xBC\xBB\xEF\xBC\xBD.\\[\\]/~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E"; // Regular expression of viable phone numbers. This is location independent. // Checks we have at least three leading digits, and only valid punctuation, @@ -454,7 +454,7 @@ char32 ToUnicodeCodepoint(const char* unicode_char) { // defined order. void CreateRegularExpressions() { unique_international_prefix.reset( - reg_exp::CreateRegularExpression("[\\d]+(?:[~⁓∼~][\\d]+)?")); + reg_exp::CreateRegularExpression("[\\d]+(?:[~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E][\\d]+)?")); first_group_capturing_pattern.reset( reg_exp::CreateRegularExpression("(\\$1)")); carrier_code_pattern.reset( @@ -476,16 +476,16 @@ void CreateRegularExpressions() { StrCat("[", kPlusChars, "]*(?:[", kValidPunctuation, "]*[", kValidDigits, "]){3,}[", kValidAlpha, kValidPunctuation, kValidDigits, "]*"))); // Canonical-equivalence doesn't seem to be an option with RE2, so we allow - // two options for representing the ó - the character itself, and one in the + // two options for representing the \xC3\xB3 - the character itself, and one in the // unicode decomposed form with the combining acute accent. Note that there // are currently three capturing groups for the extension itself - if this // number is changed, MaybeStripExtension needs to be updated. const string capturing_extn_digits = StrCat("([", kValidDigits, "]{1,7})"); known_extn_patterns.reset(new string( StrCat(kRfc3966ExtnPrefix, capturing_extn_digits, "|" - "[  \\t,]*(?:ext(?:ensi(?:ó?|ó))?n?|extn?|[,xx##~~]|" - "int|int|anexo)" - "[:\\..]?[  \\t,-]*", capturing_extn_digits, "#?|" + "[ \xC2\xA0\\t,]*(?:ext(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|\xEF\xBD\x85\xEF\xBD\x98\xEF\xBD\x94\xEF\xBD\x8E?|[,x\xEF\xBD\x98#\xEF\xBC\x83~\xEF\xBD\x9E]|" + "int|\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94|anexo)" + "[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*", capturing_extn_digits, "#?|" "[- ]+([", kValidDigits, "]{1,5})#"))); extn_pattern.reset(reg_exp::CreateRegularExpression( StrCat("(?i)(?:", *known_extn_patterns, ")$").c_str())); @@ -509,35 +509,35 @@ void InitializeStaticMapsAndSets() { all_plus_number_grouping_symbols->insert( make_pair(ToUnicodeCodepoint("-"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("-"), '-')); + make_pair(ToUnicodeCodepoint("\xEF\xBC\x8D"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("‐"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x90"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("‑"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x91"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("‒"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x92"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("–"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x93"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("—"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x94"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("―"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x95"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("−"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x88\x92"), '-')); all_plus_number_grouping_symbols->insert( make_pair(ToUnicodeCodepoint("/"), '/')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("/"), '/')); + make_pair(ToUnicodeCodepoint("\xEF\xBC\x8F"), '/')); all_plus_number_grouping_symbols->insert( make_pair(ToUnicodeCodepoint(" "), ' ')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint(" "), ' ')); + make_pair(ToUnicodeCodepoint("\xE3\x80\x80"), ' ')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("⁠"), ' ')); + make_pair(ToUnicodeCodepoint("\xE2\x81\xA0"), ' ')); all_plus_number_grouping_symbols->insert( make_pair(ToUnicodeCodepoint("."), '.')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("."), '.')); + make_pair(ToUnicodeCodepoint("\xEF\xBC\x8E"), '.')); // Only the upper-case letters are added here - the lower-case versions are // added programmatically. alpha_mappings->insert(make_pair(ToUnicodeCodepoint("A"), '2')); @@ -849,7 +849,7 @@ void PhoneNumberUtil::Format(const PhoneNumber& number, // Note here that all NANPA formatting rules are contained by US, so we use // that to format NANPA numbers. The same applies to Russian Fed regions - // rules are contained by Russia. French Indian Ocean country rules are - // contained by Réunion. + // contained by R\xC3\xA9union. string region_code; GetRegionCodeForCountryCode(country_calling_code, ®ion_code); if (!HasValidRegionCode(region_code, country_calling_code, @@ -1015,7 +1015,7 @@ void PhoneNumberUtil::FormatOutOfCountryCallingNumber( // For regions that share a country calling code, the country calling code // need not be dialled. This also applies when dialling within a region, so // this if clause covers both these cases. - // Technically this is the case for dialling from la Réunion to other + // Technically this is the case for dialling from la R\xC3\xA9union to other // overseas departments of France (French Guiana, Martinique, Guadeloupe), // but not vice versa - so we don't cover this edge case for now and for // those cases return the version including country calling code. diff --git a/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc b/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc index 3ef3fa7..9eadaf9 100644 --- a/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc +++ b/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc @@ -1101,11 +1101,11 @@ TEST_F(PhoneNumberUtilTest, ExtractPossibleNumber) { ExtractPossibleNumber("Tel:+800-345-600", &extracted_number); EXPECT_EQ("+800-345-600", extracted_number); // Should recognise wide digits as possible start values. - ExtractPossibleNumber("023", &extracted_number); - EXPECT_EQ("023", extracted_number); + ExtractPossibleNumber("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93", &extracted_number); + EXPECT_EQ("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93", extracted_number); // Dashes are not possible start values and should be removed. - ExtractPossibleNumber("Num-123", &extracted_number); - EXPECT_EQ("123", extracted_number); + ExtractPossibleNumber("Num-\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93", &extracted_number); + EXPECT_EQ("\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93", extracted_number); // If not possible number present, return empty string. ExtractPossibleNumber("Num-....", &extracted_number); EXPECT_EQ("", extracted_number); @@ -1119,7 +1119,7 @@ TEST_F(PhoneNumberUtilTest, ExtractPossibleNumber) { ExtractPossibleNumber("(650) 253-0000.", &extracted_number); EXPECT_EQ("650) 253-0000", extracted_number); // This case has a trailing RTL char. - ExtractPossibleNumber("(650) 253-0000‏", &extracted_number); + ExtractPossibleNumber("(650) 253-0000\xE2\x80\x8F", &extracted_number); EXPECT_EQ("650) 253-0000", extracted_number); } @@ -1163,7 +1163,7 @@ TEST_F(PhoneNumberUtilTest, IsValidForRegion) { // This number is no longer valid. EXPECT_FALSE(phone_util_.IsValidNumber(bs_number)); - // La Mayotte and Réunion use 'leadingDigits' to differentiate them. + // La Mayotte and R\xC3\xA9union use 'leadingDigits' to differentiate them. PhoneNumber re_number; re_number.set_country_code(262); re_number.set_national_number(262123456ULL); @@ -1631,13 +1631,13 @@ TEST_F(PhoneNumberUtilTest, IsViablePhoneNumber) { EXPECT_TRUE(IsViablePhoneNumber("0800-4-PIZZA")); // Only one or two digits before possible punctuation followed by more digits. // The punctuation used here is the unicode character u+3000. - EXPECT_TRUE(IsViablePhoneNumber("1 34")); - EXPECT_FALSE(IsViablePhoneNumber("1 3+4")); + EXPECT_TRUE(IsViablePhoneNumber("1\xE3\x80\x80" "34")); + EXPECT_FALSE(IsViablePhoneNumber("1\xE3\x80\x80" "3+4")); // Unicode variants of possible starting character and other allowed // punctuation/digits. - EXPECT_TRUE(IsViablePhoneNumber("(1) 3456789")); + EXPECT_TRUE(IsViablePhoneNumber("\xEF\xBC\x88" "1\xEF\xBC\x89\xE3\x80\x80" "3456789")); // Testing a leading + is okay. - EXPECT_TRUE(IsViablePhoneNumber("+1) 3456789")); + EXPECT_TRUE(IsViablePhoneNumber("+1\xEF\xBC\x89\xE3\x80\x80" "3456789")); } TEST_F(PhoneNumberUtilTest, NormaliseRemovePunctuation) { @@ -1659,13 +1659,13 @@ TEST_F(PhoneNumberUtilTest, NormaliseReplaceAlphaCharacters) { TEST_F(PhoneNumberUtilTest, NormaliseOtherDigits) { // The first digit is a full-width 2, the last digit is an Arabic-indic digit // 5. - string input_number("25٥"); + string input_number("\xEF\xBC\x92" "5\xD9\xA5"); Normalize(&input_number); static const string kExpectedOutput("255"); EXPECT_EQ(kExpectedOutput, input_number) << "Conversion did not correctly replace non-latin digits"; // The first digit is an Eastern-Arabic 5, the latter an Eastern-Arabic 0. - string eastern_arabic_input_number("۵2۰"); + string eastern_arabic_input_number("\xDB\xB5" "2\xDB\xB0"); Normalize(&eastern_arabic_input_number); static const string kExpectedOutput2("520"); EXPECT_EQ(kExpectedOutput2, eastern_arabic_input_number) @@ -2321,21 +2321,21 @@ TEST_F(PhoneNumberUtilTest, ParseWithInternationalPrefixes) { // Using a full-width plus sign. test_number.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("+1 (650) 333-6000", + phone_util_.Parse("\xEF\xBC\x8B" "1 (650) 333-6000", RegionCode::SG(), &test_number)); EXPECT_EQ(us_number, test_number); // The whole number, including punctuation, is here represented in full-width // form. test_number.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("+1 (650) 333-6000", + phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x8D\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90", RegionCode::SG(), &test_number)); EXPECT_EQ(us_number, test_number); // Using the U+30FC dash. test_number.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("+1 (650) 333ー6000", + phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93\xE3\x83\xBC\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90", RegionCode::SG(), &test_number)); EXPECT_EQ(us_number, test_number); } @@ -2575,7 +2575,7 @@ TEST_F(PhoneNumberUtilTest, ParseNumbersWithPlusWithNoRegion) { // Test with full-width plus. result_proto.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("+64 3 331 6005", RegionCode::ZZ(), + phone_util_.Parse("\xEF\xBC\x8B" "64 3 331 6005", RegionCode::ZZ(), &result_proto)); EXPECT_EQ(nz_number, result_proto); // Test with normal plus but leading characters that need to be stripped. @@ -2733,7 +2733,7 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) { EXPECT_EQ(us_with_extension, test_number); test_number.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("(800) 901-3355 ,extensión 7246433", + phone_util_.Parse("(800) 901-3355 ,extensi\xC3\xB3n 7246433", RegionCode::US(), &test_number)); EXPECT_EQ(us_with_extension, test_number); @@ -2741,7 +2741,7 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) { // Repeat with the small letter o with acute accent created by combining // characters. EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("(800) 901-3355 ,extensión 7246433", + phone_util_.Parse("(800) 901-3355 ,extensio\xCC\x81n 7246433", RegionCode::US(), &test_number)); EXPECT_EQ(us_with_extension, test_number); diff --git a/third_party/libphonenumber/patches/utf8_v186.patch b/third_party/libphonenumber/patches/utf8_v186.patch new file mode 100644 index 0000000..9545bab --- /dev/null +++ b/third_party/libphonenumber/patches/utf8_v186.patch @@ -0,0 +1,251 @@ +Index: D:/src/src/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc +=================================================================== +--- phonenumberutil_test.cc (revision 186) ++++ phonenumberutil_test.cc (working copy) +@@ -1101,11 +1101,11 @@ + ExtractPossibleNumber("Tel:+800-345-600", &extracted_number); + EXPECT_EQ("+800-345-600", extracted_number); + // Should recognise wide digits as possible start values. +- ExtractPossibleNumber("023", &extracted_number); +- EXPECT_EQ("023", extracted_number); ++ ExtractPossibleNumber("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93", &extracted_number); ++ EXPECT_EQ("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93", extracted_number); + // Dashes are not possible start values and should be removed. +- ExtractPossibleNumber("Num-123", &extracted_number); +- EXPECT_EQ("123", extracted_number); ++ ExtractPossibleNumber("Num-\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93", &extracted_number); ++ EXPECT_EQ("\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93", extracted_number); + // If not possible number present, return empty string. + ExtractPossibleNumber("Num-....", &extracted_number); + EXPECT_EQ("", extracted_number); +@@ -1119,7 +1119,7 @@ + ExtractPossibleNumber("(650) 253-0000.", &extracted_number); + EXPECT_EQ("650) 253-0000", extracted_number); + // This case has a trailing RTL char. +- ExtractPossibleNumber("(650) 253-0000‏", &extracted_number); ++ ExtractPossibleNumber("(650) 253-0000\xE2\x80\x8F", &extracted_number); + EXPECT_EQ("650) 253-0000", extracted_number); + } + +@@ -1163,7 +1163,7 @@ + // This number is no longer valid. + EXPECT_FALSE(phone_util_.IsValidNumber(bs_number)); + +- // La Mayotte and Réunion use 'leadingDigits' to differentiate them. ++ // La Mayotte and R\xC3\xA9union use 'leadingDigits' to differentiate them. + PhoneNumber re_number; + re_number.set_country_code(262); + re_number.set_national_number(262123456ULL); +@@ -1631,13 +1631,13 @@ + EXPECT_TRUE(IsViablePhoneNumber("0800-4-PIZZA")); + // Only one or two digits before possible punctuation followed by more digits. + // The punctuation used here is the unicode character u+3000. +- EXPECT_TRUE(IsViablePhoneNumber("1 34")); +- EXPECT_FALSE(IsViablePhoneNumber("1 3+4")); ++ EXPECT_TRUE(IsViablePhoneNumber("1\xE3\x80\x80" "34")); ++ EXPECT_FALSE(IsViablePhoneNumber("1\xE3\x80\x80" "3+4")); + // Unicode variants of possible starting character and other allowed + // punctuation/digits. +- EXPECT_TRUE(IsViablePhoneNumber("(1) 3456789")); ++ EXPECT_TRUE(IsViablePhoneNumber("\xEF\xBC\x88" "1\xEF\xBC\x89\xE3\x80\x80" "3456789")); + // Testing a leading + is okay. +- EXPECT_TRUE(IsViablePhoneNumber("+1) 3456789")); ++ EXPECT_TRUE(IsViablePhoneNumber("+1\xEF\xBC\x89\xE3\x80\x80" "3456789")); + } + + TEST_F(PhoneNumberUtilTest, NormaliseRemovePunctuation) { +@@ -1659,13 +1659,13 @@ + TEST_F(PhoneNumberUtilTest, NormaliseOtherDigits) { + // The first digit is a full-width 2, the last digit is an Arabic-indic digit + // 5. +- string input_number("25٥"); ++ string input_number("\xEF\xBC\x92" "5\xD9\xA5"); + Normalize(&input_number); + static const string kExpectedOutput("255"); + EXPECT_EQ(kExpectedOutput, input_number) + << "Conversion did not correctly replace non-latin digits"; + // The first digit is an Eastern-Arabic 5, the latter an Eastern-Arabic 0. +- string eastern_arabic_input_number("۵2۰"); ++ string eastern_arabic_input_number("\xDB\xB5" "2\xDB\xB0"); + Normalize(&eastern_arabic_input_number); + static const string kExpectedOutput2("520"); + EXPECT_EQ(kExpectedOutput2, eastern_arabic_input_number) +@@ -2321,21 +2321,21 @@ + // Using a full-width plus sign. + test_number.Clear(); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, +- phone_util_.Parse("+1 (650) 333-6000", ++ phone_util_.Parse("\xEF\xBC\x8B" "1 (650) 333-6000", + RegionCode::SG(), &test_number)); + EXPECT_EQ(us_number, test_number); + // The whole number, including punctuation, is here represented in full-width + // form. + test_number.Clear(); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, +- phone_util_.Parse("+1 (650) 333-6000", ++ phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x8D\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90", + RegionCode::SG(), &test_number)); + EXPECT_EQ(us_number, test_number); + + // Using the U+30FC dash. + test_number.Clear(); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, +- phone_util_.Parse("+1 (650) 333ー6000", ++ phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93\xE3\x83\xBC\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90", + RegionCode::SG(), &test_number)); + EXPECT_EQ(us_number, test_number); + } +@@ -2575,7 +2575,7 @@ + // Test with full-width plus. + result_proto.Clear(); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, +- phone_util_.Parse("+64 3 331 6005", RegionCode::ZZ(), ++ phone_util_.Parse("\xEF\xBC\x8B" "64 3 331 6005", RegionCode::ZZ(), + &result_proto)); + EXPECT_EQ(nz_number, result_proto); + // Test with normal plus but leading characters that need to be stripped. +@@ -2733,7 +2733,7 @@ + EXPECT_EQ(us_with_extension, test_number); + test_number.Clear(); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, +- phone_util_.Parse("(800) 901-3355 ,extensión 7246433", ++ phone_util_.Parse("(800) 901-3355 ,extensi\xC3\xB3n 7246433", + RegionCode::US(), + &test_number)); + EXPECT_EQ(us_with_extension, test_number); +@@ -2741,7 +2741,7 @@ + // Repeat with the small letter o with acute accent created by combining + // characters. + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, +- phone_util_.Parse("(800) 901-3355 ,extensión 7246433", ++ phone_util_.Parse("(800) 901-3355 ,extensio\xCC\x81n 7246433", + RegionCode::US(), + &test_number)); + EXPECT_EQ(us_with_extension, test_number); +Index: D:/src/src/third_party/libphonenumber/cpp/src/phonenumberutil.cc +=================================================================== +--- phonenumberutil.cc (revision 186) ++++ phonenumberutil.cc (working copy) +@@ -72,7 +72,7 @@ + // The kPlusSign signifies the international prefix. + const char kPlusSign[] = "+"; + +-const char kPlusChars[] = "++"; ++const char kPlusChars[] = "+\xEF\xBC\x8B"; + scoped_ptr plus_chars_pattern; + + const char kRfc3966ExtnPrefix[] = ";ext="; +@@ -88,7 +88,7 @@ + + // Digits accepted in phone numbers. + // Both Arabic-Indic and Eastern Arabic-Indic are supported. +-const char kValidDigits[] = "0-90-9٠-٩۰-۹"; ++const char kValidDigits[] = "0-9\xEF\xBC\x90-\xEF\xBC\x99\xD9\xA0-\xD9\xA9\xDB\xB0-\xDB\xB9"; + // We accept alpha characters in phone numbers, ASCII only. We store lower-case + // here only since our regular expressions are case-insensitive. + const char kValidAlpha[] = "a-z"; +@@ -140,7 +140,7 @@ + // itself. In emacs, you can use M-x unicode-what to query information about the + // unicode character. + const char kValidPunctuation[] = +- "-x‐-―−ー--/  ​⁠ ()()[].\\[\\]/~⁓∼~"; ++ "-x\xE2\x80\x90-\xE2\x80\x95\xE2\x88\x92\xE3\x83\xBC\xEF\xBC\x8D-\xEF\xBC\x8F \xC2\xA0\xE2\x80\x8B\xE2\x81\xA0\xE3\x80\x80()\xEF\xBC\x88\xEF\xBC\x89\xEF\xBC\xBB\xEF\xBC\xBD.\\[\\]/~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E"; + + // Regular expression of viable phone numbers. This is location independent. + // Checks we have at least three leading digits, and only valid punctuation, +@@ -454,7 +454,7 @@ + // defined order. + void CreateRegularExpressions() { + unique_international_prefix.reset( +- reg_exp::CreateRegularExpression("[\\d]+(?:[~⁓∼~][\\d]+)?")); ++ reg_exp::CreateRegularExpression("[\\d]+(?:[~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E][\\d]+)?")); + first_group_capturing_pattern.reset( + reg_exp::CreateRegularExpression("(\\$1)")); + carrier_code_pattern.reset( +@@ -476,16 +476,16 @@ + StrCat("[", kPlusChars, "]*(?:[", kValidPunctuation, "]*[", kValidDigits, + "]){3,}[", kValidAlpha, kValidPunctuation, kValidDigits, "]*"))); + // Canonical-equivalence doesn't seem to be an option with RE2, so we allow +- // two options for representing the ó - the character itself, and one in the ++ // two options for representing the \xC3\xB3 - the character itself, and one in the + // unicode decomposed form with the combining acute accent. Note that there + // are currently three capturing groups for the extension itself - if this + // number is changed, MaybeStripExtension needs to be updated. + const string capturing_extn_digits = StrCat("([", kValidDigits, "]{1,7})"); + known_extn_patterns.reset(new string( + StrCat(kRfc3966ExtnPrefix, capturing_extn_digits, "|" +- "[  \\t,]*(?:ext(?:ensi(?:ó?|ó))?n?|extn?|[,xx##~~]|" +- "int|int|anexo)" +- "[:\\..]?[  \\t,-]*", capturing_extn_digits, "#?|" ++ "[ \xC2\xA0\\t,]*(?:ext(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|\xEF\xBD\x85\xEF\xBD\x98\xEF\xBD\x94\xEF\xBD\x8E?|[,x\xEF\xBD\x98#\xEF\xBC\x83~\xEF\xBD\x9E]|" ++ "int|\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94|anexo)" ++ "[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*", capturing_extn_digits, "#?|" + "[- ]+([", kValidDigits, "]{1,5})#"))); + extn_pattern.reset(reg_exp::CreateRegularExpression( + StrCat("(?i)(?:", *known_extn_patterns, ")$").c_str())); +@@ -509,35 +509,35 @@ + all_plus_number_grouping_symbols->insert( + make_pair(ToUnicodeCodepoint("-"), '-')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("-"), '-')); ++ make_pair(ToUnicodeCodepoint("\xEF\xBC\x8D"), '-')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("‐"), '-')); ++ make_pair(ToUnicodeCodepoint("\xE2\x80\x90"), '-')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("‑"), '-')); ++ make_pair(ToUnicodeCodepoint("\xE2\x80\x91"), '-')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("‒"), '-')); ++ make_pair(ToUnicodeCodepoint("\xE2\x80\x92"), '-')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("–"), '-')); ++ make_pair(ToUnicodeCodepoint("\xE2\x80\x93"), '-')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("—"), '-')); ++ make_pair(ToUnicodeCodepoint("\xE2\x80\x94"), '-')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("―"), '-')); ++ make_pair(ToUnicodeCodepoint("\xE2\x80\x95"), '-')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("−"), '-')); ++ make_pair(ToUnicodeCodepoint("\xE2\x88\x92"), '-')); + all_plus_number_grouping_symbols->insert( + make_pair(ToUnicodeCodepoint("/"), '/')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("/"), '/')); ++ make_pair(ToUnicodeCodepoint("\xEF\xBC\x8F"), '/')); + all_plus_number_grouping_symbols->insert( + make_pair(ToUnicodeCodepoint(" "), ' ')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint(" "), ' ')); ++ make_pair(ToUnicodeCodepoint("\xE3\x80\x80"), ' ')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("⁠"), ' ')); ++ make_pair(ToUnicodeCodepoint("\xE2\x81\xA0"), ' ')); + all_plus_number_grouping_symbols->insert( + make_pair(ToUnicodeCodepoint("."), '.')); + all_plus_number_grouping_symbols->insert( +- make_pair(ToUnicodeCodepoint("."), '.')); ++ make_pair(ToUnicodeCodepoint("\xEF\xBC\x8E"), '.')); + // Only the upper-case letters are added here - the lower-case versions are + // added programmatically. + alpha_mappings->insert(make_pair(ToUnicodeCodepoint("A"), '2')); +@@ -849,7 +849,7 @@ + // Note here that all NANPA formatting rules are contained by US, so we use + // that to format NANPA numbers. The same applies to Russian Fed regions - + // rules are contained by Russia. French Indian Ocean country rules are +- // contained by Réunion. ++ // contained by R\xC3\xA9union. + string region_code; + GetRegionCodeForCountryCode(country_calling_code, ®ion_code); + if (!HasValidRegionCode(region_code, country_calling_code, +@@ -1015,7 +1015,7 @@ + // For regions that share a country calling code, the country calling code + // need not be dialled. This also applies when dialling within a region, so + // this if clause covers both these cases. +- // Technically this is the case for dialling from la Réunion to other ++ // Technically this is the case for dialling from la R\xC3\xA9union to other + // overseas departments of France (French Guiana, Martinique, Guadeloupe), + // but not vice versa - so we don't cover this edge case for now and for + // those cases return the version including country calling code. -- cgit v1.1