From 866cd68e4fd9d4195026109b2e6d285b50334218 Mon Sep 17 00:00:00 2001
From: "georgey@chromium.org"
 <georgey@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>
Date: Fri, 27 May 2011 01:56:59 +0000
Subject: Change UTF-8 charcters in constants into escape sequences BUG=none
 TEST=unit-tested Review URL: http://codereview.chromium.org/7077021

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@86950 0039d316-1c4b-4281-b951-d872f2087c98
---
 third_party/libphonenumber/README.chromium         |   3 +-
 .../libphonenumber/cpp/src/phonenumberutil.cc      |  44 ++--
 .../libphonenumber/cpp/src/phonenumberutil_test.cc |  36 +--
 third_party/libphonenumber/patches/utf8_v186.patch | 251 +++++++++++++++++++++
 4 files changed, 293 insertions(+), 41 deletions(-)
 create mode 100644 third_party/libphonenumber/patches/utf8_v186.patch
diff --git a/third_party/libphonenumber/README.chromium b/third_party/libphonenumber/README.chromium
index 9cf38ee..608a0a5 100644
--- a/third_party/libphonenumber/README.chromium
+++ b/third_party/libphonenumber/README.chromium
@@ -22,7 +22,8 @@ Additional files, not in the original library:
 Until the changes are upstreamed library is included directly, with a patch
 in patches/version186.patch applied. The patch adds plugability to the regular
 expression engine, its RE2 (default) implementation, and the unit-test for the
-changes.
+changes. The second patch utf8_v186.patch is applied to allow compilation on
+some multi-byte locales.
 
 The folders included in our repository for now are
   cpp/
diff --git a/third_party/libphonenumber/cpp/src/phonenumberutil.cc b/third_party/libphonenumber/cpp/src/phonenumberutil.cc
index 75ef374..b0201f2 100644
--- a/third_party/libphonenumber/cpp/src/phonenumberutil.cc
+++ b/third_party/libphonenumber/cpp/src/phonenumberutil.cc
@@ -72,7 +72,7 @@ scoped_ptr<map<char32, char> > all_plus_number_grouping_symbols;
 // The kPlusSign signifies the international prefix.
 const char kPlusSign[] = "+";
 
-const char kPlusChars[] = "+＋";
+const char kPlusChars[] = "+\xEF\xBC\x8B";
 scoped_ptr<const reg_exp::RegularExpression> plus_chars_pattern;
 
 const char kRfc3966ExtnPrefix[] = ";ext=";
@@ -88,7 +88,7 @@ scoped_ptr<const reg_exp::RegularExpression> unique_international_prefix;
 
 // Digits accepted in phone numbers.
 // Both Arabic-Indic and Eastern Arabic-Indic are supported.
-const char kValidDigits[] = "0-9０-９٠-٩۰-۹";
+const char kValidDigits[] = "0-9\xEF\xBC\x90-\xEF\xBC\x99\xD9\xA0-\xD9\xA9\xDB\xB0-\xDB\xB9";
 // We accept alpha characters in phone numbers, ASCII only. We store lower-case
 // here only since our regular expressions are case-insensitive.
 const char kValidAlpha[] = "a-z";
@@ -140,7 +140,7 @@ scoped_ptr<const reg_exp::RegularExpression> unwanted_end_char_pattern;
 // itself. In emacs, you can use M-x unicode-what to query information about the
 // unicode character.
 const char kValidPunctuation[] =
-    "-x‐-―−ー－-／  ​⁠　()（）［］.\\[\\]/~⁓∼～";
+    "-x\xE2\x80\x90-\xE2\x80\x95\xE2\x88\x92\xE3\x83\xBC\xEF\xBC\x8D-\xEF\xBC\x8F \xC2\xA0\xE2\x80\x8B\xE2\x81\xA0\xE3\x80\x80()\xEF\xBC\x88\xEF\xBC\x89\xEF\xBC\xBB\xEF\xBC\xBD.\\[\\]/~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E";
 
 // Regular expression of viable phone numbers. This is location independent.
 // Checks we have at least three leading digits, and only valid punctuation,
@@ -454,7 +454,7 @@ char32 ToUnicodeCodepoint(const char* unicode_char) {
 // defined order.
 void CreateRegularExpressions() {
   unique_international_prefix.reset(
-      reg_exp::CreateRegularExpression("[\\d]+(?:[~⁓∼～][\\d]+)?"));
+      reg_exp::CreateRegularExpression("[\\d]+(?:[~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E][\\d]+)?"));
   first_group_capturing_pattern.reset(
       reg_exp::CreateRegularExpression("(\\$1)"));
   carrier_code_pattern.reset(
@@ -476,16 +476,16 @@ void CreateRegularExpressions() {
       StrCat("[", kPlusChars, "]*(?:[", kValidPunctuation, "]*[", kValidDigits,
              "]){3,}[", kValidAlpha, kValidPunctuation, kValidDigits, "]*")));
   // Canonical-equivalence doesn't seem to be an option with RE2, so we allow
-  // two options for representing the ó - the character itself, and one in the
+  // two options for representing the \xC3\xB3 - the character itself, and one in the
   // unicode decomposed form with the combining acute accent. Note that there
   // are currently three capturing groups for the extension itself - if this
   // number is changed, MaybeStripExtension needs to be updated.
   const string capturing_extn_digits = StrCat("([", kValidDigits, "]{1,7})");
   known_extn_patterns.reset(new string(
       StrCat(kRfc3966ExtnPrefix, capturing_extn_digits, "|"
-             "[  \\t,]*(?:ext(?:ensi(?:ó?|ó))?n?|ｅｘｔｎ?|[,xｘ#＃~～]|"
-             "int|ｉｎｔ|anexo)"
-             "[:\\.．]?[  \\t,-]*", capturing_extn_digits, "#?|"
+             "[ \xC2\xA0\\t,]*(?:ext(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|\xEF\xBD\x85\xEF\xBD\x98\xEF\xBD\x94\xEF\xBD\x8E?|[,x\xEF\xBD\x98#\xEF\xBC\x83~\xEF\xBD\x9E]|"
+             "int|\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94|anexo)"
+             "[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*", capturing_extn_digits, "#?|"
              "[- ]+([", kValidDigits, "]{1,5})#")));
   extn_pattern.reset(reg_exp::CreateRegularExpression(
       StrCat("(?i)(?:", *known_extn_patterns, ")$").c_str()));
@@ -509,35 +509,35 @@ void InitializeStaticMapsAndSets() {
   all_plus_number_grouping_symbols->insert(
       make_pair(ToUnicodeCodepoint("-"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("－"), '-'));
+      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8D"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("‐"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x90"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("‑"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x91"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("‒"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x92"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("–"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x93"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("—"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x94"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("―"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x80\x95"), '-'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("−"), '-'));
+      make_pair(ToUnicodeCodepoint("\xE2\x88\x92"), '-'));
   all_plus_number_grouping_symbols->insert(
       make_pair(ToUnicodeCodepoint("/"), '/'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("／"), '/'));
+      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8F"), '/'));
   all_plus_number_grouping_symbols->insert(
       make_pair(ToUnicodeCodepoint(" "), ' '));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("　"), ' '));
+      make_pair(ToUnicodeCodepoint("\xE3\x80\x80"), ' '));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("⁠"), ' '));
+      make_pair(ToUnicodeCodepoint("\xE2\x81\xA0"), ' '));
   all_plus_number_grouping_symbols->insert(
       make_pair(ToUnicodeCodepoint("."), '.'));
   all_plus_number_grouping_symbols->insert(
-      make_pair(ToUnicodeCodepoint("．"), '.'));
+      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8E"), '.'));
   // Only the upper-case letters are added here - the lower-case versions are
   // added programmatically.
   alpha_mappings->insert(make_pair(ToUnicodeCodepoint("A"), '2'));
@@ -849,7 +849,7 @@ void PhoneNumberUtil::Format(const PhoneNumber& number,
   // Note here that all NANPA formatting rules are contained by US, so we use
   // that to format NANPA numbers. The same applies to Russian Fed regions -
   // rules are contained by Russia. French Indian Ocean country rules are
-  // contained by Réunion.
+  // contained by R\xC3\xA9union.
   string region_code;
   GetRegionCodeForCountryCode(country_calling_code, &region_code);
   if (!HasValidRegionCode(region_code, country_calling_code,
@@ -1015,7 +1015,7 @@ void PhoneNumberUtil::FormatOutOfCountryCallingNumber(
     // For regions that share a country calling code, the country calling code
     // need not be dialled. This also applies when dialling within a region, so
     // this if clause covers both these cases.
-    // Technically this is the case for dialling from la Réunion to other
+    // Technically this is the case for dialling from la R\xC3\xA9union to other
     // overseas departments of France (French Guiana, Martinique, Guadeloupe),
     // but not vice versa - so we don't cover this edge case for now and for
     // those cases return the version including country calling code.
diff --git a/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc b/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc
index 3ef3fa7..9eadaf9 100644
--- a/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc
+++ b/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc
@@ -1101,11 +1101,11 @@ TEST_F(PhoneNumberUtilTest, ExtractPossibleNumber) {
   ExtractPossibleNumber("Tel:+800-345-600", &extracted_number);
   EXPECT_EQ("+800-345-600", extracted_number);
   // Should recognise wide digits as possible start values.
-  ExtractPossibleNumber("０２３", &extracted_number);
-  EXPECT_EQ("０２３", extracted_number);
+  ExtractPossibleNumber("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93", &extracted_number);
+  EXPECT_EQ("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93", extracted_number);
   // Dashes are not possible start values and should be removed.
-  ExtractPossibleNumber("Num-１２３", &extracted_number);
-  EXPECT_EQ("１２３", extracted_number);
+  ExtractPossibleNumber("Num-\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93", &extracted_number);
+  EXPECT_EQ("\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93", extracted_number);
   // If not possible number present, return empty string.
   ExtractPossibleNumber("Num-....", &extracted_number);
   EXPECT_EQ("", extracted_number);
@@ -1119,7 +1119,7 @@ TEST_F(PhoneNumberUtilTest, ExtractPossibleNumber) {
   ExtractPossibleNumber("(650) 253-0000.", &extracted_number);
   EXPECT_EQ("650) 253-0000", extracted_number);
   // This case has a trailing RTL char.
-  ExtractPossibleNumber("(650) 253-0000‏", &extracted_number);
+  ExtractPossibleNumber("(650) 253-0000\xE2\x80\x8F", &extracted_number);
   EXPECT_EQ("650) 253-0000", extracted_number);
 }
 
@@ -1163,7 +1163,7 @@ TEST_F(PhoneNumberUtilTest, IsValidForRegion) {
   // This number is no longer valid.
   EXPECT_FALSE(phone_util_.IsValidNumber(bs_number));
 
-  // La Mayotte and Réunion use 'leadingDigits' to differentiate them.
+  // La Mayotte and R\xC3\xA9union use 'leadingDigits' to differentiate them.
   PhoneNumber re_number;
   re_number.set_country_code(262);
   re_number.set_national_number(262123456ULL);
@@ -1631,13 +1631,13 @@ TEST_F(PhoneNumberUtilTest, IsViablePhoneNumber) {
   EXPECT_TRUE(IsViablePhoneNumber("0800-4-PIZZA"));
   // Only one or two digits before possible punctuation followed by more digits.
   // The punctuation used here is the unicode character u+3000.
-  EXPECT_TRUE(IsViablePhoneNumber("1　34"));
-  EXPECT_FALSE(IsViablePhoneNumber("1　3+4"));
+  EXPECT_TRUE(IsViablePhoneNumber("1\xE3\x80\x80" "34"));
+  EXPECT_FALSE(IsViablePhoneNumber("1\xE3\x80\x80" "3+4"));
   // Unicode variants of possible starting character and other allowed
   // punctuation/digits.
-  EXPECT_TRUE(IsViablePhoneNumber("（1）　3456789"));
+  EXPECT_TRUE(IsViablePhoneNumber("\xEF\xBC\x88" "1\xEF\xBC\x89\xE3\x80\x80" "3456789"));
   // Testing a leading + is okay.
-  EXPECT_TRUE(IsViablePhoneNumber("+1）　3456789"));
+  EXPECT_TRUE(IsViablePhoneNumber("+1\xEF\xBC\x89\xE3\x80\x80" "3456789"));
 }
 
 TEST_F(PhoneNumberUtilTest, NormaliseRemovePunctuation) {
@@ -1659,13 +1659,13 @@ TEST_F(PhoneNumberUtilTest, NormaliseReplaceAlphaCharacters) {
 TEST_F(PhoneNumberUtilTest, NormaliseOtherDigits) {
   // The first digit is a full-width 2, the last digit is an Arabic-indic digit
   // 5.
-  string input_number("２5٥");
+  string input_number("\xEF\xBC\x92" "5\xD9\xA5");
   Normalize(&input_number);
   static const string kExpectedOutput("255");
   EXPECT_EQ(kExpectedOutput, input_number)
       << "Conversion did not correctly replace non-latin digits";
   // The first digit is an Eastern-Arabic 5, the latter an Eastern-Arabic 0.
-  string eastern_arabic_input_number("۵2۰");
+  string eastern_arabic_input_number("\xDB\xB5" "2\xDB\xB0");
   Normalize(&eastern_arabic_input_number);
   static const string kExpectedOutput2("520");
   EXPECT_EQ(kExpectedOutput2, eastern_arabic_input_number)
@@ -2321,21 +2321,21 @@ TEST_F(PhoneNumberUtilTest, ParseWithInternationalPrefixes) {
   // Using a full-width plus sign.
   test_number.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("＋1 (650) 333-6000",
+            phone_util_.Parse("\xEF\xBC\x8B" "1 (650) 333-6000",
                               RegionCode::SG(), &test_number));
   EXPECT_EQ(us_number, test_number);
   // The whole number, including punctuation, is here represented in full-width
   // form.
   test_number.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("＋１　（６５０）　３３３－６０００",
+            phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x8D\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90",
                               RegionCode::SG(), &test_number));
   EXPECT_EQ(us_number, test_number);
 
   // Using the U+30FC dash.
   test_number.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("＋１　（６５０）　３３３ー６０００",
+            phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93\xE3\x83\xBC\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90",
                               RegionCode::SG(), &test_number));
   EXPECT_EQ(us_number, test_number);
 }
@@ -2575,7 +2575,7 @@ TEST_F(PhoneNumberUtilTest, ParseNumbersWithPlusWithNoRegion) {
   // Test with full-width plus.
   result_proto.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("＋64 3 331 6005", RegionCode::ZZ(),
+            phone_util_.Parse("\xEF\xBC\x8B" "64 3 331 6005", RegionCode::ZZ(),
                               &result_proto));
   EXPECT_EQ(nz_number, result_proto);
   // Test with normal plus but leading characters that need to be stripped.
@@ -2733,7 +2733,7 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) {
   EXPECT_EQ(us_with_extension, test_number);
   test_number.Clear();
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("(800) 901-3355 ,extensión 7246433",
+            phone_util_.Parse("(800) 901-3355 ,extensi\xC3\xB3n 7246433",
                               RegionCode::US(),
                               &test_number));
   EXPECT_EQ(us_with_extension, test_number);
@@ -2741,7 +2741,7 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) {
   // Repeat with the small letter o with acute accent created by combining
   // characters.
   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
-            phone_util_.Parse("(800) 901-3355 ,extensión 7246433",
+            phone_util_.Parse("(800) 901-3355 ,extensio\xCC\x81n 7246433",
                               RegionCode::US(),
                               &test_number));
   EXPECT_EQ(us_with_extension, test_number);
diff --git a/third_party/libphonenumber/patches/utf8_v186.patch b/third_party/libphonenumber/patches/utf8_v186.patch
new file mode 100644
index 0000000..9545bab
--- /dev/null
+++ b/third_party/libphonenumber/patches/utf8_v186.patch
@@ -0,0 +1,251 @@
+Index: D:/src/src/third_party/libphonenumber/cpp/src/phonenumberutil_test.cc
+===================================================================
+--- phonenumberutil_test.cc	(revision 186)
++++ phonenumberutil_test.cc	(working copy)
+@@ -1101,11 +1101,11 @@
+   ExtractPossibleNumber("Tel:+800-345-600", &extracted_number);
+   EXPECT_EQ("+800-345-600", extracted_number);
+   // Should recognise wide digits as possible start values.
+-  ExtractPossibleNumber("０２３", &extracted_number);
+-  EXPECT_EQ("０２３", extracted_number);
++  ExtractPossibleNumber("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93", &extracted_number);
++  EXPECT_EQ("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93", extracted_number);
+   // Dashes are not possible start values and should be removed.
+-  ExtractPossibleNumber("Num-１２３", &extracted_number);
+-  EXPECT_EQ("１２３", extracted_number);
++  ExtractPossibleNumber("Num-\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93", &extracted_number);
++  EXPECT_EQ("\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93", extracted_number);
+   // If not possible number present, return empty string.
+   ExtractPossibleNumber("Num-....", &extracted_number);
+   EXPECT_EQ("", extracted_number);
+@@ -1119,7 +1119,7 @@
+   ExtractPossibleNumber("(650) 253-0000.", &extracted_number);
+   EXPECT_EQ("650) 253-0000", extracted_number);
+   // This case has a trailing RTL char.
+-  ExtractPossibleNumber("(650) 253-0000‏", &extracted_number);
++  ExtractPossibleNumber("(650) 253-0000\xE2\x80\x8F", &extracted_number);
+   EXPECT_EQ("650) 253-0000", extracted_number);
+ }
+ 
+@@ -1163,7 +1163,7 @@
+   // This number is no longer valid.
+   EXPECT_FALSE(phone_util_.IsValidNumber(bs_number));
+ 
+-  // La Mayotte and Réunion use 'leadingDigits' to differentiate them.
++  // La Mayotte and R\xC3\xA9union use 'leadingDigits' to differentiate them.
+   PhoneNumber re_number;
+   re_number.set_country_code(262);
+   re_number.set_national_number(262123456ULL);
+@@ -1631,13 +1631,13 @@
+   EXPECT_TRUE(IsViablePhoneNumber("0800-4-PIZZA"));
+   // Only one or two digits before possible punctuation followed by more digits.
+   // The punctuation used here is the unicode character u+3000.
+-  EXPECT_TRUE(IsViablePhoneNumber("1　34"));
+-  EXPECT_FALSE(IsViablePhoneNumber("1　3+4"));
++  EXPECT_TRUE(IsViablePhoneNumber("1\xE3\x80\x80" "34"));
++  EXPECT_FALSE(IsViablePhoneNumber("1\xE3\x80\x80" "3+4"));
+   // Unicode variants of possible starting character and other allowed
+   // punctuation/digits.
+-  EXPECT_TRUE(IsViablePhoneNumber("（1）　3456789"));
++  EXPECT_TRUE(IsViablePhoneNumber("\xEF\xBC\x88" "1\xEF\xBC\x89\xE3\x80\x80" "3456789"));
+   // Testing a leading + is okay.
+-  EXPECT_TRUE(IsViablePhoneNumber("+1）　3456789"));
++  EXPECT_TRUE(IsViablePhoneNumber("+1\xEF\xBC\x89\xE3\x80\x80" "3456789"));
+ }
+ 
+ TEST_F(PhoneNumberUtilTest, NormaliseRemovePunctuation) {
+@@ -1659,13 +1659,13 @@
+ TEST_F(PhoneNumberUtilTest, NormaliseOtherDigits) {
+   // The first digit is a full-width 2, the last digit is an Arabic-indic digit
+   // 5.
+-  string input_number("２5٥");
++  string input_number("\xEF\xBC\x92" "5\xD9\xA5");
+   Normalize(&input_number);
+   static const string kExpectedOutput("255");
+   EXPECT_EQ(kExpectedOutput, input_number)
+       << "Conversion did not correctly replace non-latin digits";
+   // The first digit is an Eastern-Arabic 5, the latter an Eastern-Arabic 0.
+-  string eastern_arabic_input_number("۵2۰");
++  string eastern_arabic_input_number("\xDB\xB5" "2\xDB\xB0");
+   Normalize(&eastern_arabic_input_number);
+   static const string kExpectedOutput2("520");
+   EXPECT_EQ(kExpectedOutput2, eastern_arabic_input_number)
+@@ -2321,21 +2321,21 @@
+   // Using a full-width plus sign.
+   test_number.Clear();
+   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
+-            phone_util_.Parse("＋1 (650) 333-6000",
++            phone_util_.Parse("\xEF\xBC\x8B" "1 (650) 333-6000",
+                               RegionCode::SG(), &test_number));
+   EXPECT_EQ(us_number, test_number);
+   // The whole number, including punctuation, is here represented in full-width
+   // form.
+   test_number.Clear();
+   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
+-            phone_util_.Parse("＋１　（６５０）　３３３－６０００",
++            phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x8D\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90",
+                               RegionCode::SG(), &test_number));
+   EXPECT_EQ(us_number, test_number);
+ 
+   // Using the U+30FC dash.
+   test_number.Clear();
+   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
+-            phone_util_.Parse("＋１　（６５０）　３３３ー６０００",
++            phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93\xE3\x83\xBC\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90",
+                               RegionCode::SG(), &test_number));
+   EXPECT_EQ(us_number, test_number);
+ }
+@@ -2575,7 +2575,7 @@
+   // Test with full-width plus.
+   result_proto.Clear();
+   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
+-            phone_util_.Parse("＋64 3 331 6005", RegionCode::ZZ(),
++            phone_util_.Parse("\xEF\xBC\x8B" "64 3 331 6005", RegionCode::ZZ(),
+                               &result_proto));
+   EXPECT_EQ(nz_number, result_proto);
+   // Test with normal plus but leading characters that need to be stripped.
+@@ -2733,7 +2733,7 @@
+   EXPECT_EQ(us_with_extension, test_number);
+   test_number.Clear();
+   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
+-            phone_util_.Parse("(800) 901-3355 ,extensión 7246433",
++            phone_util_.Parse("(800) 901-3355 ,extensi\xC3\xB3n 7246433",
+                               RegionCode::US(),
+                               &test_number));
+   EXPECT_EQ(us_with_extension, test_number);
+@@ -2741,7 +2741,7 @@
+   // Repeat with the small letter o with acute accent created by combining
+   // characters.
+   EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
+-            phone_util_.Parse("(800) 901-3355 ,extensión 7246433",
++            phone_util_.Parse("(800) 901-3355 ,extensio\xCC\x81n 7246433",
+                               RegionCode::US(),
+                               &test_number));
+   EXPECT_EQ(us_with_extension, test_number);
+Index: D:/src/src/third_party/libphonenumber/cpp/src/phonenumberutil.cc
+===================================================================
+--- phonenumberutil.cc	(revision 186)
++++ phonenumberutil.cc	(working copy)
+@@ -72,7 +72,7 @@
+ // The kPlusSign signifies the international prefix.
+ const char kPlusSign[] = "+";
+ 
+-const char kPlusChars[] = "+＋";
++const char kPlusChars[] = "+\xEF\xBC\x8B";
+ scoped_ptr<const reg_exp::RegularExpression> plus_chars_pattern;
+ 
+ const char kRfc3966ExtnPrefix[] = ";ext=";
+@@ -88,7 +88,7 @@
+ 
+ // Digits accepted in phone numbers.
+ // Both Arabic-Indic and Eastern Arabic-Indic are supported.
+-const char kValidDigits[] = "0-9０-９٠-٩۰-۹";
++const char kValidDigits[] = "0-9\xEF\xBC\x90-\xEF\xBC\x99\xD9\xA0-\xD9\xA9\xDB\xB0-\xDB\xB9";
+ // We accept alpha characters in phone numbers, ASCII only. We store lower-case
+ // here only since our regular expressions are case-insensitive.
+ const char kValidAlpha[] = "a-z";
+@@ -140,7 +140,7 @@
+ // itself. In emacs, you can use M-x unicode-what to query information about the
+ // unicode character.
+ const char kValidPunctuation[] =
+-    "-x‐-―−ー－-／  ​⁠　()（）［］.\\[\\]/~⁓∼～";
++    "-x\xE2\x80\x90-\xE2\x80\x95\xE2\x88\x92\xE3\x83\xBC\xEF\xBC\x8D-\xEF\xBC\x8F \xC2\xA0\xE2\x80\x8B\xE2\x81\xA0\xE3\x80\x80()\xEF\xBC\x88\xEF\xBC\x89\xEF\xBC\xBB\xEF\xBC\xBD.\\[\\]/~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E";
+ 
+ // Regular expression of viable phone numbers. This is location independent.
+ // Checks we have at least three leading digits, and only valid punctuation,
+@@ -454,7 +454,7 @@
+ // defined order.
+ void CreateRegularExpressions() {
+   unique_international_prefix.reset(
+-      reg_exp::CreateRegularExpression("[\\d]+(?:[~⁓∼～][\\d]+)?"));
++      reg_exp::CreateRegularExpression("[\\d]+(?:[~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E][\\d]+)?"));
+   first_group_capturing_pattern.reset(
+       reg_exp::CreateRegularExpression("(\\$1)"));
+   carrier_code_pattern.reset(
+@@ -476,16 +476,16 @@
+       StrCat("[", kPlusChars, "]*(?:[", kValidPunctuation, "]*[", kValidDigits,
+              "]){3,}[", kValidAlpha, kValidPunctuation, kValidDigits, "]*")));
+   // Canonical-equivalence doesn't seem to be an option with RE2, so we allow
+-  // two options for representing the ó - the character itself, and one in the
++  // two options for representing the \xC3\xB3 - the character itself, and one in the
+   // unicode decomposed form with the combining acute accent. Note that there
+   // are currently three capturing groups for the extension itself - if this
+   // number is changed, MaybeStripExtension needs to be updated.
+   const string capturing_extn_digits = StrCat("([", kValidDigits, "]{1,7})");
+   known_extn_patterns.reset(new string(
+       StrCat(kRfc3966ExtnPrefix, capturing_extn_digits, "|"
+-             "[  \\t,]*(?:ext(?:ensi(?:ó?|ó))?n?|ｅｘｔｎ?|[,xｘ#＃~～]|"
+-             "int|ｉｎｔ|anexo)"
+-             "[:\\.．]?[  \\t,-]*", capturing_extn_digits, "#?|"
++             "[ \xC2\xA0\\t,]*(?:ext(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|\xEF\xBD\x85\xEF\xBD\x98\xEF\xBD\x94\xEF\xBD\x8E?|[,x\xEF\xBD\x98#\xEF\xBC\x83~\xEF\xBD\x9E]|"
++             "int|\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94|anexo)"
++             "[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*", capturing_extn_digits, "#?|"
+              "[- ]+([", kValidDigits, "]{1,5})#")));
+   extn_pattern.reset(reg_exp::CreateRegularExpression(
+       StrCat("(?i)(?:", *known_extn_patterns, ")$").c_str()));
+@@ -509,35 +509,35 @@
+   all_plus_number_grouping_symbols->insert(
+       make_pair(ToUnicodeCodepoint("-"), '-'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("－"), '-'));
++      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8D"), '-'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("‐"), '-'));
++      make_pair(ToUnicodeCodepoint("\xE2\x80\x90"), '-'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("‑"), '-'));
++      make_pair(ToUnicodeCodepoint("\xE2\x80\x91"), '-'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("‒"), '-'));
++      make_pair(ToUnicodeCodepoint("\xE2\x80\x92"), '-'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("–"), '-'));
++      make_pair(ToUnicodeCodepoint("\xE2\x80\x93"), '-'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("—"), '-'));
++      make_pair(ToUnicodeCodepoint("\xE2\x80\x94"), '-'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("―"), '-'));
++      make_pair(ToUnicodeCodepoint("\xE2\x80\x95"), '-'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("−"), '-'));
++      make_pair(ToUnicodeCodepoint("\xE2\x88\x92"), '-'));
+   all_plus_number_grouping_symbols->insert(
+       make_pair(ToUnicodeCodepoint("/"), '/'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("／"), '/'));
++      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8F"), '/'));
+   all_plus_number_grouping_symbols->insert(
+       make_pair(ToUnicodeCodepoint(" "), ' '));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("　"), ' '));
++      make_pair(ToUnicodeCodepoint("\xE3\x80\x80"), ' '));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("⁠"), ' '));
++      make_pair(ToUnicodeCodepoint("\xE2\x81\xA0"), ' '));
+   all_plus_number_grouping_symbols->insert(
+       make_pair(ToUnicodeCodepoint("."), '.'));
+   all_plus_number_grouping_symbols->insert(
+-      make_pair(ToUnicodeCodepoint("．"), '.'));
++      make_pair(ToUnicodeCodepoint("\xEF\xBC\x8E"), '.'));
+   // Only the upper-case letters are added here - the lower-case versions are
+   // added programmatically.
+   alpha_mappings->insert(make_pair(ToUnicodeCodepoint("A"), '2'));
+@@ -849,7 +849,7 @@
+   // Note here that all NANPA formatting rules are contained by US, so we use
+   // that to format NANPA numbers. The same applies to Russian Fed regions -
+   // rules are contained by Russia. French Indian Ocean country rules are
+-  // contained by Réunion.
++  // contained by R\xC3\xA9union.
+   string region_code;
+   GetRegionCodeForCountryCode(country_calling_code, &region_code);
+   if (!HasValidRegionCode(region_code, country_calling_code,
+@@ -1015,7 +1015,7 @@
+     // For regions that share a country calling code, the country calling code
+     // need not be dialled. This also applies when dialling within a region, so
+     // this if clause covers both these cases.
+-    // Technically this is the case for dialling from la Réunion to other
++    // Technically this is the case for dialling from la R\xC3\xA9union to other
+     // overseas departments of France (French Guiana, Martinique, Guadeloupe),
+     // but not vice versa - so we don't cover this edge case for now and for
+     // those cases return the version including country calling code.
-- 
cgit v1.1