6 files changed, 52 insertions, 32 deletions
diff --git a/googleurl/src/gurl.h b/googleurl/src/gurl.h
index 29fea81..ba97191 100644
--- a/googleurl/src/gurl.h
+++ b/googleurl/src/gurl.h
@@ -30,7 +30,7 @@
 #ifndef GOOGLEURL_SRC_GURL_H__
 #define GOOGLEURL_SRC_GURL_H__
 
-#include <iostream>
+#include <iosfwd>
 #include <string>
 
 #include "base/string16.h"
diff --git a/googleurl/src/url_canon_etc.cc b/googleurl/src/url_canon_etc.cc
index aea181a..318c906 100644
--- a/googleurl/src/url_canon_etc.cc
+++ b/googleurl/src/url_canon_etc.cc
@@ -213,9 +213,6 @@ bool DoUserInfo(const CHAR* username_spec,
 inline void WritePortInt(char* output, int output_len, int port) {
   _itoa_s(port, output, output_len, 10);
 }
-inline void WritePortInt(char16* output, int output_len, int port) {
-  _itow_s(port, output, output_len, 10);
-}
 
 // This function will prepend the colon if there will be a port.
 template<typename CHAR, typename UCHAR>
@@ -290,12 +287,11 @@ void DoCanonicalizeRef(const CHAR* spec,
     } else {
       // Non-ASCII characters are appended unescaped, but only when they are
       // valid. Invalid Unicode characters are replaced with the "invalid
-      // character" as IE seems to.
+      // character" as IE seems to (ReadUTFChar puts the unicode replacement
+      // character in the output on failure for us).
       unsigned code_point;
-      if (!ReadUTFChar(spec, &i, end, &code_point))
-        AppendUTF8Value(kUnicodeReplacementCharacter, output);
-      else
-        AppendUTF8Value(code_point, output);
+      ReadUTFChar(spec, &i, end, &code_point);
+      AppendUTF8Value(code_point, output);
     }
   }
 
diff --git a/googleurl/src/url_canon_internal.h b/googleurl/src/url_canon_internal.h
index 4b1e45a..6305647 100644
--- a/googleurl/src/url_canon_internal.h
+++ b/googleurl/src/url_canon_internal.h
@@ -37,6 +37,7 @@
 
 #include <stdlib.h>
 
+#include "base/logging.h"
 #include "googleurl/src/url_canon.h"
 
 namespace url_canon {
@@ -173,6 +174,9 @@ bool ReadUTFChar(const char* str, int* begin, int length,
 // Generic To-UTF-8 converter. This will call the given append method for each
 // character that should be appended, with the given output method. Wrappers
 // are provided below for escaped and non-escaped versions of this.
+//
+// The char_value must have already been checked that it's a valid Unicode
+// character.
 template<class Output, void Appender(unsigned char, Output*)>
 inline void DoAppendUTF8(unsigned char_value, Output* output) {
   if (char_value <= 0x7f) {
@@ -191,7 +195,7 @@ inline void DoAppendUTF8(unsigned char_value, Output* output) {
              output);
     Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
              output);
-  } else if (char_value <= 0x1fffff) {
+  } else if (char_value <= 0x10FFFF) {  // Max unicode code point.
     // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
              output);
@@ -201,20 +205,9 @@ inline void DoAppendUTF8(unsigned char_value, Output* output) {
              output);
     Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
              output);
-  } else if (char_value <= 0x10FFFF) {  // Max unicode code point.
-    // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-    Appender(static_cast<unsigned char>(0xf8 | (char_value >> 24)),
-             output);
-    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 18) & 0x3f)),
-             output);
-    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)),
-             output);
-    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
-             output);
-    Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
-             output);
   } else {
-    // Invalid UTF-8 character (>20 bits)
+    // Invalid UTF-8 character (>20 bits).
+    NOTREACHED();
   }
 }
 
diff --git a/googleurl/src/url_canon_unittest.cc b/googleurl/src/url_canon_unittest.cc
index a3e43e2..1a3cd53 100644
--- a/googleurl/src/url_canon_unittest.cc
+++ b/googleurl/src/url_canon_unittest.cc
@@ -144,6 +144,44 @@ void SetupReplComp(
 
 }  // namespace
 
+TEST(URLCanonTest, DoAppendUTF8) {
+  struct UTF8Case {
+    unsigned input;
+    const char* output;
+  } utf_cases[] = {
+    // Valid code points.
+    {0x24, "\x24"},
+    {0xA2, "\xC2\xA2"},
+    {0x20AC, "\xE2\x82\xAC"},
+    {0x24B62, "\xF0\xA4\xAD\xA2"},
+    {0x10FFFF, "\xF4\x8F\xBF\xBF"},
+  };
+  std::string out_str;
+  for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) {
+    out_str.clear();
+    url_canon::StdStringCanonOutput output(&out_str);
+    url_canon::AppendUTF8Value(utf_cases[i].input, &output);
+    output.Complete();
+    EXPECT_EQ(utf_cases[i].output, out_str);
+  }
+}
+
+// TODO(mattm): Can't run this in debug mode for now, since the DCHECK will
+// cause the Chromium stacktrace dialog to appear and hang the test.
+// See http://crbug.com/49580.
+#ifdef NDEBUG
+TEST(URLCanonTest, DoAppendUTF8Invalid) {
+  std::string out_str;
+  url_canon::StdStringCanonOutput output(&out_str);
+  // Invalid code point (too large).
+  ASSERT_DEBUG_DEATH({
+    url_canon::AppendUTF8Value(0x110000, &output);
+    output.Complete();
+    EXPECT_EQ("", out_str);
+  }, "");
+}
+#endif
+
 TEST(URLCanonTest, UTF) {
   // Low-level test that we handle reading, canonicalization, and writing
   // UTF-8/UTF-16 strings properly.
diff --git a/googleurl/src/url_parse.cc b/googleurl/src/url_parse.cc
index a08c4da..fa31210 100644
--- a/googleurl/src/url_parse.cc
+++ b/googleurl/src/url_parse.cc
@@ -324,7 +324,7 @@ void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
   if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
     after_scheme = parsed->scheme.end() + 1;  // Skip past the colon.
   } else {
-    // Say there's no scheme when there is a colon. We could also say that
+    // Say there's no scheme when there is no colon. We could also say that
     // everything is the scheme. Both would produce an invalid URL, but this way
     // seems less wrong in more cases.
     parsed->scheme.reset();
@@ -645,7 +645,7 @@ bool ExtractScheme(const char16* url, int url_len, Component* scheme) {
 // This handles everything that may be an authority terminator, including
 // backslash. For special backslash handling see DoParseAfterScheme.
 bool IsAuthorityTerminator(char16 ch) {
-  return IsURLSlash(ch) || ch == '?' || ch == '#' || ch == ';';
+  return IsURLSlash(ch) || ch == '?' || ch == '#';
 }
 
 void ExtractFileName(const char* url,
diff --git a/googleurl/src/url_test_utils.h b/googleurl/src/url_test_utils.h
index 5294202..6278e3f 100644
--- a/googleurl/src/url_test_utils.h
+++ b/googleurl/src/url_test_utils.h
@@ -75,11 +75,4 @@ inline std::string ConvertUTF16ToUTF8(const string16& src) {
 
 }  // namespace url_test_utils
 
-// This operator allows EXPECT_EQ(astring16, anotherstring16); to work.
-inline std::ostream& operator<<(std::ostream& os,
-                                const string16& str) {
-  // Convert to UTF-8 and print the string
-  return os << url_test_utils::ConvertUTF16ToUTF8(str);
-}
-
 #endif  // GOOGLEURL_SRC_URL_TEST_UTILS_H__