From 80456899565a20e8138450c43a7e09d40d2dd2b5 Mon Sep 17 00:00:00 2001
From: "asanka@chromium.org"
 <asanka@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>
Date: Sat, 15 Dec 2012 20:07:31 +0000
Subject: Move DecodeFilenameValue and DecodeExt value into
 http_content_disposition.

BUG=none


Review URL: https://chromiumcodereview.appspot.com/11471041

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@173307 0039d316-1c4b-4281-b951-d872f2087c98
---
 net/http/http_content_disposition.cc | 326 +++++++++++++++++++++++++++++++++++
 1 file changed, 326 insertions(+)

(limited to 'net/http')
diff --git a/net/http/http_content_disposition.cc b/net/http/http_content_disposition.cc
index 52d9f4f..0726e93 100644
--- a/net/http/http_content_disposition.cc
+++ b/net/http/http_content_disposition.cc
@@ -4,10 +4,336 @@
 
 #include "net/http/http_content_disposition.h"
 
+#include "base/base64.h"
+#include "base/i18n/icu_string_conversions.h"
 #include "base/logging.h"
 #include "base/string_util.h"
+#include "base/sys_string_conversions.h"
+#include "base/utf_string_conversions.h"
 #include "net/base/net_util.h"
 #include "net/http/http_util.h"
+#include "unicode/ucnv.h"
+
+namespace {
+
+enum RFC2047EncodingType {
+  Q_ENCODING,
+  B_ENCODING
+};
+
+// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
+// decoding a quoted-printable string.  Returns true if the input was valid.
+bool DecodeQEncoding(const std::string& input, std::string* output) {
+  std::string temp;
+  temp.reserve(input.size());
+  for (std::string::const_iterator it = input.begin(); it != input.end();
+       ++it) {
+    if (*it == '_') {
+      temp.push_back(' ');
+    } else if (*it == '=') {
+      if ((input.end() - it < 3) ||
+          !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
+          !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
+        return false;
+      unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
+                         HexDigitToInt(*(it + 2));
+      temp.push_back(static_cast<char>(ch));
+      ++it;
+      ++it;
+    } else if (0x20 < *it && *it < 0x7F && *it != '?') {
+      // In a Q-encoded word, only printable ASCII characters
+      // represent themselves. Besides, space, '=', '_' and '?' are
+      // not allowed, but they're already filtered out.
+      DCHECK_NE('=', *it);
+      DCHECK_NE('?', *it);
+      DCHECK_NE('_', *it);
+      temp.push_back(*it);
+    } else {
+      return false;
+    }
+  }
+  output->swap(temp);
+  return true;
+}
+
+// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
+// type is specified in |enc_type|.
+bool DecodeBQEncoding(const std::string& part,
+                      RFC2047EncodingType enc_type,
+                      const std::string& charset,
+                      std::string* output) {
+  std::string decoded;
+  if (!((enc_type == B_ENCODING) ?
+        base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))
+    return false;
+
+  if (decoded.empty()) {
+    output->clear();
+    return true;
+  }
+
+  UErrorCode err = U_ZERO_ERROR;
+  UConverter* converter(ucnv_open(charset.c_str(), &err));
+  if (U_FAILURE(err))
+    return false;
+
+  // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
+  // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
+  // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
+  // trailing '\0'.
+  size_t output_length = decoded.length() * 3 + 1;
+  char* buf = WriteInto(output, output_length);
+  output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
+                                     decoded.data(), decoded.length(), &err);
+  ucnv_close(converter);
+  if (U_FAILURE(err))
+    return false;
+  output->resize(output_length);
+  return true;
+}
+
+bool DecodeWord(const std::string& encoded_word,
+                const std::string& referrer_charset,
+                bool* is_rfc2047,
+                std::string* output) {
+  *is_rfc2047 = false;
+  output->clear();
+  if (encoded_word.empty())
+    return true;
+
+  if (!IsStringASCII(encoded_word)) {
+    // Try UTF-8, referrer_charset and the native OS default charset in turn.
+    if (IsStringUTF8(encoded_word)) {
+      *output = encoded_word;
+    } else {
+      string16 utf16_output;
+      if (!referrer_charset.empty() &&
+          base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
+                                base::OnStringConversionError::FAIL,
+                                &utf16_output)) {
+        *output = UTF16ToUTF8(utf16_output);
+      } else {
+        *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
+      }
+    }
+
+    return true;
+  }
+
+  // RFC 2047 : one of encoding methods supported by Firefox and relatively
+  // widely used by web servers.
+  // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
+  // We don't care about the length restriction (72 bytes) because
+  // many web servers generate encoded words longer than the limit.
+  std::string tmp;
+  *is_rfc2047 = true;
+  int part_index = 0;
+  std::string charset;
+  StringTokenizer t(encoded_word, "?");
+  RFC2047EncodingType enc_type = Q_ENCODING;
+  while (*is_rfc2047 && t.GetNext()) {
+    std::string part = t.token();
+    switch (part_index) {
+      case 0:
+        if (part != "=") {
+          *is_rfc2047 = false;
+          break;
+        }
+        ++part_index;
+        break;
+      case 1:
+        // Do we need charset validity check here?
+        charset = part;
+        ++part_index;
+        break;
+      case 2:
+        if (part.size() > 1 ||
+            part.find_first_of("bBqQ") == std::string::npos) {
+          *is_rfc2047 = false;
+          break;
+        }
+        if (part[0] == 'b' || part[0] == 'B') {
+          enc_type = B_ENCODING;
+        }
+        ++part_index;
+        break;
+      case 3:
+        *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
+        if (!*is_rfc2047) {
+          // Last minute failure. Invalid B/Q encoding. Rather than
+          // passing it through, return now.
+          return false;
+        }
+        ++part_index;
+        break;
+      case 4:
+        if (part != "=") {
+          // Another last minute failure !
+          // Likely to be a case of two encoded-words in a row or
+          // an encoded word followed by a non-encoded word. We can be
+          // generous, but it does not help much in terms of compatibility,
+          // I believe. Return immediately.
+          *is_rfc2047 = false;
+          return false;
+        }
+        ++part_index;
+        break;
+      default:
+        *is_rfc2047 = false;
+        return false;
+    }
+  }
+
+  if (*is_rfc2047) {
+    if (*(encoded_word.end() - 1) == '=') {
+      output->swap(tmp);
+      return true;
+    }
+    // encoded_word ending prematurelly with '?' or extra '?'
+    *is_rfc2047 = false;
+    return false;
+  }
+
+  // We're not handling 'especial' characters quoted with '\', but
+  // it should be Ok because we're not an email client but a
+  // web browser.
+
+  // What IE6/7 does: %-escaped UTF-8.
+  tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES);
+  if (IsStringUTF8(tmp)) {
+    output->swap(tmp);
+    return true;
+    // We can try either the OS default charset or 'origin charset' here,
+    // As far as I can tell, IE does not support it. However, I've seen
+    // web servers emit %-escaped string in a legacy encoding (usually
+    // origin charset).
+    // TODO(jungshik) : Test IE further and consider adding a fallback here.
+  }
+  return false;
+}
+
+// Decodes the value of a 'filename' or 'name' parameter given as |input|. The
+// value is supposed to be of the form:
+//
+//   value                   = token | quoted-string
+//
+// However we currently also allow RFC 2047 encoding and non-ASCII
+// strings. Non-ASCII strings are interpreted based on |referrer_charset|.
+bool DecodeFilenameValue(const std::string& input,
+                         const std::string& referrer_charset,
+                         std::string* output) {
+  std::string tmp;
+  // Tokenize with whitespace characters.
+  StringTokenizer t(input, " \t\n\r");
+  t.set_options(StringTokenizer::RETURN_DELIMS);
+  bool is_previous_token_rfc2047 = true;
+  while (t.GetNext()) {
+    if (t.token_is_delim()) {
+      // If the previous non-delimeter token is not RFC2047-encoded,
+      // put in a space in its place. Otheriwse, skip over it.
+      if (!is_previous_token_rfc2047) {
+        tmp.push_back(' ');
+      }
+      continue;
+    }
+    // We don't support a single multibyte character split into
+    // adjacent encoded words. Some broken mail clients emit headers
+    // with that problem, but most web servers usually encode a filename
+    // in a single encoded-word. Firefox/Thunderbird do not support
+    // it, either.
+    std::string decoded;
+    if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
+                    &decoded))
+      return false;
+    tmp.append(decoded);
+  }
+  output->swap(tmp);
+  return true;
+}
+
+// Parses the charset and value-chars out of an ext-value string.
+//
+//  ext-value     = charset  "'" [ language ] "'" value-chars
+bool ParseExtValueComponents(const std::string& input,
+                             std::string* charset,
+                             std::string* value_chars) {
+  StringTokenizer t(input, "'");
+  t.set_options(StringTokenizer::RETURN_DELIMS);
+  std::string temp_charset;
+  std::string temp_value;
+  int numDelimsSeen = 0;
+  while (t.GetNext()) {
+    if (t.token_is_delim()) {
+      ++numDelimsSeen;
+      continue;
+    } else {
+      switch (numDelimsSeen) {
+        case 0:
+          temp_charset = t.token();
+          break;
+        case 1:
+          // Language is ignored.
+          break;
+        case 2:
+          temp_value = t.token();
+          break;
+        default:
+          return false;
+      }
+    }
+  }
+  if (numDelimsSeen != 2)
+    return false;
+  if (temp_charset.empty() || temp_value.empty())
+    return false;
+  charset->swap(temp_charset);
+  value_chars->swap(temp_value);
+  return true;
+}
+
+// http://tools.ietf.org/html/rfc5987#section-3.2
+//
+//  ext-value     = charset  "'" [ language ] "'" value-chars
+//
+//  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
+//
+//  mime-charset  = 1*mime-charsetc
+//  mime-charsetc = ALPHA / DIGIT
+//                 / "!" / "#" / "$" / "%" / "&"
+//                 / "+" / "-" / "^" / "_" / "`"
+//                 / "{" / "}" / "~"
+//
+//  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
+//
+//  value-chars   = *( pct-encoded / attr-char )
+//
+//  pct-encoded   = "%" HEXDIG HEXDIG
+//
+//  attr-char     = ALPHA / DIGIT
+//                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
+//                 / "^" / "_" / "`" / "|" / "~"
+bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
+  if (param_value.find('"') != std::string::npos)
+    return false;
+
+  std::string charset;
+  std::string value;
+  if (!ParseExtValueComponents(param_value, &charset, &value))
+    return false;
+
+  // RFC 5987 value should be ASCII-only.
+  if (!IsStringASCII(value)) {
+    decoded->clear();
+    return true;
+  }
+
+  std::string unescaped = net::UnescapeURLComponent(
+      value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
+
+  return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
+}
+
+} // namespace
 
 namespace net {
 
-- 
cgit v1.1