net/base/data_url.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// NOTE: based loosely on mozilla's nsDataChannel.cpp

#include <algorithm>

#include "net/base/data_url.h"

#include "base/base64.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "net/base/escape.h"
#include "net/base/mime_util.h"
#include "net/http/http_util.h"
#include "url/gurl.h"

namespace net {

// static
bool DataURL::Parse(const GURL& url, std::string* mime_type,
                    std::string* charset, std::string* data) {
  if (!url.is_valid())
    return false;

  DCHECK(mime_type->empty());
  DCHECK(charset->empty());
  std::string::const_iterator begin = url.spec().begin();
  std::string::const_iterator end = url.spec().end();

  std::string::const_iterator after_colon = std::find(begin, end, ':');
  if (after_colon == end)
    return false;
  ++after_colon;

  std::string::const_iterator comma = std::find(after_colon, end, ',');
  if (comma == end)
    return false;

  std::vector<std::string> meta_data =
      base::SplitString(base::StringPiece(after_colon, comma), ";",
                        base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);

  std::vector<std::string>::iterator iter = meta_data.begin();
  if (iter != meta_data.end()) {
    mime_type->swap(*iter);
    *mime_type = base::ToLowerASCII(*mime_type);
    ++iter;
  }

  static const char kBase64Tag[] = "base64";
  static const char kCharsetTag[] = "charset=";
  const size_t kCharsetTagLength = arraysize(kCharsetTag) - 1;

  bool base64_encoded = false;
  for (; iter != meta_data.end(); ++iter) {
    if (!base64_encoded && *iter == kBase64Tag) {
      base64_encoded = true;
    } else if (charset->empty() &&
               iter->compare(0, kCharsetTagLength, kCharsetTag) == 0) {
      charset->assign(iter->substr(kCharsetTagLength));
      // The grammar for charset is not specially defined in RFC2045 and
      // RFC2397. It just needs to be a token.
      if (!HttpUtil::IsToken(*charset))
        return false;
    }
  }

  if (mime_type->empty()) {
    // Fallback to the default if nothing specified in the mediatype part as
    // specified in RFC2045. As specified in RFC2397, we use |charset| even if
    // |mime_type| is empty.
    mime_type->assign("text/plain");
  } else if (!ParseMimeTypeWithoutParameter(*mime_type, NULL, NULL)) {
    // Fallback to the default as recommended in RFC2045 when the mediatype
    // value is invalid. For this case, we don't respect |charset| but force it
    // set to "US-ASCII".
    mime_type->assign("text/plain");
    charset->assign("US-ASCII");
  }
  if (charset->empty())
    charset->assign("US-ASCII");

  // The caller may not be interested in receiving the data.
  if (!data)
    return true;

  // Preserve spaces if dealing with text or xml input, same as mozilla:
  //   https://bugzilla.mozilla.org/show_bug.cgi?id=138052
  // but strip them otherwise:
  //   https://bugzilla.mozilla.org/show_bug.cgi?id=37200
  // (Spaces in a data URL should be escaped, which is handled below, so any
  // spaces now are wrong. People expect to be able to enter them in the URL
  // bar for text, and it can't hurt, so we allow it.)
  std::string temp_data = std::string(comma + 1, end);

  // For base64, we may have url-escaped whitespace which is not part
  // of the data, and should be stripped. Otherwise, the escaped whitespace
  // could be part of the payload, so don't strip it.
  if (base64_encoded) {
    temp_data = UnescapeURLComponent(temp_data,
        UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS |
        UnescapeRule::SPOOFING_AND_CONTROL_CHARS);
  }

  // Strip whitespace.
  if (base64_encoded || !(mime_type->compare(0, 5, "text/") == 0 ||
                          mime_type->find("xml") != std::string::npos)) {
    temp_data.erase(std::remove_if(temp_data.begin(), temp_data.end(),
                                   base::IsAsciiWhitespace<wchar_t>),
                    temp_data.end());
  }

  if (!base64_encoded) {
    temp_data = UnescapeURLComponent(temp_data,
        UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS |
        UnescapeRule::SPOOFING_AND_CONTROL_CHARS);
  }

  if (base64_encoded) {
    size_t length = temp_data.length();
    size_t padding_needed = 4 - (length % 4);
    // If the input wasn't padded, then we pad it as necessary until we have a
    // length that is a multiple of 4 as required by our decoder. We don't
    // correct if the input was incorrectly padded. If |padding_needed| == 3,
    // then the input isn't well formed and decoding will fail with or without
    // padding.
    if ((padding_needed == 1 || padding_needed == 2) &&
        temp_data[length - 1] != '=') {
      temp_data.resize(length + padding_needed, '=');
    }
    return base::Base64Decode(temp_data, data);
  }

  temp_data.swap(*data);
  return true;
}

}  // namespace net