chromium/net/base/data_url.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// NOTE: based loosely on mozilla's nsDataChannel.cpp

#include <algorithm>

#include "net/base/data_url.h"

#include "base/base64.h"
#include "base/containers/cxx20_erase.h"
#include "base/feature_list.h"
#include "base/features.h"
#include "base/strings/escape.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "net/base/mime_util.h"
#include "net/http/http_response_headers.h"
#include "net/http/http_util.h"
#include "url/gurl.h"

namespace net {
namespace {

// A data URL is ready for decode if it:
//   - Doesn't need any extra padding.
//   - Does not have any escaped characters.
//   - Does not have any whitespace.
bool IsDataURLReadyForDecode(base::StringPiece body) {
  return (body.length() % 4) == 0 && base::ranges::find_if(body, [](char c) {
                                       return c == '%' ||
                                              base::IsAsciiWhitespace(c);
                                     }) == std::end(body);
}

}  // namespace

bool DataURL::Parse(const GURL& url,
                    std::string* mime_type,
                    std::string* charset,
                    std::string* data) {
  if (!url.is_valid() || !url.has_scheme())
    return false;

  DCHECK(mime_type->empty());
  DCHECK(charset->empty());
  DCHECK(!data || data->empty());

  base::StringPiece content;
  std::string content_string;
  if (base::FeatureList::IsEnabled(base::features::kOptimizeDataUrls)) {
    // Avoid copying the URL content which can be expensive for large URLs.
    content = url.GetContentPiece();
  } else {
    content_string = url.GetContent();
    content = content_string;
  }

  base::StringPiece::const_iterator begin = content.begin();
  base::StringPiece::const_iterator end = content.end();

  base::StringPiece::const_iterator comma = std::find(begin, end, ',');

  if (comma == end)
    return false;

  std::vector<base::StringPiece> meta_data =
      base::SplitStringPiece(base::MakeStringPiece(begin, comma), ";",
                             base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);

  // These are moved to |mime_type| and |charset| on success.
  std::string mime_type_value;
  std::string charset_value;
  auto iter = meta_data.cbegin();
  if (iter != meta_data.cend()) {
    mime_type_value = base::ToLowerASCII(*iter);
    ++iter;
  }

  static constexpr base::StringPiece kBase64Tag("base64");
  static constexpr base::StringPiece kCharsetTag("charset=");

  bool base64_encoded = false;
  for (; iter != meta_data.cend(); ++iter) {
    if (!base64_encoded &&
        base::EqualsCaseInsensitiveASCII(*iter, kBase64Tag)) {
      base64_encoded = true;
    } else if (charset_value.empty() &&
               base::StartsWith(*iter, kCharsetTag,
                                base::CompareCase::INSENSITIVE_ASCII)) {
      charset_value = std::string(iter->substr(kCharsetTag.size()));
      // The grammar for charset is not specially defined in RFC2045 and
      // RFC2397. It just needs to be a token.
      if (!HttpUtil::IsToken(charset_value))
        return false;
    }
  }

  if (mime_type_value.empty()) {
    // Fallback to the default if nothing specified in the mediatype part as
    // specified in RFC2045. As specified in RFC2397, we use |charset| even if
    // |mime_type| is empty.
    mime_type_value = "text/plain";
    if (charset_value.empty())
      charset_value = "US-ASCII";
  } else if (!ParseMimeTypeWithoutParameter(mime_type_value, nullptr,
                                            nullptr)) {
    // Fallback to the default as recommended in RFC2045 when the mediatype
    // value is invalid. For this case, we don't respect |charset| but force it
    // set to "US-ASCII".
    mime_type_value = "text/plain";
    charset_value = "US-ASCII";
  }

  // The caller may not be interested in receiving the data.
  if (data) {
    // Preserve spaces if dealing with text or xml input, same as mozilla:
    //   https://bugzilla.mozilla.org/show_bug.cgi?id=138052
    // but strip them otherwise:
    //   https://bugzilla.mozilla.org/show_bug.cgi?id=37200
    // (Spaces in a data URL should be escaped, which is handled below, so any
    // spaces now are wrong. People expect to be able to enter them in the URL
    // bar for text, and it can't hurt, so we allow it.)
    //
    // TODO(mmenke): Is removing all spaces reasonable? GURL removes trailing
    // spaces itself, anyways. Should we just trim leading spaces instead?
    // Allowing random intermediary spaces seems unnecessary.

    auto raw_body = base::MakeStringPiece(comma + 1, end);

    // For base64, we may have url-escaped whitespace which is not part
    // of the data, and should be stripped. Otherwise, the escaped whitespace
    // could be part of the payload, so don't strip it.
    if (base64_encoded) {
      // If the data URL is well formed, we can decode it immediately.
      if (base::FeatureList::IsEnabled(base::features::kOptimizeDataUrls) &&
          IsDataURLReadyForDecode(raw_body)) {
        if (!base::Base64Decode(raw_body, data))
          return false;
      } else {
        std::string unescaped_body = base::UnescapeBinaryURLComponent(raw_body);

        // Strip spaces, which aren't allowed in Base64 encoding.
        base::EraseIf(unescaped_body, base::IsAsciiWhitespace<char>);

        size_t length = unescaped_body.length();
        size_t padding_needed = 4 - (length % 4);
        // If the input wasn't padded, then we pad it as necessary until we have
        // a length that is a multiple of 4 as required by our decoder. We don't
        // correct if the input was incorrectly padded. If |padding_needed| ==
        // 3, then the input isn't well formed and decoding will fail with or
        // without padding.
        if ((padding_needed == 1 || padding_needed == 2) &&
            unescaped_body[length - 1] != '=') {
          unescaped_body.resize(length + padding_needed, '=');
        }
        if (!base::Base64Decode(unescaped_body, data))
          return false;
      }
    } else {
      // Strip whitespace for non-text MIME types.
      std::string temp;
      if (!(mime_type_value.compare(0, 5, "text/") == 0 ||
            mime_type_value.find("xml") != std::string::npos)) {
        temp = std::string(raw_body);
        base::EraseIf(temp, base::IsAsciiWhitespace<char>);
        raw_body = temp;
      }

      *data = base::UnescapeBinaryURLComponent(raw_body);
    }
  }

  *mime_type = std::move(mime_type_value);
  *charset = std::move(charset_value);
  return true;
}

Error DataURL::BuildResponse(const GURL& url,
                             base::StringPiece method,
                             std::string* mime_type,
                             std::string* charset,
                             std::string* data,
                             scoped_refptr<HttpResponseHeaders>* headers) {
  DCHECK(data);
  DCHECK(!*headers);

  if (!DataURL::Parse(url, mime_type, charset, data))
    return ERR_INVALID_URL;

  // |mime_type| set by DataURL::Parse() is guaranteed to be in
  //     token "/" token
  // form. |charset| can be an empty string.
  DCHECK(!mime_type->empty());

  // "charset" in the Content-Type header is specified explicitly to follow
  // the "token" ABNF in the HTTP spec. When the DataURL::Parse() call is
  // successful, it's guaranteed that the string in |charset| follows the
  // "token" ABNF.
  std::string content_type = *mime_type;
  if (!charset->empty())
    content_type.append(";charset=" + *charset);
  // The terminal double CRLF isn't needed by TryToCreate().
  *headers = HttpResponseHeaders::TryToCreate(
      "HTTP/1.1 200 OK\r\n"
      "Content-Type:" +
      content_type);
  // Above line should always succeed - TryToCreate() only fails when there are
  // nulls in the string, and DataURL::Parse() can't return nulls in anything
  // but the |data| argument.
  DCHECK(*headers);

  if (base::EqualsCaseInsensitiveASCII(method, "HEAD"))
    data->clear();

  return OK;
}

}  // namespace net