// Copyright 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "net/base/url_util.h" #include "build/build_config.h" #if defined(OS_POSIX) #include #elif defined(OS_WIN) #include #endif #include "base/logging.h" #include "base/strings/string_util.h" #include "base/strings/stringprintf.h" #include "net/base/escape.h" #include "net/base/ip_address.h" #include "net/base/registry_controlled_domains/registry_controlled_domain.h" #include "url/gurl.h" #include "url/url_canon.h" #include "url/url_canon_ip.h" namespace net { namespace { bool IsHostCharAlphanumeric(char c) { // We can just check lowercase because uppercase characters have already been // normalized. return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9')); } bool IsNormalizedLocalhostTLD(const std::string& host) { return base::EndsWith(host, ".localhost", base::CompareCase::SENSITIVE); } } // namespace GURL AppendQueryParameter(const GURL& url, const std::string& name, const std::string& value) { std::string query(url.query()); if (!query.empty()) query += "&"; query += (EscapeQueryParamValue(name, true) + "=" + EscapeQueryParamValue(value, true)); GURL::Replacements replacements; replacements.SetQueryStr(query); return url.ReplaceComponents(replacements); } GURL AppendOrReplaceQueryParameter(const GURL& url, const std::string& name, const std::string& value) { bool replaced = false; std::string param_name = EscapeQueryParamValue(name, true); std::string param_value = EscapeQueryParamValue(value, true); const std::string input = url.query(); url::Component cursor(0, input.size()); std::string output; url::Component key_range, value_range; while (url::ExtractQueryKeyValue(input.data(), &cursor, &key_range, &value_range)) { const base::StringPiece key( input.data() + key_range.begin, key_range.len); std::string key_value_pair; // Check |replaced| as only the first pair should be replaced. if (!replaced && key == param_name) { replaced = true; key_value_pair = (param_name + "=" + param_value); } else { key_value_pair.assign(input.data(), key_range.begin, value_range.end() - key_range.begin); } if (!output.empty()) output += "&"; output += key_value_pair; } if (!replaced) { if (!output.empty()) output += "&"; output += (param_name + "=" + param_value); } GURL::Replacements replacements; replacements.SetQueryStr(output); return url.ReplaceComponents(replacements); } QueryIterator::QueryIterator(const GURL& url) : url_(url), at_end_(!url.is_valid()) { if (!at_end_) { query_ = url.parsed_for_possibly_invalid_spec().query; Advance(); } } QueryIterator::~QueryIterator() { } std::string QueryIterator::GetKey() const { DCHECK(!at_end_); if (key_.is_nonempty()) return url_.spec().substr(key_.begin, key_.len); return std::string(); } std::string QueryIterator::GetValue() const { DCHECK(!at_end_); if (value_.is_nonempty()) return url_.spec().substr(value_.begin, value_.len); return std::string(); } const std::string& QueryIterator::GetUnescapedValue() { DCHECK(!at_end_); if (value_.is_nonempty() && unescaped_value_.empty()) { unescaped_value_ = UnescapeURLComponent( GetValue(), UnescapeRule::SPACES | UnescapeRule::PATH_SEPARATORS | UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS | UnescapeRule::REPLACE_PLUS_WITH_SPACE); } return unescaped_value_; } bool QueryIterator::IsAtEnd() const { return at_end_; } void QueryIterator::Advance() { DCHECK (!at_end_); key_.reset(); value_.reset(); unescaped_value_.clear(); at_end_ = !url::ExtractQueryKeyValue(url_.spec().c_str(), &query_, &key_, &value_); } bool GetValueForKeyInQuery(const GURL& url, const std::string& search_key, std::string* out_value) { for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) { if (it.GetKey() == search_key) { *out_value = it.GetUnescapedValue(); return true; } } return false; } bool ParseHostAndPort(std::string::const_iterator host_and_port_begin, std::string::const_iterator host_and_port_end, std::string* host, int* port) { if (host_and_port_begin >= host_and_port_end) return false; // When using url, we use char*. const char* auth_begin = &(*host_and_port_begin); int auth_len = host_and_port_end - host_and_port_begin; url::Component auth_component(0, auth_len); url::Component username_component; url::Component password_component; url::Component hostname_component; url::Component port_component; url::ParseAuthority(auth_begin, auth_component, &username_component, &password_component, &hostname_component, &port_component); // There shouldn't be a username/password. if (username_component.is_valid() || password_component.is_valid()) return false; if (!hostname_component.is_nonempty()) return false; // Failed parsing. int parsed_port_number = -1; if (port_component.is_nonempty()) { parsed_port_number = url::ParsePort(auth_begin, port_component); // If parsing failed, port_number will be either PORT_INVALID or // PORT_UNSPECIFIED, both of which are negative. if (parsed_port_number < 0) return false; // Failed parsing the port number. } if (port_component.len == 0) return false; // Reject inputs like "foo:" unsigned char tmp_ipv6_addr[16]; // If the hostname starts with a bracket, it is either an IPv6 literal or // invalid. If it is an IPv6 literal then strip the brackets. if (hostname_component.len > 0 && auth_begin[hostname_component.begin] == '[') { if (auth_begin[hostname_component.end() - 1] == ']' && url::IPv6AddressToNumber( auth_begin, hostname_component, tmp_ipv6_addr)) { // Strip the brackets. hostname_component.begin++; hostname_component.len -= 2; } else { return false; } } // Pass results back to caller. host->assign(auth_begin + hostname_component.begin, hostname_component.len); *port = parsed_port_number; return true; // Success. } bool ParseHostAndPort(const std::string& host_and_port, std::string* host, int* port) { return ParseHostAndPort( host_and_port.begin(), host_and_port.end(), host, port); } std::string GetHostAndPort(const GURL& url) { // For IPv6 literals, GURL::host() already includes the brackets so it is // safe to just append a colon. return base::StringPrintf("%s:%d", url.host().c_str(), url.EffectiveIntPort()); } std::string GetHostAndOptionalPort(const GURL& url) { // For IPv6 literals, GURL::host() already includes the brackets // so it is safe to just append a colon. if (url.has_port()) return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str()); return url.host(); } std::string TrimEndingDot(base::StringPiece host) { base::StringPiece host_trimmed = host; size_t len = host_trimmed.length(); if (len > 1 && host_trimmed[len - 1] == '.') { host_trimmed.remove_suffix(1); } return host_trimmed.as_string(); } std::string GetHostOrSpecFromURL(const GURL& url) { return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec(); } std::string CanonicalizeHost(base::StringPiece host, url::CanonHostInfo* host_info) { // Try to canonicalize the host. const url::Component raw_host_component(0, static_cast(host.length())); std::string canon_host; url::StdStringCanonOutput canon_host_output(&canon_host); url::CanonicalizeHostVerbose(host.data(), raw_host_component, &canon_host_output, host_info); if (host_info->out_host.is_nonempty() && host_info->family != url::CanonHostInfo::BROKEN) { // Success! Assert that there's no extra garbage. canon_host_output.Complete(); DCHECK_EQ(host_info->out_host.len, static_cast(canon_host.length())); } else { // Empty host, or canonicalization failed. We'll return empty. canon_host.clear(); } return canon_host; } bool IsCanonicalizedHostCompliant(const std::string& host) { if (host.empty()) return false; bool in_component = false; bool most_recent_component_started_alphanumeric = false; for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) { const char c = *i; if (!in_component) { most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c); if (!most_recent_component_started_alphanumeric && (c != '-') && (c != '_')) { return false; } in_component = true; } else if (c == '.') { in_component = false; } else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) { return false; } } return most_recent_component_started_alphanumeric; } bool IsHostnameNonUnique(const std::string& hostname) { // CanonicalizeHost requires surrounding brackets to parse an IPv6 address. const std::string host_or_ip = hostname.find(':') != std::string::npos ? "[" + hostname + "]" : hostname; url::CanonHostInfo host_info; std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info); // If canonicalization fails, then the input is truly malformed. However, // to avoid mis-reporting bad inputs as "non-unique", treat them as unique. if (canonical_name.empty()) return false; // If |hostname| is an IP address, check to see if it's in an IANA-reserved // range. if (host_info.IsIPAddress()) { IPAddress host_addr; if (!host_addr.AssignFromIPLiteral(hostname.substr( host_info.out_host.begin, host_info.out_host.len))) { return false; } switch (host_info.family) { case url::CanonHostInfo::IPV4: case url::CanonHostInfo::IPV6: return host_addr.IsReserved(); case url::CanonHostInfo::NEUTRAL: case url::CanonHostInfo::BROKEN: return false; } } // Check for a registry controlled portion of |hostname|, ignoring private // registries, as they already chain to ICANN-administered registries, // and explicitly ignoring unknown registries. // // Note: This means that as new gTLDs are introduced on the Internet, they // will be treated as non-unique until the registry controlled domain list // is updated. However, because gTLDs are expected to provide significant // advance notice to deprecate older versions of this code, this an // acceptable tradeoff. return 0 == registry_controlled_domains::GetRegistryLength( canonical_name, registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); } bool IsLocalhost(base::StringPiece host) { if (IsLocalHostname(host, nullptr)) return true; IPAddress ip_address; if (ip_address.AssignFromIPLiteral(host)) { size_t size = ip_address.size(); switch (size) { case IPAddress::kIPv4AddressSize: { const uint8_t prefix[] = {127}; return IPAddressStartsWith(ip_address, prefix); } case IPAddress::kIPv6AddressSize: return ip_address == IPAddress::IPv6Localhost(); default: NOTREACHED(); } } return false; } GURL SimplifyUrlForRequest(const GURL& url) { DCHECK(url.is_valid()); GURL::Replacements replacements; replacements.ClearUsername(); replacements.ClearPassword(); replacements.ClearRef(); return url.ReplaceComponents(replacements); } void GetIdentityFromURL(const GURL& url, base::string16* username, base::string16* password) { UnescapeRule::Type flags = UnescapeRule::SPACES | UnescapeRule::PATH_SEPARATORS | UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS; *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags); *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags); } bool HasGoogleHost(const GURL& url) { static const char* kGoogleHostSuffixes[] = { ".google.com", ".youtube.com", ".gmail.com", ".doubleclick.net", ".gstatic.com", ".googlevideo.com", ".googleusercontent.com", ".googlesyndication.com", ".google-analytics.com", ".googleadservices.com", ".googleapis.com", ".ytimg.com", }; base::StringPiece host = url.host_piece(); for (const char* suffix : kGoogleHostSuffixes) { // Here it's possible to get away with faster case-sensitive comparisons // because the list above is all lowercase, and a GURL's host name will // always be canonicalized to lowercase as well. if (base::EndsWith(host, suffix, base::CompareCase::SENSITIVE)) return true; } return false; } bool IsLocalHostname(base::StringPiece host, bool* is_local6) { std::string normalized_host = base::ToLowerASCII(host); // Remove any trailing '.'. if (!normalized_host.empty() && *normalized_host.rbegin() == '.') normalized_host.resize(normalized_host.size() - 1); if (normalized_host == "localhost6" || normalized_host == "localhost6.localdomain6") { if (is_local6) *is_local6 = true; return true; } if (is_local6) *is_local6 = false; return normalized_host == "localhost" || normalized_host == "localhost.localdomain" || IsNormalizedLocalhostTLD(normalized_host); } } // namespace net