summaryrefslogtreecommitdiff
path: root/chromium/components/url_formatter/url_formatter.h
blob: 338d5aeb29cd1b5e1835cf323495fba777ed2a55 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// url_formatter contains routines for formatting URLs in a way that can be
// safely and securely displayed to users. For example, it is responsible
// for determining when to convert an IDN A-Label (e.g. "xn--[something]")
// into the IDN U-Label.
//
// Note that this formatting is only intended for display purposes; it would
// be insecure and insufficient to make comparisons solely on formatted URLs
// (that is, it should not be used for normalizing URLs for comparison for
// security decisions).

#ifndef COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_
#define COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_

#include <stddef.h>
#include <stdint.h>

#include <string>
#include <vector>

#include "base/strings/string16.h"
#include "base/strings/string_piece.h"
#include "base/strings/utf_offset_string_conversions.h"
#include "net/base/escape.h"

class GURL;

namespace url {
struct Parsed;
}

namespace url_formatter {

// Used by FormatUrl to specify handling of certain parts of the url.
typedef uint32_t FormatUrlType;
typedef uint32_t FormatUrlTypes;

// Nothing is ommitted.
extern const FormatUrlType kFormatUrlOmitNothing;

// If set, any username and password are removed.
extern const FormatUrlType kFormatUrlOmitUsernamePassword;

// If the scheme is 'http://', it's removed.
extern const FormatUrlType kFormatUrlOmitHTTP;

// Omits the path if it is just a slash and there is no query or ref.  This is
// meaningful for non-file "standard" URLs.
extern const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname;

// If the scheme is 'https://', it's removed. Not in kFormatUrlOmitDefaults.
extern const FormatUrlType kFormatUrlOmitHTTPS;

// Replaces the path, query, and ref with an ellipsis. Experimental and not in
// kFormatUrlOmitDefaults.
extern const FormatUrlType kFormatUrlExperimentalElideAfterHost;

// Omits some trivially informative subdomains such as "www" or "m". Not in
// kFormatUrlOmitDefaults.
extern const FormatUrlType kFormatUrlOmitTrivialSubdomains;

// Convenience for omitting all unecessary types. Does not include HTTPS scheme
// removal, or experimental flags.
extern const FormatUrlType kFormatUrlOmitDefaults;

// Creates a string representation of |url|. The IDN host name is turned to
// Unicode if the Unicode representation is deemed safe. |format_type| is a
// bitmask of FormatUrlTypes, see it for details. |unescape_rules| defines how
// to clean the URL for human readability. You will generally want
// |UnescapeRule::SPACES| for display to the user if you can handle spaces, or
// |UnescapeRule::NORMAL| if not. If the path part and the query part seem to
// be encoded in %-encoded UTF-8, decodes %-encoding and UTF-8.
//
// The last three parameters may be NULL.
//
// |new_parsed| will be set to the parsing parameters of the resultant URL.
//
// |prefix_end| will be the length before the hostname of the resultant URL.
//
// |offset[s]_for_adjustment| specifies one or more offsets into the original
// URL, representing insertion or selection points between characters: if the
// input is "http://foo.com/", offset 0 is before the entire URL, offset 7 is
// between the scheme and the host, and offset 15 is after the end of the URL.
// Valid input offsets range from 0 to the length of the input URL string.  On
// exit, each offset will have been modified to reflect any changes made to the
// output string.  For example, if |url| is "http://a:b@c.com/",
// |omit_username_password| is true, and an offset is 12 (pointing between 'c'
// and '.'), then on return the output string will be "http://c.com/" and the
// offset will be 8.  If an offset cannot be successfully adjusted (e.g. because
// it points into the middle of a component that was entirely removed or into
// the middle of an encoding sequence), it will be set to base::string16::npos.
// For consistency, if an input offset points between the scheme and the
// username/password, and both are removed, on output this offset will be 0
// rather than npos; this means that offsets at the starts and ends of removed
// components are always transformed the same way regardless of what other
// components are adjacent.
base::string16 FormatUrl(const GURL& url,
                         FormatUrlTypes format_types,
                         net::UnescapeRule::Type unescape_rules,
                         url::Parsed* new_parsed,
                         size_t* prefix_end,
                         size_t* offset_for_adjustment);

base::string16 FormatUrlWithOffsets(
    const GURL& url,
    FormatUrlTypes format_types,
    net::UnescapeRule::Type unescape_rules,
    url::Parsed* new_parsed,
    size_t* prefix_end,
    std::vector<size_t>* offsets_for_adjustment);

// This function is like those above except it takes |adjustments| rather
// than |offset[s]_for_adjustment|.  |adjustments| will be set to reflect all
// the transformations that happened to |url| to convert it into the returned
// value.
base::string16 FormatUrlWithAdjustments(
    const GURL& url,
    FormatUrlTypes format_types,
    net::UnescapeRule::Type unescape_rules,
    url::Parsed* new_parsed,
    size_t* prefix_end,
    base::OffsetAdjuster::Adjustments* adjustments);

// This is a convenience function for FormatUrl() with
// format_types = kFormatUrlOmitDefaults and unescape = SPACES.  This is the
// typical set of flags for "URLs to display to the user".  You should be
// cautious about using this for URLs which will be parsed or sent to other
// applications.
inline base::string16 FormatUrl(const GURL& url) {
  return FormatUrl(url, kFormatUrlOmitDefaults, net::UnescapeRule::SPACES,
                   nullptr, nullptr, nullptr);
}

// Returns whether FormatUrl() would strip a trailing slash from |url|, given a
// format flag including kFormatUrlOmitTrailingSlashOnBareHostname.
bool CanStripTrailingSlash(const GURL& url);

// Formats the host in |url| and appends it to |output|.
void AppendFormattedHost(const GURL& url, base::string16* output);

// Converts the given host name to unicode characters. This can be called for
// any host name, if the input is not IDN or is invalid in some way, we'll just
// return the ASCII source so it is still usable.
//
// The input should be the canonicalized ASCII host name from GURL. This
// function does NOT accept UTF-8!
base::string16 IDNToUnicode(base::StringPiece host);

// If |text| starts with "www." it is removed, otherwise |text| is returned
// unmodified.
base::string16 StripWWW(const base::string16& text);

// Runs |url|'s host through StripWWW().  |url| must be valid.
base::string16 StripWWWFromHost(const GURL& url);

}  // namespace url_formatter

#endif  // COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_