summaryrefslogtreecommitdiff
path: root/googleurl/url_parse_file.cpp
blob: 02b8028493a667feb27f9430d539fc3c93b5fc2d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// Copyright 2007, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "base/logging.h"
#include "url_file.h"
#include "url_parse.h"
#include "url_parse_internal.h"

// Interesting IE file:isms...
//
//  INPUT                      OUTPUT
//  =========================  ==============================
//  file:/foo/bar              file:///foo/bar
//      The result here seems totally invalid!?!? This isn't UNC.
//
//  file:/
//  file:// or any other number of slashes
//      IE6 doesn't do anything at all if you click on this link. No error:
//      nothing. IE6's history system seems to always color this link, so I'm
//      guessing that it maps internally to the empty URL.
//
//  C:\                        file:///C:/
//      When on a file: URL source page, this link will work. When over HTTP,
//      the file: URL will appear in the status bar but the link will not work
//      (security restriction for all file URLs).
//
//  file:foo/                  file:foo/     (invalid?!?!?)
//  file:/foo/                 file:///foo/  (invalid?!?!?)
//  file://foo/                file://foo/   (UNC to server "foo")
//  file:///foo/               file:///foo/  (invalid, seems to be a file)
//  file:////foo/              file://foo/   (UNC to server "foo")
//      Any more than four slashes is also treated as UNC.
//
//  file:C:/                   file://C:/
//  file:/C:/                  file://C:/
//      The number of slashes after "file:" don't matter if the thing following
//      it looks like an absolute drive path. Also, slashes and backslashes are
//      equally valid here.

namespace url_parse {

namespace {

// A subcomponent of DoInitFileURL, the input of this function should be a UNC
// path name, with the index of the first character after the slashes following
// the scheme given in |after_slashes|. This will initialize the host, path,
// query, and ref, and leave the other output components untouched
// (DoInitFileURL handles these for us).
template<typename CHAR>
void DoParseUNC(const CHAR* spec,
                int after_slashes,
                int spec_len,
               Parsed* parsed) {
  int next_slash = FindNextSlash(spec, after_slashes, spec_len);
  if (next_slash == spec_len) {
    // No additional slash found, as in "file://foo", treat the text as the
    // host with no path (this will end up being UNC to server "foo").
    int host_len = spec_len - after_slashes;
    if (host_len)
      parsed->host = Component(after_slashes, host_len);
    else
      parsed->host.reset();
    parsed->path.reset();
    return;
  }

#ifdef WIN32
  // See if we have something that looks like a path following the first
  // component. As in "file://localhost/c:/", we get "c:/" out. We want to
  // treat this as a having no host but the path given. Works on Windows only.
  if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
    parsed->host.reset();
    ParsePathInternal(spec, MakeRange(next_slash, spec_len),
                      &parsed->path, &parsed->query, &parsed->ref);
    return;
  }
#endif

  // Otherwise, everything up until that first slash we found is the host name,
  // which will end up being the UNC host. For example "file://foo/bar.txt"
  // will get a server name of "foo" and a path of "/bar". Later, on Windows,
  // this should be treated as the filename "\\foo\bar.txt" in proper UNC
  // notation.
  int host_len = next_slash - after_slashes;
  if (host_len)
    parsed->host = MakeRange(after_slashes, next_slash);
  else
    parsed->host.reset();
  if (next_slash < spec_len) {
    ParsePathInternal(spec, MakeRange(next_slash, spec_len),
                      &parsed->path, &parsed->query, &parsed->ref);
  } else {
    parsed->path.reset();
  }
}

// A subcomponent of DoParseFileURL, the input should be a local file, with the
// beginning of the path indicated by the index in |path_begin|. This will
// initialize the host, path, query, and ref, and leave the other output
// components untouched (DoInitFileURL handles these for us).
template<typename CHAR>
void DoParseLocalFile(const CHAR* spec,
                      int path_begin,
                      int spec_len,
                      Parsed* parsed) {
  parsed->host.reset();
  ParsePathInternal(spec, MakeRange(path_begin, spec_len),
                    &parsed->path, &parsed->query, &parsed->ref);
}

// Backend for the external functions that operates on either char type.
// We are handed the character after the "file:" at the beginning of the spec.
// Usually this is a slash, but needn't be; we allow paths like "file:c:\foo".
template<typename CHAR>
void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
  DCHECK(spec_len >= 0);

  // Get the parts we never use for file URLs out of the way.
  parsed->username.reset();
  parsed->password.reset();
  parsed->port.reset();

  // Many of the code paths don't set these, so it's convenient to just clear
  // them. We'll write them in those cases we need them.
  parsed->query.reset();
  parsed->ref.reset();

  // Strip leading & trailing spaces and control characters.
  int begin = 0;
  TrimURL(spec, &begin, &spec_len);

  // Find the scheme.
  int num_slashes;
  int after_scheme;
  int after_slashes;
#ifdef WIN32
  // See how many slashes there are. We want to handle cases like UNC but also
  // "/c:/foo". This is when there is no scheme, so we can allow pages to do
  // links like "c:/foo/bar" or "//foo/bar". This is also called by the
  // relative URL resolver when it determines there is an absolute URL, which
  // may give us input like "/c:/foo".
  num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
  after_slashes = begin + num_slashes;
  if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
    // Windows path, don't try to extract the scheme (for example, "c:\foo").
    parsed->scheme.reset();
    after_scheme = after_slashes;
  } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
    // Windows UNC path: don't try to extract the scheme, but keep the slashes.
    parsed->scheme.reset();
    after_scheme = begin;
  } else
#endif
  {
    if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
      // Offset the results since we gave ExtractScheme a substring.
      parsed->scheme.begin += begin;
      after_scheme = parsed->scheme.end() + 1;
    } else {
      // No scheme found, remember that.
      parsed->scheme.reset();
      after_scheme = begin;
    }
  }

  // Handle empty specs ones that contain only whitespace or control chars,
  // or that are just the scheme (for example "file:").
  if (after_scheme == spec_len) {
    parsed->host.reset();
    parsed->path.reset();
    return;
  }

  num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);

  after_slashes = after_scheme + num_slashes;
#ifdef WIN32
  // Check whether the input is a drive again. We checked above for windows
  // drive specs, but that's only at the very beginning to see if we have a
  // scheme at all. This test will be duplicated in that case, but will
  // additionally handle all cases with a real scheme such as "file:///C:/".
  if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
      num_slashes != 3) {
    // Anything not beginning with a drive spec ("c:\") on Windows is treated
    // as UNC, with the exception of three slashes which always means a file.
    // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
    DoParseUNC(spec, after_slashes, spec_len, parsed);
    return;
  }
#else
  // file: URL with exactly 2 slashes is considered to have a host component.
  if (num_slashes == 2) {
    DoParseUNC(spec, after_slashes, spec_len, parsed);
    return;
  }
#endif  // WIN32

  // Easy and common case, the full path immediately follows the scheme
  // (modulo slashes), as in "file://c:/foo". Just treat everything from
  // there to the end as the path. Empty hosts have 0 length instead of -1.
  // We include the last slash as part of the path if there is one.
  DoParseLocalFile(spec,
      num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
      spec_len, parsed);
}

}  // namespace

void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
  DoParseFileURL(url, url_len, parsed);
}

void ParseFileURL(const char16* url, int url_len, Parsed* parsed) {
  DoParseFileURL(url, url_len, parsed);
}

}  // namespace url_parse