-

author: Andreas Baumann <abaumann@yahoo.com> 2012-08-04 14:03:06 +0200
committer: Andreas Baumann <abaumann@yahoo.com> 2012-08-04 14:03:06 +0200
commit: 0c92e873518ce6a92caeba0be81a0d81d16c6ed8 (patch)
tree: ca0033ad7c96ff9e7e1d037b09dca12a2e90809b /googleurl
parent: 9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27 (diff)
download: crawler-0c92e873518ce6a92caeba0be81a0d81d16c6ed8.tar.gz
crawler-0c92e873518ce6a92caeba0be81a0d81d16c6ed8.tar.bz2
17 files changed, 6478 insertions, 0 deletions
diff --git a/googleurl/gurl.cpp b/googleurl/gurl.cpp
new file mode 100644
index 0000000..4c90408
--- /dev/null
+++ b/googleurl/gurl.cpp
@@ -0,0 +1,529 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+#include <algorithm>
+#include <ostream>
+
+#include "gurl.h"
+
+#include "base/logging.h"
+#include "url_canon_stdstring.h"
+#include "url_util.h"
+
+namespace {
+
+// External template that can handle initialization of either character type.
+// The input spec is given, and the canonical version will be placed in
+// |*canonical|, along with the parsing of the canonical spec in |*parsed|.
+template<typename STR>
+bool InitCanonical(const STR& input_spec,
+                   std::string* canonical,
+                   url_parse::Parsed* parsed) {
+  // Reserve enough room in the output for the input, plus some extra so that
+  // we have room if we have to escape a few things without reallocating.
+  canonical->reserve(input_spec.size() + 32);
+  url_canon::StdStringCanonOutput output(canonical);
+  bool success = url_util::Canonicalize(
+      input_spec.data(), static_cast<int>(input_spec.length()),
+      NULL, &output, parsed);
+
+  output.Complete();  // Must be done before using string.
+  return success;
+}
+
+static std::string* empty_string = NULL;
+static GURL* empty_gurl = NULL;
+
+#ifdef WIN32
+
+// Returns a static reference to an empty string for returning a reference
+// when there is no underlying string.
+const std::string& EmptyStringForGURL() {
+  // Avoid static object construction/destruction on startup/shutdown.
+  if (!empty_string) {
+    // Create the string. Be careful that we don't break in the case that this
+    // is being called from multiple threads. Statics are not threadsafe.
+    std::string* new_empty_string = new std::string;
+    if (InterlockedCompareExchangePointer(
+        reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
+      // The old value was non-NULL, so no replacement was done. Another
+      // thread did the initialization out from under us.
+      delete new_empty_string;
+    }
+  }
+  return *empty_string;
+}
+
+#else
+
+static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
+static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
+
+void EmptyStringForGURLOnce(void) {
+  empty_string = new std::string;
+}
+
+const std::string& EmptyStringForGURL() {
+  // Avoid static object construction/destruction on startup/shutdown.
+  pthread_once(&empty_string_once, EmptyStringForGURLOnce);
+  return *empty_string;
+}
+
+#endif  // WIN32
+
+} // namespace
+
+GURL::GURL() : is_valid_(false), inner_url_(NULL) {
+}
+
+GURL::GURL(const GURL& other)
+    : spec_(other.spec_),
+      is_valid_(other.is_valid_),
+      parsed_(other.parsed_),
+      inner_url_(NULL) {
+  if (other.inner_url_)
+    inner_url_ = new GURL(*other.inner_url_);
+  // Valid filesystem urls should always have an inner_url_.
+  DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
+}
+
+GURL::GURL(const std::string& url_string) : inner_url_(NULL) {
+  is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
+  if (is_valid_ && SchemeIsFileSystem()) {
+    inner_url_ =
+        new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
+  }
+}
+
+GURL::GURL(const string16& url_string) : inner_url_(NULL) {
+  is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
+  if (is_valid_ && SchemeIsFileSystem()) {
+    inner_url_ =
+        new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
+  }
+}
+
+GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
+           const url_parse::Parsed& parsed, bool _is_valid)
+    : spec_(canonical_spec, canonical_spec_len),
+      is_valid_(_is_valid),
+      parsed_(parsed),
+      inner_url_(NULL) {
+  if (is_valid_ && SchemeIsFileSystem()) {
+    inner_url_ =
+        new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
+  }
+
+#ifndef NDEBUG
+  // For testing purposes, check that the parsed canonical URL is identical to
+  // what we would have produced. Skip checking for invalid URLs have no meaning
+  // and we can't always canonicalize then reproducabely.
+  if (is_valid_) {
+    url_parse::Component _scheme;
+    if (!url_util::FindAndCompareScheme(canonical_spec, canonical_spec_len,
+                                        "filesystem", &_scheme) ||
+        _scheme.begin == parsed.scheme.begin) {
+      // We can't do this check on the inner_url of a filesystem URL, as
+      // canonical_spec actually points to the start of the outer URL, so we'd
+      // end up with infinite recursion in this constructor.
+      GURL test_url(spec_);
+
+      DCHECK(test_url.is_valid_ == is_valid_);
+      DCHECK(test_url.spec_ == spec_);
+
+      DCHECK(test_url.parsed_.scheme == parsed_.scheme);
+      DCHECK(test_url.parsed_.username == parsed_.username);
+      DCHECK(test_url.parsed_.password == parsed_.password);
+      DCHECK(test_url.parsed_.host == parsed_.host);
+      DCHECK(test_url.parsed_.port == parsed_.port);
+      DCHECK(test_url.parsed_.path == parsed_.path);
+      DCHECK(test_url.parsed_.query == parsed_.query);
+      DCHECK(test_url.parsed_.ref == parsed_.ref);
+    }
+  }
+#endif
+}
+
+GURL::~GURL() {
+  delete inner_url_;
+}
+
+GURL& GURL::operator=(const GURL& other) {
+  spec_ = other.spec_;
+  is_valid_ = other.is_valid_;
+  parsed_ = other.parsed_;
+  delete inner_url_;
+  inner_url_ = NULL;
+  if (other.inner_url_)
+    inner_url_ = new GURL(*other.inner_url_);
+  // Valid filesystem urls should always have an inner_url_.
+  DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
+  return *this;
+}
+
+const std::string& GURL::spec() const {
+  if (is_valid_ || spec_.empty())
+    return spec_;
+
+  DCHECK(false) << "Trying to get the spec of an invalid URL!";
+  return EmptyStringForGURL();
+}
+
+GURL GURL::Resolve(const std::string& relative) const {
+  return ResolveWithCharsetConverter(relative, NULL);
+}
+GURL GURL::Resolve(const string16& relative) const {
+  return ResolveWithCharsetConverter(relative, NULL);
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::ResolveWithCharsetConverter(
+    const std::string& relative,
+    url_canon::CharsetConverter* charset_converter) const {
+  // Not allowed for invalid URLs.
+  if (!is_valid_)
+    return GURL();
+
+  GURL result;
+
+  // Reserve enough room in the output for the input, plus some extra so that
+  // we have room if we have to escape a few things without reallocating.
+  result.spec_.reserve(spec_.size() + 32);
+  url_canon::StdStringCanonOutput output(&result.spec_);
+
+  if (!url_util::ResolveRelative(
+          spec_.data(), static_cast<int>(spec_.length()), parsed_,
+          relative.data(), static_cast<int>(relative.length()),
+          charset_converter, &output, &result.parsed_)) {
+    // Error resolving, return an empty URL.
+    return GURL();
+  }
+
+  output.Complete();
+  result.is_valid_ = true;
+  if (result.SchemeIsFileSystem()) {
+    result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(),
+                                 *result.parsed_.inner_parsed(), true);
+  }
+  return result;
+}
+
+// Note: code duplicated above (it's inconvenient to use a template here).
+GURL GURL::ResolveWithCharsetConverter(
+    const string16& relative,
+    url_canon::CharsetConverter* charset_converter) const {
+  // Not allowed for invalid URLs.
+  if (!is_valid_)
+    return GURL();
+
+  GURL result;
+
+  // Reserve enough room in the output for the input, plus some extra so that
+  // we have room if we have to escape a few things without reallocating.
+  result.spec_.reserve(spec_.size() + 32);
+  url_canon::StdStringCanonOutput output(&result.spec_);
+
+  if (!url_util::ResolveRelative(
+          spec_.data(), static_cast<int>(spec_.length()), parsed_,
+          relative.data(), static_cast<int>(relative.length()),
+          charset_converter, &output, &result.parsed_)) {
+    // Error resolving, return an empty URL.
+    return GURL();
+  }
+
+  output.Complete();
+  result.is_valid_ = true;
+  if (result.SchemeIsFileSystem()) {
+    result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(),
+                                 *result.parsed_.inner_parsed(), true);
+  }
+  return result;
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::ReplaceComponents(
+    const url_canon::Replacements<char>& replacements) const {
+  GURL result;
+
+  // Not allowed for invalid URLs.
+  if (!is_valid_)
+    return GURL();
+
+  // Reserve enough room in the output for the input, plus some extra so that
+  // we have room if we have to escape a few things without reallocating.
+  result.spec_.reserve(spec_.size() + 32);
+  url_canon::StdStringCanonOutput output(&result.spec_);
+
+  result.is_valid_ = url_util::ReplaceComponents(
+      spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+      NULL, &output, &result.parsed_);
+
+  output.Complete();
+  if (result.is_valid_ && result.SchemeIsFileSystem()) {
+    result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(),
+                                 *result.parsed_.inner_parsed(), true);
+  }
+  return result;
+}
+
+// Note: code duplicated above (it's inconvenient to use a template here).
+GURL GURL::ReplaceComponents(
+    const url_canon::Replacements<char16>& replacements) const {
+  GURL result;
+
+  // Not allowed for invalid URLs.
+  if (!is_valid_)
+    return GURL();
+
+  // Reserve enough room in the output for the input, plus some extra so that
+  // we have room if we have to escape a few things without reallocating.
+  result.spec_.reserve(spec_.size() + 32);
+  url_canon::StdStringCanonOutput output(&result.spec_);
+
+  result.is_valid_ = url_util::ReplaceComponents(
+      spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+      NULL, &output, &result.parsed_);
+
+  output.Complete();
+  if (result.is_valid_ && result.SchemeIsFileSystem()) {
+    result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(),
+                                 *result.parsed_.inner_parsed(), true);
+  }
+  return result;
+}
+
+GURL GURL::GetOrigin() const {
+  // This doesn't make sense for invalid or nonstandard URLs, so return
+  // the empty URL
+  if (!is_valid_ || !IsStandard())
+    return GURL();
+
+  if (SchemeIsFileSystem())
+    return inner_url_->GetOrigin();
+
+  url_canon::Replacements<char> replacements;
+  replacements.ClearUsername();
+  replacements.ClearPassword();
+  replacements.ClearPath();
+  replacements.ClearQuery();
+  replacements.ClearRef();
+
+  return ReplaceComponents(replacements);
+}
+
+GURL GURL::GetWithEmptyPath() const {
+  // This doesn't make sense for invalid or nonstandard URLs, so return
+  // the empty URL.
+  if (!is_valid_ || !IsStandard())
+    return GURL();
+
+  // We could optimize this since we know that the URL is canonical, and we are
+  // appending a canonical path, so avoiding re-parsing.
+  GURL other(*this);
+  if (parsed_.path.len == 0)
+    return other;
+
+  // Clear everything after the path.
+  other.parsed_.query.reset();
+  other.parsed_.ref.reset();
+
+  // Set the path, since the path is longer than one, we can just set the
+  // first character and resize.
+  other.spec_[other.parsed_.path.begin] = '/';
+  other.parsed_.path.len = 1;
+  other.spec_.resize(other.parsed_.path.begin + 1);
+  return other;
+}
+
+bool GURL::IsStandard() const {
+  return url_util::IsStandard(spec_.data(), parsed_.scheme);
+}
+
+bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
+  if (parsed_.scheme.len <= 0)
+    return lower_ascii_scheme == NULL;
+  return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
+                                        spec_.data() + parsed_.scheme.end(),
+                                        lower_ascii_scheme);
+}
+
+int GURL::IntPort() const {
+  if (parsed_.port.is_nonempty())
+    return url_parse::ParsePort(spec_.data(), parsed_.port);
+  return url_parse::PORT_UNSPECIFIED;
+}
+
+int GURL::EffectiveIntPort() const {
+  int int_port = IntPort();
+  if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
+    return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
+                                           parsed_.scheme.len);
+  return int_port;
+}
+
+std::string GURL::ExtractFileName() const {
+  url_parse::Component file_component;
+  url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
+  return ComponentString(file_component);
+}
+
+std::string GURL::PathForRequest() const {
+  DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
+  if (parsed_.ref.len >= 0) {
+    // Clip off the reference when it exists. The reference starts after the #
+    // sign, so we have to subtract one to also remove it.
+    return std::string(spec_, parsed_.path.begin,
+                       parsed_.ref.begin - parsed_.path.begin - 1);
+  }
+  // Compute the actual path length, rather than depending on the spec's
+  // terminator.  If we're an inner_url, our spec continues on into our outer
+  // url's path/query/ref.
+  int path_len = parsed_.path.len;
+  if (parsed_.query.is_valid())
+    path_len = parsed_.query.end() - parsed_.path.begin;
+
+  return std::string(spec_, parsed_.path.begin, path_len);
+}
+
+std::string GURL::HostNoBrackets() const {
+  // If host looks like an IPv6 literal, strip the square brackets.
+  url_parse::Component h(parsed_.host);
+  if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
+    h.begin++;
+    h.len -= 2;
+  }
+  return ComponentString(h);
+}
+
+bool GURL::HostIsIPAddress() const {
+  if (!is_valid_ || spec_.empty())
+     return false;
+
+  url_canon::RawCanonOutputT<char, 128> ignored_output;
+  url_canon::CanonHostInfo host_info;
+  url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
+                                   &ignored_output, &host_info);
+  return host_info.IsIPAddress();
+}
+
+#ifdef WIN32
+
+const GURL& GURL::EmptyGURL() {
+  // Avoid static object construction/destruction on startup/shutdown.
+  if (!empty_gurl) {
+    // Create the string. Be careful that we don't break in the case that this
+    // is being called from multiple threads.
+    GURL* new_empty_gurl = new GURL;
+    if (InterlockedCompareExchangePointer(
+        reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
+      // The old value was non-NULL, so no replacement was done. Another
+      // thread did the initialization out from under us.
+      delete new_empty_gurl;
+    }
+  }
+  return *empty_gurl;
+}
+
+#else
+
+void EmptyGURLOnce(void) {
+  empty_gurl = new GURL;
+}
+
+const GURL& GURL::EmptyGURL() {
+  // Avoid static object construction/destruction on startup/shutdown.
+  pthread_once(&empty_gurl_once, EmptyGURLOnce);
+  return *empty_gurl;
+}
+
+#endif  // WIN32
+
+bool GURL::DomainIs(const char* lower_ascii_domain,
+                    int domain_len) const {
+  // Return false if this URL is not valid or domain is empty.
+  if (!is_valid_ || !domain_len)
+    return false;
+
+  // FileSystem URLs have empty parsed_.host, so check this first.
+  if (SchemeIsFileSystem() && inner_url_)
+    return inner_url_->DomainIs(lower_ascii_domain, domain_len);
+
+  if (!parsed_.host.is_nonempty())
+    return false;
+
+  // Check whether the host name is end with a dot. If yes, treat it
+  // the same as no-dot unless the input comparison domain is end
+  // with dot.
+  const char* last_pos = spec_.data() + parsed_.host.end() - 1;
+  int host_len = parsed_.host.len;
+  if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
+    last_pos--;
+    host_len--;
+  }
+
+  // Return false if host's length is less than domain's length.
+  if (host_len < domain_len)
+    return false;
+
+  // Compare this url whether belong specific domain.
+  const char* start_pos = spec_.data() + parsed_.host.begin +
+                          host_len - domain_len;
+
+  if (!url_util::LowerCaseEqualsASCII(start_pos,
+                                      last_pos + 1,
+                                      lower_ascii_domain,
+                                      lower_ascii_domain + domain_len))
+    return false;
+
+  // Check whether host has right domain start with dot, make sure we got
+  // right domain range. For example www.google.com has domain
+  // "google.com" but www.iamnotgoogle.com does not.
+  if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
+      '.' != *(start_pos - 1))
+    return false;
+
+  return true;
+}
+
+void GURL::Swap(GURL* other) {
+  spec_.swap(other->spec_);
+  std::swap(is_valid_, other->is_valid_);
+  std::swap(parsed_, other->parsed_);
+  std::swap(inner_url_, other->inner_url_);
+}
+
+std::ostream& operator<<(std::ostream& out, const GURL& url) {
+  return out << url.possibly_invalid_spec();
+}
diff --git a/googleurl/url_canon_etc.cpp b/googleurl/url_canon_etc.cpp
new file mode 100644
index 0000000..d3f4596
--- /dev/null
+++ b/googleurl/url_canon_etc.cpp
@@ -0,0 +1,392 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Canonicalizers for random bits that aren't big enough for their own files.
+
+#include <string.h>
+
+#include "url_canon.h"
+#include "url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// Returns true if the given character should be removed from the middle of a
+// URL.
+inline bool IsRemovableURLWhitespace(int ch) {
+  return ch == '\r' || ch == '\n' || ch == '\t';
+}
+
+// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
+// It sucks that we have to do this, since this takes about 13% of the total URL
+// canonicalization time.
+template<typename CHAR>
+const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
+                                  CanonOutputT<CHAR>* buffer,
+                                  int* output_len) {
+  // Fast verification that there's nothing that needs removal. This is the 99%
+  // case, so we want it to be fast and don't care about impacting the speed
+  // when we do find whitespace.
+  int found_whitespace = false;
+  for (int i = 0; i < input_len; i++) {
+    if (!IsRemovableURLWhitespace(input[i]))
+      continue;
+    found_whitespace = true;
+    break;
+  }
+
+  if (!found_whitespace) {
+    // Didn't find any whitespace, we don't need to do anything. We can just
+    // return the input as the output.
+    *output_len = input_len;
+    return input;
+  }
+
+  // Remove the whitespace into the new buffer and return it.
+  for (int i = 0; i < input_len; i++) {
+    if (!IsRemovableURLWhitespace(input[i]))
+      buffer->push_back(input[i]);
+  }
+  *output_len = buffer->length();
+  return buffer->data();
+}
+
+// Contains the canonical version of each possible input letter in the scheme
+// (basically, lower-cased). The corresponding entry will be 0 if the letter
+// is not allowed in a scheme.
+const char kSchemeCanonical[0x80] = {
+// 00-1f: all are invalid
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+//  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
+//   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
+//   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
+     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+//   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
+//   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
+     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
+
+// This could be a table lookup as well by setting the high bit for each
+// valid character, but it's only called once per URL, and it makes the lookup
+// table easier to read not having extra stuff in it.
+inline bool IsSchemeFirstChar(unsigned char c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoScheme(const CHAR* spec,
+              const url_parse::Component& scheme,
+              CanonOutput* output,
+              url_parse::Component* out_scheme) {
+  if (scheme.len <= 0) {
+    // Scheme is unspecified or empty, convert to empty by appending a colon.
+    *out_scheme = url_parse::Component(output->length(), 0);
+    output->push_back(':');
+    return true;
+  }
+
+  // The output scheme starts from the current position.
+  out_scheme->begin = output->length();
+
+  // Danger: it's important that this code does not strip any characters: it
+  // only emits the canonical version (be it valid or escaped) of each of
+  // the input characters. Stripping would put it out of sync with
+  // url_util::FindAndCompareScheme, which could cause some security checks on
+  // schemes to be incorrect.
+  bool success = true;
+  int end = scheme.end();
+  for (int i = scheme.begin; i < end; i++) {
+    UCHAR ch = static_cast<UCHAR>(spec[i]);
+    char replacement = 0;
+    if (ch < 0x80) {
+      if (i == scheme.begin) {
+        // Need to do a special check for the first letter of the scheme.
+        if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
+          replacement = kSchemeCanonical[ch];
+      } else {
+        replacement = kSchemeCanonical[ch];
+      }
+    }
+
+    if (replacement) {
+      output->push_back(replacement);
+    } else if (ch == '%') {
+      // Canonicalizing the scheme multiple times should lead to the same
+      // result. Since invalid characters will be escaped, we need to preserve
+      // the percent to avoid multiple escaping. The scheme will be invalid.
+      success = false;
+      output->push_back('%');
+    } else {
+      // Invalid character, store it but mark this scheme as invalid.
+      success = false;
+
+      // This will escape the output and also handle encoding issues.
+      // Ignore the return value since we already failed.
+      AppendUTF8EscapedChar(spec, &i, end, output);
+    }
+  }
+
+  // The output scheme ends with the the current position, before appending
+  // the colon.
+  out_scheme->len = output->length() - out_scheme->begin;
+  output->push_back(':');
+  return success;
+}
+
+// The username and password components reference ranges in the corresponding
+// *_spec strings. Typically, these specs will be the same (we're
+// canonicalizing a single source string), but may be different when
+// replacing components.
+template<typename CHAR, typename UCHAR>
+bool DoUserInfo(const CHAR* username_spec,
+                const url_parse::Component& username,
+                const CHAR* password_spec,
+                const url_parse::Component& password,
+                CanonOutput* output,
+                url_parse::Component* out_username,
+                url_parse::Component* out_password) {
+  if (username.len <= 0 && password.len <= 0) {
+    // Common case: no user info. We strip empty username/passwords.
+    *out_username = url_parse::Component();
+    *out_password = url_parse::Component();
+    return true;
+  }
+
+  // Write the username.
+  out_username->begin = output->length();
+  if (username.len > 0) {
+    // This will escape characters not valid for the username.
+    AppendStringOfType(&username_spec[username.begin], username.len,
+                       CHAR_USERINFO, output);
+  }
+  out_username->len = output->length() - out_username->begin;
+
+  // When there is a password, we need the separator. Note that we strip
+  // empty but specified passwords.
+  if (password.len > 0) {
+    output->push_back(':');
+    out_password->begin = output->length();
+    AppendStringOfType(&password_spec[password.begin], password.len,
+                       CHAR_USERINFO, output);
+    out_password->len = output->length() - out_password->begin;
+  } else {
+    *out_password = url_parse::Component();
+  }
+
+  output->push_back('@');
+  return true;
+}
+
+// Helper functions for converting port integers to strings.
+inline void WritePortInt(char* output, int output_len, int port) {
+  _itoa_s(port, output, output_len, 10);
+}
+
+// This function will prepend the colon if there will be a port.
+template<typename CHAR, typename UCHAR>
+bool DoPort(const CHAR* spec,
+            const url_parse::Component& port,
+            int default_port_for_scheme,
+            CanonOutput* output,
+            url_parse::Component* out_port) {
+  int port_num = url_parse::ParsePort(spec, port);
+  if (port_num == url_parse::PORT_UNSPECIFIED ||
+      port_num == default_port_for_scheme) {
+    *out_port = url_parse::Component();
+    return true;  // Leave port empty.
+  }
+
+  if (port_num == url_parse::PORT_INVALID) {
+    // Invalid port: We'll copy the text from the input so the user can see
+    // what the error was, and mark the URL as invalid by returning false.
+    output->push_back(':');
+    out_port->begin = output->length();
+    AppendInvalidNarrowString(spec, port.begin, port.end(), output);
+    out_port->len = output->length() - out_port->begin;
+    return false;
+  }
+
+  // Convert port number back to an integer. Max port value is 5 digits, and
+  // the Parsed::ExtractPort will have made sure the integer is in range.
+  const int buf_size = 6;
+  char buf[buf_size];
+  WritePortInt(buf, buf_size, port_num);
+
+  // Append the port number to the output, preceeded by a colon.
+  output->push_back(':');
+  out_port->begin = output->length();
+  for (int i = 0; i < buf_size && buf[i]; i++)
+    output->push_back(buf[i]);
+
+  out_port->len = output->length() - out_port->begin;
+  return true;
+}
+
+template<typename CHAR, typename UCHAR>
+void DoCanonicalizeRef(const CHAR* spec,
+                       const url_parse::Component& ref,
+                       CanonOutput* output,
+                       url_parse::Component* out_ref) {
+  if (ref.len < 0) {
+    // Common case of no ref.
+    *out_ref = url_parse::Component();
+    return;
+  }
+
+  // Append the ref separator. Note that we need to do this even when the ref
+  // is empty but present.
+  output->push_back('#');
+  out_ref->begin = output->length();
+
+  // Now iterate through all the characters, converting to UTF-8 and validating.
+  int end = ref.end();
+  for (int i = ref.begin; i < end; i++) {
+    if (spec[i] == 0) {
+      // IE just strips NULLs, so we do too.
+      continue;
+    } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
+      // Unline IE seems to, we escape control characters. This will probably
+      // make the reference fragment unusable on a web page, but people
+      // shouldn't be using control characters in their anchor names.
+      AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
+    } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
+      // Normal ASCII characters are just appended.
+      output->push_back(static_cast<char>(spec[i]));
+    } else {
+      // Non-ASCII characters are appended unescaped, but only when they are
+      // valid. Invalid Unicode characters are replaced with the "invalid
+      // character" as IE seems to (ReadUTFChar puts the unicode replacement
+      // character in the output on failure for us).
+      unsigned code_point;
+      ReadUTFChar(spec, &i, end, &code_point);
+      AppendUTF8Value(code_point, output);
+    }
+  }
+
+  out_ref->len = output->length() - out_ref->begin;
+}
+
+}  // namespace
+
+const char* RemoveURLWhitespace(const char* input, int input_len,
+                                CanonOutputT<char>* buffer,
+                                int* output_len) {
+  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
+}
+
+const char16* RemoveURLWhitespace(const char16* input, int input_len,
+                                  CanonOutputT<char16>* buffer,
+                                  int* output_len) {
+  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
+}
+
+char CanonicalSchemeChar(char16 ch) {
+  if (ch >= 0x80)
+    return 0;  // Non-ASCII is not supported by schemes.
+  return kSchemeCanonical[ch];
+}
+
+bool CanonicalizeScheme(const char* spec,
+                        const url_parse::Component& scheme,
+                        CanonOutput* output,
+                        url_parse::Component* out_scheme) {
+  return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
+}
+
+bool CanonicalizeScheme(const char16* spec,
+                        const url_parse::Component& scheme,
+                        CanonOutput* output,
+                        url_parse::Component* out_scheme) {
+  return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
+}
+
+bool CanonicalizeUserInfo(const char* username_source,
+                          const url_parse::Component& username,
+                          const char* password_source,
+                          const url_parse::Component& password,
+                          CanonOutput* output,
+                          url_parse::Component* out_username,
+                          url_parse::Component* out_password) {
+  return DoUserInfo<char, unsigned char>(
+      username_source, username, password_source, password,
+      output, out_username, out_password);
+}
+
+bool CanonicalizeUserInfo(const char16* username_source,
+                          const url_parse::Component& username,
+                          const char16* password_source,
+                          const url_parse::Component& password,
+                          CanonOutput* output,
+                          url_parse::Component* out_username,
+                          url_parse::Component* out_password) {
+  return DoUserInfo<char16, char16>(
+      username_source, username, password_source, password,
+      output, out_username, out_password);
+}
+
+bool CanonicalizePort(const char* spec,
+                      const url_parse::Component& port,
+                      int default_port_for_scheme,
+                      CanonOutput* output,
+                      url_parse::Component* out_port) {
+  return DoPort<char, unsigned char>(spec, port,
+                                     default_port_for_scheme,
+                                     output, out_port);
+}
+
+bool CanonicalizePort(const char16* spec,
+                      const url_parse::Component& port,
+                      int default_port_for_scheme,
+                      CanonOutput* output,
+                      url_parse::Component* out_port) {
+  return DoPort<char16, char16>(spec, port, default_port_for_scheme,
+                                      output, out_port);
+}
+
+void CanonicalizeRef(const char* spec,
+                     const url_parse::Component& ref,
+                     CanonOutput* output,
+                     url_parse::Component* out_ref) {
+  DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
+}
+
+void CanonicalizeRef(const char16* spec,
+                     const url_parse::Component& ref,
+                     CanonOutput* output,
+                     url_parse::Component* out_ref) {
+  DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_filesystemurl.cpp b/googleurl/url_canon_filesystemurl.cpp
new file mode 100644
index 0000000..47a4666
--- /dev/null
+++ b/googleurl/url_canon_filesystemurl.cpp
@@ -0,0 +1,160 @@
+// Copyright 2012, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "filesystem:file:" URLs.
+
+#include "url_canon.h"
+#include "url_canon_internal.h"
+#include "url_file.h"
+#include "url_parse_internal.h"
+#include "url_util.h"
+#include "url_util_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// We use the URLComponentSource for the outer URL, as it can have replacements,
+// whereas the inner_url can't, so it uses spec.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeFileSystemURL(const CHAR* spec,
+                                 const URLComponentSource<CHAR>& source,
+                                 const url_parse::Parsed& parsed,
+                                 CharsetConverter* charset_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* new_parsed) {
+  // filesystem only uses {scheme, path, query, ref} -- clear the rest.
+  new_parsed->username = url_parse::Component();
+  new_parsed->password = url_parse::Component();
+  new_parsed->host = url_parse::Component();
+  new_parsed->port = url_parse::Component();
+
+  const url_parse::Parsed* inner_parsed = parsed.inner_parsed();
+  url_parse::Parsed new_inner_parsed;
+
+  // Scheme (known, so we don't bother running it through the more
+  // complicated scheme canonicalizer).
+  new_parsed->scheme.begin = output->length();
+  output->Append("filesystem:", 11);
+  new_parsed->scheme.len = 10;
+
+  if (!parsed.inner_parsed() || !parsed.inner_parsed()->scheme.is_valid())
+    return false;
+
+  bool success = true;
+  if (url_util::CompareSchemeComponent(spec, inner_parsed->scheme,
+      url_util::kFileScheme)) {
+    new_inner_parsed.scheme.begin = output->length();
+    output->Append("file://", 7);
+    new_inner_parsed.scheme.len = 4;
+    success &= CanonicalizePath(spec, inner_parsed->path, output,
+                                &new_inner_parsed.path);
+  } else if (url_util::IsStandard(spec, inner_parsed->scheme)) {
+    success =
+        url_canon::CanonicalizeStandardURL(spec,
+                                           parsed.inner_parsed()->Length(),
+                                           *parsed.inner_parsed(),
+                                           charset_converter, output,
+                                           &new_inner_parsed);
+  } else {
+    // TODO(ericu): The URL is wrong, but should we try to output more of what
+    // we were given?  Echoing back filesystem:mailto etc. doesn't seem all that
+    // useful.
+    return false;
+  }
+  // The filesystem type must be more than just a leading slash for validity.
+  success &= parsed.inner_parsed()->path.len > 1;
+
+  success &= CanonicalizePath(source.path, parsed.path, output,
+                              &new_parsed->path);
+
+  // Ignore failures for query/ref since the URL can probably still be loaded.
+  CanonicalizeQuery(source.query, parsed.query, charset_converter,
+                    output, &new_parsed->query);
+  CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+  if (success)
+    new_parsed->set_inner_parsed(new_inner_parsed);
+
+  return success;
+}
+
+}  // namespace
+
+bool CanonicalizeFileSystemURL(const char* spec,
+                               int spec_len,
+                               const url_parse::Parsed& parsed,
+                               CharsetConverter* charset_converter,
+                               CanonOutput* output,
+                               url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizeFileSystemURL<char, unsigned char>(
+      spec, URLComponentSource<char>(spec), parsed, charset_converter, output,
+      new_parsed);
+}
+
+bool CanonicalizeFileSystemURL(const char16* spec,
+                               int spec_len,
+                               const url_parse::Parsed& parsed,
+                               CharsetConverter* charset_converter,
+                               CanonOutput* output,
+                               url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizeFileSystemURL<char16, char16>(
+      spec, URLComponentSource<char16>(spec), parsed, charset_converter, output,
+      new_parsed);
+}
+
+bool ReplaceFileSystemURL(const char* base,
+                          const url_parse::Parsed& base_parsed,
+                          const Replacements<char>& replacements,
+                          CharsetConverter* charset_converter,
+                          CanonOutput* output,
+                          url_parse::Parsed* new_parsed) {
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupOverrideComponents(base, replacements, &source, &parsed);
+  return DoCanonicalizeFileSystemURL<char, unsigned char>(
+      base, source, parsed, charset_converter, output, new_parsed);
+}
+
+bool ReplaceFileSystemURL(const char* base,
+                          const url_parse::Parsed& base_parsed,
+                          const Replacements<char16>& replacements,
+                          CharsetConverter* charset_converter,
+                          CanonOutput* output,
+                          url_parse::Parsed* new_parsed) {
+  RawCanonOutput<1024> utf8;
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+  return DoCanonicalizeFileSystemURL<char, unsigned char>(
+      base, source, parsed, charset_converter, output, new_parsed);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_fileurl.cpp b/googleurl/url_canon_fileurl.cpp
new file mode 100644
index 0000000..15fedb9
--- /dev/null
+++ b/googleurl/url_canon_fileurl.cpp
@@ -0,0 +1,217 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "file:" URLs.
+
+#include "url_canon.h"
+#include "url_canon_internal.h"
+#include "url_file.h"
+#include "url_parse_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+#ifdef WIN32
+
+// Given a pointer into the spec, this copies and canonicalizes the drive
+// letter and colon to the output, if one is found. If there is not a drive
+// spec, it won't do anything. The index of the next character in the input
+// spec is returned (after the colon when a drive spec is found, the begin
+// offset if one is not).
+template<typename CHAR>
+int FileDoDriveSpec(const CHAR* spec, int begin, int end,
+                    CanonOutput* output) {
+  // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
+  // (with backslashes instead of slashes as well).
+  int num_slashes = url_parse::CountConsecutiveSlashes(spec, begin, end);
+  int after_slashes = begin + num_slashes;
+
+  if (!url_parse::DoesBeginWindowsDriveSpec(spec, after_slashes, end))
+    return begin;  // Haven't consumed any characters
+
+  // A drive spec is the start of a path, so we need to add a slash for the
+  // authority terminator (typically the third slash).
+  output->push_back('/');
+
+  // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
+  // and that it is followed by a colon/pipe.
+
+  // Normalize Windows drive letters to uppercase
+  if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
+    output->push_back(spec[after_slashes] - 'a' + 'A');
+  else
+    output->push_back(static_cast<char>(spec[after_slashes]));
+
+  // Normalize the character following it to a colon rather than pipe.
+  output->push_back(':');
+  return after_slashes + 2;
+}
+
+#endif  // WIN32
+
+template<typename CHAR, typename UCHAR>
+bool DoFileCanonicalizePath(const CHAR* spec,
+                            const url_parse::Component& path,
+                            CanonOutput* output,
+                            url_parse::Component* out_path) {
+  // Copies and normalizes the "c:" at the beginning, if present.
+  out_path->begin = output->length();
+  int after_drive;
+#ifdef WIN32
+  after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output);
+#else
+  after_drive = path.begin;
+#endif
+
+  // Copies the rest of the path, starting from the slash following the
+  // drive colon (if any, Windows only), or the first slash of the path.
+  bool success = true;
+  if (after_drive < path.end()) {
+    // Use the regular path canonicalizer to canonicalize the rest of the
+    // path. Give it a fake output component to write into. DoCanonicalizeFile
+    // will compute the full path component.
+    url_parse::Component sub_path =
+        url_parse::MakeRange(after_drive, path.end());
+    url_parse::Component fake_output_path;
+    success = CanonicalizePath(spec, sub_path, output, &fake_output_path);
+  } else {
+    // No input path, canonicalize to a slash.
+    output->push_back('/');
+  }
+
+  out_path->len = output->length() - out_path->begin;
+  return success;
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
+                           const url_parse::Parsed& parsed,
+                           CharsetConverter* query_converter,
+                           CanonOutput* output,
+                           url_parse::Parsed* new_parsed) {
+  // Things we don't set in file: URLs.
+  new_parsed->username = url_parse::Component();
+  new_parsed->password = url_parse::Component();
+  new_parsed->port = url_parse::Component();
+
+  // Scheme (known, so we don't bother running it through the more
+  // complicated scheme canonicalizer).
+  new_parsed->scheme.begin = output->length();
+  output->Append("file://", 7);
+  new_parsed->scheme.len = 4;
+
+  // Append the host. For many file URLs, this will be empty. For UNC, this
+  // will be present.
+  // TODO(brettw) This doesn't do any checking for host name validity. We
+  // should probably handle validity checking of UNC hosts differently than
+  // for regular IP hosts.
+  bool success = CanonicalizeHost(source.host, parsed.host,
+                                  output, &new_parsed->host);
+  success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path,
+                                    output, &new_parsed->path);
+  CanonicalizeQuery(source.query, parsed.query, query_converter,
+                    output, &new_parsed->query);
+
+  // Ignore failure for refs since the URL can probably still be loaded.
+  CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+
+  return success;
+}
+
+} // namespace
+
+bool CanonicalizeFileURL(const char* spec,
+                         int spec_len,
+                         const url_parse::Parsed& parsed,
+                         CharsetConverter* query_converter,
+                         CanonOutput* output,
+                         url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizeFileURL<char, unsigned char>(
+      URLComponentSource<char>(spec), parsed, query_converter,
+      output, new_parsed);
+}
+
+bool CanonicalizeFileURL(const char16* spec,
+                         int spec_len,
+                         const url_parse::Parsed& parsed,
+                         CharsetConverter* query_converter,
+                         CanonOutput* output,
+                         url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizeFileURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, query_converter,
+      output, new_parsed);
+}
+
+bool FileCanonicalizePath(const char* spec,
+                          const url_parse::Component& path,
+                          CanonOutput* output,
+                          url_parse::Component* out_path) {
+  return DoFileCanonicalizePath<char, unsigned char>(spec, path,
+                                                     output, out_path);
+}
+
+bool FileCanonicalizePath(const char16* spec,
+                          const url_parse::Component& path,
+                          CanonOutput* output,
+                          url_parse::Component* out_path) {
+  return DoFileCanonicalizePath<char16, char16>(spec, path,
+                                                output, out_path);
+}
+
+bool ReplaceFileURL(const char* base,
+                    const url_parse::Parsed& base_parsed,
+                    const Replacements<char>& replacements,
+                    CharsetConverter* query_converter,
+                    CanonOutput* output,
+                    url_parse::Parsed* new_parsed) {
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupOverrideComponents(base, replacements, &source, &parsed);
+  return DoCanonicalizeFileURL<char, unsigned char>(
+      source, parsed, query_converter, output, new_parsed);
+}
+
+bool ReplaceFileURL(const char* base,
+                    const url_parse::Parsed& base_parsed,
+                    const Replacements<char16>& replacements,
+                    CharsetConverter* query_converter,
+                    CanonOutput* output,
+                    url_parse::Parsed* new_parsed) {
+  RawCanonOutput<1024> utf8;
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+  return DoCanonicalizeFileURL<char, unsigned char>(
+      source, parsed, query_converter, output, new_parsed);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_host.cpp b/googleurl/url_canon_host.cpp
new file mode 100644
index 0000000..9656799
--- /dev/null
+++ b/googleurl/url_canon_host.cpp
@@ -0,0 +1,401 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "base/logging.h"
+#include "url_canon.h"
+#include "url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// For reference, here's what IE supports:
+// Key: 0 (disallowed: failure if present in the input)
+//      + (allowed either escaped or unescaped, and unmodified)
+//      U (allowed escaped or unescaped but always unescaped if present in
+//         escaped form)
+//      E (allowed escaped or unescaped but always escaped if present in
+//         unescaped form)
+//      % (only allowed escaped in the input, will be unmodified).
+//      I left blank alpha numeric characters.
+//
+//    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+//    -----------------------------------------------
+// 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
+// 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
+// 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
+// 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
+// 4   %
+// 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
+// 6   E                                               <-- That's  `
+// 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
+//
+// NOTE: I didn't actually test all the control characters. Some may be
+// disallowed in the input, but they are all accepted escaped except for 0.
+// I also didn't test if characters affecting HTML parsing are allowed
+// unescaped, eg. (") or (#), which would indicate the beginning of the path.
+// Surprisingly, space is accepted in the input and always escaped.
+
+// This table lists the canonical version of all characters we allow in the
+// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
+// value to indicate that this character should be escaped. We are a little more
+// restrictive than IE, but less restrictive than Firefox.
+//
+// Note that we disallow the % character. We will allow it when part of an
+// escape sequence, of course, but this disallows "%25". Even though IE allows
+// it, allowing it would put us in a funny state. If there was an invalid
+// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
+// Allowing percents means we'll succeed a second time, so validity would change
+// based on how many times you run the canonicalizer. We prefer to always report
+// the same vailidity, so reject this.
+const unsigned char kEsc = 0xff;
+const unsigned char kHostCharLookup[0x80] = {
+// 00-1f: all are invalid
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+//  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
+   kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
+//   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
+//   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
+   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+//   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
+//   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
+   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
+
+const int kTempHostBufferLen = 1024;
+typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
+typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
+
+// Scans a host name and fills in the output flags according to what we find.
+// |has_non_ascii| will be true if there are any non-7-bit characters, and
+// |has_escaped| will be true if there is a percent sign.
+template<typename CHAR, typename UCHAR>
+void ScanHostname(const CHAR* spec, const url_parse::Component& host,
+                  bool* has_non_ascii, bool* has_escaped) {
+  int end = host.end();
+  *has_non_ascii = false;
+  *has_escaped = false;
+  for (int i = host.begin; i < end; i++) {
+    if (static_cast<UCHAR>(spec[i]) >= 0x80)
+      *has_non_ascii = true;
+    else if (spec[i] == '%')
+      *has_escaped = true;
+  }
+}
+
+// Canonicalizes a host name that is entirely 8-bit characters (even though
+// the type holding them may be 16 bits. Escaped characters will be unescaped.
+// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
+//
+// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
+// the output.
+//
+// This function is used in two situations:
+//
+//  * When the caller knows there is no non-ASCII or percent escaped
+//    characters. This is what DoHost does. The result will be a completely
+//    canonicalized host since we know nothing weird can happen (escaped
+//    characters could be unescaped to non-7-bit, so they have to be treated
+//    with suspicion at this point). It does not use the |has_non_ascii| flag.
+//
+//  * When the caller has an 8-bit string that may need unescaping.
+//    DoComplexHost calls us this situation to do unescaping and validation.
+//    After this, it may do other IDN operations depending on the value of the
+//    |*has_non_ascii| flag.
+//
+// The return value indicates if the output is a potentially valid host name.
+template<typename INCHAR, typename OUTCHAR>
+bool DoSimpleHost(const INCHAR* host,
+                  int host_len,
+                  CanonOutputT<OUTCHAR>* output,
+                  bool* has_non_ascii) {
+  *has_non_ascii = false;
+
+  bool success = true;
+  for (int i = 0; i < host_len; ++i) {
+    unsigned int source = host[i];
+    if (source == '%') {
+      // Unescape first, if possible.
+      // Source will be used only if decode operation was successful.
+      if (!DecodeEscaped(host, &i, host_len,
+                         reinterpret_cast<unsigned char*>(&source))) {
+        // Invalid escaped character. There is nothing that can make this
+        // host valid. We append an escaped percent so the URL looks reasonable
+        // and mark as failed.
+        AppendEscapedChar('%', output);
+        success = false;
+        continue;
+      }
+    }
+
+    if (source < 0x80) {
+      // We have ASCII input, we can use our lookup table.
+      unsigned char replacement = kHostCharLookup[source];
+      if (!replacement) {
+        // Invalid character, add it as percent-escaped and mark as failed.
+        AppendEscapedChar(source, output);
+        success = false;
+      } else if (replacement == kEsc) {
+        // This character is valid but should be escaped.
+        AppendEscapedChar(source, output);
+      } else {
+        // Common case, the given character is valid in a hostname, the lookup
+        // table tells us the canonical representation of that character (lower
+        // cased).
+        output->push_back(replacement);
+      }
+    } else {
+      // It's a non-ascii char. Just push it to the output.
+      // In case where we have char16 input, and char output it's safe to
+      // cast char16->char only if input string was converted to ASCII.
+      output->push_back(static_cast<OUTCHAR>(source));
+      *has_non_ascii = true;
+    }
+  }
+
+  return success;
+}
+
+// Canonicalizes a host that requires IDN conversion. Returns true on success
+bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
+  // We need to escape URL before doing IDN conversion, since punicode strings
+  // cannot be escaped after they are created.
+  RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
+  bool has_non_ascii;
+  DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
+
+  StackBufferW wide_output;
+  if (!IDNToASCII(url_escaped_host.data(),
+                  url_escaped_host.length(),
+                  &wide_output)) {
+    // Some error, give up. This will write some reasonable looking
+    // representation of the string to the output.
+    AppendInvalidNarrowString(src, 0, src_len, output);
+    return false;
+  }
+
+  // Now we check the ASCII output like a normal host. It will also handle
+  // unescaping. Although we unescaped everything before this function call, if
+  // somebody does %00 as fullwidth, ICU will convert this to ASCII.
+  bool success = DoSimpleHost(wide_output.data(),
+                              wide_output.length(),
+                              output, &has_non_ascii);
+  DCHECK(!has_non_ascii);
+  return success;
+}
+
+// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
+// UTF-16. The has_escaped flag should be set if the input string requires
+// unescaping.
+bool DoComplexHost(const char* host, int host_len,
+                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+  // Save the current position in the output. We may write stuff and rewind it
+  // below, so we need to know where to rewind to.
+  int begin_length = output->length();
+
+  // Points to the UTF-8 data we want to convert. This will either be the
+  // input or the unescaped version written to |*output| if necessary.
+  const char* utf8_source;
+  int utf8_source_len;
+  if (has_escaped) {
+    // Unescape before converting to UTF-16 for IDN. We write this into the
+    // output because it most likely does not require IDNization, and we can
+    // save another huge stack buffer. It will be replaced below if it requires
+    // IDN. This will also update our non-ASCII flag so we know whether the
+    // unescaped input requires IDN.
+    if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
+      // Error with some escape sequence. We'll call the current output
+      // complete. DoSimpleHost will have written some "reasonable" output.
+      return false;
+    }
+
+    // Unescaping may have left us with ASCII input, in which case the
+    // unescaped version we wrote to output is complete.
+    if (!has_non_ascii) {
+      return true;
+    }
+
+    // Save the pointer into the data was just converted (it may be appended to
+    // other data in the output buffer).
+    utf8_source = &output->data()[begin_length];
+    utf8_source_len = output->length() - begin_length;
+  } else {
+    // We don't need to unescape, use input for IDNization later. (We know the
+    // input has non-ASCII, or the simple version would have been called
+    // instead of us.)
+    utf8_source = host;
+    utf8_source_len = host_len;
+  }
+
+  // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
+  // Above, we may have used the output to write the unescaped values to, so
+  // we have to rewind it to where we started after we convert it to UTF-16.
+  StackBufferW utf16;
+  if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
+    // In this error case, the input may or may not be the output.
+    StackBuffer utf8;
+    for (int i = 0; i < utf8_source_len; i++)
+      utf8.push_back(utf8_source[i]);
+    output->set_length(begin_length);
+    AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
+    return false;
+  }
+  output->set_length(begin_length);
+
+  // This will call DoSimpleHost which will do normal ASCII canonicalization
+  // and also check for IP addresses in the outpt.
+  return DoIDNHost(utf16.data(), utf16.length(), output);
+}
+
+// UTF-16 convert host to its ASCII version. The set up is already ready for
+// the backend, so we just pass through. The has_escaped flag should be set if
+// the input string requires unescaping.
+bool DoComplexHost(const char16* host, int host_len,
+                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+  if (has_escaped) {
+    // Yikes, we have escaped characters with wide input. The escaped
+    // characters should be interpreted as UTF-8. To solve this problem,
+    // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
+    //
+    // We don't bother to optimize the conversion in the ASCII case (which
+    // *could* just be a copy) and use the UTF-8 path, because it should be
+    // very rare that host names have escaped characters, and it is relatively
+    // fast to do the conversion anyway.
+    StackBuffer utf8;
+    if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
+      AppendInvalidNarrowString(host, 0, host_len, output);
+      return false;
+    }
+
+    // Once we convert to UTF-8, we can use the 8-bit version of the complex
+    // host handling code above.
+    return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
+                         has_escaped, output);
+  }
+
+  // No unescaping necessary, we can safely pass the input to ICU. This
+  // function will only get called if we either have escaped or non-ascii
+  // input, so it's safe to just use ICU now. Even if the input is ASCII,
+  // this function will do the right thing (just slower than we could).
+  return DoIDNHost(host, host_len, output);
+}
+
+template<typename CHAR, typename UCHAR>
+void DoHost(const CHAR* spec,
+            const url_parse::Component& host,
+            CanonOutput* output,
+            CanonHostInfo* host_info) {
+  if (host.len <= 0) {
+    // Empty hosts don't need anything.
+    host_info->family = CanonHostInfo::NEUTRAL;
+    host_info->out_host = url_parse::Component();
+    return;
+  }
+
+  bool has_non_ascii, has_escaped;
+  ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
+
+  // Keep track of output's initial length, so we can rewind later.
+  const int output_begin = output->length();
+
+  bool success;
+  if (!has_non_ascii && !has_escaped) {
+    success = DoSimpleHost(&spec[host.begin], host.len,
+                           output, &has_non_ascii);
+    DCHECK(!has_non_ascii);
+  } else {
+    success = DoComplexHost(&spec[host.begin], host.len,
+                            has_non_ascii, has_escaped, output);
+  }
+
+  if (!success) {
+    // Canonicalization failed.  Set BROKEN to notify the caller.
+    host_info->family = CanonHostInfo::BROKEN;
+  } else {
+    // After all the other canonicalization, check if we ended up with an IP
+    // address.  IP addresses are small, so writing into this temporary buffer
+    // should not cause an allocation.
+    RawCanonOutput<64> canon_ip;
+    CanonicalizeIPAddress(output->data(),
+                          url_parse::MakeRange(output_begin, output->length()),
+                          &canon_ip, host_info);
+
+    // If we got an IPv4/IPv6 address, copy the canonical form back to the
+    // real buffer.  Otherwise, it's a hostname or broken IP, in which case
+    // we just leave it in place.
+    if (host_info->IsIPAddress()) {
+      output->set_length(output_begin);
+      output->Append(canon_ip.data(), canon_ip.length());
+    }
+  }
+
+  host_info->out_host = url_parse::MakeRange(output_begin, output->length());
+}
+
+}  // namespace
+
+bool CanonicalizeHost(const char* spec,
+                      const url_parse::Component& host,
+                      CanonOutput* output,
+                      url_parse::Component* out_host) {
+  CanonHostInfo host_info;
+  DoHost<char, unsigned char>(spec, host, output, &host_info);
+  *out_host = host_info.out_host;
+  return (host_info.family != CanonHostInfo::BROKEN);
+}
+
+bool CanonicalizeHost(const char16* spec,
+                      const url_parse::Component& host,
+                      CanonOutput* output,
+                      url_parse::Component* out_host) {
+  CanonHostInfo host_info;
+  DoHost<char16, char16>(spec, host, output, &host_info);
+  *out_host = host_info.out_host;
+  return (host_info.family != CanonHostInfo::BROKEN);
+}
+
+void CanonicalizeHostVerbose(const char* spec,
+                             const url_parse::Component& host,
+                             CanonOutput* output,
+                             CanonHostInfo *host_info) {
+  DoHost<char, unsigned char>(spec, host, output, host_info);
+}
+
+void CanonicalizeHostVerbose(const char16* spec,
+                             const url_parse::Component& host,
+                             CanonOutput* output,
+                             CanonHostInfo *host_info) {
+  DoHost<char16, char16>(spec, host, output, host_info);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_icu.cpp b/googleurl/url_canon_icu.cpp
new file mode 100644
index 0000000..59eb96a
--- /dev/null
+++ b/googleurl/url_canon_icu.cpp
@@ -0,0 +1,213 @@
+// Copyright 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ICU integration functions.
+
+#include <stdlib.h>
+#include <string.h>
+#include <unicode/ucnv.h>
+#include <unicode/ucnv_cb.h>
+#include <unicode/uidna.h>
+
+#include "url_canon_icu.h"
+#include "url_canon_internal.h"  // for _itoa_s
+
+#include "base/logging.h"
+
+namespace url_canon {
+
+namespace {
+
+// Called when converting a character that can not be represented, this will
+// append an escaped version of the numerical character reference for that code
+// point. It is of the form "&#1234;" and we will escape the non-digits to
+// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
+void appendURLEscapedChar(const void* context,
+                          UConverterFromUnicodeArgs* from_args,
+                          const UChar* code_units,
+                          int32_t length,
+                          UChar32 code_point,
+                          UConverterCallbackReason reason,
+                          UErrorCode* err) {
+  (void)context;
+  (void)code_units;
+  (void)length;
+  if (reason == UCNV_UNASSIGNED) {
+    *err = U_ZERO_ERROR;
+
+    const static int prefix_len = 6;
+    const static char prefix[prefix_len + 1] = "%26%23";  // "&#" percent-escaped
+    ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
+
+    DCHECK(code_point < 0x110000);
+    char number[8];  // Max Unicode code point is 7 digits.
+    _itoa_s(code_point, number, 10);
+    int number_len = static_cast<int>(strlen(number));
+    ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
+
+    const static int postfix_len = 3;
+    const static char postfix[postfix_len + 1] = "%3B";   // ";" percent-escaped
+    ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
+  }
+}
+
+// A class for scoping the installation of the invalid character callback.
+class AppendHandlerInstaller {
+ public:
+  // The owner of this object must ensure that the converter is alive for the
+  // duration of this object's lifetime.
+  AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
+    UErrorCode err = U_ZERO_ERROR;
+    ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
+                          &old_callback_, &old_context_, &err);
+  }
+
+  ~AppendHandlerInstaller() {
+    UErrorCode err = U_ZERO_ERROR;
+    ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
+  }
+
+ private:
+  UConverter* converter_;
+
+  UConverterFromUCallback old_callback_;
+  const void* old_context_;
+};
+
+}  // namespace
+
+ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
+    : converter_(converter) {
+}
+
+ICUCharsetConverter::~ICUCharsetConverter() {
+}
+
+void ICUCharsetConverter::ConvertFromUTF16(const char16* input,
+                                           int input_len,
+                                           CanonOutput* output) {
+  // Install our error handler. It will be called for character that can not
+  // be represented in the destination character set.
+  AppendHandlerInstaller handler(converter_);
+
+  int begin_offset = output->length();
+  int dest_capacity = output->capacity() - begin_offset;
+  output->set_length(output->length());
+
+  do {
+    UErrorCode err = U_ZERO_ERROR;
+    char* dest = &output->data()[begin_offset];
+    int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
+                                            input, input_len, &err);
+    if (err != U_BUFFER_OVERFLOW_ERROR) {
+      output->set_length(begin_offset + required_capacity);
+      return;
+    }
+
+    // Output didn't fit, expand
+    dest_capacity = required_capacity;
+    output->Resize(begin_offset + dest_capacity);
+  } while (true);
+}
+
+// Converts the Unicode input representing a hostname to ASCII using IDN rules.
+// The output must be ASCII, but is represented as wide characters.
+//
+// On success, the output will be filled with the ASCII host name and it will
+// return true. Unlike most other canonicalization functions, this assumes that
+// the output is empty. The beginning of the host will be at offset 0, and
+// the length of the output will be set to the length of the new host name.
+//
+// On error, this will return false. The output in this case is undefined.
+bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) {
+  DCHECK(output->length() == 0);  // Output buffer is assumed empty.
+  while (true) {
+    // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
+    // the spec (which do exist). This does not present any risk and is a
+    // little more future proof.
+    UErrorCode err = U_ZERO_ERROR;
+    int num_converted = uidna_IDNToASCII(src, src_len, output->data(),
+                                         output->capacity(),
+                                         UIDNA_ALLOW_UNASSIGNED, NULL, &err);
+    if (err == U_ZERO_ERROR) {
+      output->set_length(num_converted);
+      return true;
+    }
+    if (err != U_BUFFER_OVERFLOW_ERROR)
+      return false;  // Unknown error, give up.
+
+    // Not enough room in our buffer, expand.
+    output->Resize(output->capacity() * 2);
+  }
+}
+
+bool ReadUTFChar(const char* str, int* begin, int length,
+                 unsigned* code_point_out) {
+  int code_point;  // Avoids warning when U8_NEXT writes -1 to it.
+  U8_NEXT(str, *begin, length, code_point);
+  *code_point_out = static_cast<unsigned>(code_point);
+
+  // The ICU macro above moves to the next char, we want to point to the last
+  // char consumed.
+  (*begin)--;
+
+  // Validate the decoded value.
+  if (U_IS_UNICODE_CHAR(code_point))
+    return true;
+  *code_point_out = kUnicodeReplacementCharacter;
+  return false;
+}
+
+bool ReadUTFChar(const char16* str, int* begin, int length,
+                 unsigned* code_point) {
+  if (U16_IS_SURROGATE(str[*begin])) {
+    if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
+        !U16_IS_TRAIL(str[*begin + 1])) {
+      // Invalid surrogate pair.
+      *code_point = kUnicodeReplacementCharacter;
+      return false;
+    } else {
+      // Valid surrogate pair.
+      *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
+      (*begin)++;
+    }
+  } else {
+    // Not a surrogate, just one 16-bit word.
+    *code_point = str[*begin];
+  }
+
+  if (U_IS_UNICODE_CHAR(*code_point))
+    return true;
+
+  // Invalid code point.
+  *code_point = kUnicodeReplacementCharacter;
+  return false;
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_internal.cpp b/googleurl/url_canon_internal.cpp
new file mode 100644
index 0000000..5f3d70f
--- /dev/null
+++ b/googleurl/url_canon_internal.cpp
@@ -0,0 +1,429 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <cstdio>
+#include <errno.h>
+#include <stdlib.h>
+#include <string>
+
+#include "url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+template<typename CHAR, typename UCHAR>
+void DoAppendStringOfType(const CHAR* source, int length,
+                          SharedCharTypes type,
+                          CanonOutput* output) {
+  for (int i = 0; i < length; i++) {
+    if (static_cast<UCHAR>(source[i]) >= 0x80) {
+      // ReadChar will fill the code point with kUnicodeReplacementCharacter
+      // when the input is invalid, which is what we want.
+      unsigned code_point;
+      ReadUTFChar(source, &i, length, &code_point);
+      AppendUTF8EscapedValue(code_point, output);
+    } else {
+      // Just append the 7-bit character, possibly escaping it.
+      unsigned char uch = static_cast<unsigned char>(source[i]);
+      if (!IsCharOfType(uch, type))
+        AppendEscapedChar(uch, output);
+      else
+        output->push_back(uch);
+    }
+  }
+}
+
+// This function assumes the input values are all contained in 8-bit,
+// although it allows any type. Returns true if input is valid, false if not.
+template<typename CHAR, typename UCHAR>
+void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end,
+                                 CanonOutput* output) {
+  for (int i = begin; i < end; i++) {
+    UCHAR uch = static_cast<UCHAR>(spec[i]);
+    if (uch >= 0x80) {
+      // Handle UTF-8/16 encodings. This call will correctly handle the error
+      // case by appending the invalid character.
+      AppendUTF8EscapedChar(spec, &i, end, output);
+    } else if (uch <= ' ' || uch == 0x7f) {
+      // This function is for error handling, so we escape all control
+      // characters and spaces, but not anything else since we lack
+      // context to do something more specific.
+      AppendEscapedChar(static_cast<unsigned char>(uch), output);
+    } else {
+      output->push_back(static_cast<char>(uch));
+    }
+  }
+}
+
+// Overrides one component, see the url_canon::Replacements structure for
+// what the various combionations of source pointer and component mean.
+void DoOverrideComponent(const char* override_source,
+                         const url_parse::Component& override_component,
+                         const char** dest,
+                         url_parse::Component* dest_component) {
+  if (override_source) {
+    *dest = override_source;
+    *dest_component = override_component;
+  }
+}
+
+// Similar to DoOverrideComponent except that it takes a UTF-16 input and does
+// not actually set the output character pointer.
+//
+// The input is converted to UTF-8 at the end of the given buffer as a temporary
+// holding place. The component indentifying the portion of the buffer used in
+// the |utf8_buffer| will be specified in |*dest_component|.
+//
+// This will not actually set any |dest| pointer like DoOverrideComponent
+// does because all of the pointers will point into the |utf8_buffer|, which
+// may get resized while we're overriding a subsequent component. Instead, the
+// caller should use the beginning of the |utf8_buffer| as the string pointer
+// for all components once all overrides have been prepared.
+bool PrepareUTF16OverrideComponent(
+    const char16* override_source,
+    const url_parse::Component& override_component,
+    CanonOutput* utf8_buffer,
+    url_parse::Component* dest_component) {
+  bool success = true;
+  if (override_source) {
+    if (!override_component.is_valid()) {
+      // Non-"valid" component (means delete), so we need to preserve that.
+      *dest_component = url_parse::Component();
+    } else {
+      // Convert to UTF-8.
+      dest_component->begin = utf8_buffer->length();
+      success = ConvertUTF16ToUTF8(&override_source[override_component.begin],
+                                   override_component.len, utf8_buffer);
+      dest_component->len = utf8_buffer->length() - dest_component->begin;
+    }
+  }
+  return success;
+}
+
+}  // namespace
+
+// See the header file for this array's declaration.
+const unsigned char kSharedCharTypeTable[0x100] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x00 - 0x0f
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x10 - 0x1f
+    0,                           // 0x20  ' ' (escape spaces in queries)
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x21  !
+    0,                           // 0x22  "
+    0,                           // 0x23  #  (invalid in query since it marks the ref)
+    CHAR_QUERY | CHAR_USERINFO,  // 0x24  $
+    CHAR_QUERY | CHAR_USERINFO,  // 0x25  %
+    CHAR_QUERY | CHAR_USERINFO,  // 0x26  &
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x27  '
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x28  (
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x29  )
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x2a  *
+    CHAR_QUERY | CHAR_USERINFO,  // 0x2b  +
+    CHAR_QUERY | CHAR_USERINFO,  // 0x2c  ,
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x2d  -
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT,  // 0x2e  .
+    CHAR_QUERY,                  // 0x2f  /
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT,  // 0x30  0
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT,  // 0x31  1
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT,  // 0x32  2
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT,  // 0x33  3
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT,  // 0x34  4
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT,  // 0x35  5
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT,  // 0x36  6
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT,  // 0x37  7
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT,             // 0x38  8
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT,             // 0x39  9
+    CHAR_QUERY,  // 0x3a  :
+    CHAR_QUERY,  // 0x3b  ;
+    0,           // 0x3c  <  (Try to prevent certain types of XSS.)
+    CHAR_QUERY,  // 0x3d  =
+    0,           // 0x3e  >  (Try to prevent certain types of XSS.)
+    CHAR_QUERY,  // 0x3f  ?
+    CHAR_QUERY,  // 0x40  @
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x41  A
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x42  B
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x43  C
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x44  D
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x45  E
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x46  F
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x47  G
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x48  H
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x49  I
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x4a  J
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x4b  K
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x4c  L
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x4d  M
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x4e  N
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x4f  O
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x50  P
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x51  Q
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x52  R
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x53  S
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x54  T
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x55  U
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x56  V
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x57  W
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x58  X
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x59  Y
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x5a  Z
+    CHAR_QUERY,  // 0x5b  [
+    CHAR_QUERY,  // 0x5c  '\'
+    CHAR_QUERY,  // 0x5d  ]
+    CHAR_QUERY,  // 0x5e  ^
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x5f  _
+    CHAR_QUERY,  // 0x60  `
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x61  a
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x62  b
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x63  c
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x64  d
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x65  e
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT,  // 0x66  f
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x67  g
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x68  h
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x69  i
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x6a  j
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x6b  k
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x6c  l
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x6d  m
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x6e  n
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x6f  o
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x70  p
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x71  q
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x72  r
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x73  s
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x74  t
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x75  u
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x76  v
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x77  w
+    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT,  // 0x78  x
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x79  y
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x7a  z
+    CHAR_QUERY,  // 0x7b  {
+    CHAR_QUERY,  // 0x7c  |
+    CHAR_QUERY,  // 0x7d  }
+    CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT,  // 0x7e  ~
+    0,           // 0x7f
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8f
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9f
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xa0 - 0xaf
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xb0 - 0xbf
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xc0 - 0xcf
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xd0 - 0xdf
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xe0 - 0xef
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xf0 - 0xff
+};
+
+const char kHexCharLookup[0x10] = {
+    '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+};
+
+const char kCharToHexLookup[8] = {
+    0,         // 0x00 - 0x1f
+    '0',       // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39
+    'A' - 10,  // 0x40 - 0x5f: letters A - F are 0x41 - 0x46
+    'a' - 10,  // 0x60 - 0x7f: letters a - f are 0x61 - 0x66
+    0,         // 0x80 - 0x9F
+    0,         // 0xA0 - 0xBF
+    0,         // 0xC0 - 0xDF
+    0,         // 0xE0 - 0xFF
+};
+
+const char16 kUnicodeReplacementCharacter = 0xfffd;
+
+void AppendStringOfType(const char* source, int length,
+                        SharedCharTypes type,
+                        CanonOutput* output) {
+  DoAppendStringOfType<char, unsigned char>(source, length, type, output);
+}
+
+void AppendStringOfType(const char16* source, int length,
+                        SharedCharTypes type,
+                        CanonOutput* output) {
+  DoAppendStringOfType<char16, char16>(source, length, type, output);
+}
+
+void AppendInvalidNarrowString(const char* spec, int begin, int end,
+                               CanonOutput* output) {
+  DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);
+}
+
+void AppendInvalidNarrowString(const char16* spec, int begin, int end,
+                               CanonOutput* output) {
+  DoAppendInvalidNarrowString<char16, char16>(spec, begin, end, output);
+}
+
+bool ConvertUTF16ToUTF8(const char16* input, int input_len,
+                        CanonOutput* output) {
+  bool success = true;
+  for (int i = 0; i < input_len; i++) {
+    unsigned code_point;
+    success &= ReadUTFChar(input, &i, input_len, &code_point);
+    AppendUTF8Value(code_point, output);
+  }
+  return success;
+}
+
+bool ConvertUTF8ToUTF16(const char* input, int input_len,
+                        CanonOutputT<char16>* output) {
+  bool success = true;
+  for (int i = 0; i < input_len; i++) {
+    unsigned code_point;
+    success &= ReadUTFChar(input, &i, input_len, &code_point);
+    AppendUTF16Value(code_point, output);
+  }
+  return success;
+}
+
+void SetupOverrideComponents(const char* base,
+                             const Replacements<char>& repl,
+                             URLComponentSource<char>* source,
+                             url_parse::Parsed* parsed) {
+  (void)base;
+  // Get the source and parsed structures of the things we are replacing.
+  const URLComponentSource<char>& repl_source = repl.sources();
+  const url_parse::Parsed& repl_parsed = repl.components();
+
+  DoOverrideComponent(repl_source.scheme, repl_parsed.scheme,
+                      &source->scheme, &parsed->scheme);
+  DoOverrideComponent(repl_source.username, repl_parsed.username,
+                      &source->username, &parsed->username);
+  DoOverrideComponent(repl_source.password, repl_parsed.password,
+                      &source->password, &parsed->password);
+
+  // Our host should be empty if not present, so override the default setup.
+  DoOverrideComponent(repl_source.host, repl_parsed.host,
+                      &source->host, &parsed->host);
+  if (parsed->host.len == -1)
+    parsed->host.len = 0;
+
+  DoOverrideComponent(repl_source.port, repl_parsed.port,
+                      &source->port, &parsed->port);
+  DoOverrideComponent(repl_source.path, repl_parsed.path,
+                      &source->path, &parsed->path);
+  DoOverrideComponent(repl_source.query, repl_parsed.query,
+                      &source->query, &parsed->query);
+  DoOverrideComponent(repl_source.ref, repl_parsed.ref,
+                      &source->ref, &parsed->ref);
+}
+
+bool SetupUTF16OverrideComponents(const char* base,
+                                  const Replacements<char16>& repl,
+                                  CanonOutput* utf8_buffer,
+                                  URLComponentSource<char>* source,
+                                  url_parse::Parsed* parsed) {
+  (void)base;
+
+  bool success = true;
+
+  // Get the source and parsed structures of the things we are replacing.
+  const URLComponentSource<char16>& repl_source = repl.sources();
+  const url_parse::Parsed& repl_parsed = repl.components();
+
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.scheme, repl_parsed.scheme,
+      utf8_buffer, &parsed->scheme);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.username, repl_parsed.username,
+      utf8_buffer, &parsed->username);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.password, repl_parsed.password,
+      utf8_buffer, &parsed->password);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.host, repl_parsed.host,
+      utf8_buffer, &parsed->host);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.port, repl_parsed.port,
+      utf8_buffer, &parsed->port);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.path, repl_parsed.path,
+      utf8_buffer, &parsed->path);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.query, repl_parsed.query,
+      utf8_buffer, &parsed->query);
+  success &= PrepareUTF16OverrideComponent(
+      repl_source.ref, repl_parsed.ref,
+      utf8_buffer, &parsed->ref);
+
+  // PrepareUTF16OverrideComponent will not have set the data pointer since the
+  // buffer could be resized, invalidating the pointers. We set the data
+  // pointers for affected components now that the buffer is finalized.
+  if (repl_source.scheme)   source->scheme = utf8_buffer->data();
+  if (repl_source.username) source->username = utf8_buffer->data();
+  if (repl_source.password) source->password = utf8_buffer->data();
+  if (repl_source.host)     source->host = utf8_buffer->data();
+  if (repl_source.port)     source->port = utf8_buffer->data();
+  if (repl_source.path)     source->path = utf8_buffer->data();
+  if (repl_source.query)    source->query = utf8_buffer->data();
+  if (repl_source.ref)      source->ref = utf8_buffer->data();
+
+  return success;
+}
+
+#ifndef WIN32
+
+int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) {
+  int written;
+  if (radix == 10)
+    written = snprintf(buffer, size_in_chars, "%d", value);
+  else if (radix == 16)
+    written = snprintf(buffer, size_in_chars, "%x", value);
+  else
+    return EINVAL;
+
+    if (static_cast<size_t>(written) >= size_in_chars) {
+    // Output was truncated, or written was negative.
+    return EINVAL;
+  }
+  return 0;
+}
+
+int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix) {
+  if (radix != 10)
+    return EINVAL;
+
+  // No more than 12 characters will be required for a 32-bit integer.
+  // Add an extra byte for the terminating null.
+  char temp[13];
+  int written = snprintf(temp, sizeof(temp), "%d", value);
+  if (static_cast<size_t>(written) >= size_in_chars) {
+    // Output was truncated, or written was negative.
+    return EINVAL;
+  }
+
+  for (int i = 0; i < written; ++i) {
+    buffer[i] = static_cast<char16>(temp[i]);
+  }
+  buffer[written] = '\0';
+  return 0;
+}
+
+#endif  // !WIN32
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_ip.cpp b/googleurl/url_canon_ip.cpp
new file mode 100644
index 0000000..a149242
--- /dev/null
+++ b/googleurl/url_canon_ip.cpp
@@ -0,0 +1,748 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "url_canon_ip.h"
+
+#include <stdlib.h>
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "url_canon_internal.h"
+
+#ifndef _WIN32
+#include <inttypes.h>
+#endif
+
+namespace url_canon {
+
+namespace {
+
+// Converts one of the character types that represent a numerical base to the
+// corresponding base.
+int BaseForType(SharedCharTypes type) {
+  switch (type) {
+    case CHAR_HEX:
+      return 16;
+    case CHAR_DEC:
+      return 10;
+    case CHAR_OCT:
+      return 8;
+    case CHAR_QUERY:
+    case CHAR_USERINFO:
+    case CHAR_IPV4:
+    case CHAR_COMPONENT:
+    default:
+      return 0;
+  }
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoFindIPv4Components(const CHAR* spec,
+                          const url_parse::Component& host,
+                          url_parse::Component components[4]) {
+  if (!host.is_nonempty())
+    return false;
+
+  int cur_component = 0;  // Index of the component we're working on.
+  int cur_component_begin = host.begin;  // Start of the current component.
+  int end = host.end();
+  for (int i = host.begin; /* nothing */; i++) {
+    if (i >= end || spec[i] == '.') {
+      // Found the end of the current component.
+      int component_len = i - cur_component_begin;
+      components[cur_component] =
+          url_parse::Component(cur_component_begin, component_len);
+
+      // The next component starts after the dot.
+      cur_component_begin = i + 1;
+      cur_component++;
+
+      // Don't allow empty components (two dots in a row), except we may
+      // allow an empty component at the end (this would indicate that the
+      // input ends in a dot). We also want to error if the component is
+      // empty and it's the only component (cur_component == 1).
+      if (component_len == 0 && (i < end || cur_component == 1))
+        return false;
+
+      if (i >= end)
+        break;  // End of the input.
+
+      if (cur_component == 4) {
+        // Anything else after the 4th component is an error unless it is a
+        // dot that would otherwise be treated as the end of input.
+        if (spec[i] == '.' && i + 1 == end)
+          break;
+        return false;
+      }
+    } else if (static_cast<UCHAR>(spec[i]) >= 0x80 ||
+               !IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
+      // Invalid character for an IPv4 address.
+      return false;
+    }
+  }
+
+  // Fill in any unused components.
+  while (cur_component < 4)
+    components[cur_component++] = url_parse::Component();
+  return true;
+}
+
+// Converts an IPv4 component to a 32-bit number, while checking for overflow.
+//
+// Possible return values:
+// - IPV4    - The number was valid, and did not overflow.
+// - BROKEN  - The input was numeric, but too large for a 32-bit field.
+// - NEUTRAL - Input was not numeric.
+//
+// The input is assumed to be ASCII. FindIPv4Components should have stripped
+// out any input that is greater than 7 bits. The components are assumed
+// to be non-empty.
+template<typename CHAR>
+CanonHostInfo::Family IPv4ComponentToNumber(
+    const CHAR* spec,
+    const url_parse::Component& component,
+    uint32* number) {
+  // Figure out the base
+  SharedCharTypes base;
+  int base_prefix_len = 0;  // Size of the prefix for this base.
+  if (spec[component.begin] == '0') {
+    // Either hex or dec, or a standalone zero.
+    if (component.len == 1) {
+      base = CHAR_DEC;
+    } else if (spec[component.begin + 1] == 'X' ||
+               spec[component.begin + 1] == 'x') {
+      base = CHAR_HEX;
+      base_prefix_len = 2;
+    } else {
+      base = CHAR_OCT;
+      base_prefix_len = 1;
+    }
+  } else {
+    base = CHAR_DEC;
+  }
+
+  // Extend the prefix to consume all leading zeros.
+  while (base_prefix_len < component.len &&
+         spec[component.begin + base_prefix_len] == '0')
+    base_prefix_len++;
+
+  // Put the component, minus any base prefix, into a NULL-terminated buffer so
+  // we can call the standard library.  Because leading zeros have already been
+  // discarded, filling the entire buffer is guaranteed to trigger the 32-bit
+  // overflow check.
+  const int kMaxComponentLen = 16;
+  char buf[kMaxComponentLen + 1];  // digits + '\0'
+  int dest_i = 0;
+  for (int i = component.begin + base_prefix_len; i < component.end(); i++) {
+    // We know the input is 7-bit, so convert to narrow (if this is the wide
+    // version of the template) by casting.
+    char input = static_cast<char>(spec[i]);
+
+    // Validate that this character is OK for the given base.
+    if (!IsCharOfType(input, base))
+      return CanonHostInfo::NEUTRAL;
+
+    // Fill the buffer, if there's space remaining.  This check allows us to
+    // verify that all characters are numeric, even those that don't fit.
+    if (dest_i < kMaxComponentLen)
+      buf[dest_i++] = input;
+  }
+
+  buf[dest_i] = '\0';
+
+  // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
+  // number can overflow a 64-bit number in <= 16 characters).
+#ifdef WIN32
+  uint64 num = _strtoui64(buf, NULL, BaseForType(base));
+#else 
+  uint64_t num = strtoull(buf, NULL, BaseForType(base));
+#endif
+
+  // Check for 32-bit overflow.
+  if (num > kuint32max)
+    return CanonHostInfo::BROKEN;
+
+  // No overflow.  Success!
+  *number = static_cast<uint32>(num);
+  return CanonHostInfo::IPV4;
+}
+
+// Writes the given address (with each character representing one dotted
+// part of an IPv4 address) to the output, and updating |*out_host| to
+// identify the added portion.
+void AppendIPv4Address(const unsigned char address[4],
+                       CanonOutput* output,
+                       url_parse::Component* out_host) {
+  out_host->begin = output->length();
+  for (int i = 0; i < 4; i++) {
+    char str[16];
+    _itoa_s(address[i], str, 10);
+
+    for (int ch = 0; str[ch] != 0; ch++)
+      output->push_back(str[ch]);
+
+    if (i != 3)
+      output->push_back('.');
+  }
+  out_host->len = output->length() - out_host->begin;
+}
+
+// See declaration of IPv4AddressToNumber for documentation.
+template<typename CHAR>
+CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec,
+                                            const url_parse::Component& host,
+                                            unsigned char address[4],
+                                            int* num_ipv4_components) {
+  // The identified components. Not all may exist.
+  url_parse::Component components[4];
+  if (!FindIPv4Components(spec, host, components))
+    return CanonHostInfo::NEUTRAL;
+
+  // Convert existing components to digits. Values up to
+  // |existing_components| will be valid.
+  uint32 component_values[4];
+  int existing_components = 0;
+
+  // Set to true if one or more components are BROKEN.  BROKEN is only
+  // returned if all components are IPV4 or BROKEN, so, for example,
+  // 12345678912345.de returns NEUTRAL rather than broken.
+  bool broken = false;
+  for (int i = 0; i < 4; i++) {
+    if (components[i].len <= 0)
+      continue;
+    CanonHostInfo::Family family = IPv4ComponentToNumber(
+        spec, components[i], &component_values[existing_components]);
+
+    if (family == CanonHostInfo::BROKEN) {
+      broken = true;
+    } else if (family != CanonHostInfo::IPV4) {
+      // Stop if we hit a non-BROKEN invalid non-empty component.
+      return family;
+    }
+
+    existing_components++;
+  }
+
+  if (broken)
+    return CanonHostInfo::BROKEN;
+
+  // Use that sequence of numbers to fill out the 4-component IP address.
+
+  // First, process all components but the last, while making sure each fits
+  // within an 8-bit field.
+  for (int i = 0; i < existing_components - 1; i++) {
+    if (component_values[i] > kuint8max)
+      return CanonHostInfo::BROKEN;
+    address[i] = static_cast<unsigned char>(component_values[i]);
+  }
+
+  // Next, consume the last component to fill in the remaining bytes.
+  uint32 last_value = component_values[existing_components - 1];
+  for (int i = 3; i >= existing_components - 1; i--) {
+    address[i] = static_cast<unsigned char>(last_value);
+    last_value >>= 8;
+  }
+
+  // If the last component has residual bits, report overflow.
+  if (last_value != 0)
+    return CanonHostInfo::BROKEN;
+
+  // Tell the caller how many components we saw.
+  *num_ipv4_components = existing_components;
+
+  // Success!
+  return CanonHostInfo::IPV4;
+}
+
+// Return true if we've made a final IPV4/BROKEN decision, false if the result
+// is NEUTRAL, and we could use a second opinion.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeIPv4Address(const CHAR* spec,
+                               const url_parse::Component& host,
+                               CanonOutput* output,
+                               CanonHostInfo* host_info) {
+  host_info->family = IPv4AddressToNumber(
+      spec, host, host_info->address, &host_info->num_ipv4_components);
+
+  switch (host_info->family) {
+    case CanonHostInfo::IPV4:
+      // Definitely an IPv4 address.
+      AppendIPv4Address(host_info->address, output, &host_info->out_host);
+      return true;
+    case CanonHostInfo::BROKEN:
+      // Definitely broken.
+      return true;
+    case CanonHostInfo::NEUTRAL:
+    case CanonHostInfo::IPV6:
+    default:
+      // Could be IPv6 or a hostname.
+      return false;
+  }
+}
+
+// Helper class that describes the main components of an IPv6 input string.
+// See the following examples to understand how it breaks up an input string:
+//
+// [Example 1]: input = "[::aa:bb]"
+//  ==> num_hex_components = 2
+//  ==> hex_components[0] = Component(3,2) "aa"
+//  ==> hex_components[1] = Component(6,2) "bb"
+//  ==> index_of_contraction = 0
+//  ==> ipv4_component = Component(0, -1)
+//
+// [Example 2]: input = "[1:2::3:4:5]"
+//  ==> num_hex_components = 5
+//  ==> hex_components[0] = Component(1,1) "1"
+//  ==> hex_components[1] = Component(3,1) "2"
+//  ==> hex_components[2] = Component(6,1) "3"
+//  ==> hex_components[3] = Component(8,1) "4"
+//  ==> hex_components[4] = Component(10,1) "5"
+//  ==> index_of_contraction = 2
+//  ==> ipv4_component = Component(0, -1)
+//
+// [Example 3]: input = "[::ffff:192.168.0.1]"
+//  ==> num_hex_components = 1
+//  ==> hex_components[0] = Component(3,4) "ffff"
+//  ==> index_of_contraction = 0
+//  ==> ipv4_component = Component(8, 11) "192.168.0.1"
+//
+// [Example 4]: input = "[1::]"
+//  ==> num_hex_components = 1
+//  ==> hex_components[0] = Component(1,1) "1"
+//  ==> index_of_contraction = 1
+//  ==> ipv4_component = Component(0, -1)
+//
+// [Example 5]: input = "[::192.168.0.1]"
+//  ==> num_hex_components = 0
+//  ==> index_of_contraction = 0
+//  ==> ipv4_component = Component(8, 11) "192.168.0.1"
+//
+struct IPv6Parsed {
+  // Zero-out the parse information.
+  void reset() {
+    num_hex_components = 0;
+    index_of_contraction = -1;
+    ipv4_component.reset();
+  }
+
+  // There can be up to 8 hex components (colon separated) in the literal.
+  url_parse::Component hex_components[8];
+
+  // The count of hex components present. Ranges from [0,8].
+  int num_hex_components;
+
+  // The index of the hex component that the "::" contraction precedes, or
+  // -1 if there is no contraction.
+  int index_of_contraction;
+
+  // The range of characters which are an IPv4 literal.
+  url_parse::Component ipv4_component;
+};
+
+// Parse the IPv6 input string. If parsing succeeded returns true and fills
+// |parsed| with the information. If parsing failed (because the input is
+// invalid) returns false.
+template<typename CHAR, typename UCHAR>
+bool DoParseIPv6(const CHAR* spec,
+                 const url_parse::Component& host,
+                 IPv6Parsed* parsed) {
+  // Zero-out the info.
+  parsed->reset();
+
+  if (!host.is_nonempty())
+    return false;
+
+  // The index for start and end of address range (no brackets).
+  int begin = host.begin;
+  int end = host.end();
+
+  int cur_component_begin = begin;  // Start of the current component.
+
+  // Scan through the input, searching for hex components, "::" contractions,
+  // and IPv4 components.
+  for (int i = begin; /* i <= end */; i++) {
+    bool is_colon = spec[i] == ':';
+    bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':';
+
+    // We reached the end of the current component if we encounter a colon
+    // (separator between hex components, or start of a contraction), or end of
+    // input.
+    if (is_colon || i == end) {
+      int component_len = i - cur_component_begin;
+
+      // A component should not have more than 4 hex digits.
+      if (component_len > 4)
+        return false;
+
+      // Don't allow empty components.
+      if (component_len == 0) {
+        // The exception is when contractions appear at beginning of the
+        // input or at the end of the input.
+        if (!((is_contraction && i == begin) || (i == end &&
+            parsed->index_of_contraction == parsed->num_hex_components)))
+          return false;
+      }
+
+      // Add the hex component we just found to running list.
+      if (component_len > 0) {
+        // Can't have more than 8 components!
+        if (parsed->num_hex_components >= 8)
+          return false;
+
+        parsed->hex_components[parsed->num_hex_components++] =
+            url_parse::Component(cur_component_begin, component_len);
+      }
+    }
+
+    if (i == end)
+      break;  // Reached the end of the input, DONE.
+
+    // We found a "::" contraction.
+    if (is_contraction) {
+      // There can be at most one contraction in the literal.
+      if (parsed->index_of_contraction != -1)
+        return false;
+      parsed->index_of_contraction = parsed->num_hex_components;
+      ++i;  // Consume the colon we peeked.
+    }
+
+    if (is_colon) {
+      // Colons are separators between components, keep track of where the
+      // current component started (after this colon).
+      cur_component_begin = i + 1;
+    } else {
+      if (static_cast<UCHAR>(spec[i]) >= 0x80)
+        return false;  // Not ASCII.
+
+      if (!IsHexChar(static_cast<unsigned char>(spec[i]))) {
+        // Regular components are hex numbers. It is also possible for
+        // a component to be an IPv4 address in dotted form.
+        if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
+          // Since IPv4 address can only appear at the end, assume the rest
+          // of the string is an IPv4 address. (We will parse this separately
+          // later).
+          parsed->ipv4_component = url_parse::Component(
+              cur_component_begin, end - cur_component_begin);
+          break;
+        } else {
+          // The character was neither a hex digit, nor an IPv4 character.
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+// Verifies the parsed IPv6 information, checking that the various components
+// add up to the right number of bits (hex components are 16 bits, while
+// embedded IPv4 formats are 32 bits, and contractions are placeholdes for
+// 16 or more bits). Returns true if sizes match up, false otherwise. On
+// success writes the length of the contraction (if any) to
+// |out_num_bytes_of_contraction|.
+bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed,
+                             int* out_num_bytes_of_contraction) {
+  // Each group of four hex digits contributes 16 bits.
+  int num_bytes_without_contraction = parsed.num_hex_components * 2;
+
+  // If an IPv4 address was embedded at the end, it contributes 32 bits.
+  if (parsed.ipv4_component.is_valid())
+    num_bytes_without_contraction += 4;
+
+  // If there was a "::" contraction, its size is going to be:
+  // MAX([16bits], [128bits] - num_bytes_without_contraction).
+  int num_bytes_of_contraction = 0;
+  if (parsed.index_of_contraction != -1) {
+    num_bytes_of_contraction = 16 - num_bytes_without_contraction;
+    if (num_bytes_of_contraction < 2)
+      num_bytes_of_contraction = 2;
+  }
+
+  // Check that the numbers add up.
+  if (num_bytes_without_contraction + num_bytes_of_contraction != 16)
+    return false;
+
+  *out_num_bytes_of_contraction = num_bytes_of_contraction;
+  return true;
+}
+
+// Converts a hex comonent into a number. This cannot fail since the caller has
+// already verified that each character in the string was a hex digit, and
+// that there were no more than 4 characters.
+template<typename CHAR>
+uint16 IPv6HexComponentToNumber(const CHAR* spec,
+                                const url_parse::Component& component) {
+  DCHECK(component.len <= 4);
+
+  // Copy the hex string into a C-string.
+  char buf[5];
+  for (int i = 0; i < component.len; ++i)
+    buf[i] = static_cast<char>(spec[component.begin + i]);
+  buf[component.len] = '\0';
+
+  // Convert it to a number (overflow is not possible, since with 4 hex
+  // characters we can at most have a 16 bit number).
+  return static_cast<uint16>(_strtoui64(buf, NULL, 16));
+}
+
+// Converts an IPv6 address to a 128-bit number (network byte order), returning
+// true on success. False means that the input was not a valid IPv6 address.
+template<typename CHAR, typename UCHAR>
+bool DoIPv6AddressToNumber(const CHAR* spec,
+                           const url_parse::Component& host,
+                           unsigned char address[16]) {
+  // Make sure the component is bounded by '[' and ']'.
+  int end = host.end();
+  if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']')
+    return false;
+
+  // Exclude the square brackets.
+  url_parse::Component ipv6_comp(host.begin + 1, host.len - 2);
+
+  // Parse the IPv6 address -- identify where all the colon separated hex
+  // components are, the "::" contraction, and the embedded IPv4 address.
+  IPv6Parsed ipv6_parsed;
+  if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed))
+    return false;
+
+  // Do some basic size checks to make sure that the address doesn't
+  // specify more than 128 bits or fewer than 128 bits. This also resolves
+  // how may zero bytes the "::" contraction represents.
+  int num_bytes_of_contraction;
+  if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction))
+    return false;
+
+  int cur_index_in_address = 0;
+
+  // Loop through each hex components, and contraction in order.
+  for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) {
+    // Append the contraction if it appears before this component.
+    if (i == ipv6_parsed.index_of_contraction) {
+      for (int j = 0; j < num_bytes_of_contraction; ++j)
+        address[cur_index_in_address++] = 0;
+    }
+    // Append the hex component's value.
+    if (i != ipv6_parsed.num_hex_components) {
+      // Get the 16-bit value for this hex component.
+      uint16 number = IPv6HexComponentToNumber<CHAR>(
+          spec, ipv6_parsed.hex_components[i]);
+      // Append to |address|, in network byte order.
+      address[cur_index_in_address++] = (number & 0xFF00) >> 8;
+      address[cur_index_in_address++] = (number & 0x00FF);
+    }
+  }
+
+  // If there was an IPv4 section, convert it into a 32-bit number and append
+  // it to |address|.
+  if (ipv6_parsed.ipv4_component.is_valid()) {
+    // Append the 32-bit number to |address|.
+    int ignored_num_ipv4_components;
+    if (CanonHostInfo::IPV4 !=
+        IPv4AddressToNumber(spec,
+                            ipv6_parsed.ipv4_component,
+                            &address[cur_index_in_address],
+                            &ignored_num_ipv4_components))
+      return false;
+  }
+
+  return true;
+}
+
+// Searches for the longest sequence of zeros in |address|, and writes the
+// range into |contraction_range|. The run of zeros must be at least 16 bits,
+// and if there is a tie the first is chosen.
+void ChooseIPv6ContractionRange(const unsigned char address[16],
+                                url_parse::Component* contraction_range) {
+  // The longest run of zeros in |address| seen so far.
+  url_parse::Component max_range;
+
+  // The current run of zeros in |address| being iterated over.
+  url_parse::Component cur_range;
+
+  for (int i = 0; i < 16; i += 2) {
+    // Test for 16 bits worth of zero.
+    bool is_zero = (address[i] == 0 && address[i + 1] == 0);
+
+    if (is_zero) {
+      // Add the zero to the current range (or start a new one).
+      if (!cur_range.is_valid())
+        cur_range = url_parse::Component(i, 0);
+      cur_range.len += 2;
+    }
+
+    if (!is_zero || i == 14) {
+      // Just completed a run of zeros. If the run is greater than 16 bits,
+      // it is a candidate for the contraction.
+      if (cur_range.len > 2 && cur_range.len > max_range.len) {
+        max_range = cur_range;
+      }
+      cur_range.reset();
+    }
+  }
+  *contraction_range = max_range;
+}
+
+// Return true if we've made a final IPV6/BROKEN decision, false if the result
+// is NEUTRAL, and we could use a second opinion.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeIPv6Address(const CHAR* spec,
+                               const url_parse::Component& host,
+                               CanonOutput* output,
+                               CanonHostInfo* host_info) {
+  // Turn the IP address into a 128 bit number.
+  if (!IPv6AddressToNumber(spec, host, host_info->address)) {
+    // If it's not an IPv6 address, scan for characters that should *only*
+    // exist in an IPv6 address.
+    for (int i = host.begin; i < host.end(); i++) {
+      switch (spec[i]) {
+        case '[':
+        case ']':
+        case ':':
+          host_info->family = CanonHostInfo::BROKEN;
+          return true;
+      }
+    }
+
+    // No invalid characters.  Could still be IPv4 or a hostname.
+    host_info->family = CanonHostInfo::NEUTRAL;
+    return false;
+  }
+
+  host_info->out_host.begin = output->length();
+  output->push_back('[');
+
+  // We will now output the address according to the rules in:
+  // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4
+
+  // Start by finding where to place the "::" contraction (if any).
+  url_parse::Component contraction_range;
+  ChooseIPv6ContractionRange(host_info->address, &contraction_range);
+
+  for (int i = 0; i <= 14;) {
+    // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive.
+    DCHECK(i % 2 == 0);
+    if (i == contraction_range.begin && contraction_range.len > 0) {
+      // Jump over the contraction.
+      if (i == 0)
+        output->push_back(':');
+      output->push_back(':');
+      i = contraction_range.end();
+    } else {
+      // Consume the next 16 bits from |host_info->address|.
+      int x = host_info->address[i] << 8 | host_info->address[i + 1];
+
+      i += 2;
+
+      // Stringify the 16 bit number (at most requires 4 hex digits).
+      char str[5];
+      _itoa_s(x, str, 16);
+      for (int ch = 0; str[ch] != 0; ++ch)
+        output->push_back(str[ch]);
+
+      // Put a colon after each number, except the last.
+      if (i < 16)
+        output->push_back(':');
+    }
+  }
+
+  output->push_back(']');
+  host_info->out_host.len = output->length() - host_info->out_host.begin;
+
+  host_info->family = CanonHostInfo::IPV6;
+  return true;
+}
+
+}  // namespace
+
+bool FindIPv4Components(const char* spec,
+                        const url_parse::Component& host,
+                        url_parse::Component components[4]) {
+  return DoFindIPv4Components<char, unsigned char>(spec, host, components);
+}
+
+bool FindIPv4Components(const char16* spec,
+                        const url_parse::Component& host,
+                        url_parse::Component components[4]) {
+  return DoFindIPv4Components<char16, char16>(spec, host, components);
+}
+
+void CanonicalizeIPAddress(const char* spec,
+                           const url_parse::Component& host,
+                           CanonOutput* output,
+                           CanonHostInfo* host_info) {
+  if (DoCanonicalizeIPv4Address<char, unsigned char>(
+          spec, host, output, host_info))
+    return;
+  if (DoCanonicalizeIPv6Address<char, unsigned char>(
+          spec, host, output, host_info))
+    return;
+}
+
+void CanonicalizeIPAddress(const char16* spec,
+                           const url_parse::Component& host,
+                           CanonOutput* output,
+                           CanonHostInfo* host_info) {
+  if (DoCanonicalizeIPv4Address<char16, char16>(
+          spec, host, output, host_info))
+    return;
+  if (DoCanonicalizeIPv6Address<char16, char16>(
+          spec, host, output, host_info))
+    return;
+}
+
+CanonHostInfo::Family IPv4AddressToNumber(const char* spec,
+                                          const url_parse::Component& host,
+                                          unsigned char address[4],
+                                          int* num_ipv4_components) {
+  return DoIPv4AddressToNumber<char>(spec, host, address, num_ipv4_components);
+}
+
+CanonHostInfo::Family IPv4AddressToNumber(const char16* spec,
+                                          const url_parse::Component& host,
+                                          unsigned char address[4],
+                                          int* num_ipv4_components) {
+  return DoIPv4AddressToNumber<char16>(
+      spec, host, address, num_ipv4_components);
+}
+
+bool IPv6AddressToNumber(const char* spec,
+                         const url_parse::Component& host,
+                         unsigned char address[16]) {
+  return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address);
+}
+
+bool IPv6AddressToNumber(const char16* spec,
+                         const url_parse::Component& host,
+                         unsigned char address[16]) {
+  return DoIPv6AddressToNumber<char16, char16>(spec, host, address);
+}
+
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_mailtourl.cpp b/googleurl/url_canon_mailtourl.cpp
new file mode 100644
index 0000000..a216f67
--- /dev/null
+++ b/googleurl/url_canon_mailtourl.cpp
@@ -0,0 +1,139 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "mailto:" URLs.
+
+#include "url_canon.h"
+#include "url_canon_internal.h"
+#include "url_file.h"
+#include "url_parse_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source,
+                             const url_parse::Parsed& parsed,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed) {
+
+  // mailto: only uses {scheme, path, query} -- clear the rest.
+  new_parsed->username = url_parse::Component();
+  new_parsed->password = url_parse::Component();
+  new_parsed->host = url_parse::Component();
+  new_parsed->port = url_parse::Component();
+  new_parsed->ref = url_parse::Component();
+
+  // Scheme (known, so we don't bother running it through the more
+  // complicated scheme canonicalizer).
+  new_parsed->scheme.begin = output->length();
+  output->Append("mailto:", 7);
+  new_parsed->scheme.len = 6;
+
+  bool success = true;
+
+  // Path
+  if (parsed.path.is_valid()) {
+    new_parsed->path.begin = output->length();
+
+    // Copy the path using path URL's more lax escaping rules.
+    // We convert to UTF-8 and escape non-ASCII, but leave all
+    // ASCII characters alone.
+    int end = parsed.path.end();
+    for (int i = parsed.path.begin; i < end; ++i) {
+      UCHAR uch = static_cast<UCHAR>(source.path[i]);
+      if (uch < 0x20 || uch >= 0x80)
+        success &= AppendUTF8EscapedChar(source.path, &i, end, output);
+      else
+        output->push_back(static_cast<char>(uch));
+    }
+
+    new_parsed->path.len = output->length() - new_parsed->path.begin;
+  } else {
+    // No path at all
+    new_parsed->path.reset();
+  }
+
+  // Query -- always use the default utf8 charset converter.
+  CanonicalizeQuery(source.query, parsed.query, NULL,
+                    output, &new_parsed->query);
+
+  return success;
+}
+
+} // namespace
+
+bool CanonicalizeMailtoURL(const char* spec,
+                          int spec_len,
+                          const url_parse::Parsed& parsed,
+                          CanonOutput* output,
+                          url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizeMailtoURL<char, unsigned char>(
+      URLComponentSource<char>(spec), parsed, output, new_parsed);
+}
+
+bool CanonicalizeMailtoURL(const char16* spec,
+                           int spec_len,
+                           const url_parse::Parsed& parsed,
+                           CanonOutput* output,
+                           url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizeMailtoURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, output, new_parsed);
+}
+
+bool ReplaceMailtoURL(const char* base,
+                      const url_parse::Parsed& base_parsed,
+                      const Replacements<char>& replacements,
+                      CanonOutput* output,
+                      url_parse::Parsed* new_parsed) {
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupOverrideComponents(base, replacements, &source, &parsed);
+  return DoCanonicalizeMailtoURL<char, unsigned char>(
+      source, parsed, output, new_parsed);
+}
+
+bool ReplaceMailtoURL(const char* base,
+                      const url_parse::Parsed& base_parsed,
+                      const Replacements<char16>& replacements,
+                      CanonOutput* output,
+                      url_parse::Parsed* new_parsed) {
+  RawCanonOutput<1024> utf8;
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+  return DoCanonicalizeMailtoURL<char, unsigned char>(
+      source, parsed, output, new_parsed);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_path.cpp b/googleurl/url_canon_path.cpp
new file mode 100644
index 0000000..5bce022
--- /dev/null
+++ b/googleurl/url_canon_path.cpp
@@ -0,0 +1,378 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Canonicalization functions for the paths of URLs.
+
+#include "base/logging.h"
+#include "url_canon.h"
+#include "url_canon_internal.h"
+#include "url_parse_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+enum CharacterFlags {
+  // Pass through unchanged, whether escaped or unescaped. This doesn't
+  // actually set anything so you can't OR it to check, it's just to make the
+  // table below more clear when neither ESCAPE or UNESCAPE is set.
+  PASS = 0,
+
+  // This character requires special handling in DoPartialPath. Doing this test
+  // first allows us to filter out the common cases of regular characters that
+  // can be directly copied.
+  SPECIAL = 1,
+
+  // This character must be escaped in the canonical output. Note that all
+  // escaped chars also have the "special" bit set so that the code that looks
+  // for this is triggered. Not valid with PASS or ESCAPE
+  ESCAPE_BIT = 2,
+  ESCAPE = ESCAPE_BIT | SPECIAL,
+
+  // This character must be unescaped in canonical output. Not valid with
+  // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these
+  // characters unescaped, they should just be copied.
+  UNESCAPE = 4,
+
+  // This character is disallowed in URLs. Note that the "special" bit is also
+  // set to trigger handling.
+  INVALID_BIT = 8,
+  INVALID = INVALID_BIT | SPECIAL
+};
+
+// This table contains one of the above flag values. Note some flags are more
+// than one bits because they also turn on the "special" flag. Special is the
+// only flag that may be combined with others.
+//
+// This table is designed to match exactly what IE does with the characters.
+//
+// Dot is even more special, and the escaped version is handled specially by
+// IsDot. Therefore, we don't need the "escape" flag, and even the "unescape"
+// bit is never handled (we just need the "special") bit.
+const unsigned char kPathCharLookup[0x100] = {
+//   NULL     control chars...
+     INVALID, ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+//   control chars...
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+//   ' '      !        "        #        $        %        &        '        (        )        *        +        ,        -        .        /
+     ESCAPE,  PASS,    ESCAPE,  ESCAPE,  PASS,    ESCAPE,  PASS,    PASS,    PASS,    PASS,    PASS,    PASS,    PASS,    UNESCAPE,SPECIAL, PASS,
+//   0        1        2        3        4        5        6        7        8        9        :        ;        <        =        >        ?
+     UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS,    PASS,    ESCAPE,  PASS,    ESCAPE,  ESCAPE,
+//   @        A        B        C        D        E        F        G        H        I        J        K        L        M        N        O
+     PASS,    UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
+//   P        Q        R        S        T        U        V        W        X        Y        Z        [        \        ]        ^        _
+     UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS,    ESCAPE,  PASS,    ESCAPE,  UNESCAPE,
+//   `        a        b        c        d        e        f        g        h        i        j        k        l        m        n        o
+     ESCAPE,  UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
+//   p        q        r        s        t        u        v        w        x        y        z        {        |        }        ~        <NBSP>
+     UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE,  ESCAPE,  ESCAPE,  UNESCAPE,ESCAPE,
+//   ...all the high-bit characters are escaped
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,
+     ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE,  ESCAPE};
+
+enum DotDisposition {
+  // The given dot is just part of a filename and is not special.
+  NOT_A_DIRECTORY,
+
+  // The given dot is the current directory.
+  DIRECTORY_CUR,
+
+  // The given dot is the first of a double dot that should take us up one.
+  DIRECTORY_UP
+};
+
+// When the path resolver finds a dot, this function is called with the
+// character following that dot to see what it is. The return value
+// indicates what type this dot is (see above). This code handles the case
+// where the dot is at the end of the input.
+//
+// |*consumed_len| will contain the number of characters in the input that
+// express what we found.
+//
+// If the input is "../foo", |after_dot| = 1, |end| = 6, and
+// at the end, |*consumed_len| = 2 for the "./" this function consumed. The
+// original dot length should be handled by the caller.
+template<typename CHAR>
+DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot,
+                                int end, int* consumed_len) {
+  if (after_dot == end) {
+    // Single dot at the end.
+    *consumed_len = 0;
+    return DIRECTORY_CUR;
+  }
+  if (url_parse::IsURLSlash(spec[after_dot])) {
+    // Single dot followed by a slash.
+    *consumed_len = 1;  // Consume the slash
+    return DIRECTORY_CUR;
+  }
+
+  int second_dot_len = IsDot(spec, after_dot, end);
+  if (second_dot_len) {
+    int after_second_dot = after_dot + second_dot_len;
+    if (after_second_dot == end) {
+      // Double dot at the end.
+      *consumed_len = second_dot_len;
+      return DIRECTORY_UP;
+    }
+    if (url_parse::IsURLSlash(spec[after_second_dot])) {
+      // Double dot followed by a slash.
+      *consumed_len = second_dot_len + 1;
+      return DIRECTORY_UP;
+    }
+  }
+
+  // The dots are followed by something else, not a directory.
+  *consumed_len = 0;
+  return NOT_A_DIRECTORY;
+}
+
+// Rewinds the output to the previous slash. It is assumed that the output
+// ends with a slash and this doesn't count (we call this when we are
+// appending directory paths, so the previous path component has and ending
+// slash).
+//
+// This will stop at the first slash (assumed to be at position
+// |path_begin_in_output| and not go any higher than that. Some web pages
+// do ".." too many times, so we need to handle that brokenness.
+//
+// It searches for a literal slash rather than including a backslash as well
+// because it is run only on the canonical output.
+//
+// The output is guaranteed to end in a slash when this function completes.
+void BackUpToPreviousSlash(int path_begin_in_output,
+                           CanonOutput* output) {
+  DCHECK(output->length() > 0);
+
+  int i = output->length() - 1;
+  DCHECK(output->at(i) == '/');
+  if (i == path_begin_in_output)
+    return;  // We're at the first slash, nothing to do.
+
+  // Now back up (skipping the trailing slash) until we find another slash.
+  i--;
+  while (output->at(i) != '/' && i > path_begin_in_output)
+    i--;
+
+  // Now shrink the output to just include that last slash we found.
+  output->set_length(i + 1);
+}
+
+// Appends the given path to the output. It assumes that if the input path
+// starts with a slash, it should be copied to the output. If no path has
+// already been appended to the output (the case when not resolving
+// relative URLs), the path should begin with a slash.
+//
+// If there are already path components (this mode is used when appending
+// relative paths for resolving), it assumes that the output already has
+// a trailing slash and that if the input begins with a slash, it should be
+// copied to the output.
+//
+// We do not collapse multiple slashes in a row to a single slash. It seems
+// no web browsers do this, and we don't want incompababilities, even though
+// it would be correct for most systems.
+template<typename CHAR, typename UCHAR>
+bool DoPartialPath(const CHAR* spec,
+                   const url_parse::Component& path,
+                   int path_begin_in_output,
+                   CanonOutput* output) {
+  int end = path.end();
+
+  bool success = true;
+  for (int i = path.begin; i < end; i++) {
+    UCHAR uch = static_cast<UCHAR>(spec[i]);
+    if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) {
+      // We only need to test wide input for having non-ASCII characters. For
+      // narrow input, we'll always just use the lookup table. We don't try to
+      // do anything tricky with decoding/validating UTF-8. This function will
+      // read one or two UTF-16 characters and append the output as UTF-8. This
+      // call will be removed in 8-bit mode.
+      success &= AppendUTF8EscapedChar(spec, &i, end, output);
+    } else {
+      // Normal ASCII character or 8-bit input, use the lookup table.
+      unsigned char out_ch = static_cast<unsigned char>(uch);
+      unsigned char flags = kPathCharLookup[out_ch];
+      if (flags & SPECIAL) {
+        // Needs special handling of some sort.
+        int dotlen;
+        if ((dotlen = IsDot(spec, i, end)) > 0) {
+          // See if this dot was preceeded by a slash in the output. We
+          // assume that when canonicalizing paths, they will always
+          // start with a slash and not a dot, so we don't have to
+          // bounds check the output.
+          //
+          // Note that we check this in the case of dots so we don't have to
+          // special case slashes. Since slashes are much more common than
+          // dots, this actually increases performance measurably (though
+          // slightly).
+          DCHECK(output->length() > path_begin_in_output);
+          if (output->length() > path_begin_in_output &&
+              output->at(output->length() - 1) == '/') {
+            // Slash followed by a dot, check to see if this is means relative
+            int consumed_len;
+            switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end,
+                                           &consumed_len)) {
+              case NOT_A_DIRECTORY:
+                // Copy the dot to the output, it means nothing special.
+                output->push_back('.');
+                i += dotlen - 1;
+                break;
+              case DIRECTORY_CUR:  // Current directory, just skip the input.
+                i += dotlen + consumed_len - 1;
+                break;
+              case DIRECTORY_UP:
+                BackUpToPreviousSlash(path_begin_in_output, output);
+                i += dotlen + consumed_len - 1;
+                break;
+            }
+          } else {
+            // This dot is not preceeded by a slash, it is just part of some
+            // file name.
+            output->push_back('.');
+            i += dotlen - 1;
+          }
+
+        } else if (out_ch == '\\') {
+          // Convert backslashes to forward slashes
+          output->push_back('/');
+
+        } else if (out_ch == '%') {
+          // Handle escape sequences.
+          unsigned char unescaped_value;
+          if (DecodeEscaped(spec, &i, end, &unescaped_value)) {
+            // Valid escape sequence, see if we keep, reject, or unescape it.
+            char unescaped_flags = kPathCharLookup[unescaped_value];
+
+            if (unescaped_flags & UNESCAPE) {
+              // This escaped value shouldn't be escaped, copy it.
+              output->push_back(unescaped_value);
+            } else if (unescaped_flags & INVALID_BIT) {
+              // Invalid escaped character, copy it and remember the error.
+              output->push_back('%');
+              output->push_back(static_cast<char>(spec[i - 1]));
+              output->push_back(static_cast<char>(spec[i]));
+              success = false;
+            } else {
+              // Valid escaped character but we should keep it escaped. We
+              // don't want to change the case of any hex letters in case
+              // the server is sensitive to that, so we just copy the two
+              // characters without checking (DecodeEscape will have advanced
+              // to the last character of the pair).
+              output->push_back('%');
+              output->push_back(static_cast<char>(spec[i - 1]));
+              output->push_back(static_cast<char>(spec[i]));
+            }
+          } else {
+            // Invalid escape sequence. IE7 rejects any URLs with such
+            // sequences, while Firefox, IE6, and Safari all pass it through
+            // unchanged. We are more permissive unlike IE7. I don't think this
+            // can cause significant problems, if it does, we should change
+            // to be more like IE7.
+            output->push_back('%');
+          }
+
+        } else if (flags & INVALID_BIT) {
+          // For NULLs, etc. fail.
+          AppendEscapedChar(out_ch, output);
+          success = false;
+
+        } else if (flags & ESCAPE_BIT) {
+          // This character should be escaped.
+          AppendEscapedChar(out_ch, output);
+        }
+      } else {
+        // Nothing special about this character, just append it.
+        output->push_back(out_ch);
+      }
+    }
+  }
+  return success;
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoPath(const CHAR* spec,
+            const url_parse::Component& path,
+            CanonOutput* output,
+            url_parse::Component* out_path) {
+  bool success = true;
+  out_path->begin = output->length();
+  if (path.len > 0) {
+    // Write out an initial slash if the input has none. If we just parse a URL
+    // and then canonicalize it, it will of course have a slash already. This
+    // check is for the replacement and relative URL resolving cases of file
+    // URLs.
+    if (!url_parse::IsURLSlash(spec[path.begin]))
+      output->push_back('/');
+
+    success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output);
+  } else {
+    // No input, canonical path is a slash.
+    output->push_back('/');
+  }
+  out_path->len = output->length() - out_path->begin;
+  return success;
+}
+
+}  // namespace
+
+bool CanonicalizePath(const char* spec,
+                      const url_parse::Component& path,
+                      CanonOutput* output,
+                      url_parse::Component* out_path) {
+  return DoPath<char, unsigned char>(spec, path, output, out_path);
+}
+
+bool CanonicalizePath(const char16* spec,
+                      const url_parse::Component& path,
+                      CanonOutput* output,
+                      url_parse::Component* out_path) {
+  return DoPath<char16, char16>(spec, path, output, out_path);
+}
+
+bool CanonicalizePartialPath(const char* spec,
+                             const url_parse::Component& path,
+                             int path_begin_in_output,
+                             CanonOutput* output) {
+  return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output,
+                                            output);
+}
+
+bool CanonicalizePartialPath(const char16* spec,
+                             const url_parse::Component& path,
+                             int path_begin_in_output,
+                             CanonOutput* output) {
+  return DoPartialPath<char16, char16>(spec, path, path_begin_in_output,
+                                       output);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_pathurl.cpp b/googleurl/url_canon_pathurl.cpp
new file mode 100644
index 0000000..74b5721
--- /dev/null
+++ b/googleurl/url_canon_pathurl.cpp
@@ -0,0 +1,130 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "path" URLs. Not to be confused with the path
+// of a URL, these are URLs that have no authority section, only a path. For
+// example, "javascript:" and "data:".
+
+#include "url_canon.h"
+#include "url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
+                           const url_parse::Parsed& parsed,
+                           CanonOutput* output,
+                           url_parse::Parsed* new_parsed) {
+  // Scheme: this will append the colon.
+  bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
+                                    output, &new_parsed->scheme);
+
+  // We assume there's no authority for path URLs. Note that hosts should never
+  // have -1 length.
+  new_parsed->username.reset();
+  new_parsed->password.reset();
+  new_parsed->host.reset();
+  new_parsed->port.reset();
+
+  if (parsed.path.is_valid()) {
+    // Copy the path using path URL's more lax escaping rules (think for
+    // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
+    // ASCII characters alone. This helps readability of JavaStript.
+    new_parsed->path.begin = output->length();
+    int end = parsed.path.end();
+    for (int i = parsed.path.begin; i < end; i++) {
+      UCHAR uch = static_cast<UCHAR>(source.path[i]);
+      if (uch < 0x20 || uch >= 0x80)
+        success &= AppendUTF8EscapedChar(source.path, &i, end, output);
+      else
+        output->push_back(static_cast<char>(uch));
+    }
+    new_parsed->path.len = output->length() - new_parsed->path.begin;
+  } else {
+    // Empty path.
+    new_parsed->path.reset();
+  }
+
+  // Assume there's no query or ref.
+  new_parsed->query.reset();
+  new_parsed->ref.reset();
+
+  return success;
+}
+
+}  // namespace
+
+bool CanonicalizePathURL(const char* spec,
+                         int spec_len,
+                         const url_parse::Parsed& parsed,
+                         CanonOutput* output,
+                         url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizePathURL<char, unsigned char>(
+      URLComponentSource<char>(spec), parsed, output, new_parsed);
+}
+
+bool CanonicalizePathURL(const char16* spec,
+                         int spec_len,
+                         const url_parse::Parsed& parsed,
+                         CanonOutput* output,
+                         url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizePathURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, output, new_parsed);
+}
+
+bool ReplacePathURL(const char* base,
+                    const url_parse::Parsed& base_parsed,
+                    const Replacements<char>& replacements,
+                    CanonOutput* output,
+                    url_parse::Parsed* new_parsed) {
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupOverrideComponents(base, replacements, &source, &parsed);
+  return DoCanonicalizePathURL<char, unsigned char>(
+      source, parsed, output, new_parsed);
+}
+
+bool ReplacePathURL(const char* base,
+                    const url_parse::Parsed& base_parsed,
+                    const Replacements<char16>& replacements,
+                    CanonOutput* output,
+                    url_parse::Parsed* new_parsed) {
+  RawCanonOutput<1024> utf8;
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+  return DoCanonicalizePathURL<char, unsigned char>(
+      source, parsed, output, new_parsed);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_query.cpp b/googleurl/url_canon_query.cpp
new file mode 100644
index 0000000..d52d7c7
--- /dev/null
+++ b/googleurl/url_canon_query.cpp
@@ -0,0 +1,189 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "url_canon.h"
+#include "url_canon_internal.h"
+
+// Query canonicalization in IE
+// ----------------------------
+// IE is very permissive for query parameters specified in links on the page
+// (in contrast to links that it constructs itself based on form data). It does
+// not unescape any character. It does not reject any escape sequence (be they
+// invalid like "%2y" or freaky like %00).
+//
+// IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09),
+// LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier
+// layer since they are removed from all portions of the URL). All other
+// characters are passed unmodified. Invalid UTF-16 sequences are preserved as
+// well, with each character in the input being converted to UTF-8. It is the
+// server's job to make sense of this invalid query.
+//
+// Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page)
+// are converted to the invalid character and sent as unescaped UTF-8 (0xef,
+// 0xbf, 0xbd). This may not be canonicalization, the parser may generate these
+// strings before the URL handler ever sees them.
+//
+// Our query canonicalization
+// --------------------------
+// We escape all non-ASCII characters and control characters, like Firefox.
+// This is more conformant to the URL spec, and there do not seem to be many
+// problems relating to Firefox's behavior.
+//
+// Like IE, we will never unescape (although the application may want to try
+// unescaping to present the user with a more understandable URL). We will
+// replace all invalid sequences (including invalid UTF-16 sequences, which IE
+// doesn't) with the "invalid character," and we will escape it.
+
+namespace url_canon {
+
+namespace {
+
+// Returns true if the characters starting at |begin| and going until |end|
+// (non-inclusive) are all representable in 7-bits.
+template<typename CHAR, typename UCHAR>
+bool IsAllASCII(const CHAR* spec, const url_parse::Component& query) {
+  int end = query.end();
+  for (int i = query.begin; i < end; i++) {
+    if (static_cast<UCHAR>(spec[i]) >= 0x80)
+      return false;
+  }
+  return true;
+}
+
+// Appends the given string to the output, escaping characters that do not
+// match the given |type| in SharedCharTypes. This version will accept 8 or 16
+// bit characters, but assumes that they have only 7-bit values. It also assumes
+// that all UTF-8 values are correct, so doesn't bother checking
+template<typename CHAR>
+void AppendRaw8BitQueryString(const CHAR* source, int length,
+                              CanonOutput* output) {
+  for (int i = 0; i < length; i++) {
+    if (!IsQueryChar(static_cast<unsigned char>(source[i])))
+      AppendEscapedChar(static_cast<unsigned char>(source[i]), output);
+    else  // Doesn't need escaping.
+      output->push_back(static_cast<char>(source[i]));
+  }
+}
+
+// Runs the converter on the given UTF-8 input. Since the converter expects
+// UTF-16, we have to convert first. The converter must be non-NULL.
+void RunConverter(const char* spec,
+                  const url_parse::Component& query,
+                  CharsetConverter* converter,
+                  CanonOutput* output) {
+  // This function will replace any misencoded values with the invalid
+  // character. This is what we want so we don't have to check for error.
+  RawCanonOutputW<1024> utf16;
+  ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16);
+  converter->ConvertFromUTF16(utf16.data(), utf16.length(), output);
+}
+
+// Runs the converter with the given UTF-16 input. We don't have to do
+// anything, but this overriddden function allows us to use the same code
+// for both UTF-8 and UTF-16 input.
+void RunConverter(const char16* spec,
+                  const url_parse::Component& query,
+                  CharsetConverter* converter,
+                  CanonOutput* output) {
+  converter->ConvertFromUTF16(&spec[query.begin], query.len, output);
+}
+
+template<typename CHAR, typename UCHAR>
+void DoConvertToQueryEncoding(const CHAR* spec,
+                              const url_parse::Component& query,
+                              CharsetConverter* converter,
+                              CanonOutput* output) {
+  if (IsAllASCII<CHAR, UCHAR>(spec, query)) {
+    // Easy: the input can just appended with no character set conversions.
+    AppendRaw8BitQueryString(&spec[query.begin], query.len, output);
+
+  } else {
+    // Harder: convert to the proper encoding first.
+    if (converter) {
+      // Run the converter to get an 8-bit string, then append it, escaping
+      // necessary values.
+      RawCanonOutput<1024> eight_bit;
+      RunConverter(spec, query, converter, &eight_bit);
+      AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output);
+
+    } else {
+      // No converter, do our own UTF-8 conversion.
+      AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);
+    }
+  }
+}
+
+template<typename CHAR, typename UCHAR>
+void DoCanonicalizeQuery(const CHAR* spec,
+                         const url_parse::Component& query,
+                         CharsetConverter* converter,
+                         CanonOutput* output,
+                         url_parse::Component* out_query) {
+  if (query.len < 0) {
+    *out_query = url_parse::Component();
+    return;
+  }
+
+  output->push_back('?');
+  out_query->begin = output->length();
+
+  DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output);
+
+  out_query->len = output->length() - out_query->begin;
+}
+
+}  // namespace
+
+void CanonicalizeQuery(const char* spec,
+                       const url_parse::Component& query,
+                       CharsetConverter* converter,
+                       CanonOutput* output,
+                       url_parse::Component* out_query) {
+  DoCanonicalizeQuery<char, unsigned char>(spec, query, converter,
+                                           output, out_query);
+}
+
+void CanonicalizeQuery(const char16* spec,
+                       const url_parse::Component& query,
+                       CharsetConverter* converter,
+                       CanonOutput* output,
+                       url_parse::Component* out_query) {
+  DoCanonicalizeQuery<char16, char16>(spec, query, converter,
+                                      output, out_query);
+}
+
+void ConvertUTF16ToQueryEncoding(const char16* input,
+                                 const url_parse::Component& query,
+                                 CharsetConverter* converter,
+                                 CanonOutput* output) {
+  DoConvertToQueryEncoding<char16, char16>(input, query,
+                                           converter, output);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_relative.cpp b/googleurl/url_canon_relative.cpp
new file mode 100644
index 0000000..b654db0
--- /dev/null
+++ b/googleurl/url_canon_relative.cpp
@@ -0,0 +1,580 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Canonicalizer functions for working with and resolving relative URLs.
+
+#include "base/logging.h"
+#include "url_canon.h"
+#include "url_canon_internal.h"
+#include "url_file.h"
+#include "url_parse_internal.h"
+#include "url_util_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
+// 379034), whereas IE is case-insensetive.
+//
+// We choose to be more permissive like IE. We don't need to worry about
+// unescaping or anything here: neither IE or Firefox allow this. We also
+// don't have to worry about invalid scheme characters since we are comparing
+// against the canonical scheme of the base.
+//
+// The base URL should always be canonical, therefore is ASCII.
+template<typename CHAR>
+bool AreSchemesEqual(const char* base,
+                     const url_parse::Component& base_scheme,
+                     const CHAR* cmp,
+                     const url_parse::Component& cmp_scheme) {
+  if (base_scheme.len != cmp_scheme.len)
+    return false;
+  for (int i = 0; i < base_scheme.len; i++) {
+    // We assume the base is already canonical, so we don't have to
+    // canonicalize it.
+    if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) !=
+        base[base_scheme.begin + i])
+      return false;
+  }
+  return true;
+}
+
+#ifdef WIN32
+
+// Here, we also allow Windows paths to be represented as "/C:/" so we can be
+// consistent about URL paths beginning with slashes. This function is like
+// DoesBeginWindowsDrivePath except that it also requires a slash at the
+// beginning.
+template<typename CHAR>
+bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset,
+                                    int spec_len) {
+  if (start_offset >= spec_len)
+    return false;
+  return url_parse::IsURLSlash(spec[start_offset]) &&
+      url_parse::DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len);
+}
+
+#endif  // WIN32
+
+// See IsRelativeURL in the header file for usage.
+template<typename CHAR>
+bool DoIsRelativeURL(const char* base,
+                     const url_parse::Parsed& base_parsed,
+                     const CHAR* url,
+                     int url_len,
+                     bool is_base_hierarchical,
+                     bool* is_relative,
+                     url_parse::Component* relative_component) {
+  *is_relative = false;  // So we can default later to not relative.
+
+  // Trim whitespace and construct a new range for the substring.
+  int begin = 0;
+  url_parse::TrimURL(url, &begin, &url_len);
+  if (begin >= url_len) {
+    // Empty URLs are relative, but do nothing.
+    *relative_component = url_parse::Component(begin, 0);
+    *is_relative = true;
+    return true;
+  }
+
+#ifdef WIN32
+  // We special case paths like "C:\foo" so they can link directly to the
+  // file on Windows (IE compatability). The security domain stuff should
+  // prevent a link like this from actually being followed if its on a
+  // web page.
+  //
+  // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/"
+  // as relative, as this will just replace the path when the base scheme
+  // is a file and the answer will still be correct.
+  //
+  // We require strict backslashes when detecting UNC since two forward
+  // shashes should be treated a a relative URL with a hostname.
+  if (url_parse::DoesBeginWindowsDriveSpec(url, begin, url_len) ||
+      url_parse::DoesBeginUNCPath(url, begin, url_len, true))
+    return true;
+#endif  // WIN32
+
+  // See if we've got a scheme, if not, we know this is a relative URL.
+  // BUT: Just because we have a scheme, doesn't make it absolute.
+  // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
+  // empty, we treat it as relative (":foo") like IE does.
+  url_parse::Component scheme;
+  if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
+    // Don't allow relative URLs if the base scheme doesn't support it.
+    if (!is_base_hierarchical)
+      return false;
+
+    *relative_component = url_parse::MakeRange(begin, url_len);
+    *is_relative = true;
+    return true;
+  }
+
+  // If the scheme isn't valid, then it's relative.
+  int scheme_end = scheme.end();
+  for (int i = scheme.begin; i < scheme_end; i++) {
+    if (!CanonicalSchemeChar(url[i])) {
+      *relative_component = url_parse::MakeRange(begin, url_len);
+      *is_relative = true;
+      return true;
+    }
+  }
+
+  // If the scheme is not the same, then we can't count it as relative.
+  if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme))
+    return true;
+
+  // When the scheme that they both share is not hierarchical, treat the
+  // incoming scheme as absolute (this way with the base of "data:foo",
+  // "data:bar" will be reported as absolute.
+  if (!is_base_hierarchical)
+    return true;
+
+  int colon_offset = scheme.end();
+
+  // If it's a filesystem URL, the only valid way to make it relative is not to
+  // supply a scheme.  There's no equivalent to e.g. http:index.html.
+  if (url_util::CompareSchemeComponent(url, scheme, "filesystem"))
+    return true;
+
+  // ExtractScheme guarantees that the colon immediately follows what it
+  // considers to be the scheme. CountConsecutiveSlashes will handle the
+  // case where the begin offset is the end of the input.
+  int num_slashes = url_parse::CountConsecutiveSlashes(url, colon_offset + 1,
+                                                       url_len);
+
+  if (num_slashes == 0 || num_slashes == 1) {
+    // No slashes means it's a relative path like "http:foo.html". One slash
+    // is an absolute path. "http:/home/foo.html"
+    *is_relative = true;
+    *relative_component = url_parse::MakeRange(colon_offset + 1, url_len);
+    return true;
+  }
+
+  // Two or more slashes after the scheme we treat as absolute.
+  return true;
+}
+
+// Copies all characters in the range [begin, end) of |spec| to the output,
+// up until and including the last slash. There should be a slash in the
+// range, if not, nothing will be copied.
+//
+// The input is assumed to be canonical, so we search only for exact slashes
+// and not backslashes as well. We also know that it's ASCII.
+void CopyToLastSlash(const char* spec,
+                     int begin,
+                     int end,
+                     CanonOutput* output) {
+  // Find the last slash.
+  int last_slash = -1;
+  for (int i = end - 1; i >= begin; i--) {
+    if (spec[i] == '/') {
+      last_slash = i;
+      break;
+    }
+  }
+  if (last_slash < 0)
+    return;  // No slash.
+
+  // Copy.
+  for (int i = begin; i <= last_slash; i++)
+    output->push_back(spec[i]);
+}
+
+// Copies a single component from the source to the output. This is used
+// when resolving relative URLs and a given component is unchanged. Since the
+// source should already be canonical, we don't have to do anything special,
+// and the input is ASCII.
+void CopyOneComponent(const char* source,
+                      const url_parse::Component& source_component,
+                      CanonOutput* output,
+                      url_parse::Component* output_component) {
+  if (source_component.len < 0) {
+    // This component is not present.
+    *output_component = url_parse::Component();
+    return;
+  }
+
+  output_component->begin = output->length();
+  int source_end = source_component.end();
+  for (int i = source_component.begin; i < source_end; i++)
+    output->push_back(source[i]);
+  output_component->len = output->length() - output_component->begin;
+}
+
+#ifdef WIN32
+
+// Called on Windows when the base URL is a file URL, this will copy the "C:"
+// to the output, if there is a drive letter and if that drive letter is not
+// being overridden by the relative URL. Otherwise, do nothing.
+//
+// It will return the index of the beginning of the next character in the
+// base to be processed: if there is a "C:", the slash after it, or if
+// there is no drive letter, the slash at the beginning of the path, or
+// the end of the base. This can be used as the starting offset for further
+// path processing.
+template<typename CHAR>
+int CopyBaseDriveSpecIfNecessary(const char* base_url,
+                                 int base_path_begin,
+                                 int base_path_end,
+                                 const CHAR* relative_url,
+                                 int path_start,
+                                 int relative_url_len,
+                                 CanonOutput* output) {
+  if (base_path_begin >= base_path_end)
+    return base_path_begin;  // No path.
+
+  // If the relative begins with a drive spec, don't do anything. The existing
+  // drive spec in the base will be replaced.
+  if (url_parse::DoesBeginWindowsDriveSpec(relative_url,
+                                           path_start, relative_url_len)) {
+    return base_path_begin;  // Relative URL path is "C:/foo"
+  }
+
+  // The path should begin with a slash (as all canonical paths do). We check
+  // if it is followed by a drive letter and copy it.
+  if (DoesBeginSlashWindowsDriveSpec(base_url,
+                                     base_path_begin,
+                                     base_path_end)) {
+    // Copy the two-character drive spec to the output. It will now look like
+    // "file:///C:" so the rest of it can be treated like a standard path.
+    output->push_back('/');
+    output->push_back(base_url[base_path_begin + 1]);
+    output->push_back(base_url[base_path_begin + 2]);
+    return base_path_begin + 3;
+  }
+
+  return base_path_begin;
+}
+
+#endif  // WIN32
+
+// A subroutine of DoResolveRelativeURL, this resolves the URL knowning that
+// the input is a relative path or less (qyuery or ref).
+template<typename CHAR>
+bool DoResolveRelativePath(const char* base_url,
+                           const url_parse::Parsed& base_parsed,
+                           bool base_is_file,
+                           const CHAR* relative_url,
+                           const url_parse::Component& relative_component,
+                           CharsetConverter* query_converter,
+                           CanonOutput* output,
+                           url_parse::Parsed* out_parsed) {
+  (void)base_is_file;
+  bool success = true;
+
+  // We know the authority section didn't change, copy it to the output. We
+  // also know we have a path so can copy up to there.
+  url_parse::Component path, query, ref;
+  url_parse::ParsePathInternal(relative_url,
+                               relative_component,
+                               &path,
+                               &query,
+                               &ref);
+  // Canonical URLs always have a path, so we can use that offset.
+  output->Append(base_url, base_parsed.path.begin);
+
+  if (path.len > 0) {
+    // The path is replaced or modified.
+    int true_path_begin = output->length();
+
+    // For file: URLs on Windows, we don't want to treat the drive letter and
+    // colon as part of the path for relative file resolution when the
+    // incoming URL does not provide a drive spec. We save the true path
+    // beginning so we can fix it up after we are done.
+    int base_path_begin = base_parsed.path.begin;
+#ifdef WIN32
+    if (base_is_file) {
+      base_path_begin = CopyBaseDriveSpecIfNecessary(
+          base_url, base_parsed.path.begin, base_parsed.path.end(),
+          relative_url, relative_component.begin, relative_component.end(),
+          output);
+      // Now the output looks like either "file://" or "file:///C:"
+      // and we can start appending the rest of the path. |base_path_begin|
+      // points to the character in the base that comes next.
+    }
+#endif  // WIN32
+
+    if (url_parse::IsURLSlash(relative_url[path.begin])) {
+      // Easy case: the path is an absolute path on the server, so we can
+      // just replace everything from the path on with the new versions.
+      // Since the input should be canonical hierarchical URL, we should
+      // always have a path.
+      success &= CanonicalizePath(relative_url, path,
+                                  output, &out_parsed->path);
+    } else {
+      // Relative path, replace the query, and reference. We take the
+      // original path with the file part stripped, and append the new path.
+      // The canonicalizer will take care of resolving ".." and "."
+      int path_begin = output->length();
+      CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(),
+                      output);
+      success &= CanonicalizePartialPath(relative_url, path, path_begin,
+                                         output);
+      out_parsed->path = url_parse::MakeRange(path_begin, output->length());
+
+      // Copy the rest of the stuff after the path from the relative path.
+    }
+
+    // Finish with the query and reference part (these can't fail).
+    CanonicalizeQuery(relative_url, query, query_converter,
+                      output, &out_parsed->query);
+    CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+
+    // Fix the path beginning to add back the "C:" we may have written above.
+    out_parsed->path = url_parse::MakeRange(true_path_begin,
+                                            out_parsed->path.end());
+    return success;
+  }
+
+  // If we get here, the path is unchanged: copy to output.
+  CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path);
+
+  if (query.is_valid()) {
+    // Just the query specified, replace the query and reference (ignore
+    // failures for refs)
+    CanonicalizeQuery(relative_url, query, query_converter,
+                      output, &out_parsed->query);
+    CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+    return success;
+  }
+
+  // If we get here, the query is unchanged: copy to output. Note that the
+  // range of the query parameter doesn't include the question mark, so we
+  // have to add it manually if there is a component.
+  if (base_parsed.query.is_valid())
+    output->push_back('?');
+  CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query);
+
+  if (ref.is_valid()) {
+    // Just the reference specified: replace it (ignoring failures).
+    CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+    return success;
+  }
+
+  // We should always have something to do in this function, the caller checks
+  // that some component is being replaced.
+  DCHECK(false) << "Not reached";
+  return success;
+}
+
+// Resolves a relative URL that contains a host. Typically, these will
+// be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which
+// should be kept from the original URL is the scheme.
+template<typename CHAR>
+bool DoResolveRelativeHost(const char* base_url,
+                           const url_parse::Parsed& base_parsed,
+                           const CHAR* relative_url,
+                           const url_parse::Component& relative_component,
+                           CharsetConverter* query_converter,
+                           CanonOutput* output,
+                           url_parse::Parsed* out_parsed) {
+  // Parse the relative URL, just like we would for anything following a
+  // scheme.
+  url_parse::Parsed relative_parsed;  // Everything but the scheme is valid.
+  url_parse::ParseAfterScheme(&relative_url[relative_component.begin],
+                              relative_component.len, relative_component.begin,
+                              &relative_parsed);
+
+  // Now we can just use the replacement function to replace all the necessary
+  // parts of the old URL with the new one.
+  Replacements<CHAR> replacements;
+  replacements.SetUsername(relative_url, relative_parsed.username);
+  replacements.SetPassword(relative_url, relative_parsed.password);
+  replacements.SetHost(relative_url, relative_parsed.host);
+  replacements.SetPort(relative_url, relative_parsed.port);
+  replacements.SetPath(relative_url, relative_parsed.path);
+  replacements.SetQuery(relative_url, relative_parsed.query);
+  replacements.SetRef(relative_url, relative_parsed.ref);
+
+  return ReplaceStandardURL(base_url, base_parsed, replacements,
+                            query_converter, output, out_parsed);
+}
+
+// Resolves a relative URL that happens to be an absolute file path.  Examples
+// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
+template<typename CHAR>
+bool DoResolveAbsoluteFile(const CHAR* relative_url,
+                           const url_parse::Component& relative_component,
+                           CharsetConverter* query_converter,
+                           CanonOutput* output,
+                           url_parse::Parsed* out_parsed) {
+  // Parse the file URL. The file URl parsing function uses the same logic
+  // as we do for determining if the file is absolute, in which case it will
+  // not bother to look for a scheme.
+  url_parse::Parsed relative_parsed;
+  url_parse::ParseFileURL(&relative_url[relative_component.begin],
+                          relative_component.len, &relative_parsed);
+
+  return CanonicalizeFileURL(&relative_url[relative_component.begin],
+                             relative_component.len, relative_parsed,
+                             query_converter, output, out_parsed);
+}
+
+// TODO(brettw) treat two slashes as root like Mozilla for FTP?
+template<typename CHAR>
+bool DoResolveRelativeURL(const char* base_url,
+                          const url_parse::Parsed& base_parsed,
+                          bool base_is_file,
+                          const CHAR* relative_url,
+                          const url_parse::Component& relative_component,
+                          CharsetConverter* query_converter,
+                          CanonOutput* output,
+                          url_parse::Parsed* out_parsed) {
+  // Starting point for our output parsed. We'll fix what we change.
+  *out_parsed = base_parsed;
+
+  // Sanity check: the input should have a host or we'll break badly below.
+  // We can only resolve relative URLs with base URLs that have hosts and
+  // paths (even the default path of "/" is OK).
+  //
+  // We allow hosts with no length so we can handle file URLs, for example.
+  if (base_parsed.path.len <= 0) {
+    // On error, return the input (resolving a relative URL on a non-relative
+    // base = the base).
+    int base_len = base_parsed.Length();
+    for (int i = 0; i < base_len; i++)
+      output->push_back(base_url[i]);
+    return false;
+  }
+
+  if (relative_component.len <= 0) {
+    // Empty relative URL, leave unchanged, only removing the ref component.
+    int base_len = base_parsed.Length();
+    base_len -= base_parsed.ref.len + 1;
+    out_parsed->ref.reset();
+    output->Append(base_url, base_len);
+    return true;
+  }
+
+  int num_slashes = url_parse::CountConsecutiveSlashes(
+      relative_url, relative_component.begin, relative_component.end());
+
+#ifdef WIN32
+  // On Windows, two slashes for a file path (regardless of which direction
+  // they are) means that it's UNC. Two backslashes on any base scheme mean
+  // that it's an absolute UNC path (we use the base_is_file flag to control
+  // how strict the UNC finder is).
+  //
+  // We also allow Windows absolute drive specs on any scheme (for example
+  // "c:\foo") like IE does. There must be no preceeding slashes in this
+  // case (we reject anything like "/c:/foo") because that should be treated
+  // as a path. For file URLs, we allow any number of slashes since that would
+  // be setting the path.
+  //
+  // This assumes the absolute path resolver handles absolute URLs like this
+  // properly. url_util::DoCanonicalize does this.
+  int after_slashes = relative_component.begin + num_slashes;
+  if (url_parse::DoesBeginUNCPath(relative_url, relative_component.begin,
+                                  relative_component.end(), !base_is_file) ||
+      ((num_slashes == 0 || base_is_file) &&
+       url_parse::DoesBeginWindowsDriveSpec(relative_url, after_slashes,
+                                            relative_component.end()))) {
+    return DoResolveAbsoluteFile(relative_url, relative_component,
+                                 query_converter, output, out_parsed);
+  }
+#else
+  // Other platforms need explicit handling for file: URLs with multiple
+  // slashes because the generic scheme parsing always extracts a host, but a
+  // file: URL only has a host if it has exactly 2 slashes. This also
+  // handles the special case where the URL is only slashes, since that
+  // doesn't have a host part either.
+  if (base_is_file &&
+      (num_slashes > 2 || num_slashes == relative_component.len)) {
+    return DoResolveAbsoluteFile(relative_url, relative_component,
+                                 query_converter, output, out_parsed);
+  }
+#endif
+
+  // Any other double-slashes mean that this is relative to the scheme.
+  if (num_slashes >= 2) {
+    return DoResolveRelativeHost(base_url, base_parsed,
+                                 relative_url, relative_component,
+                                 query_converter, output, out_parsed);
+  }
+
+  // When we get here, we know that the relative URL is on the same host.
+  return DoResolveRelativePath(base_url, base_parsed, base_is_file,
+                               relative_url, relative_component,
+                               query_converter, output, out_parsed);
+}
+
+}  // namespace
+
+bool IsRelativeURL(const char* base,
+                   const url_parse::Parsed& base_parsed,
+                   const char* fragment,
+                   int fragment_len,
+                   bool is_base_hierarchical,
+                   bool* is_relative,
+                   url_parse::Component* relative_component) {
+  return DoIsRelativeURL<char>(
+      base, base_parsed, fragment, fragment_len, is_base_hierarchical,
+      is_relative, relative_component);
+}
+
+bool IsRelativeURL(const char* base,
+                   const url_parse::Parsed& base_parsed,
+                   const char16* fragment,
+                   int fragment_len,
+                   bool is_base_hierarchical,
+                   bool* is_relative,
+                   url_parse::Component* relative_component) {
+  return DoIsRelativeURL<char16>(
+      base, base_parsed, fragment, fragment_len, is_base_hierarchical,
+      is_relative, relative_component);
+}
+
+bool ResolveRelativeURL(const char* base_url,
+                        const url_parse::Parsed& base_parsed,
+                        bool base_is_file,
+                        const char* relative_url,
+                        const url_parse::Component& relative_component,
+                        CharsetConverter* query_converter,
+                        CanonOutput* output,
+                        url_parse::Parsed* out_parsed) {
+  return DoResolveRelativeURL<char>(
+      base_url, base_parsed, base_is_file, relative_url,
+      relative_component, query_converter, output, out_parsed);
+}
+
+bool ResolveRelativeURL(const char* base_url,
+                        const url_parse::Parsed& base_parsed,
+                        bool base_is_file,
+                        const char16* relative_url,
+                        const url_parse::Component& relative_component,
+                        CharsetConverter* query_converter,
+                        CanonOutput* output,
+                        url_parse::Parsed* out_parsed) {
+  return DoResolveRelativeURL<char16>(
+      base_url, base_parsed, base_is_file, relative_url,
+      relative_component, query_converter, output, out_parsed);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_canon_stdurl.cpp b/googleurl/url_canon_stdurl.cpp
new file mode 100644
index 0000000..caa8d02
--- /dev/null
+++ b/googleurl/url_canon_stdurl.cpp
@@ -0,0 +1,213 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions to canonicalize "standard" URLs, which are ones that have an
+// authority section including a host name.
+
+#include "url_canon.h"
+#include "url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
+                               const url_parse::Parsed& parsed,
+                               CharsetConverter* query_converter,
+                               CanonOutput* output,
+                               url_parse::Parsed* new_parsed) {
+  // Scheme: this will append the colon.
+  bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
+                                    output, &new_parsed->scheme);
+
+  // Authority (username, password, host, port)
+  bool have_authority;
+  if (parsed.username.is_valid() || parsed.password.is_valid() ||
+      parsed.host.is_nonempty() || parsed.port.is_valid()) {
+    have_authority = true;
+
+    // Only write the authority separators when we have a scheme.
+    if (parsed.scheme.is_valid()) {
+      output->push_back('/');
+      output->push_back('/');
+    }
+
+    // User info: the canonicalizer will handle the : and @.
+    success &= CanonicalizeUserInfo(source.username, parsed.username,
+                                    source.password, parsed.password,
+                                    output,
+                                    &new_parsed->username,
+                                    &new_parsed->password);
+
+    success &= CanonicalizeHost(source.host, parsed.host,
+                                output, &new_parsed->host);
+
+    // Host must not be empty for standard URLs.
+    if (!parsed.host.is_nonempty())
+      success = false;
+
+    // Port: the port canonicalizer will handle the colon.
+    int default_port = DefaultPortForScheme(
+        &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len);
+    success &= CanonicalizePort(source.port, parsed.port, default_port,
+                                output, &new_parsed->port);
+  } else {
+    // No authority, clear the components.
+    have_authority = false;
+    new_parsed->host.reset();
+    new_parsed->username.reset();
+    new_parsed->password.reset();
+    new_parsed->port.reset();
+    success = false;  // Standard URLs must have an authority.
+  }
+
+  // Path
+  if (parsed.path.is_valid()) {
+    success &= CanonicalizePath(source.path, parsed.path,
+                                output, &new_parsed->path);
+  } else if (have_authority ||
+             parsed.query.is_valid() || parsed.ref.is_valid()) {
+    // When we have an empty path, make up a path when we have an authority
+    // or something following the path. The only time we allow an empty
+    // output path is when there is nothing else.
+    new_parsed->path = url_parse::Component(output->length(), 1);
+    output->push_back('/');
+  } else {
+    // No path at all
+    new_parsed->path.reset();
+  }
+
+  // Query
+  CanonicalizeQuery(source.query, parsed.query, query_converter,
+                    output, &new_parsed->query);
+
+  // Ref: ignore failure for this, since the page can probably still be loaded.
+  CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+
+  return success;
+}
+
+}  // namespace
+
+
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+int DefaultPortForScheme(const char* scheme, int scheme_len) {
+  int default_port = url_parse::PORT_UNSPECIFIED;
+  switch (scheme_len) {
+    case 4:
+      if (!strncmp(scheme, "http", scheme_len))
+        default_port = 80;
+      break;
+    case 5:
+      if (!strncmp(scheme, "https", scheme_len))
+        default_port = 443;
+      break;
+    case 3:
+      if (!strncmp(scheme, "ftp", scheme_len))
+        default_port = 21;
+      else if (!strncmp(scheme, "wss", scheme_len))
+        default_port = 443;
+      break;
+    case 6:
+      if (!strncmp(scheme, "gopher", scheme_len))
+        default_port = 70;
+      break;
+    case 2:
+      if (!strncmp(scheme, "ws", scheme_len))
+        default_port = 80;
+      break;
+  }
+  return default_port;
+}
+
+bool CanonicalizeStandardURL(const char* spec,
+                             int spec_len,
+                             const url_parse::Parsed& parsed,
+                             CharsetConverter* query_converter,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizeStandardURL<char, unsigned char>(
+      URLComponentSource<char>(spec), parsed, query_converter,
+      output, new_parsed);
+}
+
+bool CanonicalizeStandardURL(const char16* spec,
+                             int spec_len,
+                             const url_parse::Parsed& parsed,
+                             CharsetConverter* query_converter,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed) {
+  (void)spec_len;
+  return DoCanonicalizeStandardURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, query_converter,
+      output, new_parsed);
+}
+
+// It might be nice in the future to optimize this so unchanged components don't
+// need to be recanonicalized. This is especially true since the common case for
+// ReplaceComponents is removing things we don't want, like reference fragments
+// and usernames. These cases can become more efficient if we can assume the
+// rest of the URL is OK with these removed (or only the modified parts
+// recanonicalized). This would be much more complex to implement, however.
+//
+// You would also need to update DoReplaceComponents in url_util.cc which
+// relies on this re-checking everything (see the comment there for why).
+bool ReplaceStandardURL(const char* base,
+                        const url_parse::Parsed& base_parsed,
+                        const Replacements<char>& replacements,
+                        CharsetConverter* query_converter,
+                        CanonOutput* output,
+                        url_parse::Parsed* new_parsed) {
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupOverrideComponents(base, replacements, &source, &parsed);
+  return DoCanonicalizeStandardURL<char, unsigned char>(
+      source, parsed, query_converter, output, new_parsed);
+}
+
+// For 16-bit replacements, we turn all the replacements into UTF-8 so the
+// regular codepath can be used.
+bool ReplaceStandardURL(const char* base,
+                        const url_parse::Parsed& base_parsed,
+                        const Replacements<char16>& replacements,
+                        CharsetConverter* query_converter,
+                        CanonOutput* output,
+                        url_parse::Parsed* new_parsed) {
+  RawCanonOutput<1024> utf8;
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+  return DoCanonicalizeStandardURL<char, unsigned char>(
+      source, parsed, query_converter, output, new_parsed);
+}
+
+}  // namespace url_canon
diff --git a/googleurl/url_parse.cpp b/googleurl/url_parse.cpp
new file mode 100644
index 0000000..5f66d94
--- /dev/null
+++ b/googleurl/url_parse.cpp
@@ -0,0 +1,923 @@
+/* Based on nsURLParsers.cc from Mozilla
+ * -------------------------------------
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Darin Fisher (original author)
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "url_parse.h"
+
+#include <stdlib.h>
+
+#include "base/logging.h"
+#include "url_parse_internal.h"
+#include "url_util.h"
+#include "url_util_internal.h"
+
+namespace url_parse {
+
+namespace {
+
+// Returns true if the given character is a valid digit to use in a port.
+inline bool IsPortDigit(char16 ch) {
+  return ch >= '0' && ch <= '9';
+}
+
+// Returns the offset of the next authority terminator in the input starting
+// from start_offset. If no terminator is found, the return value will be equal
+// to spec_len.
+template<typename CHAR>
+int FindNextAuthorityTerminator(const CHAR* spec,
+                                int start_offset,
+                                int spec_len) {
+  for (int i = start_offset; i < spec_len; i++) {
+    if (IsAuthorityTerminator(spec[i]))
+      return i;
+  }
+  return spec_len;  // Not found.
+}
+
+template<typename CHAR>
+void ParseUserInfo(const CHAR* spec,
+                   const Component& user,
+                   Component* username,
+                   Component* password) {
+  // Find the first colon in the user section, which separates the username and
+  // password.
+  int colon_offset = 0;
+  while (colon_offset < user.len && spec[user.begin + colon_offset] != ':')
+    colon_offset++;
+
+  if (colon_offset < user.len) {
+    // Found separator: <username>:<password>
+    *username = Component(user.begin, colon_offset);
+    *password = MakeRange(user.begin + colon_offset + 1,
+                          user.begin + user.len);
+  } else {
+    // No separator, treat everything as the username
+    *username = user;
+    *password = Component();
+  }
+}
+
+template<typename CHAR>
+void ParseServerInfo(const CHAR* spec,
+                     const Component& serverinfo,
+                     Component* hostname,
+                     Component* port_num) {
+  if (serverinfo.len == 0) {
+    // No server info, host name is empty.
+    hostname->reset();
+    port_num->reset();
+    return;
+  }
+
+  // If the host starts with a left-bracket, assume the entire host is an
+  // IPv6 literal.  Otherwise, assume none of the host is an IPv6 literal.
+  // This assumption will be overridden if we find a right-bracket.
+  //
+  // Our IPv6 address canonicalization code requires both brackets to exist,
+  // but the ability to locate an incomplete address can still be useful.
+  int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1;
+  int colon = -1;
+
+  // Find the last right-bracket, and the last colon.
+  for (int i = serverinfo.begin; i < serverinfo.end(); i++) {
+    switch (spec[i]) {
+      case ']':
+        ipv6_terminator = i;
+        break;
+      case ':':
+        colon = i;
+        break;
+    }
+  }
+
+  if (colon > ipv6_terminator) {
+    // Found a port number: <hostname>:<port>
+    *hostname = MakeRange(serverinfo.begin, colon);
+    if (hostname->len == 0)
+      hostname->reset();
+    *port_num = MakeRange(colon + 1, serverinfo.end());
+  } else {
+    // No port: <hostname>
+    *hostname = serverinfo;
+    port_num->reset();
+  }
+}
+
+// Given an already-identified auth section, breaks it into its consituent
+// parts. The port number will be parsed and the resulting integer will be
+// filled into the given *port variable, or -1 if there is no port number or it
+// is invalid.
+template<typename CHAR>
+void DoParseAuthority(const CHAR* spec,
+                      const Component& auth,
+                      Component* username,
+                      Component* password,
+                      Component* hostname,
+                      Component* port_num) {
+  DCHECK(auth.is_valid()) << "We should always get an authority";
+  if (auth.len == 0) {
+    username->reset();
+    password->reset();
+    hostname->reset();
+    port_num->reset();
+    return;
+  }
+
+  // Search backwards for @, which is the separator between the user info and
+  // the server info.
+  int i = auth.begin + auth.len - 1;
+  while (i > auth.begin && spec[i] != '@')
+    i--;
+
+  if (spec[i] == '@') {
+    // Found user info: <user-info>@<server-info>
+    ParseUserInfo(spec, Component(auth.begin, i - auth.begin),
+                  username, password);
+    ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len),
+                    hostname, port_num);
+  } else {
+    // No user info, everything is server info.
+    username->reset();
+    password->reset();
+    ParseServerInfo(spec, auth, hostname, port_num);
+  }
+}
+
+template<typename CHAR>
+void ParsePath(const CHAR* spec,
+               const Component& path,
+               Component* filepath,
+               Component* query,
+               Component* ref) {
+  // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref>
+
+  // Special case when there is no path.
+  if (path.len == -1) {
+    filepath->reset();
+    query->reset();
+    ref->reset();
+    return;
+  }
+  DCHECK(path.len > 0) << "We should never have 0 length paths";
+
+  // Search for first occurrence of either ? or #.
+  int path_end = path.begin + path.len;
+
+  int query_separator = -1;  // Index of the '?'
+  int ref_separator = -1;    // Index of the '#'
+  for (int i = path.begin; i < path_end; i++) {
+    switch (spec[i]) {
+      case '?':
+        // Only match the query string if it precedes the reference fragment
+        // and when we haven't found one already.
+        if (ref_separator < 0 && query_separator < 0)
+          query_separator = i;
+        break;
+      case '#':
+        // Record the first # sign only.
+        if (ref_separator < 0)
+          ref_separator = i;
+        break;
+    }
+  }
+
+  // Markers pointing to the character after each of these corresponding
+  // components. The code below words from the end back to the beginning,
+  // and will update these indices as it finds components that exist.
+  int file_end, query_end;
+
+  // Ref fragment: from the # to the end of the path.
+  if (ref_separator >= 0) {
+    file_end = query_end = ref_separator;
+    *ref = MakeRange(ref_separator + 1, path_end);
+  } else {
+    file_end = query_end = path_end;
+    ref->reset();
+  }
+
+  // Query fragment: everything from the ? to the next boundary (either the end
+  // of the path or the ref fragment).
+  if (query_separator >= 0) {
+    file_end = query_separator;
+    *query = MakeRange(query_separator + 1, query_end);
+  } else {
+    query->reset();
+  }
+
+  // File path: treat an empty file path as no file path.
+  if (file_end != path.begin)
+    *filepath = MakeRange(path.begin, file_end);
+  else
+    filepath->reset();
+}
+
+template<typename CHAR>
+bool DoExtractScheme(const CHAR* url,
+                     int url_len,
+                     Component* scheme) {
+  // Skip leading whitespace and control characters.
+  int begin = 0;
+  while (begin < url_len && ShouldTrimFromURL(url[begin]))
+    begin++;
+  if (begin == url_len)
+    return false;  // Input is empty or all whitespace.
+
+  // Find the first colon character.
+  for (int i = begin; i < url_len; i++) {
+    if (url[i] == ':') {
+      *scheme = MakeRange(begin, i);
+      return true;
+    }
+  }
+  return false;  // No colon found: no scheme
+}
+
+// Fills in all members of the Parsed structure except for the scheme.
+//
+// |spec| is the full spec being parsed, of length |spec_len|.
+// |after_scheme| is the character immediately following the scheme (after the
+//   colon) where we'll begin parsing.
+//
+// Compatability data points. I list "host", "path" extracted:
+// Input                IE6             Firefox                Us
+// -----                --------------  --------------         --------------
+// http://foo.com/      "foo.com", "/"  "foo.com", "/"         "foo.com", "/"
+// http:foo.com/        "foo.com", "/"  "foo.com", "/"         "foo.com", "/"
+// http:/foo.com/       fail(*)         "foo.com", "/"         "foo.com", "/"
+// http:\foo.com/       fail(*)         "\foo.com", "/"(fail)  "foo.com", "/"
+// http:////foo.com/    "foo.com", "/"  "foo.com", "/"         "foo.com", "/"
+//
+// (*) Interestingly, although IE fails to load these URLs, its history
+// canonicalizer handles them, meaning if you've been to the corresponding
+// "http://foo.com/" link, it will be colored.
+template <typename CHAR>
+void DoParseAfterScheme(const CHAR* spec,
+                        int spec_len,
+                        int after_scheme,
+                        Parsed* parsed) {
+  int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
+  int after_slashes = after_scheme + num_slashes;
+
+  // First split into two main parts, the authority (username, password, host,
+  // and port) and the full path (path, query, and reference).
+  Component authority;
+  Component full_path;
+
+  // Found "//<some data>", looks like an authority section. Treat everything
+  // from there to the next slash (or end of spec) to be the authority. Note
+  // that we ignore the number of slashes and treat it as the authority.
+  int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len);
+  authority = Component(after_slashes, end_auth - after_slashes);
+
+  if (end_auth == spec_len)  // No beginning of path found.
+    full_path = Component();
+  else  // Everything starting from the slash to the end is the path.
+    full_path = Component(end_auth, spec_len - end_auth);
+
+  // Now parse those two sub-parts.
+  DoParseAuthority(spec, authority, &parsed->username, &parsed->password,
+                   &parsed->host, &parsed->port);
+  ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
+}
+
+// The main parsing function for standard URLs. Standard URLs have a scheme,
+// host, path, etc.
+template<typename CHAR>
+void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+  DCHECK(spec_len >= 0);
+
+  // Strip leading & trailing spaces and control characters.
+  int begin = 0;
+  TrimURL(spec, &begin, &spec_len);
+
+  int after_scheme;
+  if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
+    after_scheme = parsed->scheme.end() + 1;  // Skip past the colon.
+  } else {
+    // Say there's no scheme when there is no colon. We could also say that
+    // everything is the scheme. Both would produce an invalid URL, but this way
+    // seems less wrong in more cases.
+    parsed->scheme.reset();
+    after_scheme = begin;
+  }
+  DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+template<typename CHAR>
+void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+  DCHECK(spec_len >= 0);
+
+  // Get the unused parts of the URL out of the way.
+  parsed->username.reset();
+  parsed->password.reset();
+  parsed->host.reset();
+  parsed->port.reset();
+  parsed->path.reset();   // May use this; reset for convenience.
+  parsed->ref.reset();    // May use this; reset for convenience.
+  parsed->query.reset();  // May use this; reset for convenience.
+  parsed->clear_inner_parsed();  // May use this; reset for convenience.
+
+  // Strip leading & trailing spaces and control characters.
+  int begin = 0;
+  TrimURL(spec, &begin, &spec_len);
+
+  // Handle empty specs or ones that contain only whitespace or control chars.
+  if (begin == spec_len) {
+    parsed->scheme.reset();
+    return;
+  }
+
+  int inner_start = -1;
+
+  // Extract the scheme.  We also handle the case where there is no scheme.
+  if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+    // Offset the results since we gave ExtractScheme a substring.
+    parsed->scheme.begin += begin;
+
+    if (parsed->scheme.end() == spec_len - 1)
+      return;
+
+    inner_start = parsed->scheme.end() + 1;
+  } else {
+    // No scheme found; that's not valid for filesystem URLs.
+    parsed->scheme.reset();
+    return;
+  }
+
+  url_parse::Component inner_scheme;
+  const CHAR* inner_spec = &spec[inner_start];
+  int inner_spec_len = spec_len - inner_start;
+
+  if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) {
+    // Offset the results since we gave ExtractScheme a substring.
+    inner_scheme.begin += inner_start;
+
+    if (inner_scheme.end() == spec_len - 1)
+      return;
+  } else {
+    // No scheme found; that's not valid for filesystem URLs.
+    // The best we can do is return "filesystem://".
+    return;
+  }
+
+  Parsed inner_parsed;
+
+  if (url_util::CompareSchemeComponent(
+      spec, inner_scheme, url_util::kFileScheme)) {
+    // File URLs are special.
+    ParseFileURL(inner_spec, inner_spec_len, &inner_parsed);
+  } else if (url_util::CompareSchemeComponent(spec, inner_scheme,
+      url_util::kFileSystemScheme)) {
+    // Filesystem URLs don't nest.
+    return;
+  } else if (url_util::IsStandard(spec, inner_scheme)) {
+    // All "normal" URLs.
+    DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed);
+  } else {
+    return;
+  }
+
+  // All members of inner_parsed need to be offset by inner_start.
+  // If we had any scheme that supported nesting more than one level deep,
+  // we'd have to recurse into the inner_parsed's inner_parsed when
+  // adjusting by inner_start.
+  inner_parsed.scheme.begin += inner_start;
+  inner_parsed.username.begin += inner_start;
+  inner_parsed.password.begin += inner_start;
+  inner_parsed.host.begin += inner_start;
+  inner_parsed.port.begin += inner_start;
+  inner_parsed.query.begin += inner_start;
+  inner_parsed.ref.begin += inner_start;
+  inner_parsed.path.begin += inner_start;
+
+  // Query and ref move from inner_parsed to parsed.
+  parsed->query = inner_parsed.query;
+  inner_parsed.query.reset();
+  parsed->ref = inner_parsed.ref;
+  inner_parsed.ref.reset();
+
+  parsed->set_inner_parsed(inner_parsed);
+  if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() ||
+      inner_parsed.inner_parsed()) {
+    return;
+  }
+
+  // The path in inner_parsed should start with a slash, then have a filesystem
+  // type followed by a slash.  From the first slash up to but excluding the
+  // second should be what it keeps; the rest goes to parsed.  If the path ends
+  // before the second slash, it's still pretty clear what the user meant, so
+  // we'll let that through.
+  if (!IsURLSlash(spec[inner_parsed.path.begin])) {
+    return;
+  }
+  int inner_path_end = inner_parsed.path.begin + 1;  // skip the leading slash
+  while (inner_path_end < spec_len &&
+      !IsURLSlash(spec[inner_path_end]))
+    ++inner_path_end;
+  parsed->path.begin = inner_path_end;
+  int new_inner_path_length = inner_path_end - inner_parsed.path.begin;
+  parsed->path.len = inner_parsed.path.len - new_inner_path_length;
+  parsed->inner_parsed()->path.len = new_inner_path_length;
+}
+
+// Initializes a path URL which is merely a scheme followed by a path. Examples
+// include "about:foo" and "javascript:alert('bar');"
+template<typename CHAR>
+void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+  // Get the non-path and non-scheme parts of the URL out of the way, we never
+  // use them.
+  parsed->username.reset();
+  parsed->password.reset();
+  parsed->host.reset();
+  parsed->port.reset();
+  parsed->query.reset();
+  parsed->ref.reset();
+
+  // Strip leading & trailing spaces and control characters.
+  int begin = 0;
+  TrimURL(spec, &begin, &spec_len);
+
+  // Handle empty specs or ones that contain only whitespace or control chars.
+  if (begin == spec_len) {
+    parsed->scheme.reset();
+    parsed->path.reset();
+    return;
+  }
+
+  // Extract the scheme, with the path being everything following. We also
+  // handle the case where there is no scheme.
+  if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+    // Offset the results since we gave ExtractScheme a substring.
+    parsed->scheme.begin += begin;
+
+    // For compatability with the standard URL parser, we treat no path as
+    // -1, rather than having a length of 0 (we normally wouldn't care so
+    // much for these non-standard URLs).
+    if (parsed->scheme.end() == spec_len - 1)
+      parsed->path.reset();
+    else
+      parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
+  } else {
+    // No scheme found, just path.
+    parsed->scheme.reset();
+    parsed->path = MakeRange(begin, spec_len);
+  }
+}
+
+template<typename CHAR>
+void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+  DCHECK(spec_len >= 0);
+
+  // Get the non-path and non-scheme parts of the URL out of the way, we never
+  // use them.
+  parsed->username.reset();
+  parsed->password.reset();
+  parsed->host.reset();
+  parsed->port.reset();
+  parsed->ref.reset();
+  parsed->query.reset();  // May use this; reset for convenience.
+
+  // Strip leading & trailing spaces and control characters.
+  int begin = 0;
+  TrimURL(spec, &begin, &spec_len);
+
+  // Handle empty specs or ones that contain only whitespace or control chars.
+  if (begin == spec_len) {
+    parsed->scheme.reset();
+    parsed->path.reset();
+    return;
+  }
+
+  int path_begin = -1;
+  int path_end = -1;
+
+  // Extract the scheme, with the path being everything following. We also
+  // handle the case where there is no scheme.
+  if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+    // Offset the results since we gave ExtractScheme a substring.
+    parsed->scheme.begin += begin;
+
+    if (parsed->scheme.end() != spec_len - 1) {
+      path_begin = parsed->scheme.end() + 1;
+      path_end = spec_len;
+    }
+  } else {
+    // No scheme found, just path.
+    parsed->scheme.reset();
+    path_begin = begin;
+    path_end = spec_len;
+  }
+
+  // Split [path_begin, path_end) into a path + query.
+  for (int i = path_begin; i < path_end; ++i) {
+    if (spec[i] == '?') {
+      parsed->query = MakeRange(i + 1, path_end);
+      path_end = i;
+      break;
+    }
+  }
+
+  // For compatability with the standard URL parser, treat no path as
+  // -1, rather than having a length of 0
+  if (path_begin == path_end) {
+    parsed->path.reset();
+  } else {
+    parsed->path = MakeRange(path_begin, path_end);
+  }
+}
+
+// Converts a port number in a string to an integer. We'd like to just call
+// sscanf but our input is not NULL-terminated, which sscanf requires. Instead,
+// we copy the digits to a small stack buffer (since we know the maximum number
+// of digits in a valid port number) that we can NULL terminate.
+template<typename CHAR>
+int DoParsePort(const CHAR* spec, const Component& component) {
+  // Easy success case when there is no port.
+  const int kMaxDigits = 5;
+  if (!component.is_nonempty())
+    return PORT_UNSPECIFIED;
+
+  // Skip over any leading 0s.
+  Component digits_comp(component.end(), 0);
+  for (int i = 0; i < component.len; i++) {
+    if (spec[component.begin + i] != '0') {
+      digits_comp = MakeRange(component.begin + i, component.end());
+      break;
+    }
+  }
+  if (digits_comp.len == 0)
+    return 0;  // All digits were 0.
+
+  // Verify we don't have too many digits (we'll be copying to our buffer so
+  // we need to double-check).
+  if (digits_comp.len > kMaxDigits)
+    return PORT_INVALID;
+
+  // Copy valid digits to the buffer.
+  char digits[kMaxDigits + 1];  // +1 for null terminator
+  for (int i = 0; i < digits_comp.len; i++) {
+    CHAR ch = spec[digits_comp.begin + i];
+    if (!IsPortDigit(ch)) {
+      // Invalid port digit, fail.
+      return PORT_INVALID;
+    }
+    digits[i] = static_cast<char>(ch);
+  }
+
+  // Null-terminate the string and convert to integer. Since we guarantee
+  // only digits, atoi's lack of error handling is OK.
+  digits[digits_comp.len] = 0;
+  int port = atoi(digits);
+  if (port > 65535)
+    return PORT_INVALID;  // Out of range.
+  return port;
+}
+
+template<typename CHAR>
+void DoExtractFileName(const CHAR* spec,
+                       const Component& path,
+                       Component* file_name) {
+  // Handle empty paths: they have no file names.
+  if (!path.is_nonempty()) {
+    file_name->reset();
+    return;
+  }
+
+  // Search backwards for a parameter, which is a normally unused field in a
+  // URL delimited by a semicolon. We parse the parameter as part of the
+  // path, but here, we don't want to count it. The last semicolon is the
+  // parameter. The path should start with a slash, so we don't need to check
+  // the first one.
+  int file_end = path.end();
+  for (int i = path.end() - 1; i > path.begin; i--) {
+    if (spec[i] == ';') {
+      file_end = i;
+      break;
+    }
+  }
+
+  // Now search backwards from the filename end to the previous slash
+  // to find the beginning of the filename.
+  for (int i = file_end - 1; i >= path.begin; i--) {
+    if (IsURLSlash(spec[i])) {
+      // File name is everything following this character to the end
+      *file_name = MakeRange(i + 1, file_end);
+      return;
+    }
+  }
+
+  // No slash found, this means the input was degenerate (generally paths
+  // will start with a slash). Let's call everything the file name.
+  *file_name = MakeRange(path.begin, file_end);
+  return;
+}
+
+template<typename CHAR>
+bool DoExtractQueryKeyValue(const CHAR* spec,
+                            Component* query,
+                            Component* key,
+                            Component* value) {
+  if (!query->is_nonempty())
+    return false;
+
+  int start = query->begin;
+  int cur = start;
+  int end = query->end();
+
+  // We assume the beginning of the input is the beginning of the "key" and we
+  // skip to the end of it.
+  key->begin = cur;
+  while (cur < end && spec[cur] != '&' && spec[cur] != '=')
+    cur++;
+  key->len = cur - key->begin;
+
+  // Skip the separator after the key (if any).
+  if (cur < end && spec[cur] == '=')
+    cur++;
+
+  // Find the value part.
+  value->begin = cur;
+  while (cur < end && spec[cur] != '&')
+    cur++;
+  value->len = cur - value->begin;
+
+  // Finally skip the next separator if any
+  if (cur < end && spec[cur] == '&')
+    cur++;
+
+  // Save the new query
+  *query = url_parse::MakeRange(cur, end);
+  return true;
+}
+
+}  // namespace
+
+Parsed::Parsed() : inner_parsed_(NULL) {
+}
+
+Parsed::Parsed(const Parsed& other) :
+    scheme(other.scheme),
+    username(other.username),
+    password(other.password),
+    host(other.host),
+    port(other.port),
+    path(other.path),
+    query(other.query),
+    ref(other.ref),
+    inner_parsed_(NULL) {
+  if (other.inner_parsed_)
+    set_inner_parsed(*other.inner_parsed_);
+}
+
+Parsed& Parsed::operator=(const Parsed& other) {
+  if (this != &other) {
+    scheme = other.scheme;
+    username = other.username;
+    password = other.password;
+    host = other.host;
+    port = other.port;
+    path = other.path;
+    query = other.query;
+    ref = other.ref;
+    if (other.inner_parsed_)
+      set_inner_parsed(*other.inner_parsed_);
+    else
+      clear_inner_parsed();
+  }
+  return *this;
+}
+
+Parsed::~Parsed() {
+  delete inner_parsed_;
+}
+
+int Parsed::Length() const {
+  if (ref.is_valid())
+    return ref.end();
+  return CountCharactersBefore(REF, false);
+}
+
+int Parsed::CountCharactersBefore(ComponentType type,
+                                  bool include_delimiter) const {
+  if (type == SCHEME)
+    return scheme.begin;
+
+  // There will be some characters after the scheme like "://" and we don't
+  // know how many. Search forwards for the next thing until we find one.
+  int cur = 0;
+  if (scheme.is_valid())
+    cur = scheme.end() + 1;  // Advance over the ':' at the end of the scheme.
+
+  if (username.is_valid()) {
+    if (type <= USERNAME)
+      return username.begin;
+    cur = username.end() + 1;  // Advance over the '@' or ':' at the end.
+  }
+
+  if (password.is_valid()) {
+    if (type <= PASSWORD)
+      return password.begin;
+    cur = password.end() + 1;  // Advance over the '@' at the end.
+  }
+
+  if (host.is_valid()) {
+    if (type <= HOST)
+      return host.begin;
+    cur = host.end();
+  }
+
+  if (port.is_valid()) {
+    if (type < PORT || (type == PORT && include_delimiter))
+      return port.begin - 1;  // Back over delimiter.
+    if (type == PORT)
+      return port.begin;  // Don't want delimiter counted.
+    cur = port.end();
+  }
+
+  if (path.is_valid()) {
+    if (type <= PATH)
+      return path.begin;
+    cur = path.end();
+  }
+
+  if (query.is_valid()) {
+    if (type < QUERY || (type == QUERY && include_delimiter))
+      return query.begin - 1;  // Back over delimiter.
+    if (type == QUERY)
+      return query.begin;  // Don't want delimiter counted.
+    cur = query.end();
+  }
+
+  if (ref.is_valid()) {
+    if (type == REF && !include_delimiter)
+      return ref.begin;  // Back over delimiter.
+
+    // When there is a ref and we get here, the component we wanted was before
+    // this and not found, so we always know the beginning of the ref is right.
+    return ref.begin - 1;  // Don't want delimiter counted.
+  }
+
+  return cur;
+}
+
+bool ExtractScheme(const char* url, int url_len, Component* scheme) {
+  return DoExtractScheme(url, url_len, scheme);
+}
+
+bool ExtractScheme(const char16* url, int url_len, Component* scheme) {
+  return DoExtractScheme(url, url_len, scheme);
+}
+
+// This handles everything that may be an authority terminator, including
+// backslash. For special backslash handling see DoParseAfterScheme.
+bool IsAuthorityTerminator(char16 ch) {
+  return IsURLSlash(ch) || ch == '?' || ch == '#';
+}
+
+void ExtractFileName(const char* url,
+                     const Component& path,
+                     Component* file_name) {
+  DoExtractFileName(url, path, file_name);
+}
+
+void ExtractFileName(const char16* url,
+                     const Component& path,
+                     Component* file_name) {
+  DoExtractFileName(url, path, file_name);
+}
+
+bool ExtractQueryKeyValue(const char* url,
+                          Component* query,
+                          Component* key,
+                          Component* value) {
+  return DoExtractQueryKeyValue(url, query, key, value);
+}
+
+bool ExtractQueryKeyValue(const char16* url,
+                          Component* query,
+                          Component* key,
+                          Component* value) {
+  return DoExtractQueryKeyValue(url, query, key, value);
+}
+
+void ParseAuthority(const char* spec,
+                    const Component& auth,
+                    Component* username,
+                    Component* password,
+                    Component* hostname,
+                    Component* port_num) {
+  DoParseAuthority(spec, auth, username, password, hostname, port_num);
+}
+
+void ParseAuthority(const char16* spec,
+                    const Component& auth,
+                    Component* username,
+                    Component* password,
+                    Component* hostname,
+                    Component* port_num) {
+  DoParseAuthority(spec, auth, username, password, hostname, port_num);
+}
+
+int ParsePort(const char* url, const Component& port) {
+  return DoParsePort(url, port);
+}
+
+int ParsePort(const char16* url, const Component& port) {
+  return DoParsePort(url, port);
+}
+
+void ParseStandardURL(const char* url, int url_len, Parsed* parsed) {
+  DoParseStandardURL(url, url_len, parsed);
+}
+
+void ParseStandardURL(const char16* url, int url_len, Parsed* parsed) {
+  DoParseStandardURL(url, url_len, parsed);
+}
+
+void ParsePathURL(const char* url, int url_len, Parsed* parsed) {
+  DoParsePathURL(url, url_len, parsed);
+}
+
+void ParsePathURL(const char16* url, int url_len, Parsed* parsed) {
+  DoParsePathURL(url, url_len, parsed);
+}
+
+void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
+  DoParseFileSystemURL(url, url_len, parsed);
+}
+
+void ParseFileSystemURL(const char16* url, int url_len, Parsed* parsed) {
+  DoParseFileSystemURL(url, url_len, parsed);
+}
+
+void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) {
+  DoParseMailtoURL(url, url_len, parsed);
+}
+
+void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed) {
+  DoParseMailtoURL(url, url_len, parsed);
+}
+
+void ParsePathInternal(const char* spec,
+                       const Component& path,
+                       Component* filepath,
+                       Component* query,
+                       Component* ref) {
+  ParsePath(spec, path, filepath, query, ref);
+}
+
+void ParsePathInternal(const char16* spec,
+                       const Component& path,
+                       Component* filepath,
+                       Component* query,
+                       Component* ref) {
+  ParsePath(spec, path, filepath, query, ref);
+}
+
+void ParseAfterScheme(const char* spec,
+                      int spec_len,
+                      int after_scheme,
+                      Parsed* parsed) {
+  DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+void ParseAfterScheme(const char16* spec,
+                      int spec_len,
+                      int after_scheme,
+                      Parsed* parsed) {
+  DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+}  // namespace url_parse
diff --git a/googleurl/url_parse_file.cpp b/googleurl/url_parse_file.cpp
new file mode 100644
index 0000000..02b8028
--- /dev/null
+++ b/googleurl/url_parse_file.cpp
@@ -0,0 +1,243 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "base/logging.h"
+#include "url_file.h"
+#include "url_parse.h"
+#include "url_parse_internal.h"
+
+// Interesting IE file:isms...
+//
+//  INPUT                      OUTPUT
+//  =========================  ==============================
+//  file:/foo/bar              file:///foo/bar
+//      The result here seems totally invalid!?!? This isn't UNC.
+//
+//  file:/
+//  file:// or any other number of slashes
+//      IE6 doesn't do anything at all if you click on this link. No error:
+//      nothing. IE6's history system seems to always color this link, so I'm
+//      guessing that it maps internally to the empty URL.
+//
+//  C:\                        file:///C:/
+//      When on a file: URL source page, this link will work. When over HTTP,
+//      the file: URL will appear in the status bar but the link will not work
+//      (security restriction for all file URLs).
+//
+//  file:foo/                  file:foo/     (invalid?!?!?)
+//  file:/foo/                 file:///foo/  (invalid?!?!?)
+//  file://foo/                file://foo/   (UNC to server "foo")
+//  file:///foo/               file:///foo/  (invalid, seems to be a file)
+//  file:////foo/              file://foo/   (UNC to server "foo")
+//      Any more than four slashes is also treated as UNC.
+//
+//  file:C:/                   file://C:/
+//  file:/C:/                  file://C:/
+//      The number of slashes after "file:" don't matter if the thing following
+//      it looks like an absolute drive path. Also, slashes and backslashes are
+//      equally valid here.
+
+namespace url_parse {
+
+namespace {
+
+// A subcomponent of DoInitFileURL, the input of this function should be a UNC
+// path name, with the index of the first character after the slashes following
+// the scheme given in |after_slashes|. This will initialize the host, path,
+// query, and ref, and leave the other output components untouched
+// (DoInitFileURL handles these for us).
+template<typename CHAR>
+void DoParseUNC(const CHAR* spec,
+                int after_slashes,
+                int spec_len,
+               Parsed* parsed) {
+  int next_slash = FindNextSlash(spec, after_slashes, spec_len);
+  if (next_slash == spec_len) {
+    // No additional slash found, as in "file://foo", treat the text as the
+    // host with no path (this will end up being UNC to server "foo").
+    int host_len = spec_len - after_slashes;
+    if (host_len)
+      parsed->host = Component(after_slashes, host_len);
+    else
+      parsed->host.reset();
+    parsed->path.reset();
+    return;
+  }
+
+#ifdef WIN32
+  // See if we have something that looks like a path following the first
+  // component. As in "file://localhost/c:/", we get "c:/" out. We want to
+  // treat this as a having no host but the path given. Works on Windows only.
+  if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
+    parsed->host.reset();
+    ParsePathInternal(spec, MakeRange(next_slash, spec_len),
+                      &parsed->path, &parsed->query, &parsed->ref);
+    return;
+  }
+#endif
+
+  // Otherwise, everything up until that first slash we found is the host name,
+  // which will end up being the UNC host. For example "file://foo/bar.txt"
+  // will get a server name of "foo" and a path of "/bar". Later, on Windows,
+  // this should be treated as the filename "\\foo\bar.txt" in proper UNC
+  // notation.
+  int host_len = next_slash - after_slashes;
+  if (host_len)
+    parsed->host = MakeRange(after_slashes, next_slash);
+  else
+    parsed->host.reset();
+  if (next_slash < spec_len) {
+    ParsePathInternal(spec, MakeRange(next_slash, spec_len),
+                      &parsed->path, &parsed->query, &parsed->ref);
+  } else {
+    parsed->path.reset();
+  }
+}
+
+// A subcomponent of DoParseFileURL, the input should be a local file, with the
+// beginning of the path indicated by the index in |path_begin|. This will
+// initialize the host, path, query, and ref, and leave the other output
+// components untouched (DoInitFileURL handles these for us).
+template<typename CHAR>
+void DoParseLocalFile(const CHAR* spec,
+                      int path_begin,
+                      int spec_len,
+                      Parsed* parsed) {
+  parsed->host.reset();
+  ParsePathInternal(spec, MakeRange(path_begin, spec_len),
+                    &parsed->path, &parsed->query, &parsed->ref);
+}
+
+// Backend for the external functions that operates on either char type.
+// We are handed the character after the "file:" at the beginning of the spec.
+// Usually this is a slash, but needn't be; we allow paths like "file:c:\foo".
+template<typename CHAR>
+void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+  DCHECK(spec_len >= 0);
+
+  // Get the parts we never use for file URLs out of the way.
+  parsed->username.reset();
+  parsed->password.reset();
+  parsed->port.reset();
+
+  // Many of the code paths don't set these, so it's convenient to just clear
+  // them. We'll write them in those cases we need them.
+  parsed->query.reset();
+  parsed->ref.reset();
+
+  // Strip leading & trailing spaces and control characters.
+  int begin = 0;
+  TrimURL(spec, &begin, &spec_len);
+
+  // Find the scheme.
+  int num_slashes;
+  int after_scheme;
+  int after_slashes;
+#ifdef WIN32
+  // See how many slashes there are. We want to handle cases like UNC but also
+  // "/c:/foo". This is when there is no scheme, so we can allow pages to do
+  // links like "c:/foo/bar" or "//foo/bar". This is also called by the
+  // relative URL resolver when it determines there is an absolute URL, which
+  // may give us input like "/c:/foo".
+  num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
+  after_slashes = begin + num_slashes;
+  if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
+    // Windows path, don't try to extract the scheme (for example, "c:\foo").
+    parsed->scheme.reset();
+    after_scheme = after_slashes;
+  } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
+    // Windows UNC path: don't try to extract the scheme, but keep the slashes.
+    parsed->scheme.reset();
+    after_scheme = begin;
+  } else
+#endif
+  {
+    if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+      // Offset the results since we gave ExtractScheme a substring.
+      parsed->scheme.begin += begin;
+      after_scheme = parsed->scheme.end() + 1;
+    } else {
+      // No scheme found, remember that.
+      parsed->scheme.reset();
+      after_scheme = begin;
+    }
+  }
+
+  // Handle empty specs ones that contain only whitespace or control chars,
+  // or that are just the scheme (for example "file:").
+  if (after_scheme == spec_len) {
+    parsed->host.reset();
+    parsed->path.reset();
+    return;
+  }
+
+  num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
+
+  after_slashes = after_scheme + num_slashes;
+#ifdef WIN32
+  // Check whether the input is a drive again. We checked above for windows
+  // drive specs, but that's only at the very beginning to see if we have a
+  // scheme at all. This test will be duplicated in that case, but will
+  // additionally handle all cases with a real scheme such as "file:///C:/".
+  if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
+      num_slashes != 3) {
+    // Anything not beginning with a drive spec ("c:\") on Windows is treated
+    // as UNC, with the exception of three slashes which always means a file.
+    // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
+    DoParseUNC(spec, after_slashes, spec_len, parsed);
+    return;
+  }
+#else
+  // file: URL with exactly 2 slashes is considered to have a host component.
+  if (num_slashes == 2) {
+    DoParseUNC(spec, after_slashes, spec_len, parsed);
+    return;
+  }
+#endif  // WIN32
+
+  // Easy and common case, the full path immediately follows the scheme
+  // (modulo slashes), as in "file://c:/foo". Just treat everything from
+  // there to the end as the path. Empty hosts have 0 length instead of -1.
+  // We include the last slash as part of the path if there is one.
+  DoParseLocalFile(spec,
+      num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
+      spec_len, parsed);
+}
+
+}  // namespace
+
+void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
+  DoParseFileURL(url, url_len, parsed);
+}
+
+void ParseFileURL(const char16* url, int url_len, Parsed* parsed) {
+  DoParseFileURL(url, url_len, parsed);
+}
+
+}  // namespace url_parse
diff --git a/googleurl/url_util.cpp b/googleurl/url_util.cpp
new file mode 100644
index 0000000..e8c9e60
--- /dev/null
+++ b/googleurl/url_util.cpp
@@ -0,0 +1,594 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <string.h>
+#include <vector>
+
+#include "url_util.h"
+
+#include "base/logging.h"
+#include "url_canon_internal.h"
+#include "url_file.h"
+#include "url_util_internal.h"
+
+namespace url_util {
+
+const char kFileScheme[] = "file";
+const char kFileSystemScheme[] = "filesystem";
+const char kMailtoScheme[] = "mailto";
+
+namespace {
+
+// ASCII-specific tolower.  The standard library's tolower is locale sensitive,
+// so we don't want to use it here.
+template <class Char> inline Char ToLowerASCII(Char c) {
+  return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
+}
+
+// Backend for LowerCaseEqualsASCII.
+template<typename Iter>
+inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
+  for (Iter it = a_begin; it != a_end; ++it, ++b) {
+    if (!*b || ToLowerASCII(*it) != *b)
+      return false;
+  }
+  return *b == 0;
+}
+
+const int kNumStandardURLSchemes = 8;
+const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
+  "http",
+  "https",
+  kFileScheme,  // Yes, file urls can have a hostname!
+  "ftp",
+  "gopher",
+  "ws",  // WebSocket.
+  "wss",  // WebSocket secure.
+  kFileSystemScheme,
+};
+
+// List of the currently installed standard schemes. This list is lazily
+// initialized by InitStandardSchemes and is leaked on shutdown to prevent
+// any destructors from being called that will slow us down or cause problems.
+std::vector<const char*>* standard_schemes = NULL;
+
+// See the LockStandardSchemes declaration in the header.
+bool standard_schemes_locked = false;
+
+// Ensures that the standard_schemes list is initialized, does nothing if it
+// already has values.
+void InitStandardSchemes() {
+  if (standard_schemes)
+    return;
+  standard_schemes = new std::vector<const char*>;
+  for (int i = 0; i < kNumStandardURLSchemes; i++)
+    standard_schemes->push_back(kStandardURLSchemes[i]);
+}
+
+// Given a string and a range inside the string, compares it to the given
+// lower-case |compare_to| buffer.
+template<typename CHAR>
+inline bool DoCompareSchemeComponent(const CHAR* spec,
+                                     const url_parse::Component& component,
+                                     const char* compare_to) {
+  if (!component.is_nonempty())
+    return compare_to[0] == 0;  // When component is empty, match empty scheme.
+  return LowerCaseEqualsASCII(&spec[component.begin],
+                              &spec[component.end()],
+                              compare_to);
+}
+
+// Returns true if the given scheme identified by |scheme| within |spec| is one
+// of the registered "standard" schemes.
+template<typename CHAR>
+bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {
+  if (!scheme.is_nonempty())
+    return false;  // Empty or invalid schemes are non-standard.
+
+  InitStandardSchemes();
+  for (size_t i = 0; i < standard_schemes->size(); i++) {
+    if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
+                             standard_schemes->at(i)))
+      return true;
+  }
+  return false;
+}
+
+template<typename CHAR>
+bool DoFindAndCompareScheme(const CHAR* str,
+                            int str_len,
+                            const char* compare,
+                            url_parse::Component* found_scheme) {
+  // Before extracting scheme, canonicalize the URL to remove any whitespace.
+  // This matches the canonicalization done in DoCanonicalize function.
+  url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+  int spec_len;
+  const CHAR* spec = RemoveURLWhitespace(str, str_len,
+                                         &whitespace_buffer, &spec_len);
+
+  url_parse::Component our_scheme;
+  if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) {
+    // No scheme.
+    if (found_scheme)
+      *found_scheme = url_parse::Component();
+    return false;
+  }
+  if (found_scheme)
+    *found_scheme = our_scheme;
+  return DoCompareSchemeComponent(spec, our_scheme, compare);
+}
+
+template<typename CHAR>
+bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
+                    url_canon::CharsetConverter* charset_converter,
+                    url_canon::CanonOutput* output,
+                    url_parse::Parsed* output_parsed) {
+  // Remove any whitespace from the middle of the relative URL, possibly
+  // copying to the new buffer.
+  url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+  int spec_len;
+  const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
+                                         &whitespace_buffer, &spec_len);
+
+  url_parse::Parsed parsed_input;
+#ifdef WIN32
+  // For Windows, we allow things that look like absolute Windows paths to be
+  // fixed up magically to file URLs. This is done for IE compatability. For
+  // example, this will change "c:/foo" into a file URL rather than treating
+  // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
+  // There is similar logic in url_canon_relative.cc for
+  //
+  // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
+  // has no meaning as an absolute path name. This is because browsers on Mac
+  // & Unix don't generally do this, so there is no compatibility reason for
+  // doing so.
+  if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) ||
+      url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
+    url_parse::ParseFileURL(spec, spec_len, &parsed_input);
+    return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
+                                          charset_converter,
+                                          output, output_parsed);
+  }
+#endif
+
+  url_parse::Component scheme;
+  if (!url_parse::ExtractScheme(spec, spec_len, &scheme)) {
+    AppendInvalidNarrowString(spec, 0, spec_len, output);
+    return false;
+  }
+
+  // This is the parsed version of the input URL, we have to canonicalize it
+  // before storing it in our object.
+  bool success;
+  if (DoCompareSchemeComponent(spec, scheme, kFileScheme)) {
+    // File URLs are special.
+    url_parse::ParseFileURL(spec, spec_len, &parsed_input);
+    success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
+                                             charset_converter, output,
+                                             output_parsed);
+  } else if (DoCompareSchemeComponent(spec, scheme, kFileSystemScheme)) {
+    // Filesystem URLs are special.
+    url_parse::ParseFileSystemURL(spec, spec_len, &parsed_input);
+    success = url_canon::CanonicalizeFileSystemURL(spec, spec_len,
+                                                   parsed_input,
+                                                   charset_converter,
+                                                   output, output_parsed);
+
+  } else if (DoIsStandard(spec, scheme)) {
+    // All "normal" URLs.
+    url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
+    success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
+                                                 charset_converter,
+                                                 output, output_parsed);
+
+  } else if (DoCompareSchemeComponent(spec, scheme, kMailtoScheme)) {
+    // Mailto are treated like a standard url with only a scheme, path, query
+    url_parse::ParseMailtoURL(spec, spec_len, &parsed_input);
+    success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input,
+                                               output, output_parsed);
+
+  } else {
+    // "Weird" URLs like data: and javascript:
+    url_parse::ParsePathURL(spec, spec_len, &parsed_input);
+    success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input,
+                                             output, output_parsed);
+  }
+  return success;
+}
+
+template<typename CHAR>
+bool DoResolveRelative(const char* base_spec,
+                       int base_spec_len,
+                       const url_parse::Parsed& base_parsed,
+                       const CHAR* in_relative,
+                       int in_relative_length,
+                       url_canon::CharsetConverter* charset_converter,
+                       url_canon::CanonOutput* output,
+                       url_parse::Parsed* output_parsed) {
+  (void)base_spec_len;
+  // Remove any whitespace from the middle of the relative URL, possibly
+  // copying to the new buffer.
+  url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+  int relative_length;
+  const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
+                                             &whitespace_buffer,
+                                             &relative_length);
+
+  // See if our base URL should be treated as "standard".
+  bool standard_base_scheme =
+      base_parsed.scheme.is_nonempty() &&
+      DoIsStandard(base_spec, base_parsed.scheme);
+
+  bool is_relative;
+  url_parse::Component relative_component;
+  if (!url_canon::IsRelativeURL(base_spec, base_parsed,
+                                relative, relative_length,
+                                standard_base_scheme,
+                                &is_relative,
+                                &relative_component)) {
+    // Error resolving.
+    return false;
+  }
+
+  if (is_relative) {
+    // Relative, resolve and canonicalize.
+    bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
+        DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
+    return url_canon::ResolveRelativeURL(base_spec, base_parsed,
+                                         file_base_scheme, relative,
+                                         relative_component, charset_converter,
+                                         output, output_parsed);
+  }
+
+  // Not relative, canonicalize the input.
+  return DoCanonicalize(relative, relative_length, charset_converter,
+                        output, output_parsed);
+}
+
+template<typename CHAR>
+bool DoReplaceComponents(const char* spec,
+                         int spec_len,
+                         const url_parse::Parsed& parsed,
+                         const url_canon::Replacements<CHAR>& replacements,
+                         url_canon::CharsetConverter* charset_converter,
+                         url_canon::CanonOutput* output,
+                         url_parse::Parsed* out_parsed) {
+  // If the scheme is overridden, just do a simple string substitution and
+  // reparse the whole thing. There are lots of edge cases that we really don't
+  // want to deal with. Like what happens if I replace "http://e:8080/foo"
+  // with a file. Does it become "file:///E:/8080/foo" where the port number
+  // becomes part of the path? Parsing that string as a file URL says "yes"
+  // but almost no sane rule for dealing with the components individually would
+  // come up with that.
+  //
+  // Why allow these crazy cases at all? Programatically, there is almost no
+  // case for replacing the scheme. The most common case for hitting this is
+  // in JS when building up a URL using the location object. In this case, the
+  // JS code expects the string substitution behavior:
+  //   http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
+  if (replacements.IsSchemeOverridden()) {
+    // Canonicalize the new scheme so it is 8-bit and can be concatenated with
+    // the existing spec.
+    url_canon::RawCanonOutput<128> scheme_replaced;
+    url_parse::Component scheme_replaced_parsed;
+    url_canon::CanonicalizeScheme(
+        replacements.sources().scheme,
+        replacements.components().scheme,
+        &scheme_replaced, &scheme_replaced_parsed);
+
+    // We can assume that the input is canonicalized, which means it always has
+    // a colon after the scheme (or where the scheme would be).
+    int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
+                                                    : 1;
+    if (spec_len - spec_after_colon > 0) {
+      scheme_replaced.Append(&spec[spec_after_colon],
+                             spec_len - spec_after_colon);
+    }
+
+    // We now need to completely re-parse the resulting string since its meaning
+    // may have changed with the different scheme.
+    url_canon::RawCanonOutput<128> recanonicalized;
+    url_parse::Parsed recanonicalized_parsed;
+    DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
+                   charset_converter,
+                   &recanonicalized, &recanonicalized_parsed);
+
+    // Recurse using the version with the scheme already replaced. This will now
+    // use the replacement rules for the new scheme.
+    //
+    // Warning: this code assumes that ReplaceComponents will re-check all
+    // components for validity. This is because we can't fail if DoCanonicalize
+    // failed above since theoretically the thing making it fail could be
+    // getting replaced here. If ReplaceComponents didn't re-check everything,
+    // we wouldn't know if something *not* getting replaced is a problem.
+    // If the scheme-specific replacers are made more intelligent so they don't
+    // re-check everything, we should instead recanonicalize the whole thing
+    // after this call to check validity (this assumes replacing the scheme is
+    // much much less common than other types of replacements, like clearing the
+    // ref).
+    url_canon::Replacements<CHAR> replacements_no_scheme = replacements;
+    replacements_no_scheme.SetScheme(NULL, url_parse::Component());
+    return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
+                               recanonicalized_parsed, replacements_no_scheme,
+                               charset_converter, output, out_parsed);
+  }
+
+  // If we get here, then we know the scheme doesn't need to be replaced, so can
+  // just key off the scheme in the spec to know how to do the replacements.
+  if (DoCompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {
+    return url_canon::ReplaceFileURL(spec, parsed, replacements,
+                                     charset_converter, output, out_parsed);
+  }
+  if (DoCompareSchemeComponent(spec, parsed.scheme, kFileSystemScheme)) {
+    return url_canon::ReplaceFileSystemURL(spec, parsed, replacements,
+                                           charset_converter, output,
+                                           out_parsed);
+  }
+  if (DoIsStandard(spec, parsed.scheme)) {
+    return url_canon::ReplaceStandardURL(spec, parsed, replacements,
+                                         charset_converter, output, out_parsed);
+  }
+  if (DoCompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {
+     return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
+                                        output, out_parsed);
+  }
+
+  // Default is a path URL.
+  return url_canon::ReplacePathURL(spec, parsed, replacements,
+                                   output, out_parsed);
+}
+
+}  // namespace
+
+void Initialize() {
+  InitStandardSchemes();
+}
+
+void Shutdown() {
+  if (standard_schemes) {
+    delete standard_schemes;
+    standard_schemes = NULL;
+  }
+}
+
+void AddStandardScheme(const char* new_scheme) {
+  // If this assert triggers, it means you've called AddStandardScheme after
+  // LockStandardSchemes have been called (see the header file for
+  // LockStandardSchemes for more).
+  //
+  // This normally means you're trying to set up a new standard scheme too late
+  // in your application's init process. Locate where your app does this
+  // initialization and calls LockStandardScheme, and add your new standard
+  // scheme there.
+  DCHECK(!standard_schemes_locked) <<
+      "Trying to add a standard scheme after the list has been locked.";
+
+  size_t scheme_len = strlen(new_scheme);
+  if (scheme_len == 0)
+    return;
+
+  // Dulicate the scheme into a new buffer and add it to the list of standard
+  // schemes. This pointer will be leaked on shutdown.
+  char* dup_scheme = new char[scheme_len + 1];
+  memcpy(dup_scheme, new_scheme, scheme_len + 1);
+
+  InitStandardSchemes();
+  standard_schemes->push_back(dup_scheme);
+}
+
+void LockStandardSchemes() {
+  standard_schemes_locked = true;
+}
+
+bool IsStandard(const char* spec, const url_parse::Component& scheme) {
+  return DoIsStandard(spec, scheme);
+}
+
+bool IsStandard(const char16* spec, const url_parse::Component& scheme) {
+  return DoIsStandard(spec, scheme);
+}
+
+bool FindAndCompareScheme(const char* str,
+                          int str_len,
+                          const char* compare,
+                          url_parse::Component* found_scheme) {
+  return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
+}
+
+bool FindAndCompareScheme(const char16* str,
+                          int str_len,
+                          const char* compare,
+                          url_parse::Component* found_scheme) {
+  return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
+}
+
+bool Canonicalize(const char* spec,
+                  int spec_len,
+                  url_canon::CharsetConverter* charset_converter,
+                  url_canon::CanonOutput* output,
+                  url_parse::Parsed* output_parsed) {
+  return DoCanonicalize(spec, spec_len, charset_converter,
+                        output, output_parsed);
+}
+
+bool Canonicalize(const char16* spec,
+                  int spec_len,
+                  url_canon::CharsetConverter* charset_converter,
+                  url_canon::CanonOutput* output,
+                  url_parse::Parsed* output_parsed) {
+  return DoCanonicalize(spec, spec_len, charset_converter,
+                        output, output_parsed);
+}
+
+bool ResolveRelative(const char* base_spec,
+                     int base_spec_len,
+                     const url_parse::Parsed& base_parsed,
+                     const char* relative,
+                     int relative_length,
+                     url_canon::CharsetConverter* charset_converter,
+                     url_canon::CanonOutput* output,
+                     url_parse::Parsed* output_parsed) {
+  return DoResolveRelative(base_spec, base_spec_len, base_parsed,
+                           relative, relative_length,
+                           charset_converter, output, output_parsed);
+}
+
+bool ResolveRelative(const char* base_spec,
+                     int base_spec_len,
+                     const url_parse::Parsed& base_parsed,
+                     const char16* relative,
+                     int relative_length,
+                     url_canon::CharsetConverter* charset_converter,
+                     url_canon::CanonOutput* output,
+                     url_parse::Parsed* output_parsed) {
+  return DoResolveRelative(base_spec, base_spec_len, base_parsed,
+                           relative, relative_length,
+                           charset_converter, output, output_parsed);
+}
+
+bool ReplaceComponents(const char* spec,
+                       int spec_len,
+                       const url_parse::Parsed& parsed,
+                       const url_canon::Replacements<char>& replacements,
+                       url_canon::CharsetConverter* charset_converter,
+                       url_canon::CanonOutput* output,
+                       url_parse::Parsed* out_parsed) {
+  return DoReplaceComponents(spec, spec_len, parsed, replacements,
+                             charset_converter, output, out_parsed);
+}
+
+bool ReplaceComponents(const char* spec,
+                       int spec_len,
+                       const url_parse::Parsed& parsed,
+                       const url_canon::Replacements<char16>& replacements,
+                       url_canon::CharsetConverter* charset_converter,
+                       url_canon::CanonOutput* output,
+                       url_parse::Parsed* out_parsed) {
+  return DoReplaceComponents(spec, spec_len, parsed, replacements,
+                             charset_converter, output, out_parsed);
+}
+
+// Front-ends for LowerCaseEqualsASCII.
+bool LowerCaseEqualsASCII(const char* a_begin,
+                          const char* a_end,
+                          const char* b) {
+  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
+}
+
+bool LowerCaseEqualsASCII(const char* a_begin,
+                          const char* a_end,
+                          const char* b_begin,
+                          const char* b_end) {
+  while (a_begin != a_end && b_begin != b_end &&
+         ToLowerASCII(*a_begin) == *b_begin) {
+    a_begin++;
+    b_begin++;
+  }
+  return a_begin == a_end && b_begin == b_end;
+}
+
+bool LowerCaseEqualsASCII(const char16* a_begin,
+                          const char16* a_end,
+                          const char* b) {
+  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
+}
+
+void DecodeURLEscapeSequences(const char* input, int length,
+                              url_canon::CanonOutputW* output) {
+  url_canon::RawCanonOutputT<char> unescaped_chars;
+  for (int i = 0; i < length; i++) {
+    if (input[i] == '%') {
+      unsigned char ch;
+      if (url_canon::DecodeEscaped(input, &i, length, &ch)) {
+        unescaped_chars.push_back(ch);
+      } else {
+        // Invalid escape sequence, copy the percent literal.
+        unescaped_chars.push_back('%');
+      }
+    } else {
+      // Regular non-escaped 8-bit character.
+      unescaped_chars.push_back(input[i]);
+    }
+  }
+
+  // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
+  // JavaScript URLs, but Firefox and Safari do.
+  for (int i = 0; i < unescaped_chars.length(); i++) {
+    unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
+    if (uch < 0x80) {
+      // Non-UTF-8, just append directly
+      output->push_back(uch);
+    } else {
+      // next_ch will point to the last character of the decoded
+      // character.
+      int next_character = i;
+      unsigned code_point;
+      if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character,
+                                 unescaped_chars.length(), &code_point)) {
+        // Valid UTF-8 character, convert to UTF-16.
+        url_canon::AppendUTF16Value(code_point, output);
+        i = next_character;
+      } else {
+        // If there are any sequences that are not valid UTF-8, we keep
+        // invalid code points and promote to UTF-16. We copy all characters
+        // from the current position to the end of the identified sequence.
+        while (i < next_character) {
+          output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+          i++;
+        }
+        output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+      }
+    }
+  }
+}
+
+void EncodeURIComponent(const char* input, int length,
+                        url_canon::CanonOutput* output) {
+  for (int i = 0; i < length; ++i) {
+    unsigned char c = static_cast<unsigned char>(input[i]);
+    if (url_canon::IsComponentChar(c))
+      output->push_back(c);
+    else
+      AppendEscapedChar(c, output);
+  }
+}
+
+bool CompareSchemeComponent(const char* spec,
+                            const url_parse::Component& component,
+                            const char* compare_to) {
+  return DoCompareSchemeComponent(spec, component, compare_to);
+}
+
+bool CompareSchemeComponent(const char16* spec,
+                            const url_parse::Component& component,
+                            const char* compare_to) {
+  return DoCompareSchemeComponent(spec, component, compare_to);
+}
+
+}  // namespace url_util
author	Andreas Baumann <abaumann@yahoo.com>	2012-08-04 14:03:06 +0200
committer	Andreas Baumann <abaumann@yahoo.com>	2012-08-04 14:03:06 +0200
commit	0c92e873518ce6a92caeba0be81a0d81d16c6ed8 (patch)
tree	ca0033ad7c96ff9e7e1d037b09dca12a2e90809b /googleurl
parent	9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27 (diff)
download	crawler-0c92e873518ce6a92caeba0be81a0d81d16c6ed8.tar.gz crawler-0c92e873518ce6a92caeba0be81a0d81d16c6ed8.tar.bz2