added google url library

author: Andreas Baumann <abaumann@yahoo.com> 2012-08-04 14:01:19 +0200
committer: Andreas Baumann <abaumann@yahoo.com> 2012-08-04 14:01:19 +0200
commit: 9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27 (patch)
tree: f88532f9adc9d15514f484cdf65e21c78d72e480 /googleurl
parent: 4029e28c299049e19972556eeb22cf6d15147eab (diff)
download: crawler-9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27.tar.gz
crawler-9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27.tar.bz2
24 files changed, 5099 insertions, 0 deletions
diff --git a/googleurl/GNUmakefile b/googleurl/GNUmakefile
new file mode 100644
index 0000000..0971a4a
--- /dev/null
+++ b/googleurl/GNUmakefile
@@ -0,0 +1,50 @@
+TOPDIR = ..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_CFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_DIRS = \
+	-I.
+
+INCLUDE_LIBS = \
+
+CPP_OBJS = \
+	url_canon_etc.o \
+	url_canon_filesystemurl.o \
+	url_canon_fileurl.o \
+	url_canon_host.o \
+	url_canon_icu.o \
+	url_canon_internal.o \
+	url_canon_ip.o \
+	url_canon_mailtourl.o \
+	url_canon_path.o \
+	url_canon_pathurl.o \
+	url_canon_query.o \
+	url_canon_relative.o \
+	url_canon_stdurl.o \
+	url_parse.o \
+	url_parse_file.o \
+	url_util.o \
+	gurl.o
+
+STATIC_LIB = \
+	libgoogleurl.a
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/googleurl/LICENSE.txt b/googleurl/LICENSE.txt
new file mode 100644
index 0000000..ac40837
--- /dev/null
+++ b/googleurl/LICENSE.txt
@@ -0,0 +1,65 @@
+Copyright 2007, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-------------------------------------------------------------------------------
+
+The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is
+licensed separately as follows:
+
+The contents of this file are subject to the Mozilla Public License Version
+1.1 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.mozilla.org/MPL/
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+for the specific language governing rights and limitations under the
+License.
+
+The Original Code is mozilla.org code.
+
+The Initial Developer of the Original Code is
+Netscape Communications Corporation.
+Portions created by the Initial Developer are Copyright (C) 1998
+the Initial Developer. All Rights Reserved.
+
+Contributor(s):
+  Darin Fisher (original author)
+
+Alternatively, the contents of this file may be used under the terms of
+either the GNU General Public License Version 2 or later (the "GPL"), or
+the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+in which case the provisions of the GPL or the LGPL are applicable instead
+of those above. If you wish to allow use of your version of this file only
+under the terms of either the GPL or the LGPL, and not to allow others to
+use your version of this file under the terms of the MPL, indicate your
+decision by deleting the provisions above and replace them with the notice
+and other provisions required by the GPL or the LGPL. If you do not delete
+the provisions above, a recipient may use your version of this file under
+the terms of any one of the MPL, the GPL or the LGPL.
diff --git a/googleurl/README.txt b/googleurl/README.txt
new file mode 100644
index 0000000..d5f79a3
--- /dev/null
+++ b/googleurl/README.txt
@@ -0,0 +1,185 @@
+                       ==============================
+                       The Google URL Parsing Library
+                       ==============================
+
+This is the Google URL Parsing Library which parses and canonicalizes URLs.
+Please see the LICENSE.txt file for licensing information.
+
+Features
+========
+
+   * Easily embeddable: This library was written for a variety of client and
+     server programs in mind, so unlike most implementations of URL parsing
+     and canonicalization, it can be easily emdedded.
+
+   * Fast: hundreds of thousands of typical URLs can be parsed and
+     canonicalized per second on a modern CPU. It is much faster than, for
+     example, calling WinInet's corresponding functions.
+
+   * Compatible: When possible, this library has strived for IE7 compatability
+     for both general web compatability, and so IE addons or other applications
+     that communicate with or embed IE will work properly.
+
+     It supports Unix-style file URLs, as well as the more complex rules for
+     Window file URLs. Note that total compatability is not possible (for
+     example, IE6 and IE7 disagree about how to parse certain IP addresses),
+     and that this is more strict about certain illegal, rarely used, and
+     potentially dangerous constructs such as escaped control characters in
+     host names that IE will allow. It is typically a little less strict than
+     Firefox.
+
+
+Example
+=======
+
+An example implementation of a URL object that uses this library is provided
+in src/gurl.*. This implementation uses the "application integration" layer
+discussed below to interface with the low-level parsing and canonicalization
+functions.
+
+
+Building
+========
+
+The canonicalization files require ICU for some UTF-8 and UTF-16 conversion
+macros. If your project does not use ICU, it should be straightforward to
+factor out the macros and functions used in ICU, there are only a few well-
+isolated things that are used.
+
+TODO(brettw) ADD INSTRUCTIONS FOR GETTING ICU HERE!
+
+logging.h and logging.cc are Windows-only because the corresponding Unix
+logging system has many dependencies. This library uses few of the logging
+macros, and a dummy header can easily be written that defines the
+appropriate things for Unix.
+
+
+Definitions
+===========
+
+"Standard URL": A URL with an "authority", which is a hostname and optionally
+   a port, username, and password. Most URLs are standard such as HTTP and FTP.
+
+"File URL": A URL that references a file on disk. There are special rules for
+   this type of URL. Note that it may have a hostname! "localhost" is allowed,
+   for example "file://localhost/foo" is the same as "file:///foo".
+
+"FileSystem URL": A URL referring to a file reached via the FileSystem API
+   described at http://www.w3.org/TR/file-system-api/.  These are nested URLs,
+   with compound schemes of e.g. "filesystem:file:" or "filesystem:https:".
+   Parsed FileSystem URLs will have a nested inner_parsed() object containing
+   information about the inner URL.
+
+"Path URL": This is everything else. There is no standard on how to treat these
+   URLs, or even what they are called. This library decomposes them into a
+   scheme and a path. The path is everything following the scheme. This type of
+   URL includes "javascript", "data", and even "mailto" (although "mailto"
+   might look like a standard scheme in some respects, it is not).
+
+Design
+======
+
+The library is divided into four layers. They are listed here from the lowest
+to the highest; you can use any portion of the library as long as you embed the
+layers below it.
+
+1. Parsing
+----------
+At the lowest level is the parsing code. The files encompassing this are
+url_parse.* and the main include file is src/url_parse.h. This code will, given
+an input string, parse it into the most likely form of a URL.
+
+Parsing cannot fail and does no validation. The exception is the port number,
+which it currently validates, but this is a bug. Given crazy input, the parser
+will do its best to find the various URL components according to its rules (see
+url_parse_unittest.cc for some examples).
+
+To use this, an application will typically use ExtractScheme to determine the
+type of a given input URL, and then call one of the initialization functions:
+"ParseStandardURL", "ParsePathURL", or "ParseFileURL". This will result in
+a "Parsed" structure which identifies the substrings of each identified
+component.
+
+2. Canonicalization
+-------------------
+At the next highest level is canonicalization. The files encompasing this are
+url_canon.* and the main include file is src/url_canon.h. This code will
+validate an already-parsed URL, and will convert it to a canonical form. For
+example, this will convert host names to lowercase, convert IP addresses
+into dotted-decimal notation, handle encoding issues, etc.
+
+This layer will always do its best to produce a reasonable output string, but
+it may return that the string is invalid. For example, if there are invalid
+characters in the host name, it will escape them or replace them with the
+Unicode "invalid character" character, but will fail. This way, the program can
+display error messages to the user with the output, log it, etc.  and the
+string will have some meaning.
+
+Canonicalized output is written to a CanonOutput object which is a simple
+wrapper around an expanding buffer. An implementation called RawCanonOutput is
+proivided that writes to a raw buffer with a fixed amount statically allocated
+(for performance). Applications using STL can use StdStringCanonOutput defined
+in url_canon_stdstring.h which writes into a std::string.
+
+A normal application would call one of the four high-level functions
+"CanonicalizeStandardURL", "CanonicalizeFileURL", "CanonicalizeFileSystemURL",
+and CanonicalizePathURL" depending on the type of URL in question. Lower-level
+functions are also provided which will canonicalize individual parts of a URL
+(for example, "CanonicalizeHost").
+
+Part of this layer is the integration with the host system for IDN and encoding
+conversion. An implementation that provides integration with the ICU
+(http://www-306.ibm.com/software/globalization/icu/index.jsp) is provided in
+src/url_canon_icu.cc. The embedder may wish to replace this file with
+implementations of the functions for their own IDN library if they do not use
+ICU.
+
+3. Application integration
+--------------------------
+The canonicalization and parsing layers do not know anything about the URI
+schemes supported by your application. The parsing and canonicalization
+functions are very low-level, and you must call the correct function to do the
+work (for example, "CanonicalizeFileURL").
+
+The application integration in url_util.* provides wrappers around the
+low-level parsing and canonicalization to call the correct versions for
+different identified schemes.  Embedders will want to modify this file if
+necessary to suit the needs of their application.
+
+4. URL object
+-------------
+The highest level is the "URL" object that a C++ application would use to
+to encapsulate a URL. Embedders will typically want to provide their own URL
+object that meets the requirements of their system. A reasonably complete
+example implemnetation is provided in src/gurl.*. You may wish to use this
+object, extend or modify it, or write your own.
+
+Whitespace
+----------
+Sometimes, you may want to remove linefeeds and tabs from the content of a URL.
+Some web pages, for example, expect that a URL spanning two lines should be
+treated as one with the newline removed. Depending on the source of the URLs
+you are canonicalizing, these newlines may or may not be trimmed off.
+
+If you want this behavior, call RemoveURLWhitespace before parsing. This will
+remove CR, LF and TAB from the input. Note that it preserves spaces. On typical
+URLs, this function produces a 10-15% speed reduction, so it is optional and
+not done automatically. The example GURL object and the url_util wrapper does
+this for you.
+
+Tests
+=====
+
+There are a number of *_unittest.cc and *_perftest.cc files. These files are
+not currently compilable as they rely on a not-included unit testing framework
+Tests are declared like this:
+  TEST(TestCaseName, TestName) {
+    ASSERT_TRUE(a);
+    EXPECT_EQ(a, b);
+  }
+If you would like to compile them, it should be straightforward to define
+the TEST macro (which would declare a function by combining the two arguments)
+and the other macros whose behavior should be self-explanatory (EXPECT is like
+an ASSERT, but does not stop the test, if you are doing this, you probably
+don't care about this difference). Then you would define a .cc file that
+calls all of these functions.
diff --git a/googleurl/base/README.txt b/googleurl/base/README.txt
new file mode 100644
index 0000000..311faa0
--- /dev/null
+++ b/googleurl/base/README.txt
@@ -0,0 +1,2 @@
+These files contain some shared code. You can define your own assertion macros
+to eliminate the dependency on logging.h.
diff --git a/googleurl/base/basictypes.h b/googleurl/base/basictypes.h
new file mode 100644
index 0000000..b0c404d
--- /dev/null
+++ b/googleurl/base/basictypes.h
@@ -0,0 +1,88 @@
+// Copyright 2001 - 2003 Google Inc. All Rights Reserved
+
+#ifndef BASE_BASICTYPES_H__
+#define BASE_BASICTYPES_H__
+
+typedef unsigned char  uint8;
+typedef unsigned short uint16;
+typedef unsigned int   uint32;
+
+const uint8  kuint8max  = (( uint8) 0xFF);
+const uint32 kuint32max = ((uint32) 0xFFFFFFFF);
+
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+//
+// One caveat is that arraysize() doesn't accept any array of an
+// anonymous type or a type defined inside a function.  In these rare
+// cases, you have to use the unsafe ARRAYSIZE() macro below.  This is
+// due to a limitation in C++'s template system.  The limitation might
+// eventually be removed, but it hasn't happened yet.
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef _MSC_VER
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+// ARRAYSIZE performs essentially the same calculation as arraysize,
+// but can be used on anonymous types or types defined inside
+// functions.  It's less safe than arraysize as it accepts some
+// (although not all) pointers.  Therefore, you should use arraysize
+// whenever possible.
+//
+// The expression ARRAYSIZE(a) is a compile-time constant of type
+// size_t.
+//
+// ARRAYSIZE catches a few type errors.  If you see a compiler error
+//
+//   "warning: division by zero in ..."
+//
+// when using ARRAYSIZE, you are (wrongfully) giving it a pointer.
+// You should only use ARRAYSIZE on statically allocated arrays.
+//
+// The following comments are on the implementation details, and can
+// be ignored by the users.
+//
+// ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in
+// the array) and sizeof(*(arr)) (the # of bytes in one array
+// element).  If the former is divisible by the latter, perhaps arr is
+// indeed an array, in which case the division result is the # of
+// elements in the array.  Otherwise, arr cannot possibly be an array,
+// and we generate a compiler error to prevent the code from
+// compiling.
+//
+// Since the size of bool is implementation-defined, we need to cast
+// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final
+// result has type size_t.
+//
+// This macro is not perfect as it wrongfully accepts certain
+// pointers, namely where the pointer size is divisible by the pointee
+// size.  Since all our code has to go through a 32-bit compiler,
+// where a pointer is 4 bytes, this means all pointers to a type whose
+// size is 3 or greater than 4 will be (righteously) rejected.
+//
+// Starting with Visual C++ 2005, WinNT.h includes ARRAYSIZE.
+#define ARRAYSIZE_UNSAFE(a) \
+  ((sizeof(a) / sizeof(*(a))) / \
+   static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+
+// A macro to disallow the evil copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define DISALLOW_EVIL_CONSTRUCTORS(TypeName)    \
+  TypeName(const TypeName&);                    \
+  void operator=(const TypeName&)
+
+#endif  // BASE_BASICTYPES_H__
diff --git a/googleurl/base/logging.cc b/googleurl/base/logging.cc
new file mode 100644
index 0000000..ab03150
--- /dev/null
+++ b/googleurl/base/logging.cc
@@ -0,0 +1,380 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <ctime>
+#include <iomanip>
+#include <cstring>
+#include <windows.h>
+#include <tchar.h>
+#include <algorithm>
+#include "base/logging.h"
+
+namespace logging {
+
+const char* const log_severity_names[LOG_NUM_SEVERITIES] = {
+  "INFO", "WARNING", "ERROR", "FATAL" };
+
+int min_log_level = 0;
+LogLockingState lock_log_file = LOCK_LOG_FILE;
+LoggingDestination logging_destination = LOG_ONLY_TO_FILE;
+
+const int kMaxFilteredLogLevel = LOG_WARNING;
+char* log_filter_prefix = NULL;
+
+// which log file to use? This is initialized by InitLogging or
+// will be lazily initialized to the default value when it is
+// first needed.
+TCHAR log_file_name[MAX_PATH] = { 0 };
+
+// this file is lazily opened and the handle may be NULL
+HANDLE log_file = NULL;
+
+// what should be prepended to each message?
+bool log_process_id = false;
+bool log_thread_id = false;
+bool log_timestamp = true;
+bool log_tickcount = false;
+
+// An assert handler override specified by the client to be called instead of
+// the debug message dialog.
+LogAssertHandlerFunction log_assert_handler = NULL;
+
+// The critical section is used if log file locking is false. It helps us
+// avoid problems with multiple threads writing to the log file at the same
+// time.
+bool initialized_critical_section = false;
+CRITICAL_SECTION log_critical_section;
+
+// When we don't use a critical section, we are using a global mutex. We
+// need to do this because LockFileEx is not thread safe
+HANDLE log_mutex = NULL;
+
+// Called by logging functions to ensure that debug_file is initialized
+// and can be used for writing. Returns false if the file could not be
+// initialized. debug_file will be NULL in this case.
+bool InitializeLogFileHandle() {
+  if (log_file)
+    return true;
+
+  if (!log_file_name[0]) {
+    // nobody has called InitLogging to specify a debug log file, so here we
+    // initialize the log file name to the default
+    GetModuleFileName(NULL, log_file_name, MAX_PATH);
+    TCHAR* last_backslash = _tcsrchr(log_file_name, '\\');
+    if (last_backslash)
+      last_backslash[1] = 0; // name now ends with the backslash
+    _tcscat_s(log_file_name, _T("debug.log"));
+  }
+
+  log_file = CreateFile(log_file_name, GENERIC_WRITE,
+                        FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+                        OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+  if (log_file == INVALID_HANDLE_VALUE || log_file == NULL) {
+    // try the current directory
+    log_file = CreateFile(_T(".\\debug.log"), GENERIC_WRITE,
+                          FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+                          OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (log_file == INVALID_HANDLE_VALUE || log_file == NULL) {
+      log_file = NULL;
+      return false;
+    }
+  }
+  SetFilePointer(log_file, 0, 0, FILE_END);
+  return true;
+}
+
+void InitLogMutex() {
+  if (!log_mutex) {
+    // \ is not a legal character in mutex names so we replace \ with /
+    std::wstring safe_name(log_file_name);
+    std::replace(safe_name.begin(), safe_name.end(), '\\', '/');
+    std::wstring t(L"Global\\");
+    t.append(safe_name);
+    log_mutex = ::CreateMutex(NULL, FALSE, t.c_str());
+  }
+}
+
+void InitLogging(const TCHAR* new_log_file, LoggingDestination logging_dest,
+                 LogLockingState lock_log, OldFileDeletionState delete_old) {
+  if (log_file) {
+    // calling InitLogging twice or after some log call has already opened the
+    // default log file will re-initialize to the new options
+    CloseHandle(log_file);
+    log_file = NULL;
+  }
+
+  lock_log_file = lock_log;
+  logging_destination = logging_dest;
+
+  // ignore file options if logging is only to system
+  if (logging_destination == LOG_ONLY_TO_SYSTEM_DEBUG_LOG)
+    return;
+
+  _tcscpy_s(log_file_name, MAX_PATH, new_log_file);
+  if (delete_old == DELETE_OLD_LOG_FILE)
+    DeleteFile(log_file_name);
+
+  if (lock_log_file == LOCK_LOG_FILE) {
+    InitLogMutex();
+  } else if (!initialized_critical_section) {
+    // initialize the critical section
+    InitializeCriticalSection(&log_critical_section);
+    initialized_critical_section = true;
+  }
+
+  InitializeLogFileHandle();
+}
+
+void SetMinLogLevel(int level) {
+  min_log_level = level;
+}
+
+void SetLogFilterPrefix(char* filter)  {
+  if (log_filter_prefix) {
+    delete[] log_filter_prefix;
+    log_filter_prefix = NULL;
+  }
+
+  if (filter) {
+    size_t size = strlen(filter)+1;
+    log_filter_prefix = new char[size];
+    strcpy_s(log_filter_prefix, size, filter);
+  }
+}
+
+void SetLogItems(bool enable_process_id, bool enable_thread_id,
+                 bool enable_timestamp, bool enable_tickcount) {
+  log_process_id = enable_process_id;
+  log_thread_id = enable_thread_id;
+  log_timestamp = enable_timestamp;
+  log_tickcount = enable_tickcount;
+}
+
+void SetLogAssertHandler(LogAssertHandlerFunction handler) {
+  log_assert_handler = handler;
+}
+
+// Displays a message box to the user with the error message in it. For
+// Windows programs, it's possible that the message loop is messed up on
+// a fatal error, and creating a MessageBox will cause that message loop
+// to be run. Instead, we try to spawn another process that displays its
+// command line. We look for "Debug Message.exe" in the same directory as
+// the application. If it exists, we use it, otherwise, we use a regular
+// message box.
+void DisplayDebugMessage(const std::string& str) {
+  if (str.empty())
+    return;
+
+  // look for the debug dialog program next to our application
+  wchar_t prog_name[MAX_PATH];
+  GetModuleFileNameW(NULL, prog_name, MAX_PATH);
+  wchar_t* backslash = wcsrchr(prog_name, '\\');
+  if (backslash)
+    backslash[1] = 0;
+  wcscat_s(prog_name, MAX_PATH, L"debug_message.exe");
+
+  // stupid CreateProcess requires a non-const command line and may modify it.
+  // We also want to use the wide string
+  int charcount = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, NULL, 0);
+  if (!charcount)
+    return;
+  scoped_array<wchar_t> cmdline(new wchar_t[charcount]);
+  if (!MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, cmdline.get(), charcount))
+    return;
+
+  STARTUPINFO startup_info;
+  memset(&startup_info, 0, sizeof(startup_info));
+  startup_info.cb = sizeof(startup_info);
+
+  PROCESS_INFORMATION process_info;
+  if (CreateProcessW(prog_name, cmdline.get(), NULL, NULL, false, 0, NULL,
+                     NULL, &startup_info, &process_info)) {
+    WaitForSingleObject(process_info.hProcess, INFINITE);
+    CloseHandle(process_info.hThread);
+    CloseHandle(process_info.hProcess);
+  } else {
+    // debug process broken, let's just do a message box
+    MessageBoxW(NULL, cmdline.get(), L"Fatal error", MB_OK | MB_ICONHAND);
+  }
+}
+
+LogMessage::LogMessage(const char* file, int line, LogSeverity severity,
+                       int ctr)
+    : severity_(severity) {
+  Init(file, line);
+}
+
+LogMessage::LogMessage(const char* file, int line, const CheckOpString& result)
+    : severity_(LOG_FATAL) {
+  Init(file, line);
+  stream_ << "Check failed: " << (*result.str_);
+}
+
+LogMessage::LogMessage(const char* file, int line)
+     : severity_(LOG_INFO) {
+  Init(file, line);
+}
+
+LogMessage::LogMessage(const char* file, int line, LogSeverity severity)
+    : severity_(severity) {
+  Init(file, line);
+}
+
+// writes the common header info to the stream
+void LogMessage::Init(const char* file, int line) {
+  // log only the filename
+  const char* last_slash = strrchr(file, '\\');
+  if (last_slash)
+    file = last_slash + 1;
+
+  stream_ <<  '[';
+  if (log_process_id)
+    stream_ << GetCurrentProcessId() << ':';
+  if (log_thread_id)
+    stream_ << GetCurrentThreadId() << ':';
+  if (log_timestamp) {
+    time_t t = time(NULL);
+    struct tm tm_time;
+    localtime_s(&tm_time, &t);
+    stream_ << std::setfill('0')
+            << std::setw(2) << 1 + tm_time.tm_mon
+            << std::setw(2) << tm_time.tm_mday
+            << '/'
+            << std::setw(2) << tm_time.tm_hour
+            << std::setw(2) << tm_time.tm_min
+            << std::setw(2) << tm_time.tm_sec
+            << ':';
+  }
+  if (log_tickcount)
+    stream_ << GetTickCount() << ':';
+  stream_ << log_severity_names[severity_] << ":" << file << "(" << line << ")] ";
+
+  message_start_ = stream_.pcount();
+}
+
+LogMessage::~LogMessage() {
+  if (severity_ < min_log_level)
+    return;
+
+  std::string str_newline(stream_.str(), stream_.pcount());
+  str_newline.append("\r\n");
+
+  if (log_filter_prefix && severity_ <= kMaxFilteredLogLevel &&
+      str_newline.compare(message_start_, strlen(log_filter_prefix),
+                          log_filter_prefix) != 0) {
+    goto cleanup;
+  }
+
+  if (logging_destination != LOG_ONLY_TO_FILE)
+    OutputDebugStringA(str_newline.c_str());
+
+  // write to log file
+  if (logging_destination != LOG_ONLY_TO_SYSTEM_DEBUG_LOG &&
+      InitializeLogFileHandle()) {
+    // we can have multiple threads and/or processes, so try to prevent them from
+    // clobbering each other's writes
+    if (lock_log_file == LOCK_LOG_FILE) {
+      // Ensure that the mutex is initialized in case the client app did not
+      // call InitLogging. This is not thread safe. See below
+      InitLogMutex();
+
+      DWORD r = ::WaitForSingleObject(log_mutex, INFINITE);
+      DCHECK(r != WAIT_ABANDONED);
+    } else {
+      // use the critical section
+      if (!initialized_critical_section) {
+        // The client app did not call InitLogging, and so the critical section
+        // has not been created. We do this on demand, but if two threads try to
+        // do this at the same time, there will be a race condition to create
+        // the critical section. This is why InitLogging should be called from
+        // the main thread at the beginning of execution.
+        InitializeCriticalSection(&log_critical_section);
+        initialized_critical_section = true;
+      }
+      EnterCriticalSection(&log_critical_section);
+    }
+
+    SetFilePointer(log_file, 0, 0, SEEK_END);
+    DWORD num_written;
+    WriteFile(log_file, (void*)str_newline.c_str(), (DWORD)str_newline.length(), &num_written, NULL);
+
+    if (lock_log_file == LOCK_LOG_FILE) {
+      ReleaseMutex(log_mutex);
+    } else {
+      LeaveCriticalSection(&log_critical_section);
+    }
+  }
+
+  if (severity_ == LOG_FATAL) {
+    // display a message or break into the debugger on a fatal error
+    if (::IsDebuggerPresent()) {
+      DebugBreak();
+    } else {
+      if (log_assert_handler) {
+        log_assert_handler(std::string(stream_.str(), stream_.pcount()));
+      } else {
+        // don't use the string with the newline, get a fresh version to send to
+        // the debug message process
+        DisplayDebugMessage(std::string(stream_.str(), stream_.pcount()));
+        TerminateProcess(GetCurrentProcess(), 1);
+      }
+    }
+  }
+
+cleanup:
+  // Calling stream_.str() freezes the stream buffer.  A frozen buffer will
+  // not be freed during strstreambuf destruction.
+  stream_.freeze(false);
+}
+
+void CloseLogFile() {
+  if (!log_file)
+    return;
+
+  CloseHandle(log_file);
+  log_file = NULL;
+}
+
+} // namespace logging
+
+std::ostream& operator<<(std::ostream& out, const wchar_t* wstr) {
+  if (!wstr || !wstr[0])
+    return out;
+
+  // compute the length of the buffer we'll need
+  int charcount = WideCharToMultiByte(CP_UTF8, 0, wstr, -1,
+                                      NULL, 0, NULL, NULL);
+  if (charcount == 0)
+    return out;
+
+  // convert
+  scoped_array<char> buf(new char[charcount]);
+  WideCharToMultiByte(CP_UTF8, 0, wstr, -1, buf.get(), charcount, NULL, NULL);
+  return out << buf.get();
+}
diff --git a/googleurl/base/logging.h b/googleurl/base/logging.h
new file mode 100644
index 0000000..0a69613
--- /dev/null
+++ b/googleurl/base/logging.h
@@ -0,0 +1,489 @@
+// Copyright 2006 Google Inc. All Rights Reserved.
+// Author: brettw (Brett Wilson)
+
+#ifndef BASE_LOGGING_H__
+#define BASE_LOGGING_H__
+
+#include <string>
+#include <cstring>
+#include <sstream>
+#ifdef _WIN32
+#include <tchar.h>
+#endif
+
+#include "base/basictypes.h"
+#include "base/scoped_ptr.h"
+
+// Optional message capabilities
+// -----------------------------
+// Assertion failed messages and fatal errors are displayed in a dialog box
+// before the application exits. However, running this UI creates a message
+// loop, which causes application messages to be processed and potentially
+// dispatched to existing application windows. Since the application is in a
+// bad state when this assertion dialog is displayed, these messages may not
+// get processed and hang the dialog, or the application might go crazy.
+//
+// Therefore, it can be beneficial to display the error dialog in a separate
+// process from the main application. When the logging system needs to display
+// a fatal error dialog box, it will look for a program called
+// "DebugMessage.exe" in the same directory as the application executable. It
+// will run this application with the message as the command line, and will
+// not include the name of the application as is traditional for easier
+// parsing.
+//
+// The code for DebugMessage.exe is only one line. In WinMain, do:
+//   MessageBox(NULL, GetCommandLineW(), L"Fatal Error", 0);
+//
+// If DebugMessage.exe is not found, the logging code will use a normal
+// MessageBox, potentially causing the problems discussed above.
+
+
+// Instructions
+// ------------
+//
+// Make a bunch of macros for logging.  The way to log things is to stream
+// things to LOG(<a particular severity level>).  E.g.,
+//
+//   LOG(INFO) << "Found " << num_cookies << " cookies";
+//
+// You can also do conditional logging:
+//
+//   LOG_IF(INFO, num_cookies > 10) << "Got lots of cookies";
+//
+// The above will cause log messages to be output on the 1st, 11th, 21st, ...
+// times it is executed.  Note that the special COUNTER value is used to
+// identify which repetition is happening.
+//
+// There are also "debug mode" logging macros like the ones above:
+//
+//   DLOG(INFO) << "Found cookies";
+//
+//   DLOG_IF(INFO, num_cookies > 10) << "Got lots of cookies";
+//
+// All "debug mode" logging is compiled away to nothing for non-debug mode
+// compiles.  LOG_IF and development flags also work well together
+// because the code can be compiled away sometimes.
+//
+// We also have
+//
+//   LOG_ASSERT(assertion);
+//   DLOG_ASSERT(assertion);
+//
+// which is syntactic sugar for {,D}LOG_IF(FATAL, assert fails) << assertion;
+//
+// We also override the standard 'assert' to use 'DLOG_ASSERT'.
+//
+// The supported severity levels for macros that allow you to specify one
+// are (in increasing order of severity) INFO, WARNING, ERROR, and FATAL.
+//
+// There is also the special severity of DFATAL, which logs FATAL in
+// debug mode, ERROR in normal mode.
+//
+// Very important: logging a message at the FATAL severity level causes
+// the program to terminate (after the message is logged).
+
+namespace logging {
+
+// Where to record logging output? A flat file and/or system debug log via
+// OutputDebugString. Defaults to LOG_ONLY_TO_FILE.
+enum LoggingDestination { LOG_ONLY_TO_FILE,
+                          LOG_ONLY_TO_SYSTEM_DEBUG_LOG,
+                          LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG };
+
+// Indicates that the log file should be locked when being written to.
+// Often, there is no locking, which is fine for a single threaded program.
+// If logging is being done from multiple threads or there can be more than
+// one process doing the logging, the file should be locked during writes to
+// make each log outut atomic. Other writers will block.
+//
+// All processes writing to the log file must have their locking set for it to
+// work properly. Defaults to DONT_LOCK_LOG_FILE.
+enum LogLockingState { LOCK_LOG_FILE, DONT_LOCK_LOG_FILE };
+
+// On startup, should we delete or append to an existing log file (if any)?
+// Defaults to APPEND_TO_OLD_LOG_FILE.
+enum OldFileDeletionState { DELETE_OLD_LOG_FILE, APPEND_TO_OLD_LOG_FILE };
+
+// Sets the log file name and other global logging state. Calling this function
+// is recommended, and is normally done at the beginning of application init.
+// If you don't call it, all the flags will be initialized to their default
+// values, and there is a race condition that may leak a critical section
+// object if two threads try to do the first log at the same time.
+// See the definition of the enums above for descriptions and default values.
+//
+// The default log file is initialized to "debug.log" in the application
+// directory. You probably don't want this, especially since the program
+// directory may not be writable on an enduser's system.
+#ifdef _WIN32
+void InitLogging(const TCHAR* log_file, LoggingDestination logging_dest,
+                 LogLockingState lock_log, OldFileDeletionState delete_old);
+#else
+void InitLogging(const char* log_file, LoggingDestination logging_dest,
+                 LogLockingState lock_log, OldFileDeletionState delete_old);
+#endif
+
+// Sets the log level. Anything at or above this level will be written to the
+// log file/displayed to the user (if applicable). Anything below this level
+// will be silently ignored. The log level defaults to 0 (everything is logged)
+// if this function is not called.
+void SetMinLogLevel(int level);
+
+// Sets the log filter prefix.  Any log message below LOG_ERROR severity that
+// doesn't start with this prefix with be silently ignored.  The filter defaults
+// to NULL (everything is logged) if this function is not called.  Messages
+// with severity of LOG_ERROR or higher will not be filtered.
+void SetLogFilterPrefix(char* filter);
+
+// Sets the common items you want to be prepended to each log message.
+// process and thread IDs default to off, the timestamp defaults to on.
+// If this function is not called, logging defaults to writing the timestamp
+// only.
+void SetLogItems(bool enable_process_id, bool enable_thread_id,
+                 bool enable_timestamp, bool enable_tickcount);
+
+// Sets the Log Assert Handler that will be used to notify of check failures.
+// The default handler shows a dialog box, however clients can use this
+// function to override with their own handling (e.g. a silent one for Unit
+// Tests)
+typedef void (*LogAssertHandlerFunction)(const std::string& str);
+void SetLogAssertHandler(LogAssertHandlerFunction handler);
+
+typedef int LogSeverity;
+const LogSeverity LOG_INFO = 0;
+const LogSeverity LOG_WARNING = 1;
+const LogSeverity LOG_ERROR = 2;
+const LogSeverity LOG_FATAL = 3;
+const LogSeverity LOG_NUM_SEVERITIES = 4;
+
+// LOG_DFATAL_LEVEL is LOG_FATAL in debug mode, ERROR in normal mode
+#ifdef NDEBUG
+const LogSeverity LOG_DFATAL_LEVEL = LOG_ERROR;
+#else
+const LogSeverity LOG_DFATAL_LEVEL = LOG_FATAL;
+#endif
+
+// A few definitions of macros that don't generate much code. These are used
+// by LOG() and LOG_IF, etc. Since these are used all over our code, it's
+// better to have compact code for these operations.
+#define COMPACT_GOOGLE_LOG_INFO \
+  logging::LogMessage(__FILE__, __LINE__)
+#define COMPACT_GOOGLE_LOG_WARNING \
+  logging::LogMessage(__FILE__, __LINE__, logging::LOG_WARNING)
+#define COMPACT_GOOGLE_LOG_ERROR \
+  logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR)
+#define COMPACT_GOOGLE_LOG_FATAL \
+  logging::LogMessage(__FILE__, __LINE__, logging::LOG_FATAL)
+#define COMPACT_GOOGLE_LOG_DFATAL \
+  logging::LogMessage(__FILE__, __LINE__, logging::LOG_DFATAL_LEVEL)
+
+// wingdi.h defines ERROR to be 0. When we call LOG(ERROR), it gets
+// substituted with 0, and it expands to COMPACT_GOOGLE_LOG_0. To allow us
+// to keep using this syntax, we define this macro to do the same thing
+// as COMPACT_GOOGLE_LOG_ERROR, and also define ERROR the same way that
+// the Windows SDK does for consistency.
+#define ERROR 0
+#define COMPACT_GOOGLE_LOG_0 \
+  logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR)
+
+// We use the preprocessor's merging operator, "##", so that, e.g.,
+// LOG(INFO) becomes the token COMPACT_GOOGLE_LOG_INFO.  There's some funny
+// subtle difference between ostream member streaming functions (e.g.,
+// ostream::operator<<(int) and ostream non-member streaming functions
+// (e.g., ::operator<<(ostream&, string&): it turns out that it's
+// impossible to stream something like a string directly to an unnamed
+// ostream. We employ a neat hack by calling the stream() member
+// function of LogMessage which seems to avoid the problem.
+
+#define LOG(severity) COMPACT_GOOGLE_LOG_ ## severity.stream()
+#define SYSLOG(severity) LOG(severity)
+
+#define LOG_IF(severity, condition) \
+  !(condition) ? (void) 0 : logging::LogMessageVoidify() & LOG(severity)
+#define SYSLOG_IF(severity, condition) LOG_IF(severity, condition)
+
+#define LOG_ASSERT(condition)  \
+  LOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". "
+#define SYSLOG_ASSERT(condition) \
+  SYSLOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". "
+
+// A container for a string pointer which can be evaluated to a bool -
+// true iff the pointer is NULL.
+struct CheckOpString {
+  CheckOpString(std::string* str) : str_(str) { }
+  // No destructor: if str_ is non-NULL, we're about to LOG(FATAL),
+  // so there's no point in cleaning up str_.
+  operator bool() const { return str_ != NULL; }
+  std::string* str_;
+};
+
+// Build the error message string.  This is separate from the "Impl"
+// function template because it is not performance critical and so can
+// be out of line, while the "Impl" code should be inline.
+template<class t1, class t2>
+std::string* MakeCheckOpString(const t1& v1, const t2& v2, const char* names) {
+  std::ostringstream ss;
+  ss << names << " (" << v1 << " vs. " << v2 << ")";
+  return new std::string(ss.str());
+}
+
+std::string* MakeCheckOpStringIntInt(int v1, int v2, const char* names);
+
+template<int, int>
+std::string* MakeCheckOpString(const int& v1, const int& v2, const char* names) {
+  return MakeCheckOpStringIntInt(v1, v2, names);
+}
+
+// Plus some debug-logging macros that get compiled to nothing for production
+//
+// DEBUG_MODE is for uses like
+//   if (DEBUG_MODE) foo.CheckThatFoo();
+// instead of
+//   #ifndef NDEBUG
+//     foo.CheckThatFoo();
+//   #endif
+
+#ifndef NDEBUG
+
+#define DLOG(severity) LOG(severity)
+#define DLOG_IF(severity, condition) LOG_IF(severity, condition)
+#define DLOG_ASSERT(condition) LOG_ASSERT(condition)
+
+// debug-only checking.  not executed in NDEBUG mode.
+enum { DEBUG_MODE = 1 };
+#define DCHECK(condition) \
+  LOG_IF(FATAL, !(condition)) << "Check failed: " #condition ". "
+
+// Helper functions for DCHECK_OP macro.
+// The (int, int) specialization works around the issue that the compiler
+// will not instantiate the template version of the function on values of
+// unnamed enum type - see comment below.
+#define DEFINE_DCHECK_OP_IMPL(name, op) \
+  template <class t1, class t2> \
+  inline std::string* Check##name##Impl(const t1& v1, const t2& v2, \
+                                        const char* names) { \
+    if (v1 op v2) return NULL; \
+    else return MakeCheckOpString(v1, v2, names); \
+  } \
+  inline std::string* Check##name##Impl(int v1, int v2, const char* names) { \
+    if (v1 op v2) return NULL; \
+    else return MakeCheckOpString(v1, v2, names); \
+  }
+DEFINE_DCHECK_OP_IMPL(EQ, ==)
+DEFINE_DCHECK_OP_IMPL(NE, !=)
+DEFINE_DCHECK_OP_IMPL(LE, <=)
+DEFINE_DCHECK_OP_IMPL(LT, < )
+DEFINE_DCHECK_OP_IMPL(GE, >=)
+DEFINE_DCHECK_OP_IMPL(GT, > )
+#undef DEFINE_DCHECK_OP_IMPL
+
+// Helper macro for binary operators.
+// Don't use this macro directly in your code, use CHECK_EQ et al below.
+#define DCHECK_OP(name, op, val1, val2)  \
+  while (logging::CheckOpString _result = \
+         logging::Check##name##Impl((val1), (val2), #val1 " " #op " " #val2)) \
+    logging::LogMessage(__FILE__, __LINE__, _result).stream()
+
+// Equality/Inequality checks - compare two values, and log a LOG_FATAL message
+// including the two values when the result is not as expected.  The values
+// must have operator<<(ostream, ...) defined.
+//
+// You may append to the error message like so:
+//   CHECK_NE(1, 2) << ": The world must be ending!";
+//
+// We are very careful to ensure that each argument is evaluated exactly
+// once, and that anything which is legal to pass as a function argument is
+// legal here.  In particular, the arguments may be temporary expressions
+// which will end up being destroyed at the end of the apparent statement,
+// for example:
+//   CHECK_EQ(string("abc")[1], 'b');
+//
+// WARNING: These don't compile correctly if one of the arguments is a pointer
+// and the other is NULL. To work around this, simply static_cast NULL to the
+// type of the desired pointer.
+
+#define DCHECK_EQ(val1, val2) DCHECK_OP(EQ, ==, val1, val2)
+#define DCHECK_NE(val1, val2) DCHECK_OP(NE, !=, val1, val2)
+#define DCHECK_LE(val1, val2) DCHECK_OP(LE, <=, val1, val2)
+#define DCHECK_LT(val1, val2) DCHECK_OP(LT, < , val1, val2)
+#define DCHECK_GE(val1, val2) DCHECK_OP(GE, >=, val1, val2)
+#define DCHECK_GT(val1, val2) DCHECK_OP(GT, > , val1, val2)
+
+// Helper functions for string comparisons.
+// To avoid bloat, the definitions are in logging.cc.
+#define DECLARE_DCHECK_STROP_IMPL(func, expected) \
+  std::string* Check##func##expected##Impl(const char* s1, \
+                                           const char* s2, \
+                                           const char* names);
+DECLARE_DCHECK_STROP_IMPL(strcmp, true)
+DECLARE_DCHECK_STROP_IMPL(strcmp, false)
+DECLARE_DCHECK_STROP_IMPL(_stricmp, true)
+DECLARE_DCHECK_STROP_IMPL(_stricmp, false)
+#undef DECLARE_DCHECK_STROP_IMPL
+
+// Helper macro for string comparisons.
+// Don't use this macro directly in your code, use CHECK_STREQ et al below.
+#define DCHECK_STROP(func, op, expected, s1, s2) \
+  while (CheckOpString _result = \
+      logging::Check##func##expected##Impl((s1), (s2), \
+                                           #s1 " " #op " " #s2)) \
+    LOG(FATAL) << *_result.str_
+
+// String (char*) equality/inequality checks.
+// CASE versions are case-insensitive.
+//
+// Note that "s1" and "s2" may be temporary strings which are destroyed
+// by the compiler at the end of the current "full expression"
+// (e.g. DCHECK_STREQ(Foo().c_str(), Bar().c_str())).
+
+#define DCHECK_STREQ(s1, s2) DCHECK_STROP(strcmp, ==, true, s1, s2)
+#define DCHECK_STRNE(s1, s2) DCHECK_STROP(strcmp, !=, false, s1, s2)
+#define DCHECK_STRCASEEQ(s1, s2) DCHECK_STROP(_stricmp, ==, true, s1, s2)
+#define DCHECK_STRCASENE(s1, s2) DCHECK_STROP(_stricmp, !=, false, s1, s2)
+
+#define DCHECK_INDEX(I,A) DCHECK(I < (sizeof(A)/sizeof(A[0])))
+#define DCHECK_BOUND(B,A) DCHECK(B <= (sizeof(A)/sizeof(A[0])))
+
+#else  // NDEBUG
+
+#define DLOG(severity) \
+  true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity)
+
+#define DLOG_IF(severity, condition) \
+  true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity)
+
+#define DLOG_ASSERT(condition) \
+  true ? (void) 0 : LOG_ASSERT(condition)
+
+enum { DEBUG_MODE = 0 };
+
+// This macro can be followed by a sequence of stream parameters in
+// non-debug mode. The DCHECK and friends macros use this so that
+// the expanded expression DCHECK(foo) << "asdf" is still syntactically
+// valid, even though the expression will get optimized away.
+#define NDEBUG_EAT_STREAM_PARAMETERS \
+  logging::LogMessage(__FILE__, __LINE__).stream()
+
+#define DCHECK(condition) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_EQ(val1, val2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_NE(val1, val2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_LE(val1, val2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_LT(val1, val2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_GE(val1, val2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_GT(val1, val2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_STREQ(str1, str2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_STRCASEEQ(str1, str2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_STRNE(str1, str2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#define DCHECK_STRCASENE(str1, str2) \
+  while (false) NDEBUG_EAT_STREAM_PARAMETERS
+
+#endif  // NDEBUG
+
+#define NOTREACHED() DCHECK(false)
+
+// Redefine the standard assert to use our nice log files
+#undef assert
+#define assert(x) DLOG_ASSERT(x)
+
+// This class more or less represents a particular log message.  You
+// create an instance of LogMessage and then stream stuff to it.
+// When you finish streaming to it, ~LogMessage is called and the
+// full message gets streamed to the appropriate destination.
+//
+// You shouldn't actually use LogMessage's constructor to log things,
+// though.  You should use the LOG() macro (and variants thereof)
+// above.
+class LogMessage {
+ public:
+  LogMessage(const char* file, int line, LogSeverity severity, int ctr);
+
+  // Two special constructors that generate reduced amounts of code at
+  // LOG call sites for common cases.
+  //
+  // Used for LOG(INFO): Implied are:
+  // severity = LOG_INFO, ctr = 0
+  //
+  // Using this constructor instead of the more complex constructor above
+  // saves a couple of bytes per call site.
+  LogMessage(const char* file, int line);
+
+  // Used for LOG(severity) where severity != INFO.  Implied
+  // are: ctr = 0
+  //
+  // Using this constructor instead of the more complex constructor above
+  // saves a couple of bytes per call site.
+  LogMessage(const char* file, int line, LogSeverity severity);
+
+  // A special constructor used for check failures.
+  // Implied severity = LOG_FATAL
+  LogMessage(const char* file, int line, const CheckOpString& result);
+
+  ~LogMessage();
+
+  std::ostream& stream() { return stream_; }
+
+ private:
+  void Init(const char* file, int line);
+
+  LogSeverity severity_;
+  std::ostringstream stream_;
+  int message_start_;  // offset of the start of the message (past prefix info).
+
+  DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
+};
+
+// A non-macro interface to the log facility; (useful
+// when the logging level is not a compile-time constant).
+inline void LogAtLevel(int const log_level, std::string const &msg) {
+  LogMessage(__FILE__, __LINE__, log_level).stream() << msg;
+}
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LogMessageVoidify {
+ public:
+  LogMessageVoidify() { }
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(std::ostream&) { }
+};
+
+// Closes the log file explicitly if open.
+// NOTE: Since the log file is opened as necessary by the action of logging
+//       statements, there's no guarantee that it will stay closed
+//       after this call.
+void CloseLogFile();
+
+} // namespace Logging
+
+// These functions are provided as a convenience for logging, which is where we
+// use streams (it is against Google style to use streams in other places). It
+// is designed to allow you to emit non-ASCII Unicode strings to the log file,
+// which is normally ASCII. It is relatively slow, so try not to use it for
+// common cases. Non-ASCII characters will be converted to UTF-8 by these operators.
+std::ostream& operator<<(std::ostream& out, const wchar_t* wstr);
+inline std::ostream& operator<<(std::ostream& out, const std::wstring& wstr) {
+  return out << wstr.c_str();
+}
+
+#endif  // BASE_LOGGING_H__
diff --git a/googleurl/base/scoped_ptr.h b/googleurl/base/scoped_ptr.h
new file mode 100644
index 0000000..de0b388
--- /dev/null
+++ b/googleurl/base/scoped_ptr.h
@@ -0,0 +1,322 @@
+#ifndef BASE_SCOPED_PTR_H
+#define BASE_SCOPED_PTR_H
+
+//  (C) Copyright Greg Colvin and Beman Dawes 1998, 1999.
+//  Copyright (c) 2001, 2002 Peter Dimov
+//
+//  Permission to copy, use, modify, sell and distribute this software
+//  is granted provided this copyright notice appears in all copies.
+//  This software is provided "as is" without express or implied
+//  warranty, and with no claim as to its suitability for any purpose.
+//
+//  See http://www.boost.org/libs/smart_ptr/scoped_ptr.htm for documentation.
+//
+
+//  scoped_ptr mimics a built-in pointer except that it guarantees deletion
+//  of the object pointed to, either on destruction of the scoped_ptr or via
+//  an explicit reset(). scoped_ptr is a simple solution for simple needs;
+//  use shared_ptr or std::auto_ptr if your needs are more complex.
+
+//  *** NOTE ***
+//  If your scoped_ptr is a class member of class FOO pointing to a
+//  forward declared type BAR (as shown below), then you MUST use a non-inlined
+//  version of the destructor.  The destructor of a scoped_ptr (called from
+//  FOO's destructor) must have a complete definition of BAR in order to
+//  destroy it.  Example:
+//
+//  -- foo.h --
+//  class BAR;
+//
+//  class FOO {
+//   public:
+//    FOO();
+//    ~FOO();  // Required for sources that instantiate class FOO to compile!
+//
+//   private:
+//    scoped_ptr<BAR> bar_;
+//  };
+//
+//  -- foo.cc --
+//  #include "foo.h"
+//  FOO::~FOO() {} // Empty, but must be non-inlined to FOO's class definition.
+
+#include <cstddef>            // for std::ptrdiff_t
+#include <assert.h>           // for assert
+#include <stdlib.h>           // for free() decl
+
+template <typename T>
+class scoped_ptr {
+ private:
+
+  T* ptr;
+
+  scoped_ptr(scoped_ptr const &);
+  scoped_ptr & operator=(scoped_ptr const &);
+
+ public:
+
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = 0): ptr(p) {}
+
+  ~scoped_ptr() {
+    typedef char type_must_be_complete[sizeof(T)];
+    delete ptr;
+  }
+
+  void reset(T* p = 0) {
+    typedef char type_must_be_complete[sizeof(T)];
+
+    if (ptr != p) {
+      delete ptr;
+      ptr = p;
+    }
+  }
+
+  T& operator*() const {
+    assert(ptr != 0);
+    return *ptr;
+  }
+
+  T* operator->() const  {
+    assert(ptr != 0);
+    return ptr;
+  }
+
+  bool operator==(T* p) const {
+    return ptr == p;
+  }
+
+  bool operator!=(T* p) const {
+    return ptr != p;
+  }
+
+  T* get() const  {
+    return ptr;
+  }
+
+  void swap(scoped_ptr & b) {
+    T* tmp = b.ptr;
+    b.ptr = ptr;
+    ptr = tmp;
+  }
+
+  T* release() {
+    T* tmp = ptr;
+    ptr = 0;
+    return tmp;
+  }
+
+ private:
+
+  // no reason to use these: each scoped_ptr should have its own object
+  template <typename U> bool operator==(scoped_ptr<U> const& p) const;
+  template <typename U> bool operator!=(scoped_ptr<U> const& p) const;
+};
+
+template<typename T> inline
+void swap(scoped_ptr<T>& a, scoped_ptr<T>& b) {
+  a.swap(b);
+}
+
+template<typename T> inline
+bool operator==(T* p, const scoped_ptr<T>& b) {
+  return p == b.get();
+}
+
+template<typename T> inline
+bool operator!=(T* p, const scoped_ptr<T>& b) {
+  return p != b.get();
+}
+
+//  scoped_array extends scoped_ptr to arrays. Deletion of the array pointed to
+//  is guaranteed, either on destruction of the scoped_array or via an explicit
+//  reset(). Use shared_array or std::vector if your needs are more complex.
+
+template<typename T>
+class scoped_array {
+ private:
+
+  T* ptr;
+
+  scoped_array(scoped_array const &);
+  scoped_array & operator=(scoped_array const &);
+
+ public:
+
+  typedef T element_type;
+
+  explicit scoped_array(T* p = 0) : ptr(p) {}
+
+  ~scoped_array() {
+    typedef char type_must_be_complete[sizeof(T)];
+    delete[] ptr;
+  }
+
+  void reset(T* p = 0) {
+    typedef char type_must_be_complete[sizeof(T)];
+
+    if (ptr != p) {
+      delete [] ptr;
+      ptr = p;
+    }
+  }
+
+  T& operator[](std::ptrdiff_t i) const {
+    assert(ptr != 0);
+    assert(i >= 0);
+    return ptr[i];
+  }
+
+  bool operator==(T* p) const {
+    return ptr == p;
+  }
+
+  bool operator!=(T* p) const {
+    return ptr != p;
+  }
+
+  T* get() const {
+    return ptr;
+  }
+
+  void swap(scoped_array & b) {
+    T* tmp = b.ptr;
+    b.ptr = ptr;
+    ptr = tmp;
+  }
+
+  T* release() {
+    T* tmp = ptr;
+    ptr = 0;
+    return tmp;
+  }
+
+ private:
+
+  // no reason to use these: each scoped_array should have its own object
+  template <typename U> bool operator==(scoped_array<U> const& p) const;
+  template <typename U> bool operator!=(scoped_array<U> const& p) const;
+};
+
+template<class T> inline
+void swap(::scoped_array<T>& a, ::scoped_array<T>& b) {
+  a.swap(b);
+}
+
+template<typename T> inline
+bool operator==(T* p, const ::scoped_array<T>& b) {
+  return p == b.get();
+}
+
+template<typename T> inline
+bool operator!=(T* p, const ::scoped_array<T>& b) {
+  return p != b.get();
+}
+
+
+// This class wraps the c library function free() in a class that can be
+// passed as a template argument to scoped_ptr_malloc below.
+class ScopedPtrMallocFree {
+ public:
+  inline void operator()(void* x) const {
+    free(x);
+  }
+};
+
+// scoped_ptr_malloc<> is similar to scoped_ptr<>, but it accepts a
+// second template argument, the functor used to free the object.
+
+template<typename T, typename FreeProc = ScopedPtrMallocFree>
+class scoped_ptr_malloc {
+ private:
+
+  T* ptr;
+
+  scoped_ptr_malloc(scoped_ptr_malloc const &);
+  scoped_ptr_malloc & operator=(scoped_ptr_malloc const &);
+
+ public:
+
+  typedef T element_type;
+
+  explicit scoped_ptr_malloc(T* p = 0): ptr(p) {}
+
+  ~scoped_ptr_malloc() {
+    typedef char type_must_be_complete[sizeof(T)];
+    free_((void*) ptr);
+  }
+
+  void reset(T* p = 0) {
+    typedef char type_must_be_complete[sizeof(T)];
+
+    if (ptr != p) {
+      free_((void*) ptr);
+      ptr = p;
+    }
+  }
+
+  T& operator*() const {
+    assert(ptr != 0);
+    return *ptr;
+  }
+
+  T* operator->() const {
+    assert(ptr != 0);
+    return ptr;
+  }
+
+  bool operator==(T* p) const {
+    return ptr == p;
+  }
+
+  bool operator!=(T* p) const {
+    return ptr != p;
+  }
+
+  T* get() const {
+    return ptr;
+  }
+
+  void swap(scoped_ptr_malloc & b) {
+    T* tmp = b.ptr;
+    b.ptr = ptr;
+    ptr = tmp;
+  }
+
+  T* release() {
+    T* tmp = ptr;
+    ptr = 0;
+    return tmp;
+  }
+
+ private:
+
+  // no reason to use these: each scoped_ptr_malloc should have its own object
+  template <typename U, typename GP>
+  bool operator==(scoped_ptr_malloc<U, GP> const& p) const;
+  template <typename U, typename GP>
+  bool operator!=(scoped_ptr_malloc<U, GP> const& p) const;
+
+  static FreeProc const free_;
+};
+
+template<typename T, typename FP>
+FP const scoped_ptr_malloc<T,FP>::free_ = FP();
+
+template<typename T, typename FP> inline
+void swap(scoped_ptr_malloc<T,FP>& a, scoped_ptr_malloc<T,FP>& b) {
+  a.swap(b);
+}
+
+template<typename T, typename FP> inline
+bool operator==(T* p, const scoped_ptr_malloc<T,FP>& b) {
+  return p == b.get();
+}
+
+template<typename T, typename FP> inline
+bool operator!=(T* p, const scoped_ptr_malloc<T,FP>& b) {
+  return p != b.get();
+}
+
+#endif  // #ifndef BASE_SCOPED_PTR_H
diff --git a/googleurl/base/string16.cc b/googleurl/base/string16.cc
new file mode 100644
index 0000000..fc25809
--- /dev/null
+++ b/googleurl/base/string16.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "base/string16.h"
+
+#ifdef WIN32
+
+#error This file should not be used on 2-byte wchar_t systems
+// If this winds up being needed on 2-byte wchar_t systems, either the
+// definitions below can be used, or the host system's wide character
+// functions like wmemcmp can be wrapped.
+
+#else  // !WIN32
+
+namespace base {
+
+int c16memcmp(const char16* s1, const char16* s2, size_t n) {
+  // We cannot call memcmp because that changes the semantics.
+  while (n-- > 0) {
+    if (*s1 != *s2) {
+      // We cannot use (*s1 - *s2) because char16 is unsigned.
+      return ((*s1 < *s2) ? -1 : 1);
+    }
+    ++s1;
+    ++s2;
+  }
+  return 0;
+}
+
+size_t c16len(const char16* s) {
+  const char16 *s_orig = s;
+  while (*s) {
+    ++s;
+  }
+  return s - s_orig;
+}
+
+const char16* c16memchr(const char16* s, char16 c, size_t n) {
+  while (n-- > 0) {
+    if (*s == c) {
+      return s;
+    }
+    ++s;
+  }
+  return 0;
+}
+
+char16* c16memmove(char16* s1, const char16* s2, size_t n) {
+  return reinterpret_cast<char16*>(memmove(s1, s2, n * sizeof(char16)));
+}
+
+char16* c16memcpy(char16* s1, const char16* s2, size_t n) {
+  return reinterpret_cast<char16*>(memcpy(s1, s2, n * sizeof(char16)));
+}
+
+char16* c16memset(char16* s, char16 c, size_t n) {
+  char16 *s_orig = s;
+  while (n-- > 0) {
+    *s = c;
+    ++s;
+  }
+  return s_orig;
+}
+
+}  // namespace base
+
+template class std::basic_string<char16, base::string16_char_traits>;
+
+#endif  // WIN32
diff --git a/googleurl/base/string16.h b/googleurl/base/string16.h
new file mode 100644
index 0000000..ed77165
--- /dev/null
+++ b/googleurl/base/string16.h
@@ -0,0 +1,193 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef BASE_STRING16_H_
+#define BASE_STRING16_H_
+
+// WHAT:
+// A version of std::basic_string that provides 2-byte characters even when
+// wchar_t is not implemented as a 2-byte type. You can access this class as
+// string16. We also define char16, which string16 is based upon.
+//
+// WHY:
+// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2
+// data. Plenty of existing code operates on strings encoded as UTF-16.
+//
+// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make
+// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails
+// at run time, because it calls some functions (like wcslen) that come from
+// the system's native C library -- which was built with a 4-byte wchar_t!
+// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's
+// entirely improper on those systems where the encoding of wchar_t is defined
+// as UTF-32.
+//
+// Here, we define string16, which is similar to std::wstring but replaces all
+// libc functions with custom, 2-byte-char compatible routines. It is capable
+// of carrying UTF-16-encoded data.
+
+#include <string>
+#include <cstdio>
+
+#include "base/basictypes.h"
+
+#ifdef WIN32
+
+typedef wchar_t char16;
+typedef std::wstring string16;
+
+#else  // !WIN32
+
+typedef uint16 char16;
+
+namespace base {
+
+// char16 versions of the functions required by string16_char_traits; these
+// are based on the wide character functions of similar names ("w" or "wcs"
+// instead of "c16").
+int c16memcmp(const char16* s1, const char16* s2, size_t n);
+size_t c16len(const char16* s);
+const char16* c16memchr(const char16* s, char16 c, size_t n);
+char16* c16memmove(char16* s1, const char16* s2, size_t n);
+char16* c16memcpy(char16* s1, const char16* s2, size_t n);
+char16* c16memset(char16* s, char16 c, size_t n);
+
+struct string16_char_traits {
+  typedef char16 char_type;
+  typedef int int_type;
+
+  typedef std::streamoff off_type;
+  typedef mbstate_t state_type;
+  typedef std::fpos<state_type> pos_type;
+
+  static void assign(char_type& c1, const char_type& c2) {
+    c1 = c2;
+  }
+
+  static bool eq(const char_type& c1, const char_type& c2) {
+    return c1 == c2;
+  }
+  static bool lt(const char_type& c1, const char_type& c2) {
+    return c1 < c2;
+  }
+
+  static int compare(const char_type* s1, const char_type* s2, size_t n) {
+    return c16memcmp(s1, s2, n);
+  }
+
+  static size_t length(const char_type* s) {
+    return c16len(s);
+  }
+
+  static const char_type* find(const char_type* s, size_t n,
+                               const char_type& a) {
+    return c16memchr(s, a, n);
+  }
+
+  static char_type* move(char_type* s1, const char_type* s2, int_type n) {
+    return c16memmove(s1, s2, n);
+  }
+
+  static char_type* copy(char_type* s1, const char_type* s2, size_t n) {
+    return c16memcpy(s1, s2, n);
+  }
+
+  static char_type* assign(char_type* s, size_t n, char_type a) {
+    return c16memset(s, a, n);
+  }
+
+  static int_type not_eof(const int_type& c) {
+    return eq_int_type(c, eof()) ? 0 : c;
+  }
+
+  static char_type to_char_type(const int_type& c) {
+    return char_type(c);
+  }
+
+  static int_type to_int_type(const char_type& c) {
+    return int_type(c);
+  }
+
+  static bool eq_int_type(const int_type& c1, const int_type& c2) {
+    return c1 == c2;
+  }
+
+  static int_type eof() {
+    return static_cast<int_type>(EOF);
+  }
+};
+
+}  // namespace base
+
+// The string class will be explicitly instantiated only once, in string16.cc.
+//
+// std::basic_string<> in GNU libstdc++ contains a static data member,
+// _S_empty_rep_storage, to represent empty strings.  When an operation such
+// as assignment or destruction is performed on a string, causing its existing
+// data member to be invalidated, it must not be freed if this static data
+// member is being used.  Otherwise, it counts as an attempt to free static
+// (and not allocated) data, which is a memory error.
+//
+// Generally, due to C++ template magic, _S_empty_rep_storage will be marked
+// as a coalesced symbol, meaning that the linker will combine multiple
+// instances into a single one when generating output.
+//
+// If a string class is used by multiple shared libraries, a problem occurs.
+// Each library will get its own copy of _S_empty_rep_storage.  When strings
+// are passed across a library boundary for alteration or destruction, memory
+// errors will result.  GNU libstdc++ contains a configuration option,
+// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which
+// disables the static data member optimization, but it's a good optimization
+// and non-STL code is generally at the mercy of the system's STL
+// configuration.  Fully-dynamic strings are not the default for GNU libstdc++
+// libstdc++ itself or for the libstdc++ installations on the systems we care
+// about, such as Mac OS X and relevant flavors of Linux.
+//
+// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 .
+//
+// To avoid problems, string classes need to be explicitly instantiated only
+// once, in exactly one library.  All other string users see it via an "extern"
+// declaration.  This is precisely how GNU libstdc++ handles
+// std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring).
+//
+// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2),
+// in which the linker does not fully coalesce symbols when dead code
+// stripping is enabled.  This bug causes the memory errors described above
+// to occur even when a std::basic_string<> does not cross shared library
+// boundaries, such as in statically-linked executables.
+//
+// TODO(mark): File this bug with Apple and update this note with a bug number.
+
+template class std::basic_string<char16, base::string16_char_traits>;
+
+typedef std::basic_string<char16, base::string16_char_traits> string16;
+
+std::ostream& operator<<(std::ostream& out, const string16& str);
+
+#endif  // !WIN32
+
+#endif  // BASE_STRING16_H_
diff --git a/googleurl/gurl.h b/googleurl/gurl.h
new file mode 100644
index 0000000..c6b3712
--- /dev/null
+++ b/googleurl/gurl.h
@@ -0,0 +1,392 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_GURL_H__
+#define GOOGLEURL_SRC_GURL_H__
+
+#include <iosfwd>
+#include <string>
+
+#include "base/string16.h"
+#include "url_canon.h"
+#include "url_canon_stdstring.h"
+#include "url_common.h"
+#include "url_parse.h"
+
+class GURL {
+ public:
+  typedef url_canon::StdStringReplacements<std::string> Replacements;
+  typedef url_canon::StdStringReplacements<string16> ReplacementsW;
+
+  // Creates an empty, invalid URL.
+  GURL_API GURL();
+
+  // Copy construction is relatively inexpensive, with most of the time going
+  // to reallocating the string. It does not re-parse.
+  GURL_API GURL(const GURL& other);
+
+  // The narrow version requires the input be UTF-8. Invalid UTF-8 input will
+  // result in an invalid URL.
+  //
+  // The wide version should also take an encoding parameter so we know how to
+  // encode the query parameters. It is probably sufficient for the narrow
+  // version to assume the query parameter encoding should be the same as the
+  // input encoding.
+  GURL_API explicit GURL(const std::string& url_string
+                         /*, output_param_encoding*/);
+  GURL_API explicit GURL(const string16& url_string
+                         /*, output_param_encoding*/);
+
+  // Constructor for URLs that have already been parsed and canonicalized. This
+  // is used for conversions from KURL, for example. The caller must supply all
+  // information associated with the URL, which must be correct and consistent.
+  GURL_API GURL(const char* canonical_spec, size_t canonical_spec_len,
+                const url_parse::Parsed& parsed, bool is_valid);
+
+  GURL_API ~GURL();
+
+  GURL_API GURL& operator=(const GURL& other);
+
+  // Returns true when this object represents a valid parsed URL. When not
+  // valid, other functions will still succeed, but you will not get canonical
+  // data out in the format you may be expecting. Instead, we keep something
+  // "reasonable looking" so that the user can see how it's busted if
+  // displayed to them.
+  bool is_valid() const {
+    return is_valid_;
+  }
+
+  // Returns true if the URL is zero-length. Note that empty URLs are also
+  // invalid, and is_valid() will return false for them. This is provided
+  // because some users may want to treat the empty case differently.
+  bool is_empty() const {
+    return spec_.empty();
+  }
+
+  // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8,
+  // if the URL is valid. If the URL is not valid, this will assert and return
+  // the empty string (for safety in release builds, to keep them from being
+  // misused which might be a security problem).
+  //
+  // The URL will be ASCII except the reference fragment, which may be UTF-8.
+  // It is guaranteed to be valid UTF-8.
+  //
+  // The exception is for empty() URLs (which are !is_valid()) but this will
+  // return the empty string without asserting.
+  //
+  // Used invalid_spec() below to get the unusable spec of an invalid URL. This
+  // separation is designed to prevent errors that may cause security problems
+  // that could result from the mistaken use of an invalid URL.
+  GURL_API const std::string& spec() const;
+
+  // Returns the potentially invalid spec for a the URL. This spec MUST NOT be
+  // modified or sent over the network. It is designed to be displayed in error
+  // messages to the user, as the apperance of the spec may explain the error.
+  // If the spec is valid, the valid spec will be returned.
+  //
+  // The returned string is guaranteed to be valid UTF-8.
+  const std::string& possibly_invalid_spec() const {
+    return spec_;
+  }
+
+  // Getter for the raw parsed structure. This allows callers to locate parts
+  // of the URL within the spec themselves. Most callers should consider using
+  // the individual component getters below.
+  //
+  // The returned parsed structure will reference into the raw spec, which may
+  // or may not be valid. If you are using this to index into the spec, BE
+  // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you
+  // don't do anything "important" with invalid specs.
+  const url_parse::Parsed& parsed_for_possibly_invalid_spec() const {
+    return parsed_;
+  }
+
+  // Defiant equality operator!
+  bool operator==(const GURL& other) const {
+    return spec_ == other.spec_;
+  }
+  bool operator!=(const GURL& other) const {
+    return spec_ != other.spec_;
+  }
+
+  // Allows GURL to used as a key in STL (for example, a std::set or std::map).
+  bool operator<(const GURL& other) const {
+    return spec_ < other.spec_;
+  }
+
+  // Resolves a URL that's possibly relative to this object's URL, and returns
+  // it. Absolute URLs are also handled according to the rules of URLs on web
+  // pages.
+  //
+  // It may be impossible to resolve the URLs properly. If the input is not
+  // "standard" (SchemeIsStandard() == false) and the input looks relative, we
+  // can't resolve it. In these cases, the result will be an empty, invalid
+  // GURL.
+  //
+  // The result may also be a nonempty, invalid URL if the input has some kind
+  // of encoding error. In these cases, we will try to construct a "good" URL
+  // that may have meaning to the user, but it will be marked invalid.
+  //
+  // It is an error to resolve a URL relative to an invalid URL. The result
+  // will be the empty URL.
+  GURL_API GURL Resolve(const std::string& relative) const;
+  GURL_API GURL Resolve(const string16& relative) const;
+
+  // Like Resolve() above but takes a character set encoder which will be used
+  // for any query text specified in the input. The charset converter parameter
+  // may be NULL, in which case it will be treated as UTF-8.
+  //
+  // TODO(brettw): These should be replaced with versions that take something
+  // more friendly than a raw CharsetConverter (maybe like an ICU character set
+  // name).
+  GURL_API GURL ResolveWithCharsetConverter(
+      const std::string& relative,
+      url_canon::CharsetConverter* charset_converter) const;
+  GURL_API GURL ResolveWithCharsetConverter(
+      const string16& relative,
+      url_canon::CharsetConverter* charset_converter) const;
+
+  // Creates a new GURL by replacing the current URL's components with the
+  // supplied versions. See the Replacements class in url_canon.h for more.
+  //
+  // These are not particularly quick, so avoid doing mutations when possible.
+  // Prefer the 8-bit version when possible.
+  //
+  // It is an error to replace components of an invalid URL. The result will
+  // be the empty URL.
+  //
+  // Note that we use the more general url_canon::Replacements type to give
+  // callers extra flexibility rather than our override.
+  GURL_API GURL ReplaceComponents(
+      const url_canon::Replacements<char>& replacements) const;
+  GURL_API GURL ReplaceComponents(
+      const url_canon::Replacements<char16>& replacements) const;
+
+  // A helper function that is equivalent to replacing the path with a slash
+  // and clearing out everything after that. We sometimes need to know just the
+  // scheme and the authority. If this URL is not a standard URL (it doesn't
+  // have the regular authority and path sections), then the result will be
+  // an empty, invalid GURL. Note that this *does* work for file: URLs, which
+  // some callers may want to filter out before calling this.
+  //
+  // It is an error to get an empty path on an invalid URL. The result
+  // will be the empty URL.
+  GURL_API GURL GetWithEmptyPath() const;
+
+  // A helper function to return a GURL containing just the scheme, host,
+  // and port from a URL. Equivalent to clearing any username and password,
+  // replacing the path with a slash, and clearing everything after that. If
+  // this URL is not a standard URL, then the result will be an empty,
+  // invalid GURL. If the URL has neither username nor password, this
+  // degenerates to GetWithEmptyPath().
+  //
+  // It is an error to get the origin of an invalid URL. The result
+  // will be the empty URL.
+  GURL_API GURL GetOrigin() const;
+
+  // Returns true if the scheme for the current URL is a known "standard"
+  // scheme. Standard schemes have an authority and a path section. This
+  // includes file: and filesystem:, which some callers may want to filter out
+  // explicitly by calling SchemeIsFile[System].
+  GURL_API bool IsStandard() const;
+
+  // Returns true if the given parameter (should be lower-case ASCII to match
+  // the canonicalized scheme) is the scheme for this URL. This call is more
+  // efficient than getting the scheme and comparing it because no copies or
+  // object constructions are done.
+  GURL_API bool SchemeIs(const char* lower_ascii_scheme) const;
+
+  // We often need to know if this is a file URL. File URLs are "standard", but
+  // are often treated separately by some programs.
+  bool SchemeIsFile() const {
+    return SchemeIs("file");
+  }
+
+  // FileSystem URLs need to be treated differently in some cases.
+  bool SchemeIsFileSystem() const {
+    return SchemeIs("filesystem");
+  }
+
+  // If the scheme indicates a secure connection
+  bool SchemeIsSecure() const {
+    return SchemeIs("https") ||
+        (SchemeIsFileSystem() && inner_url() && inner_url()->SchemeIsSecure());
+  }
+
+  // Returns true if the hostname is an IP address. Note: this function isn't
+  // as cheap as a simple getter because it re-parses the hostname to verify.
+  // This currently identifies only IPv4 addresses (bug 822685).
+  GURL_API bool HostIsIPAddress() const;
+
+  // Getters for various components of the URL. The returned string will be
+  // empty if the component is empty or is not present.
+  std::string scheme() const {  // Not including the colon. See also SchemeIs.
+    return ComponentString(parsed_.scheme);
+  }
+  std::string username() const {
+    return ComponentString(parsed_.username);
+  }
+  std::string password() const {
+    return ComponentString(parsed_.password);
+  }
+  // Note that this may be a hostname, an IPv4 address, or an IPv6 literal
+  // surrounded by square brackets, like "[2001:db8::1]".  To exclude these
+  // brackets, use HostNoBrackets() below.
+  std::string host() const {
+    return ComponentString(parsed_.host);
+  }
+  std::string port() const {  // Returns -1 if "default"
+    return ComponentString(parsed_.port);
+  }
+  std::string path() const {  // Including first slash following host
+    return ComponentString(parsed_.path);
+  }
+  std::string query() const {  // Stuff following '?'
+    return ComponentString(parsed_.query);
+  }
+  std::string ref() const {  // Stuff following '#'
+    return ComponentString(parsed_.ref);
+  }
+
+  // Existance querying. These functions will return true if the corresponding
+  // URL component exists in this URL. Note that existance is different than
+  // being nonempty. http://www.google.com/? has a query that just happens to
+  // be empty, and has_query() will return true.
+  bool has_scheme() const {
+    return parsed_.scheme.len >= 0;
+  }
+  bool has_username() const {
+    return parsed_.username.len >= 0;
+  }
+  bool has_password() const {
+    return parsed_.password.len >= 0;
+  }
+  bool has_host() const {
+    // Note that hosts are special, absense of host means length 0.
+    return parsed_.host.len > 0;
+  }
+  bool has_port() const {
+    return parsed_.port.len >= 0;
+  }
+  bool has_path() const {
+    // Note that http://www.google.com/" has a path, the path is "/". This can
+    // return false only for invalid or nonstandard URLs.
+    return parsed_.path.len >= 0;
+  }
+  bool has_query() const {
+    return parsed_.query.len >= 0;
+  }
+  bool has_ref() const {
+    return parsed_.ref.len >= 0;
+  }
+
+  // Returns a parsed version of the port. Can also be any of the special
+  // values defined in Parsed for ExtractPort.
+  GURL_API int IntPort() const;
+
+  // Returns the port number of the url, or the default port number.
+  // If the scheme has no concept of port (or unknown default) returns
+  // PORT_UNSPECIFIED.
+  GURL_API int EffectiveIntPort() const;
+
+  // Extracts the filename portion of the path and returns it. The filename
+  // is everything after the last slash in the path. This may be empty.
+  GURL_API std::string ExtractFileName() const;
+
+  // Returns the path that should be sent to the server. This is the path,
+  // parameter, and query portions of the URL. It is guaranteed to be ASCII.
+  GURL_API std::string PathForRequest() const;
+
+  // Returns the host, excluding the square brackets surrounding IPv6 address
+  // literals.  This can be useful for passing to getaddrinfo().
+  GURL_API std::string HostNoBrackets() const;
+
+  // Returns true if this URL's host matches or is in the same domain as
+  // the given input string. For example if this URL was "www.google.com",
+  // this would match "com", "google.com", and "www.google.com
+  // (input domain should be lower-case ASCII to match the canonicalized
+  // scheme). This call is more efficient than getting the host and check
+  // whether host has the specific domain or not because no copies or
+  // object constructions are done.
+  //
+  // If function DomainIs has parameter domain_len, which means the parameter
+  // lower_ascii_domain does not gurantee to terminate with NULL character.
+  GURL_API bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
+
+  // If function DomainIs only has parameter lower_ascii_domain, which means
+  // domain string should be terminate with NULL character.
+  bool DomainIs(const char* lower_ascii_domain) const {
+    return DomainIs(lower_ascii_domain,
+                    static_cast<int>(strlen(lower_ascii_domain)));
+  }
+
+  // Swaps the contents of this GURL object with the argument without doing
+  // any memory allocations.
+  GURL_API void Swap(GURL* other);
+
+  // Returns a reference to a singleton empty GURL. This object is for callers
+  // who return references but don't have anything to return in some cases.
+  // This function may be called from any thread.
+  GURL_API static const GURL& EmptyGURL();
+
+  // Returns the inner URL of a nested URL [currently only non-null for
+  // filesystem: URLs].
+  const GURL* inner_url() const {
+    return inner_url_;
+  }
+
+ private:
+  // Returns the substring of the input identified by the given component.
+  std::string ComponentString(const url_parse::Component& comp) const {
+    if (comp.len <= 0)
+      return std::string();
+    return std::string(spec_, comp.begin, comp.len);
+  }
+
+  // The actual text of the URL, in canonical ASCII form.
+  std::string spec_;
+
+  // Set when the given URL is valid. Otherwise, we may still have a spec and
+  // components, but they may not identify valid resources (for example, an
+  // invalid port number, invalid characters in the scheme, etc.).
+  bool is_valid_;
+
+  // Identified components of the canonical spec.
+  url_parse::Parsed parsed_;
+
+  // Used for nested schemes [currently only filesystem:].
+  GURL* inner_url_;
+
+  // TODO bug 684583: Add encoding for query params.
+};
+
+// Stream operator so GURL can be used in assertion statements.
+GURL_API std::ostream& operator<<(std::ostream& out, const GURL& url);
+
+#endif  // GOOGLEURL_SRC_GURL_H__
diff --git a/googleurl/url_canon.h b/googleurl/url_canon.h
new file mode 100644
index 0000000..a3009fe
--- /dev/null
+++ b/googleurl/url_canon.h
@@ -0,0 +1,912 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#ifndef GOOGLEURL_SRC_URL_CANON_H__
+#define GOOGLEURL_SRC_URL_CANON_H__
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "base/string16.h"
+#include "url_common.h"
+#include "url_parse.h"
+
+namespace url_canon {
+
+// Canonicalizer output -------------------------------------------------------
+
+// Base class for the canonicalizer output, this maintains a buffer and
+// supports simple resizing and append operations on it.
+//
+// It is VERY IMPORTANT that no virtual function calls be made on the common
+// code path. We only have two virtual function calls, the destructor and a
+// resize function that is called when the existing buffer is not big enough.
+// The derived class is then in charge of setting up our buffer which we will
+// manage.
+template<typename T>
+class CanonOutputT {
+ public:
+  CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {
+  }
+  virtual ~CanonOutputT() {
+  }
+
+  // Implemented to resize the buffer. This function should update the buffer
+  // pointer to point to the new buffer, and any old data up to |cur_len_| in
+  // the buffer must be copied over.
+  //
+  // The new size |sz| must be larger than buffer_len_.
+  virtual void Resize(int sz) = 0;
+
+  // Accessor for returning a character at a given position. The input offset
+  // must be in the valid range.
+  inline char at(int offset) const {
+    return buffer_[offset];
+  }
+
+  // Sets the character at the given position. The given position MUST be less
+  // than the length().
+  inline void set(int offset, int ch) {
+    buffer_[offset] = ch;
+  }
+
+  // Returns the number of characters currently in the buffer.
+  inline int length() const {
+    return cur_len_;
+  }
+
+  // Returns the current capacity of the buffer. The length() is the number of
+  // characters that have been declared to be written, but the capacity() is
+  // the number that can be written without reallocation. If the caller must
+  // write many characters at once, it can make sure there is enough capacity,
+  // write the data, then use set_size() to declare the new length().
+  int capacity() const {
+    return buffer_len_;
+  }
+
+  // Called by the user of this class to get the output. The output will NOT
+  // be NULL-terminated. Call length() to get the
+  // length.
+  const T* data() const {
+    return buffer_;
+  }
+  T* data() {
+    return buffer_;
+  }
+
+  // Shortens the URL to the new length. Used for "backing up" when processing
+  // relative paths. This can also be used if an external function writes a lot
+  // of data to the buffer (when using the "Raw" version below) beyond the end,
+  // to declare the new length.
+  //
+  // This MUST NOT be used to expand the size of the buffer beyond capacity().
+  void set_length(int new_len) {
+    cur_len_ = new_len;
+  }
+
+  // This is the most performance critical function, since it is called for
+  // every character.
+  void push_back(T ch) {
+    // In VC2005, putting this common case first speeds up execution
+    // dramatically because this branch is predicted as taken.
+    if (cur_len_ < buffer_len_) {
+      buffer_[cur_len_] = ch;
+      cur_len_++;
+      return;
+    }
+
+    // Grow the buffer to hold at least one more item. Hopefully we won't have
+    // to do this very often.
+    if (!Grow(1))
+      return;
+
+    // Actually do the insertion.
+    buffer_[cur_len_] = ch;
+    cur_len_++;
+  }
+
+  // Appends the given string to the output.
+  void Append(const T* str, int str_len) {
+    if (cur_len_ + str_len > buffer_len_) {
+      if (!Grow(cur_len_ + str_len - buffer_len_))
+        return;
+    }
+    for (int i = 0; i < str_len; i++)
+      buffer_[cur_len_ + i] = str[i];
+    cur_len_ += str_len;
+  }
+
+ protected:
+  // Grows the given buffer so that it can fit at least |min_additional|
+  // characters. Returns true if the buffer could be resized, false on OOM.
+  bool Grow(int min_additional) {
+    static const int kMinBufferLen = 16;
+    int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;
+    do {
+      if (new_len >= (1 << 30))  // Prevent overflow below.
+        return false;
+      new_len *= 2;
+    } while (new_len < buffer_len_ + min_additional);
+    Resize(new_len);
+    return true;
+  }
+
+  T* buffer_;
+  int buffer_len_;
+
+  // Used characters in the buffer.
+  int cur_len_;
+};
+
+// Simple implementation of the CanonOutput using new[]. This class
+// also supports a static buffer so if it is allocated on the stack, most
+// URLs can be canonicalized with no heap allocations.
+template<typename T, int fixed_capacity = 1024>
+class RawCanonOutputT : public CanonOutputT<T> {
+ public:
+  RawCanonOutputT() : CanonOutputT<T>() {
+    this->buffer_ = fixed_buffer_;
+    this->buffer_len_ = fixed_capacity;
+  }
+  virtual ~RawCanonOutputT() {
+    if (this->buffer_ != fixed_buffer_)
+      delete[] this->buffer_;
+  }
+
+  virtual void Resize(int sz) {
+    T* new_buf = new T[sz];
+    memcpy(new_buf, this->buffer_,
+           sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));
+    if (this->buffer_ != fixed_buffer_)
+      delete[] this->buffer_;
+    this->buffer_ = new_buf;
+    this->buffer_len_ = sz;
+  }
+
+ protected:
+  T fixed_buffer_[fixed_capacity];
+};
+
+// Normally, all canonicalization output is in narrow characters. We support
+// the templates so it can also be used internally if a wide buffer is
+// required.
+typedef CanonOutputT<char> CanonOutput;
+typedef CanonOutputT<char16> CanonOutputW;
+
+template<int fixed_capacity>
+class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
+template<int fixed_capacity>
+class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {};
+
+// Character set converter ----------------------------------------------------
+//
+// Converts query strings into a custom encoding. The embedder can supply an
+// implementation of this class to interface with their own character set
+// conversion libraries.
+//
+// Embedders will want to see the unit test for the ICU version.
+
+class CharsetConverter {
+ public:
+  CharsetConverter() {}
+  virtual ~CharsetConverter() {}
+
+  // Converts the given input string from UTF-16 to whatever output format the
+  // converter supports. This is used only for the query encoding conversion,
+  // which does not fail. Instead, the converter should insert "invalid
+  // character" characters in the output for invalid sequences, and do the
+  // best it can.
+  //
+  // If the input contains a character not representable in the output
+  // character set, the converter should append the HTML entity sequence in
+  // decimal, (such as "&#20320;") with escaping of the ampersand, number
+  // sign, and semicolon (in the previous example it would be
+  // "%26%2320320%3B"). This rule is based on what IE does in this situation.
+  virtual void ConvertFromUTF16(const char16* input,
+                                int input_len,
+                                CanonOutput* output) = 0;
+};
+
+// Whitespace -----------------------------------------------------------------
+
+// Searches for whitespace that should be removed from the middle of URLs, and
+// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces
+// are preserved, which is what most browsers do. A pointer to the output will
+// be returned, and the length of that output will be in |output_len|.
+//
+// This should be called before parsing if whitespace removal is desired (which
+// it normally is when you are canonicalizing).
+//
+// If no whitespace is removed, this function will not use the buffer and will
+// return a pointer to the input, to avoid the extra copy. If modification is
+// required, the given |buffer| will be used and the returned pointer will
+// point to the beginning of the buffer.
+//
+// Therefore, callers should not use the buffer, since it may actuall be empty,
+// use the computed pointer and |*output_len| instead.
+GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,
+                                         CanonOutputT<char>* buffer,
+                                         int* output_len);
+GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,
+                                           CanonOutputT<char16>* buffer,
+                                           int* output_len);
+
+// IDN ------------------------------------------------------------------------
+
+// Converts the Unicode input representing a hostname to ASCII using IDN rules.
+// The output must fall in the ASCII range, but will be encoded in UTF-16.
+//
+// On success, the output will be filled with the ASCII host name and it will
+// return true. Unlike most other canonicalization functions, this assumes that
+// the output is empty. The beginning of the host will be at offset 0, and
+// the length of the output will be set to the length of the new host name.
+//
+// On error, returns false. The output in this case is undefined.
+GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
+
+// Piece-by-piece canonicalizers ----------------------------------------------
+//
+// These individual canonicalizers append the canonicalized versions of the
+// corresponding URL component to the given std::string. The spec and the
+// previously-identified range of that component are the input. The range of
+// the canonicalized component will be written to the output component.
+//
+// These functions all append to the output so they can be chained. Make sure
+// the output is empty when you start.
+//
+// These functions returns boolean values indicating success. On failure, they
+// will attempt to write something reasonable to the output so that, if
+// displayed to the user, they will recognise it as something that's messed up.
+// Nothing more should ever be done with these invalid URLs, however.
+
+// Scheme: Appends the scheme and colon to the URL. The output component will
+// indicate the range of characters up to but not including the colon.
+//
+// Canonical URLs always have a scheme. If the scheme is not present in the
+// input, this will just write the colon to indicate an empty scheme. Does not
+// append slashes which will be needed before any authority components for most
+// URLs.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool CanonicalizeScheme(const char* spec,
+                                 const url_parse::Component& scheme,
+                                 CanonOutput* output,
+                                 url_parse::Component* out_scheme);
+GURL_API bool CanonicalizeScheme(const char16* spec,
+                                 const url_parse::Component& scheme,
+                                 CanonOutput* output,
+                                 url_parse::Component* out_scheme);
+
+// User info: username/password. If present, this will add the delimiters so
+// the output will be "<username>:<password>@" or "<username>@". Empty
+// username/password pairs, or empty passwords, will get converted to
+// nonexistant in the canonical version.
+//
+// The components for the username and password refer to ranges in the
+// respective source strings. Usually, these will be the same string, which
+// is legal as long as the two components don't overlap.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool CanonicalizeUserInfo(const char* username_source,
+                                   const url_parse::Component& username,
+                                   const char* password_source,
+                                   const url_parse::Component& password,
+                                   CanonOutput* output,
+                                   url_parse::Component* out_username,
+                                   url_parse::Component* out_password);
+GURL_API bool CanonicalizeUserInfo(const char16* username_source,
+                                   const url_parse::Component& username,
+                                   const char16* password_source,
+                                   const url_parse::Component& password,
+                                   CanonOutput* output,
+                                   url_parse::Component* out_username,
+                                   url_parse::Component* out_password);
+
+
+// This structure holds detailed state exported from the IP/Host canonicalizers.
+// Additional fields may be added as callers require them.
+struct CanonHostInfo {
+  CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}
+
+  // Convenience function to test if family is an IP address.
+  bool IsIPAddress() const { return family == IPV4 || family == IPV6; }
+
+  // This field summarizes how the input was classified by the canonicalizer.
+  enum Family {
+    NEUTRAL,   // - Doesn't resemble an IP address.  As far as the IP
+               //   canonicalizer is concerned, it should be treated as a
+               //   hostname.
+    BROKEN,    // - Almost an IP, but was not canonicalized.  This could be an
+               //   IPv4 address where truncation occurred, or something
+               //   containing the special characters :[] which did not parse
+               //   as an IPv6 address.  Never attempt to connect to this
+               //   address, because it might actually succeed!
+    IPV4,      // - Successfully canonicalized as an IPv4 address.
+    IPV6      // - Successfully canonicalized as an IPv6 address.
+  };
+  Family family;
+
+  // If |family| is IPV4, then this is the number of nonempty dot-separated
+  // components in the input text, from 1 to 4.  If |family| is not IPV4,
+  // this value is undefined.
+  int num_ipv4_components;
+
+  // Location of host within the canonicalized output.
+  // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6.
+  // CanonicalizeHostVerbose() always sets it.
+  url_parse::Component out_host;
+
+  // |address| contains the parsed IP Address (if any) in its first
+  // AddressLength() bytes, in network order. If IsIPAddress() is false
+  // AddressLength() will return zero and the content of |address| is undefined.
+  unsigned char address[16];
+
+  // Convenience function to calculate the length of an IP address corresponding
+  // to the current IP version in |family|, if any. For use with |address|.
+  int AddressLength() const {
+    return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0);
+  }
+};
+
+
+// Host.
+//
+// The 8-bit version requires UTF-8 encoding.  Use this version when you only
+// need to know whether canonicalization succeeded.
+GURL_API bool CanonicalizeHost(const char* spec,
+                               const url_parse::Component& host,
+                               CanonOutput* output,
+                               url_parse::Component* out_host);
+GURL_API bool CanonicalizeHost(const char16* spec,
+                               const url_parse::Component& host,
+                               CanonOutput* output,
+                               url_parse::Component* out_host);
+
+// Extended version of CanonicalizeHost, which returns additional information.
+// Use this when you need to know whether the hostname was an IP address.
+// A successful return is indicated by host_info->family != BROKEN.  See the
+// definition of CanonHostInfo above for details.
+GURL_API void CanonicalizeHostVerbose(const char* spec,
+                                      const url_parse::Component& host,
+                                      CanonOutput* output,
+                                      CanonHostInfo* host_info);
+GURL_API void CanonicalizeHostVerbose(const char16* spec,
+                                      const url_parse::Component& host,
+                                      CanonOutput* output,
+                                      CanonHostInfo* host_info);
+
+
+// IP addresses.
+//
+// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is
+// an IP address, it will canonicalize it as such, appending it to |output|.
+// Additional status information is returned via the |*host_info| parameter.
+// See the definition of CanonHostInfo above for details.
+//
+// This is called AUTOMATICALLY from the host canonicalizer, which ensures that
+// the input is unescaped and name-prepped, etc. It should not normally be
+// necessary or wise to call this directly.
+GURL_API void CanonicalizeIPAddress(const char* spec,
+                                    const url_parse::Component& host,
+                                    CanonOutput* output,
+                                    CanonHostInfo* host_info);
+GURL_API void CanonicalizeIPAddress(const char16* spec,
+                                    const url_parse::Component& host,
+                                    CanonOutput* output,
+                                    CanonHostInfo* host_info);
+
+// Port: this function will add the colon for the port if a port is present.
+// The caller can pass url_parse::PORT_UNSPECIFIED as the
+// default_port_for_scheme argument if there is no default port.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool CanonicalizePort(const char* spec,
+                               const url_parse::Component& port,
+                               int default_port_for_scheme,
+                               CanonOutput* output,
+                               url_parse::Component* out_port);
+GURL_API bool CanonicalizePort(const char16* spec,
+                               const url_parse::Component& port,
+                               int default_port_for_scheme,
+                               CanonOutput* output,
+                               url_parse::Component* out_port);
+
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);
+
+// Path. If the input does not begin in a slash (including if the input is
+// empty), we'll prepend a slash to the path to make it canonical.
+//
+// The 8-bit version assumes UTF-8 encoding, but does not verify the validity
+// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid
+// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't
+// an issue. Somebody giving us an 8-bit path is responsible for generating
+// the path that the server expects (we'll escape high-bit characters), so
+// if something is invalid, it's their problem.
+GURL_API bool CanonicalizePath(const char* spec,
+                               const url_parse::Component& path,
+                               CanonOutput* output,
+                               url_parse::Component* out_path);
+GURL_API bool CanonicalizePath(const char16* spec,
+                               const url_parse::Component& path,
+                               CanonOutput* output,
+                               url_parse::Component* out_path);
+
+// Canonicalizes the input as a file path. This is like CanonicalizePath except
+// that it also handles Windows drive specs. For example, the path can begin
+// with "c|\" and it will get properly canonicalized to "C:/".
+// The string will be appended to |*output| and |*out_path| will be updated.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool FileCanonicalizePath(const char* spec,
+                                   const url_parse::Component& path,
+                                   CanonOutput* output,
+                                   url_parse::Component* out_path);
+GURL_API bool FileCanonicalizePath(const char16* spec,
+                                   const url_parse::Component& path,
+                                   CanonOutput* output,
+                                   url_parse::Component* out_path);
+
+// Query: Prepends the ? if needed.
+//
+// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly
+// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode
+// "invalid character." This function can not fail, we always just try to do
+// our best for crazy input here since web pages can set it themselves.
+//
+// This will convert the given input into the output encoding that the given
+// character set converter object provides. The converter will only be called
+// if necessary, for ASCII input, no conversions are necessary.
+//
+// The converter can be NULL. In this case, the output encoding will be UTF-8.
+GURL_API void CanonicalizeQuery(const char* spec,
+                                const url_parse::Component& query,
+                                CharsetConverter* converter,
+                                CanonOutput* output,
+                                url_parse::Component* out_query);
+GURL_API void CanonicalizeQuery(const char16* spec,
+                                const url_parse::Component& query,
+                                CharsetConverter* converter,
+                                CanonOutput* output,
+                                url_parse::Component* out_query);
+
+// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
+// canonicalizer that does not produce ASCII output). The output is
+// guaranteed to be valid UTF-8.
+//
+// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
+// the "Unicode replacement character" for the confusing bits and copy the rest.
+GURL_API void CanonicalizeRef(const char* spec,
+                              const url_parse::Component& path,
+                              CanonOutput* output,
+                              url_parse::Component* out_path);
+GURL_API void CanonicalizeRef(const char16* spec,
+                              const url_parse::Component& path,
+                              CanonOutput* output,
+                              url_parse::Component* out_path);
+
+// Full canonicalizer ---------------------------------------------------------
+//
+// These functions replace any string contents, rather than append as above.
+// See the above piece-by-piece functions for information specific to
+// canonicalizing individual components.
+//
+// The output will be ASCII except the reference fragment, which may be UTF-8.
+//
+// The 8-bit versions require UTF-8 encoding.
+
+// Use for standard URLs with authorities and paths.
+GURL_API bool CanonicalizeStandardURL(const char* spec,
+                                      int spec_len,
+                                      const url_parse::Parsed& parsed,
+                                      CharsetConverter* query_converter,
+                                      CanonOutput* output,
+                                      url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeStandardURL(const char16* spec,
+                                      int spec_len,
+                                      const url_parse::Parsed& parsed,
+                                      CharsetConverter* query_converter,
+                                      CanonOutput* output,
+                                      url_parse::Parsed* new_parsed);
+
+// Use for file URLs.
+GURL_API bool CanonicalizeFileURL(const char* spec,
+                                  int spec_len,
+                                  const url_parse::Parsed& parsed,
+                                  CharsetConverter* query_converter,
+                                  CanonOutput* output,
+                                  url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeFileURL(const char16* spec,
+                                  int spec_len,
+                                  const url_parse::Parsed& parsed,
+                                  CharsetConverter* query_converter,
+                                  CanonOutput* output,
+                                  url_parse::Parsed* new_parsed);
+
+// Use for filesystem URLs.
+GURL_API bool CanonicalizeFileSystemURL(const char* spec,
+                                        int spec_len,
+                                        const url_parse::Parsed& parsed,
+                                        CharsetConverter* query_converter,
+                                        CanonOutput* output,
+                                        url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeFileSystemURL(const char16* spec,
+                                        int spec_len,
+                                        const url_parse::Parsed& parsed,
+                                        CharsetConverter* query_converter,
+                                        CanonOutput* output,
+                                        url_parse::Parsed* new_parsed);
+
+// Use for path URLs such as javascript. This does not modify the path in any
+// way, for example, by escaping it.
+GURL_API bool CanonicalizePathURL(const char* spec,
+                                  int spec_len,
+                                  const url_parse::Parsed& parsed,
+                                  CanonOutput* output,
+                                  url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizePathURL(const char16* spec,
+                                  int spec_len,
+                                  const url_parse::Parsed& parsed,
+                                  CanonOutput* output,
+                                  url_parse::Parsed* new_parsed);
+
+// Use for mailto URLs. This "canonicalizes" the url into a path and query
+// component. It does not attempt to merge "to" fields. It uses UTF-8 for
+// the query encoding if there is a query. This is because a mailto URL is
+// really intended for an external mail program, and the encoding of a page,
+// etc. which would influence a query encoding normally are irrelevant.
+GURL_API bool CanonicalizeMailtoURL(const char* spec,
+                                    int spec_len,
+                                    const url_parse::Parsed& parsed,
+                                    CanonOutput* output,
+                                    url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeMailtoURL(const char16* spec,
+                                    int spec_len,
+                                    const url_parse::Parsed& parsed,
+                                    CanonOutput* output,
+                                    url_parse::Parsed* new_parsed);
+
+// Part replacer --------------------------------------------------------------
+
+// Internal structure used for storing separate strings for each component.
+// The basic canonicalization functions use this structure internally so that
+// component replacement (different strings for different components) can be
+// treated on the same code path as regular canonicalization (the same string
+// for each component).
+//
+// A url_parse::Parsed structure usually goes along with this. Those
+// components identify offsets within these strings, so that they can all be
+// in the same string, or spread arbitrarily across different ones.
+//
+// This structures does not own any data. It is the caller's responsibility to
+// ensure that the data the pointers point to stays in scope and is not
+// modified.
+template<typename CHAR>
+struct URLComponentSource {
+  // Constructor normally used by callers wishing to replace components. This
+  // will make them all NULL, which is no replacement. The caller would then
+  // override the components they want to replace.
+  URLComponentSource()
+      : scheme(NULL),
+        username(NULL),
+        password(NULL),
+        host(NULL),
+        port(NULL),
+        path(NULL),
+        query(NULL),
+        ref(NULL) {
+  }
+
+  // Constructor normally used internally to initialize all the components to
+  // point to the same spec.
+  explicit URLComponentSource(const CHAR* default_value)
+      : scheme(default_value),
+        username(default_value),
+        password(default_value),
+        host(default_value),
+        port(default_value),
+        path(default_value),
+        query(default_value),
+        ref(default_value) {
+  }
+
+  const CHAR* scheme;
+  const CHAR* username;
+  const CHAR* password;
+  const CHAR* host;
+  const CHAR* port;
+  const CHAR* path;
+  const CHAR* query;
+  const CHAR* ref;
+};
+
+// This structure encapsulates information on modifying a URL. Each component
+// may either be left unchanged, replaced, or deleted.
+//
+// By default, each component is unchanged. For those components that should be
+// modified, call either Set* or Clear* to modify it.
+//
+// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT
+// IN SCOPE BY THE CALLER for as long as this object exists!
+//
+// Prefer the 8-bit replacement version if possible since it is more efficient.
+template<typename CHAR>
+class Replacements {
+ public:
+  Replacements() {
+  }
+
+  // Scheme
+  void SetScheme(const CHAR* s, const url_parse::Component& comp) {
+    sources_.scheme = s;
+    components_.scheme = comp;
+  }
+  // Note: we don't have a ClearScheme since this doesn't make any sense.
+  bool IsSchemeOverridden() const { return sources_.scheme != NULL; }
+
+  // Username
+  void SetUsername(const CHAR* s, const url_parse::Component& comp) {
+    sources_.username = s;
+    components_.username = comp;
+  }
+  void ClearUsername() {
+    sources_.username = Placeholder();
+    components_.username = url_parse::Component();
+  }
+  bool IsUsernameOverridden() const { return sources_.username != NULL; }
+
+  // Password
+  void SetPassword(const CHAR* s, const url_parse::Component& comp) {
+    sources_.password = s;
+    components_.password = comp;
+  }
+  void ClearPassword() {
+    sources_.password = Placeholder();
+    components_.password = url_parse::Component();
+  }
+  bool IsPasswordOverridden() const { return sources_.password != NULL; }
+
+  // Host
+  void SetHost(const CHAR* s, const url_parse::Component& comp) {
+    sources_.host = s;
+    components_.host = comp;
+  }
+  void ClearHost() {
+    sources_.host = Placeholder();
+    components_.host = url_parse::Component();
+  }
+  bool IsHostOverridden() const { return sources_.host != NULL; }
+
+  // Port
+  void SetPort(const CHAR* s, const url_parse::Component& comp) {
+    sources_.port = s;
+    components_.port = comp;
+  }
+  void ClearPort() {
+    sources_.port = Placeholder();
+    components_.port = url_parse::Component();
+  }
+  bool IsPortOverridden() const { return sources_.port != NULL; }
+
+  // Path
+  void SetPath(const CHAR* s, const url_parse::Component& comp) {
+    sources_.path = s;
+    components_.path = comp;
+  }
+  void ClearPath() {
+    sources_.path = Placeholder();
+    components_.path = url_parse::Component();
+  }
+  bool IsPathOverridden() const { return sources_.path != NULL; }
+
+  // Query
+  void SetQuery(const CHAR* s, const url_parse::Component& comp) {
+    sources_.query = s;
+    components_.query = comp;
+  }
+  void ClearQuery() {
+    sources_.query = Placeholder();
+    components_.query = url_parse::Component();
+  }
+  bool IsQueryOverridden() const { return sources_.query != NULL; }
+
+  // Ref
+  void SetRef(const CHAR* s, const url_parse::Component& comp) {
+    sources_.ref = s;
+    components_.ref = comp;
+  }
+  void ClearRef() {
+    sources_.ref = Placeholder();
+    components_.ref = url_parse::Component();
+  }
+  bool IsRefOverridden() const { return sources_.ref != NULL; }
+
+  // Getters for the itnernal data. See the variables below for how the
+  // information is encoded.
+  const URLComponentSource<CHAR>& sources() const { return sources_; }
+  const url_parse::Parsed& components() const { return components_; }
+
+ private:
+  // Returns a pointer to a static empty string that is used as a placeholder
+  // to indicate a component should be deleted (see below).
+  const CHAR* Placeholder() {
+    static const CHAR empty_string = 0;
+    return &empty_string;
+  }
+
+  // We support three states:
+  //
+  // Action                 | Source                Component
+  // -----------------------+--------------------------------------------------
+  // Don't change component | NULL                  (unused)
+  // Replace component      | (replacement string)  (replacement component)
+  // Delete component       | (non-NULL)            (invalid component: (0,-1))
+  //
+  // We use a pointer to the empty string for the source when the component
+  // should be deleted.
+  URLComponentSource<CHAR> sources_;
+  url_parse::Parsed components_;
+};
+
+// The base must be an 8-bit canonical URL.
+GURL_API bool ReplaceStandardURL(const char* base,
+                                 const url_parse::Parsed& base_parsed,
+                                 const Replacements<char>& replacements,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceStandardURL(const char* base,
+                                 const url_parse::Parsed& base_parsed,
+                                 const Replacements<char16>& replacements,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* new_parsed);
+
+// Filesystem URLs can only have the path, query, or ref replaced.
+// All other components will be ignored.
+GURL_API bool ReplaceFileSystemURL(const char* base,
+                                   const url_parse::Parsed& base_parsed,
+                                   const Replacements<char>& replacements,
+                                   CharsetConverter* query_converter,
+                                   CanonOutput* output,
+                                   url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceFileSystemURL(const char* base,
+                                   const url_parse::Parsed& base_parsed,
+                                   const Replacements<char16>& replacements,
+                                   CharsetConverter* query_converter,
+                                   CanonOutput* output,
+                                   url_parse::Parsed* new_parsed);
+
+// Replacing some parts of a file URL is not permitted. Everything except
+// the host, path, query, and ref will be ignored.
+GURL_API bool ReplaceFileURL(const char* base,
+                             const url_parse::Parsed& base_parsed,
+                             const Replacements<char>& replacements,
+                             CharsetConverter* query_converter,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceFileURL(const char* base,
+                             const url_parse::Parsed& base_parsed,
+                             const Replacements<char16>& replacements,
+                             CharsetConverter* query_converter,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed);
+
+// Path URLs can only have the scheme and path replaced. All other components
+// will be ignored.
+GURL_API bool ReplacePathURL(const char* base,
+                             const url_parse::Parsed& base_parsed,
+                             const Replacements<char>& replacements,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed);
+GURL_API bool ReplacePathURL(const char* base,
+                             const url_parse::Parsed& base_parsed,
+                             const Replacements<char16>& replacements,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed);
+
+// Mailto URLs can only have the scheme, path, and query replaced.
+// All other components will be ignored.
+GURL_API bool ReplaceMailtoURL(const char* base,
+                               const url_parse::Parsed& base_parsed,
+                               const Replacements<char>& replacements,
+                               CanonOutput* output,
+                               url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceMailtoURL(const char* base,
+                               const url_parse::Parsed& base_parsed,
+                               const Replacements<char16>& replacements,
+                               CanonOutput* output,
+                               url_parse::Parsed* new_parsed);
+
+// Relative URL ---------------------------------------------------------------
+
+// Given an input URL or URL fragment |fragment|, determines if it is a
+// relative or absolute URL and places the result into |*is_relative|. If it is
+// relative, the relevant portion of the URL will be placed into
+// |*relative_component| (there may have been trimmed whitespace, for example).
+// This value is passed to ResolveRelativeURL. If the input is not relative,
+// this value is UNDEFINED (it may be changed by the function).
+//
+// Returns true on success (we successfully determined the URL is relative or
+// not). Failure means that the combination of URLs doesn't make any sense.
+//
+// The base URL should always be canonical, therefore is ASCII.
+GURL_API bool IsRelativeURL(const char* base,
+                            const url_parse::Parsed& base_parsed,
+                            const char* fragment,
+                            int fragment_len,
+                            bool is_base_hierarchical,
+                            bool* is_relative,
+                            url_parse::Component* relative_component);
+GURL_API bool IsRelativeURL(const char* base,
+                            const url_parse::Parsed& base_parsed,
+                            const char16* fragment,
+                            int fragment_len,
+                            bool is_base_hierarchical,
+                            bool* is_relative,
+                            url_parse::Component* relative_component);
+
+// Given a canonical parsed source URL, a URL fragment known to be relative,
+// and the identified relevant portion of the relative URL (computed by
+// IsRelativeURL), this produces a new parsed canonical URL in |output| and
+// |out_parsed|.
+//
+// It also requires a flag indicating whether the base URL is a file: URL
+// which triggers additional logic.
+//
+// The base URL should be canonical and have a host (may be empty for file
+// URLs) and a path. If it doesn't have these, we can't resolve relative
+// URLs off of it and will return the base as the output with an error flag.
+// Becausee it is canonical is should also be ASCII.
+//
+// The query charset converter follows the same rules as CanonicalizeQuery.
+//
+// Returns true on success. On failure, the output will be "something
+// reasonable" that will be consistent and valid, just probably not what
+// was intended by the web page author or caller.
+GURL_API bool ResolveRelativeURL(const char* base_url,
+                                 const url_parse::Parsed& base_parsed,
+                                 bool base_is_file,
+                                 const char* relative_url,
+                                 const url_parse::Component& relative_component,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* out_parsed);
+GURL_API bool ResolveRelativeURL(const char* base_url,
+                                 const url_parse::Parsed& base_parsed,
+                                 bool base_is_file,
+                                 const char16* relative_url,
+                                 const url_parse::Component& relative_component,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput* output,
+                                 url_parse::Parsed* out_parsed);
+
+}  // namespace url_canon
+
+#endif  // GOOGLEURL_SRC_URL_CANON_H__
diff --git a/googleurl/url_canon_icu.h b/googleurl/url_canon_icu.h
new file mode 100644
index 0000000..736e1e9
--- /dev/null
+++ b/googleurl/url_canon_icu.h
@@ -0,0 +1,63 @@
+// Copyright 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ICU integration functions.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_ICU_H__
+#define GOOGLEURL_SRC_URL_CANON_ICU_H__
+
+#include "url_canon.h"
+
+typedef struct UConverter UConverter;
+
+namespace url_canon {
+
+// An implementation of CharsetConverter that implementations can use to
+// interface the canonicalizer with ICU's conversion routines.
+class ICUCharsetConverter : public CharsetConverter {
+ public:
+  // Constructs a converter using an already-existing ICU character set
+  // converter. This converter is NOT owned by this object; the lifetime must
+  // be managed by the creator such that it is alive as long as this is.
+  GURL_API ICUCharsetConverter(UConverter* converter);
+
+  GURL_API virtual ~ICUCharsetConverter();
+
+  GURL_API virtual void ConvertFromUTF16(const char16* input,
+                                         int input_len,
+                                         CanonOutput* output);
+
+ private:
+  // The ICU converter, not owned by this class.
+  UConverter* converter_;
+};
+
+}  // namespace url_canon
+
+#endif  // GOOGLEURL_SRC_URL_CANON_ICU_H__
diff --git a/googleurl/url_canon_internal.h b/googleurl/url_canon_internal.h
new file mode 100644
index 0000000..ac5774f
--- /dev/null
+++ b/googleurl/url_canon_internal.h
@@ -0,0 +1,462 @@
+// Copyright 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is intended to be included in another C++ file where the character
+// types are defined. This allows us to write mostly generic code, but not have
+// templace bloat because everything is inlined when anybody calls any of our
+// functions.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
+#define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
+
+#include <stdlib.h>
+
+#include "base/logging.h"
+#include "url_canon.h"
+
+namespace url_canon {
+
+// Character type handling -----------------------------------------------------
+
+// Bits that identify different character types. These types identify different
+// bits that are set for each 8-bit character in the kSharedCharTypeTable.
+enum SharedCharTypes {
+  // Characters that do not require escaping in queries. Characters that do
+  // not have this flag will be escaped; see url_canon_query.cc
+  CHAR_QUERY = 1,
+
+  // Valid in the username/password field.
+  CHAR_USERINFO = 2,
+
+  // Valid in a IPv4 address (digits plus dot and 'x' for hex).
+  CHAR_IPV4 = 4,
+
+  // Valid in an ASCII-representation of a hex digit (as in %-escaped).
+  CHAR_HEX = 8,
+
+  // Valid in an ASCII-representation of a decimal digit.
+  CHAR_DEC = 16,
+
+  // Valid in an ASCII-representation of an octal digit.
+  CHAR_OCT = 32,
+
+  // Characters that do not require escaping in encodeURIComponent.  Characters
+  // that do not have this flag will be escaped; see url_util.cc.
+  CHAR_COMPONENT = 64
+};
+
+// This table contains the flags in SharedCharTypes for each 8-bit character.
+// Some canonicalization functions have their own specialized lookup table.
+// For those with simple requirements, we have collected the flags in one
+// place so there are fewer lookup tables to load into the CPU cache.
+//
+// Using an unsigned char type has a small but measurable performance benefit
+// over using a 32-bit number.
+extern const unsigned char kSharedCharTypeTable[0x100];
+
+// More readable wrappers around the character type lookup table.
+inline bool IsCharOfType(unsigned char c, SharedCharTypes type) {
+  return !!(kSharedCharTypeTable[c] & type);
+}
+inline bool IsQueryChar(unsigned char c) {
+  return IsCharOfType(c, CHAR_QUERY);
+}
+inline bool IsIPv4Char(unsigned char c) {
+  return IsCharOfType(c, CHAR_IPV4);
+}
+inline bool IsHexChar(unsigned char c) {
+  return IsCharOfType(c, CHAR_HEX);
+}
+inline bool IsComponentChar(unsigned char c) {
+  return IsCharOfType(c, CHAR_COMPONENT);
+}
+
+// Appends the given string to the output, escaping characters that do not
+// match the given |type| in SharedCharTypes.
+void AppendStringOfType(const char* source, int length,
+                        SharedCharTypes type,
+                        CanonOutput* output);
+void AppendStringOfType(const char16* source, int length,
+                        SharedCharTypes type,
+                        CanonOutput* output);
+
+// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit
+// that will be used to represent it.
+GURL_API extern const char kHexCharLookup[0x10];
+
+// This lookup table allows fast conversion between ASCII hex letters and their
+// corresponding numerical value. The 8-bit range is divided up into 8
+// regions of 0x20 characters each. Each of the three character types (numbers,
+// uppercase, lowercase) falls into different regions of this range. The table
+// contains the amount to subtract from characters in that range to get at
+// the corresponding numerical value.
+//
+// See HexDigitToValue for the lookup.
+extern const char kCharToHexLookup[8];
+
+// Assumes the input is a valid hex digit! Call IsHexChar before using this.
+inline unsigned char HexCharToValue(unsigned char c) {
+  return c - kCharToHexLookup[c / 0x20];
+}
+
+// Indicates if the given character is a dot or dot equivalent, returning the
+// number of characters taken by it. This will be one for a literal dot, 3 for
+// an escaped dot. If the character is not a dot, this will return 0.
+template<typename CHAR>
+inline int IsDot(const CHAR* spec, int offset, int end) {
+  if (spec[offset] == '.') {
+    return 1;
+  } else if (spec[offset] == '%' && offset + 3 <= end &&
+             spec[offset + 1] == '2' &&
+             (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) {
+    // Found "%2e"
+    return 3;
+  }
+  return 0;
+}
+
+// Returns the canonicalized version of the input character according to scheme
+// rules. This is implemented alongside the scheme canonicalizer, and is
+// required for relative URL resolving to test for scheme equality.
+//
+// Returns 0 if the input character is not a valid scheme character.
+char CanonicalSchemeChar(char16 ch);
+
+// Write a single character, escaped, to the output. This always escapes: it
+// does no checking that thee character requires escaping.
+// Escaping makes sense only 8 bit chars, so code works in all cases of
+// input parameters (8/16bit).
+template<typename UINCHAR, typename OUTCHAR>
+inline void AppendEscapedChar(UINCHAR ch,
+                              CanonOutputT<OUTCHAR>* output) {
+  output->push_back('%');
+  output->push_back(kHexCharLookup[(ch >> 4) & 0xf]);
+  output->push_back(kHexCharLookup[ch & 0xf]);
+}
+
+// The character we'll substitute for undecodable or invalid characters.
+extern const char16 kUnicodeReplacementCharacter;
+
+// UTF-8 functions ------------------------------------------------------------
+
+// Reads one character in UTF-8 starting at |*begin| in |str| and places
+// the decoded value into |*code_point|. If the character is valid, we will
+// return true. If invalid, we'll return false and put the
+// kUnicodeReplacementCharacter into |*code_point|.
+//
+// |*begin| will be updated to point to the last character consumed so it
+// can be incremented in a loop and will be ready for the next character.
+// (for a single-byte ASCII character, it will not be changed).
+//
+// Implementation is in url_canon_icu.cc.
+GURL_API bool ReadUTFChar(const char* str, int* begin, int length,
+                          unsigned* code_point_out);
+
+// Generic To-UTF-8 converter. This will call the given append method for each
+// character that should be appended, with the given output method. Wrappers
+// are provided below for escaped and non-escaped versions of this.
+//
+// The char_value must have already been checked that it's a valid Unicode
+// character.
+template<class Output, void Appender(unsigned char, Output*)>
+inline void DoAppendUTF8(unsigned char_value, Output* output) {
+  if (char_value <= 0x7f) {
+    Appender(static_cast<unsigned char>(char_value), output);
+  } else if (char_value <= 0x7ff) {
+    // 110xxxxx 10xxxxxx
+    Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)),
+             output);
+    Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+             output);
+  } else if (char_value <= 0xffff) {
+    // 1110xxxx 10xxxxxx 10xxxxxx
+    Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)),
+             output);
+    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
+             output);
+    Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+             output);
+  } else if (char_value <= 0x10FFFF) {  // Max unicode code point.
+    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
+             output);
+    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)),
+             output);
+    Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
+             output);
+    Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+             output);
+  } else {
+    // Invalid UTF-8 character (>20 bits).
+    NOTREACHED();
+  }
+}
+
+// Helper used by AppendUTF8Value below. We use an unsigned parameter so there
+// are no funny sign problems with the input, but then have to convert it to
+// a regular char for appending.
+inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) {
+  output->push_back(static_cast<char>(ch));
+}
+
+// Writes the given character to the output as UTF-8. This does NO checking
+// of the validity of the unicode characters; the caller should ensure that
+// the value it is appending is valid to append.
+inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) {
+  DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output);
+}
+
+// Writes the given character to the output as UTF-8, escaping ALL
+// characters (even when they are ASCII). This does NO checking of the
+// validity of the unicode characters; the caller should ensure that the value
+// it is appending is valid to append.
+inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) {
+  DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output);
+}
+
+// UTF-16 functions -----------------------------------------------------------
+
+// Reads one character in UTF-16 starting at |*begin| in |str| and places
+// the decoded value into |*code_point|. If the character is valid, we will
+// return true. If invalid, we'll return false and put the
+// kUnicodeReplacementCharacter into |*code_point|.
+//
+// |*begin| will be updated to point to the last character consumed so it
+// can be incremented in a loop and will be ready for the next character.
+// (for a single-16-bit-word character, it will not be changed).
+//
+// Implementation is in url_canon_icu.cc.
+GURL_API bool ReadUTFChar(const char16* str, int* begin, int length,
+                          unsigned* code_point);
+
+// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method.
+inline void AppendUTF16Value(unsigned code_point,
+                             CanonOutputT<char16>* output) {
+  if (code_point > 0xffff) {
+    output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0));
+    output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00));
+  } else {
+    output->push_back(static_cast<char16>(code_point));
+  }
+}
+
+// Escaping functions ---------------------------------------------------------
+
+// Writes the given character to the output as UTF-8, escaped. Call this
+// function only when the input is wide. Returns true on success. Failure
+// means there was some problem with the encoding, we'll still try to
+// update the |*begin| pointer and add a placeholder character to the
+// output so processing can continue.
+//
+// We will append the character starting at ch[begin] with the buffer ch
+// being |length|. |*begin| will be updated to point to the last character
+// consumed (we may consume more than one for UTF-16) so that if called in
+// a loop, incrementing the pointer will move to the next character.
+//
+// Every single output character will be escaped. This means that if you
+// give it an ASCII character as input, it will be escaped. Some code uses
+// this when it knows that a character is invalid according to its rules
+// for validity. If you don't want escaping for ASCII characters, you will
+// have to filter them out prior to calling this function.
+//
+// Assumes that ch[begin] is within range in the array, but does not assume
+// that any following characters are.
+inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length,
+                                  CanonOutput* output) {
+  // UTF-16 input. Readchar16 will handle invalid characters for us and give
+  // us the kUnicodeReplacementCharacter, so we don't have to do special
+  // checking after failure, just pass through the failure to the caller.
+  unsigned char_value;
+  bool success = ReadUTFChar(str, begin, length, &char_value);
+  AppendUTF8EscapedValue(char_value, output);
+  return success;
+}
+
+// Handles UTF-8 input. See the wide version above for usage.
+inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length,
+                                  CanonOutput* output) {
+  // ReadUTF8Char will handle invalid characters for us and give us the
+  // kUnicodeReplacementCharacter, so we don't have to do special checking
+  // after failure, just pass through the failure to the caller.
+  unsigned ch;
+  bool success = ReadUTFChar(str, begin, length, &ch);
+  AppendUTF8EscapedValue(ch, output);
+  return success;
+}
+
+// Given a '%' character at |*begin| in the string |spec|, this will decode
+// the escaped value and put it into |*unescaped_value| on success (returns
+// true). On failure, this will return false, and will not write into
+// |*unescaped_value|.
+//
+// |*begin| will be updated to point to the last character of the escape
+// sequence so that when called with the index of a for loop, the next time
+// through it will point to the next character to be considered. On failure,
+// |*begin| will be unchanged.
+inline bool Is8BitChar(char c) {
+  (void)c;
+  return true;  // this case is specialized to avoid a warning
+}
+inline bool Is8BitChar(char16 c) {
+  return c <= 255;
+}
+
+template<typename CHAR>
+inline bool DecodeEscaped(const CHAR* spec, int* begin, int end,
+                          unsigned char* unescaped_value) {
+  if (*begin + 3 > end ||
+      !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) {
+    // Invalid escape sequence because there's not enough room, or the
+    // digits are not ASCII.
+    return false;
+  }
+
+  unsigned char first = static_cast<unsigned char>(spec[*begin + 1]);
+  unsigned char second = static_cast<unsigned char>(spec[*begin + 2]);
+  if (!IsHexChar(first) || !IsHexChar(second)) {
+    // Invalid hex digits, fail.
+    return false;
+  }
+
+  // Valid escape sequence.
+  *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second);
+  *begin += 2;
+  return true;
+}
+
+// Appends the given substring to the output, escaping "some" characters that
+// it feels may not be safe. It assumes the input values are all contained in
+// 8-bit although it allows any type.
+//
+// This is used in error cases to append invalid output so that it looks
+// approximately correct. Non-error cases should not call this function since
+// the escaping rules are not guaranteed!
+void AppendInvalidNarrowString(const char* spec, int begin, int end,
+                               CanonOutput* output);
+void AppendInvalidNarrowString(const char16* spec, int begin, int end,
+                               CanonOutput* output);
+
+// Misc canonicalization helpers ----------------------------------------------
+
+// Converts between UTF-8 and UTF-16, returning true on successful conversion.
+// The output will be appended to the given canonicalizer output (so make sure
+// it's empty if you want to replace).
+//
+// On invalid input, this will still write as much output as possible,
+// replacing the invalid characters with the "invalid character". It will
+// return false in the failure case, and the caller should not continue as
+// normal.
+GURL_API bool ConvertUTF16ToUTF8(const char16* input, int input_len,
+                                 CanonOutput* output);
+GURL_API bool ConvertUTF8ToUTF16(const char* input, int input_len,
+                                 CanonOutputT<char16>* output);
+
+// Converts from UTF-16 to 8-bit using the character set converter. If the
+// converter is NULL, this will use UTF-8.
+void ConvertUTF16ToQueryEncoding(const char16* input,
+                                 const url_parse::Component& query,
+                                 CharsetConverter* converter,
+                                 CanonOutput* output);
+
+// Applies the replacements to the given component source. The component source
+// should be pre-initialized to the "old" base. That is, all pointers will
+// point to the spec of the old URL, and all of the Parsed components will
+// be indices into that string.
+//
+// The pointers and components in the |source| for all non-NULL strings in the
+// |repl| (replacements) will be updated to reference those strings.
+// Canonicalizing with the new |source| and |parsed| can then combine URL
+// components from many different strings.
+void SetupOverrideComponents(const char* base,
+                             const Replacements<char>& repl,
+                             URLComponentSource<char>* source,
+                             url_parse::Parsed* parsed);
+
+// Like the above 8-bit version, except that it additionally converts the
+// UTF-16 input to UTF-8 before doing the overrides.
+//
+// The given utf8_buffer is used to store the converted components. They will
+// be appended one after another, with the parsed structure identifying the
+// appropriate substrings. This buffer is a parameter because the source has
+// no storage, so the buffer must have the same lifetime as the source
+// parameter owned by the caller.
+//
+// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of
+// |source| will point into this buffer, which could be invalidated if
+// additional data is added and the CanonOutput resizes its buffer.
+//
+// Returns true on success. Fales means that the input was not valid UTF-16,
+// although we will have still done the override with "invalid characters" in
+// place of errors.
+bool SetupUTF16OverrideComponents(const char* base,
+                                  const Replacements<char16>& repl,
+                                  CanonOutput* utf8_buffer,
+                                  URLComponentSource<char>* source,
+                                  url_parse::Parsed* parsed);
+
+// Implemented in url_canon_path.cc, these are required by the relative URL
+// resolver as well, so we declare them here.
+bool CanonicalizePartialPath(const char* spec,
+                             const url_parse::Component& path,
+                             int path_begin_in_output,
+                             CanonOutput* output);
+bool CanonicalizePartialPath(const char16* spec,
+                             const url_parse::Component& path,
+                             int path_begin_in_output,
+                             CanonOutput* output);
+
+#ifndef WIN32
+
+// Implementations of Windows' int-to-string conversions
+GURL_API int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix);
+GURL_API int _itow_s(int value, char16* buffer, size_t size_in_chars,
+                     int radix);
+
+// Secure template overloads for these functions
+template<size_t N>
+inline int _itoa_s(int value, char (&buffer)[N], int radix) {
+  return _itoa_s(value, buffer, N, radix);
+}
+
+template<size_t N>
+inline int _itow_s(int value, char16 (&buffer)[N], int radix) {
+  return _itow_s(value, buffer, N, radix);
+}
+
+// _strtoui64 and strtoull behave the same
+inline unsigned long long _strtoui64(const char* nptr,
+                                     char** endptr, int base) {
+  return strtoull(nptr, endptr, base);
+}
+
+#endif  // WIN32
+
+}  // namespace url_canon
+
+#endif  // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
diff --git a/googleurl/url_canon_internal_file.h b/googleurl/url_canon_internal_file.h
new file mode 100644
index 0000000..c37c65e
--- /dev/null
+++ b/googleurl/url_canon_internal_file.h
@@ -0,0 +1,157 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// As with url_canon_internal.h, this file is intended to be included in
+// another C++ file where the template types are defined. This allows the
+// programmer to use this to use these functions for their own strings
+// types, without bloating the code by having inline templates used in
+// every call site.
+//
+// *** This file must be included after url_canon_internal as we depend on some
+// functions in it. ***
+
+#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
+#define GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
+
+#include "url_file.h"
+#include "url_parse_internal.h"
+
+using namespace url_canon;
+
+// Given a pointer into the spec, this copies and canonicalizes the drive
+// letter and colon to the output, if one is found. If there is not a drive
+// spec, it won't do anything. The index of the next character in the input
+// spec is returned (after the colon when a drive spec is found, the begin
+// offset if one is not).
+template<typename CHAR>
+static int FileDoDriveSpec(const CHAR* spec, int begin, int end,
+                           CanonOutput* output) {
+  // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
+  // (with backslashes instead of slashes as well).
+  int num_slashes = CountConsecutiveSlashes(spec, begin, end);
+  int after_slashes = begin + num_slashes;
+
+  if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end))
+    return begin;  // Haven't consumed any characters
+
+  // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
+  // and that it is followed by a colon/pipe.
+
+  // Normalize Windows drive letters to uppercase
+  if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
+    output->push_back(spec[after_slashes] - 'a' + 'A');
+  else
+    output->push_back(static_cast<char>(spec[after_slashes]));
+
+  // Normalize the character following it to a colon rather than pipe.
+  output->push_back(':');
+  output->push_back('/');
+  return after_slashes + 2;
+}
+
+// FileDoDriveSpec will have already added the first backslash, so we need to
+// write everything following the slashes using the path canonicalizer.
+template<typename CHAR, typename UCHAR>
+static void FileDoPath(const CHAR* spec, int begin, int end,
+                       CanonOutput* output) {
+  // Normalize the number of slashes after the drive letter. The path
+  // canonicalizer expects the input to begin in a slash already so
+  // doesn't check. We want to handle no-slashes
+  int num_slashes = CountConsecutiveSlashes(spec, begin, end);
+  int after_slashes = begin + num_slashes;
+
+  // Now use the regular path canonicalizer to canonicalize the rest of the
+  // path. We supply it with the path following the slashes. It won't prepend
+  // a slash because it assumes any nonempty path already starts with one.
+  // We explicitly filter out calls with no path here to prevent that case.
+  ParsedURL::Component sub_path(after_slashes, end - after_slashes);
+  if (sub_path.len > 0) {
+    // Give it a fake output component to write into. DoCanonicalizeFile will
+    // compute the full path component.
+    ParsedURL::Component fake_output_path;
+    URLCanonInternal<CHAR, UCHAR>::DoPath(
+        spec, sub_path, output, &fake_output_path);
+  }
+}
+
+template<typename CHAR, typename UCHAR>
+static bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
+                                  const ParsedURL& parsed,
+                                  CanonOutput* output,
+                                  ParsedURL* new_parsed) {
+  // Things we don't set in file: URLs.
+  new_parsed->username = ParsedURL::Component(0, -1);
+  new_parsed->password = ParsedURL::Component(0, -1);
+  new_parsed->port = ParsedURL::Component(0, -1);
+
+  // Scheme (known, so we don't bother running it through the more
+  // complicated scheme canonicalizer).
+  new_parsed->scheme.begin = output->length();
+  output->push_back('f');
+  output->push_back('i');
+  output->push_back('l');
+  output->push_back('e');
+  new_parsed->scheme.len = output->length() - new_parsed->scheme.begin;
+  output->push_back(':');
+
+  // Write the separator for the host.
+  output->push_back('/');
+  output->push_back('/');
+
+  // Append the host. For many file URLs, this will be empty. For UNC, this
+  // will be present.
+  // TODO(brettw) This doesn't do any checking for host name validity. We
+  // should probably handle validity checking of UNC hosts differently than
+  // for regular IP hosts.
+  bool success = URLCanonInternal<CHAR, UCHAR>::DoHost(
+      source.host, parsed.host, output, &new_parsed->host);
+
+  // Write a separator for the start of the path. We'll ignore any slashes
+  // already at the beginning of the path.
+  new_parsed->path.begin = output->length();
+  output->push_back('/');
+
+  // Copies and normalizes the "c:" at the beginning, if present.
+  int after_drive = FileDoDriveSpec(source.path, parsed.path.begin,
+                                    parsed.path.end(), output);
+
+  // Copies the rest of the path
+  FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output);
+  new_parsed->path.len = output->length() - new_parsed->path.begin;
+
+  // Things following the path we can use the standard canonicalizers for.
+  success &= URLCanonInternal<CHAR, UCHAR>::DoQuery(
+      source.query, parsed.query, output, &new_parsed->query);
+  success &= URLCanonInternal<CHAR, UCHAR>::DoRef(
+      source.ref, parsed.ref, output, &new_parsed->ref);
+
+  return success;
+}
+
+#endif  // GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
diff --git a/googleurl/url_canon_ip.h b/googleurl/url_canon_ip.h
new file mode 100644
index 0000000..41da690
--- /dev/null
+++ b/googleurl/url_canon_ip.h
@@ -0,0 +1,101 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_IP_H__
+#define GOOGLEURL_SRC_URL_CANON_IP_H__
+
+#include "base/string16.h"
+#include "url_canon.h"
+#include "url_common.h"
+#include "url_parse.h"
+
+namespace url_canon {
+
+// Searches the host name for the portions of the IPv4 address. On success,
+// each component will be placed into |components| and it will return true.
+// It will return false if the host can not be separated as an IPv4 address
+// or if there are any non-7-bit characters or other characters that can not
+// be in an IP address. (This is important so we fail as early as possible for
+// common non-IP hostnames.)
+//
+// Not all components may exist. If there are only 3 components, for example,
+// the last one will have a length of -1 or 0 to indicate it does not exist.
+//
+// Note that many platform's inet_addr will ignore everything after a space
+// in certain curcumstances if the stuff before the space looks like an IP
+// address. IE6 is included in this. We do NOT handle this case. In many cases,
+// the browser's canonicalization will get run before this which converts
+// spaces to %20 (in the case of IE7) or rejects them (in the case of
+// Mozilla), so this code path never gets hit. Our host canonicalization will
+// notice these spaces and escape them, which will make IP address finding
+// fail. This seems like better behavior than stripping after a space.
+GURL_API bool FindIPv4Components(const char* spec,
+                                 const url_parse::Component& host,
+                                 url_parse::Component components[4]);
+GURL_API bool FindIPv4Components(const char16* spec,
+                                 const url_parse::Component& host,
+                                 url_parse::Component components[4]);
+
+// Converts an IPv4 address to a 32-bit number (network byte order).
+//
+// Possible return values:
+//   IPV4    - IPv4 address was successfully parsed.
+//   BROKEN  - Input was formatted like an IPv4 address, but overflow occurred
+//             during parsing.
+//   NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address.
+//             It might be an IPv6 address, or a hostname.
+//
+// On success, |num_ipv4_components| will be populated with the number of
+// components in the IPv4 address.
+GURL_API CanonHostInfo::Family IPv4AddressToNumber(
+    const char* spec,
+    const url_parse::Component& host,
+    unsigned char address[4],
+    int* num_ipv4_components);
+GURL_API CanonHostInfo::Family IPv4AddressToNumber(
+    const char16* spec,
+    const url_parse::Component& host,
+    unsigned char address[4],
+    int* num_ipv4_components);
+
+// Converts an IPv6 address to a 128-bit number (network byte order), returning
+// true on success. False means that the input was not a valid IPv6 address.
+//
+// NOTE that |host| is expected to be surrounded by square brackets.
+// i.e. "[::1]" rather than "::1".
+GURL_API bool IPv6AddressToNumber(const char* spec,
+                                  const url_parse::Component& host,
+                                  unsigned char address[16]);
+GURL_API bool IPv6AddressToNumber(const char16* spec,
+                                  const url_parse::Component& host,
+                                  unsigned char address[16]);
+
+}  // namespace url_canon
+
+#endif  // GOOGLEURL_SRC_URL_CANON_IP_H__
diff --git a/googleurl/url_canon_stdstring.h b/googleurl/url_canon_stdstring.h
new file mode 100644
index 0000000..d766e05
--- /dev/null
+++ b/googleurl/url_canon_stdstring.h
@@ -0,0 +1,134 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This header file defines a canonicalizer output method class for STL
+// strings. Because the canonicalizer tries not to be dependent on the STL,
+// we have segregated it here.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+
+#include <string>
+#include "url_canon.h"
+
+namespace url_canon {
+
+// Write into a std::string given in the constructor. This object does not own
+// the string itself, and the user must ensure that the string stays alive
+// throughout the lifetime of this object.
+//
+// The given string will be appended to; any existing data in the string will
+// be preserved. The caller should reserve() the amount of data in the string
+// they expect to be written. We will resize if necessary, but that's slow.
+//
+// Note that when canonicalization is complete, the string will likely have
+// unused space at the end because we make the string very big to start out
+// with (by |initial_size|). This ends up being important because resize
+// operations are slow, and because the base class needs to write directly
+// into the buffer.
+//
+// Therefore, the user should call Complete() before using the string that
+// this class wrote into.
+class StdStringCanonOutput : public CanonOutput {
+ public:
+  StdStringCanonOutput(std::string* str)
+      : CanonOutput(),
+        str_(str) {
+    cur_len_ = static_cast<int>(str_->size());  // Append to existing data.
+    str_->resize(str_->capacity());
+    buffer_ = str_->empty() ? NULL : &(*str_)[0];
+    buffer_len_ = static_cast<int>(str_->size());
+  }
+  virtual ~StdStringCanonOutput() {
+    // Nothing to do, we don't own the string.
+  }
+
+  // Must be called after writing has completed but before the string is used.
+  void Complete() {
+    str_->resize(cur_len_);
+    buffer_len_ = cur_len_;
+  }
+
+  virtual void Resize(int sz) {
+    str_->resize(sz);
+    buffer_ = str_->empty() ? NULL : &(*str_)[0];
+    buffer_len_ = sz;
+  }
+
+ protected:
+  std::string* str_;
+};
+
+// An extension of the Replacements class that allows the setters to use
+// standard strings.
+//
+// The strings passed as arguments are not copied and must remain valid until
+// this class goes out of scope.
+template<typename STR>
+class StdStringReplacements :
+    public url_canon::Replacements<typename STR::value_type> {
+ public:
+  void SetSchemeStr(const STR& s) {
+    this->SetScheme(s.data(),
+                    url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetUsernameStr(const STR& s) {
+    this->SetUsername(s.data(),
+                      url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetPasswordStr(const STR& s) {
+    this->SetPassword(s.data(),
+                      url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetHostStr(const STR& s) {
+    this->SetHost(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetPortStr(const STR& s) {
+    this->SetPort(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetPathStr(const STR& s) {
+    this->SetPath(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetQueryStr(const STR& s) {
+    this->SetQuery(s.data(),
+                   url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetRefStr(const STR& s) {
+    this->SetRef(s.data(),
+                 url_parse::Component(0, static_cast<int>(s.length())));
+  }
+};
+
+}  // namespace url_canon
+
+#endif  // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+
diff --git a/googleurl/url_common.h b/googleurl/url_common.h
new file mode 100644
index 0000000..ac045a8
--- /dev/null
+++ b/googleurl/url_common.h
@@ -0,0 +1,54 @@
+// Copyright 2010, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_COMMON_H__
+#define GOOGLEURL_SRC_URL_COMMON_H__
+
+#if !defined(GURL_IMPLEMENTATION)
+#define GURL_IMPLEMENTATION 0
+#endif
+
+#if defined(GURL_DLL)
+#if defined(WIN32)
+#if GURL_IMPLEMENTATION
+#define GURL_API __declspec(dllexport)
+#else
+#define GURL_API __declspec(dllimport)
+#endif
+#else
+// Non-Windows DLLs.
+#define GURL_API __attribute__((visibility("default")))
+#endif
+#else
+// Not a DLL.
+#define GURL_API
+#endif
+
+#endif  // GOOGLEURL_SRC_URL_COMMON_H__
+
diff --git a/googleurl/url_file.h b/googleurl/url_file.h
new file mode 100644
index 0000000..cb9c89f
--- /dev/null
+++ b/googleurl/url_file.h
@@ -0,0 +1,108 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Provides shared functions used by the internals of the parser and
+// canonicalizer for file URLs. Do not use outside of these modules.
+
+#ifndef GOOGLEURL_SRC_URL_FILE_H__
+#define GOOGLEURL_SRC_URL_FILE_H__
+
+#include "url_parse_internal.h"
+
+namespace url_parse {
+
+#ifdef WIN32
+
+// We allow both "c:" and "c|" as drive identifiers.
+inline bool IsWindowsDriveSeparator(char16 ch) {
+  return ch == ':' || ch == '|';
+}
+inline bool IsWindowsDriveLetter(char16 ch) {
+  return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
+}
+
+#endif  // WIN32
+
+// Returns the index of the next slash in the input after the given index, or
+// spec_len if the end of the input is reached.
+template<typename CHAR>
+inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) {
+  int idx = begin_index;
+  while (idx < spec_len && !IsURLSlash(spec[idx]))
+    idx++;
+  return idx;
+}
+
+#ifdef WIN32
+
+// Returns true if the start_offset in the given spec looks like it begins a
+// drive spec, for example "c:". This function explicitly handles start_offset
+// values that are equal to or larger than the spec_len to simplify callers.
+//
+// If this returns true, the spec is guaranteed to have a valid drive letter
+// plus a colon starting at |start_offset|.
+template<typename CHAR>
+inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset,
+                                      int spec_len) {
+  int remaining_len = spec_len - start_offset;
+  if (remaining_len < 2)
+    return false;  // Not enough room.
+  if (!IsWindowsDriveLetter(spec[start_offset]))
+    return false;  // Doesn't start with a valid drive letter.
+  if (!IsWindowsDriveSeparator(spec[start_offset + 1]))
+    return false;  // Isn't followed with a drive separator.
+  return true;
+}
+
+// Returns true if the start_offset in the given text looks like it begins a
+// UNC path, for example "\\". This function explicitly handles start_offset
+// values that are equal to or larger than the spec_len to simplify callers.
+//
+// When strict_slashes is set, this function will only accept backslashes as is
+// standard for Windows. Otherwise, it will accept forward slashes as well
+// which we use for a lot of URL handling.
+template<typename CHAR>
+inline bool DoesBeginUNCPath(const CHAR* text,
+                             int start_offset,
+                             int len,
+                             bool strict_slashes) {
+  int remaining_len = len - start_offset;
+  if (remaining_len < 2)
+    return false;
+
+  if (strict_slashes)
+    return text[start_offset] == '\\' && text[start_offset + 1] == '\\';
+  return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]);
+}
+
+#endif  // WIN32
+
+}  // namespace url_parse
+
+#endif  // GOOGLEURL_SRC_URL_FILE_H__
diff --git a/googleurl/url_parse.h b/googleurl/url_parse.h
new file mode 100644
index 0000000..1eb6fcb
--- /dev/null
+++ b/googleurl/url_parse.h
@@ -0,0 +1,373 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_PARSE_H__
+#define GOOGLEURL_SRC_URL_PARSE_H__
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/string16.h"
+#include "url_common.h"
+
+namespace url_parse {
+
+// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and
+// KURLGoogle.cpp still rely on this type.
+typedef char16 UTF16Char;
+
+// Component ------------------------------------------------------------------
+
+// Represents a substring for URL parsing.
+struct Component {
+  Component() : begin(0), len(-1) {}
+
+  // Normal constructor: takes an offset and a length.
+  Component(int b, int l) : begin(b), len(l) {}
+
+  int end() const {
+    return begin + len;
+  }
+
+  // Returns true if this component is valid, meaning the length is given. Even
+  // valid components may be empty to record the fact that they exist.
+  bool is_valid() const {
+    return (len != -1);
+  }
+
+  // Returns true if the given component is specified on false, the component
+  // is either empty or invalid.
+  bool is_nonempty() const {
+    return (len > 0);
+  }
+
+  void reset() {
+    begin = 0;
+    len = -1;
+  }
+
+  bool operator==(const Component& other) const {
+    return begin == other.begin && len == other.len;
+  }
+
+  int begin;  // Byte offset in the string of this component.
+  int len;    // Will be -1 if the component is unspecified.
+};
+
+// Helper that returns a component created with the given begin and ending
+// points. The ending point is non-inclusive.
+inline Component MakeRange(int begin, int end) {
+  return Component(begin, end - begin);
+}
+
+// Parsed ---------------------------------------------------------------------
+
+// A structure that holds the identified parts of an input URL. This structure
+// does NOT store the URL itself. The caller will have to store the URL text
+// and its corresponding Parsed structure separately.
+//
+// Typical usage would be:
+//
+//    url_parse::Parsed parsed;
+//    url_parse::Component scheme;
+//    if (!url_parse::ExtractScheme(url, url_len, &scheme))
+//      return I_CAN_NOT_FIND_THE_SCHEME_DUDE;
+//
+//    if (IsStandardScheme(url, scheme))  // Not provided by this component
+//      url_parseParseStandardURL(url, url_len, &parsed);
+//    else if (IsFileURL(url, scheme))    // Not provided by this component
+//      url_parse::ParseFileURL(url, url_len, &parsed);
+//    else
+//      url_parse::ParsePathURL(url, url_len, &parsed);
+//
+struct Parsed {
+  // Identifies different components.
+  enum ComponentType {
+    SCHEME,
+    USERNAME,
+    PASSWORD,
+    HOST,
+    PORT,
+    PATH,
+    QUERY,
+    REF
+  };
+
+  // The default constructor is sufficient for the components, but inner_parsed_
+  // requires special handling.
+  GURL_API Parsed();
+  GURL_API Parsed(const Parsed&);
+  GURL_API Parsed& operator=(const Parsed&);
+  GURL_API ~Parsed();
+
+  // Returns the length of the URL (the end of the last component).
+  //
+  // Note that for some invalid, non-canonical URLs, this may not be the length
+  // of the string. For example "http://": the parsed structure will only
+  // contain an entry for the four-character scheme, and it doesn't know about
+  // the "://". For all other last-components, it will return the real length.
+  GURL_API int Length() const;
+
+  // Returns the number of characters before the given component if it exists,
+  // or where the component would be if it did exist. This will return the
+  // string length if the component would be appended to the end.
+  //
+  // Note that this can get a little funny for the port, query, and ref
+  // components which have a delimiter that is not counted as part of the
+  // component. The |include_delimiter| flag controls if you want this counted
+  // as part of the component or not when the component exists.
+  //
+  // This example shows the difference between the two flags for two of these
+  // delimited components that is present (the port and query) and one that
+  // isn't (the reference). The components that this flag affects are marked
+  // with a *.
+  //                 0         1         2
+  //                 012345678901234567890
+  // Example input:  http://foo:80/?query
+  //              include_delim=true,  ...=false  ("<-" indicates different)
+  //      SCHEME: 0                    0
+  //    USERNAME: 5                    5
+  //    PASSWORD: 5                    5
+  //        HOST: 7                    7
+  //       *PORT: 10                   11 <-
+  //        PATH: 13                   13
+  //      *QUERY: 14                   15 <-
+  //        *REF: 20                   20
+  //
+  GURL_API int CountCharactersBefore(ComponentType type,
+                                     bool include_delimiter) const;
+
+  // Scheme without the colon: "http://foo"/ would have a scheme of "http".
+  // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there
+  // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed
+  // to start at the beginning of the string if there are preceeding whitespace
+  // or control characters.
+  Component scheme;
+
+  // Username. Specified in URLs with an @ sign before the host. See |password|
+  Component username;
+
+  // Password. The length will be -1 if unspecified, 0 if specified but empty.
+  // Not all URLs with a username have a password, as in "http://me@host/".
+  // The password is separated form the username with a colon, as in
+  // "http://me:secret@host/"
+  Component password;
+
+  // Host name.
+  Component host;
+
+  // Port number.
+  Component port;
+
+  // Path, this is everything following the host name. Length will be -1 if
+  // unspecified. This includes the preceeding slash, so the path on
+  // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to
+  // have a 0 length path, it will be -1 in cases like "http://host?foo".
+  // Note that we treat backslashes the same as slashes.
+  Component path;
+
+  // Stuff between the ? and the # after the path. This does not include the
+  // preceeding ? character. Length will be -1 if unspecified, 0 if there is
+  // a question mark but no query string.
+  Component query;
+
+  // Indicated by a #, this is everything following the hash sign (not
+  // including it). If there are multiple hash signs, we'll use the last one.
+  // Length will be -1 if there is no hash sign, or 0 if there is one but
+  // nothing follows it.
+  Component ref;
+
+  // This is used for nested URL types, currently only filesystem.  If you
+  // parse a filesystem URL, the resulting Parsed will have a nested
+  // inner_parsed_ to hold the parsed inner URL's component information.
+  // For all other url types [including the inner URL], it will be NULL.
+  Parsed* inner_parsed() const {
+    return inner_parsed_;
+  }
+
+  void set_inner_parsed(const Parsed& _inner_parsed) {
+    if (!inner_parsed_)
+      inner_parsed_ = new Parsed(_inner_parsed);
+    else
+      *inner_parsed_ = _inner_parsed;
+  }
+
+  void clear_inner_parsed() {
+    if (inner_parsed_) {
+      delete inner_parsed_;
+      inner_parsed_ = NULL;
+    }
+  }
+
+ private:
+  Parsed* inner_parsed_;  // This object is owned and managed by this struct.
+};
+
+// Initialization functions ---------------------------------------------------
+//
+// These functions parse the given URL, filling in all of the structure's
+// components. These functions can not fail, they will always do their best
+// at interpreting the input given.
+//
+// The string length of the URL MUST be specified, we do not check for NULLs
+// at any point in the process, and will actually handle embedded NULLs.
+//
+// IMPORTANT: These functions do NOT hang on to the given pointer or copy it
+// in any way. See the comment above the struct.
+//
+// The 8-bit versions require UTF-8 encoding.
+
+// StandardURL is for when the scheme is known to be one that has an
+// authority (host) like "http". This function will not handle weird ones
+// like "about:" and "javascript:", or do the right thing for "file:" URLs.
+GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);
+
+// PathURL is for when the scheme is known not to have an authority (host)
+// section but that aren't file URLs either. The scheme is parsed, and
+// everything after the scheme is considered as the path. This is used for
+// things like "about:" and "javascript:"
+GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed);
+
+// FileURL is for file URLs. There are some special rules for interpreting
+// these.
+GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed);
+
+// Filesystem URLs are structured differently than other URLs.
+GURL_API void ParseFileSystemURL(const char* url,
+                                 int url_len,
+                                 Parsed* parsed);
+GURL_API void ParseFileSystemURL(const char16* url,
+                                 int url_len,
+                                 Parsed* parsed);
+
+// MailtoURL is for mailto: urls. They are made up scheme,path,query
+GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
+
+// Helper functions -----------------------------------------------------------
+
+// Locates the scheme according to the URL  parser's rules. This function is
+// designed so the caller can find the scheme and call the correct Init*
+// function according to their known scheme types.
+//
+// It also does not perform any validation on the scheme.
+//
+// This function will return true if the scheme is found and will put the
+// scheme's range into *scheme. False means no scheme could be found. Note
+// that a URL beginning with a colon has a scheme, but it is empty, so this
+// function will return true but *scheme will = (0,0).
+//
+// The scheme is found by skipping spaces and control characters at the
+// beginning, and taking everything from there to the first colon to be the
+// scheme. The character at scheme.end() will be the colon (we may enhance
+// this to handle full width colons or something, so don't count on the
+// actual character value). The character at scheme.end()+1 will be the
+// beginning of the rest of the URL, be it the authority or the path (or the
+// end of the string).
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme);
+GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme);
+
+// Returns true if ch is a character that terminates the authority segment
+// of a URL.
+GURL_API bool IsAuthorityTerminator(char16 ch);
+
+// Does a best effort parse of input |spec|, in range |auth|. If a particular
+// component is not found, it will be set to invalid.
+GURL_API void ParseAuthority(const char* spec,
+                             const Component& auth,
+                             Component* username,
+                             Component* password,
+                             Component* hostname,
+                             Component* port_num);
+GURL_API void ParseAuthority(const char16* spec,
+                             const Component& auth,
+                             Component* username,
+                             Component* password,
+                             Component* hostname,
+                             Component* port_num);
+
+// Computes the integer port value from the given port component. The port
+// component should have been identified by one of the init functions on
+// |Parsed| for the given input url.
+//
+// The return value will be a positive integer between 0 and 64K, or one of
+// the two special values below.
+enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
+GURL_API int ParsePort(const char* url, const Component& port);
+GURL_API int ParsePort(const char16* url, const Component& port);
+
+// Extracts the range of the file name in the given url. The path must
+// already have been computed by the parse function, and the matching URL
+// and extracted path are provided to this function. The filename is
+// defined as being everything from the last slash/backslash of the path
+// to the end of the path.
+//
+// The file name will be empty if the path is empty or there is nothing
+// following the last slash.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API void ExtractFileName(const char* url,
+                              const Component& path,
+                              Component* file_name);
+GURL_API void ExtractFileName(const char16* url,
+                              const Component& path,
+                              Component* file_name);
+
+// Extract the first key/value from the range defined by |*query|. Updates
+// |*query| to start at the end of the extracted key/value pair. This is
+// designed for use in a loop: you can keep calling it with the same query
+// object and it will iterate over all items in the query.
+//
+// Some key/value pairs may have the key, the value, or both be empty (for
+// example, the query string "?&"). These will be returned. Note that an empty
+// last parameter "foo.com?" or foo.com?a&" will not be returned, this case
+// is the same as "done."
+//
+// The initial query component should not include the '?' (this is the default
+// for parsed URLs).
+//
+// If no key/value are found |*key| and |*value| will be unchanged and it will
+// return false.
+GURL_API bool ExtractQueryKeyValue(const char* url,
+                                   Component* query,
+                                   Component* key,
+                                   Component* value);
+GURL_API bool ExtractQueryKeyValue(const char16* url,
+                                   Component* query,
+                                   Component* key,
+                                   Component* value);
+
+}  // namespace url_parse
+
+#endif  // GOOGLEURL_SRC_URL_PARSE_H__
diff --git a/googleurl/url_parse_internal.h b/googleurl/url_parse_internal.h
new file mode 100644
index 0000000..32b306a
--- /dev/null
+++ b/googleurl/url_parse_internal.h
@@ -0,0 +1,112 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Contains common inline helper functions used by the URL parsing routines.
+
+#ifndef GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
+#define GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
+
+#include "url_parse.h"
+
+namespace url_parse {
+
+// We treat slashes and backslashes the same for IE compatability.
+inline bool IsURLSlash(char16 ch) {
+  return ch == '/' || ch == '\\';
+}
+
+// Returns true if we should trim this character from the URL because it is a
+// space or a control character.
+inline bool ShouldTrimFromURL(char16 ch) {
+  return ch <= ' ';
+}
+
+// Given an already-initialized begin index and length, this shrinks the range
+// to eliminate "should-be-trimmed" characters. Note that the length does *not*
+// indicate the length of untrimmed data from |*begin|, but rather the position
+// in the input string (so the string starts at character |*begin| in the spec,
+// and goes until |*len|).
+template<typename CHAR>
+inline void TrimURL(const CHAR* spec, int* begin, int* len) {
+  // Strip leading whitespace and control characters.
+  while (*begin < *len && ShouldTrimFromURL(spec[*begin]))
+    (*begin)++;
+
+  // Strip trailing whitespace and control characters. We need the >i test for
+  // when the input string is all blanks; we don't want to back past the input.
+  while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
+    (*len)--;
+}
+
+// Counts the number of consecutive slashes starting at the given offset
+// in the given string of the given length.
+template<typename CHAR>
+inline int CountConsecutiveSlashes(const CHAR *str,
+                                   int begin_offset, int str_len) {
+  int count = 0;
+  while (begin_offset + count < str_len &&
+         IsURLSlash(str[begin_offset + count]))
+    ++count;
+  return count;
+}
+
+// Internal functions in url_parse.cc that parse the path, that is, everything
+// following the authority section. The input is the range of everything
+// following the authority section, and the output is the identified ranges.
+//
+// This is designed for the file URL parser or other consumers who may do
+// special stuff at the beginning, but want regular path parsing, it just
+// maps to the internal parsing function for paths.
+void ParsePathInternal(const char* spec,
+                       const Component& path,
+                       Component* filepath,
+                       Component* query,
+                       Component* ref);
+void ParsePathInternal(const char16* spec,
+                       const Component& path,
+                       Component* filepath,
+                       Component* query,
+                       Component* ref);
+
+
+// Given a spec and a pointer to the character after the colon following the
+// scheme, this parses it and fills in the structure, Every item in the parsed
+// structure is filled EXCEPT for the scheme, which is untouched.
+void ParseAfterScheme(const char* spec,
+                      int spec_len,
+                      int after_scheme,
+                      Parsed* parsed);
+void ParseAfterScheme(const char16* spec,
+                      int spec_len,
+                      int after_scheme,
+                      Parsed* parsed);
+
+}  // namespace url_parse
+
+#endif  // GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
diff --git a/googleurl/url_test_utils.h b/googleurl/url_test_utils.h
new file mode 100644
index 0000000..77acf12
--- /dev/null
+++ b/googleurl/url_test_utils.h
@@ -0,0 +1,78 @@
+// Copyright 2007 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Convenience functions for string conversions.
+// These are mostly intended for use in unit tests.
+
+#ifndef GOOGLEURL_SRC_URL_TEST_UTILS_H__
+#define GOOGLEURL_SRC_URL_TEST_UTILS_H__
+
+#include <string>
+
+#include "base/string16.h"
+#include "url_canon_internal.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace url_test_utils {
+
+// Converts a UTF-16 string from native wchar_t format to char16, by
+// truncating the high 32 bits.  This is not meant to handle true UTF-32
+// encoded strings.
+inline string16 WStringToUTF16(const wchar_t* src) {
+  string16 str;
+  int length = static_cast<int>(wcslen(src));
+  for (int i = 0; i < length; ++i) {
+    str.push_back(static_cast<char16>(src[i]));
+  }
+  return str;
+}
+
+// Converts a string from UTF-8 to UTF-16
+inline string16 ConvertUTF8ToUTF16(const std::string& src) {
+  int length = static_cast<int>(src.length());
+  EXPECT_LT(length, 1024);
+  url_canon::RawCanonOutputW<1024> output;
+  EXPECT_TRUE(url_canon::ConvertUTF8ToUTF16(src.data(), length, &output));
+  return string16(output.data(), output.length());
+}
+
+// Converts a string from UTF-16 to UTF-8
+inline std::string ConvertUTF16ToUTF8(const string16& src) {
+  std::string str;
+  url_canon::StdStringCanonOutput output(&str);
+  EXPECT_TRUE(url_canon::ConvertUTF16ToUTF8(src.data(),
+                                            static_cast<int>(src.length()),
+                                            &output));
+  output.Complete();
+  return str;
+}
+
+}  // namespace url_test_utils
+
+#endif  // GOOGLEURL_SRC_URL_TEST_UTILS_H__
diff --git a/googleurl/url_util.h b/googleurl/url_util.h
new file mode 100644
index 0000000..32ab987
--- /dev/null
+++ b/googleurl/url_util.h
@@ -0,0 +1,229 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_UTIL_H__
+#define GOOGLEURL_SRC_URL_UTIL_H__
+
+#include <string>
+
+#include "base/string16.h"
+#include "url_common.h"
+#include "url_parse.h"
+#include "url_canon.h"
+
+namespace url_util {
+
+// Init ------------------------------------------------------------------------
+
+// Initialization is NOT required, it will be implicitly initialized when first
+// used. However, this implicit initialization is NOT threadsafe. If you are
+// using this library in a threaded environment and don't have a consistent
+// "first call" (an example might be calling "AddStandardScheme" with your
+// special application-specific schemes) then you will want to call initialize
+// before spawning any threads.
+//
+// It is OK to call this function more than once, subsequent calls will simply
+// "noop", unless Shutdown() was called in the mean time. This will also be a
+// "noop" if other calls to the library have forced an initialization
+// beforehand.
+GURL_API void Initialize();
+
+// Cleanup is not required, except some strings may leak. For most user
+// applications, this is fine. If you're using it in a library that may get
+// loaded and unloaded, you'll want to unload to properly clean up your
+// library.
+GURL_API void Shutdown();
+
+// Schemes --------------------------------------------------------------------
+
+// Adds an application-defined scheme to the internal list of "standard" URL
+// schemes. This function is not threadsafe and can not be called concurrently
+// with any other url_util function. It will assert if the list of standard
+// schemes has been locked (see LockStandardSchemes).
+GURL_API void AddStandardScheme(const char* new_scheme);
+
+// Sets a flag to prevent future calls to AddStandardScheme from succeeding.
+//
+// This is designed to help prevent errors for multithreaded applications.
+// Normal usage would be to call AddStandardScheme for your custom schemes at
+// the beginning of program initialization, and then LockStandardSchemes. This
+// prevents future callers from mistakenly calling AddStandardScheme when the
+// program is running with multiple threads, where such usage would be
+// dangerous.
+//
+// We could have had AddStandardScheme use a lock instead, but that would add
+// some platform-specific dependencies we don't otherwise have now, and is
+// overkill considering the normal usage is so simple.
+GURL_API void LockStandardSchemes();
+
+// Locates the scheme in the given string and places it into |found_scheme|,
+// which may be NULL to indicate the caller does not care about the range.
+//
+// Returns whether the given |compare| scheme matches the scheme found in the
+// input (if any). The |compare| scheme must be a valid canonical scheme or
+// the result of the comparison is undefined.
+GURL_API bool FindAndCompareScheme(const char* str,
+                                   int str_len,
+                                   const char* compare,
+                                   url_parse::Component* found_scheme);
+GURL_API bool FindAndCompareScheme(const char16* str,
+                                   int str_len,
+                                   const char* compare,
+                                   url_parse::Component* found_scheme);
+inline bool FindAndCompareScheme(const std::string& str,
+                                 const char* compare,
+                                 url_parse::Component* found_scheme) {
+  return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
+                              compare, found_scheme);
+}
+inline bool FindAndCompareScheme(const string16& str,
+                                 const char* compare,
+                                 url_parse::Component* found_scheme) {
+  return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
+                              compare, found_scheme);
+}
+
+// Returns true if the given string represents a standard URL. This means that
+// either the scheme is in the list of known standard schemes.
+GURL_API bool IsStandard(const char* spec,
+                         const url_parse::Component& scheme);
+GURL_API bool IsStandard(const char16* spec,
+                         const url_parse::Component& scheme);
+
+// TODO(brettw) remove this. This is a temporary compatibility hack to avoid
+// breaking the WebKit build when this version is synced via Chrome.
+inline bool IsStandard(const char* spec, int spec_len,
+                       const url_parse::Component& scheme) {
+  (void)spec_len;
+  return IsStandard(spec, scheme);
+}
+
+// URL library wrappers -------------------------------------------------------
+
+// Parses the given spec according to the extracted scheme type. Normal users
+// should use the URL object, although this may be useful if performance is
+// critical and you don't want to do the heap allocation for the std::string.
+//
+// As with the url_canon::Canonicalize* functions, the charset converter can
+// be NULL to use UTF-8 (it will be faster in this case).
+//
+// Returns true if a valid URL was produced, false if not. On failure, the
+// output and parsed structures will still be filled and will be consistent,
+// but they will not represent a loadable URL.
+GURL_API bool Canonicalize(const char* spec,
+                           int spec_len,
+                           url_canon::CharsetConverter* charset_converter,
+                           url_canon::CanonOutput* output,
+                           url_parse::Parsed* output_parsed);
+GURL_API bool Canonicalize(const char16* spec,
+                           int spec_len,
+                           url_canon::CharsetConverter* charset_converter,
+                           url_canon::CanonOutput* output,
+                           url_parse::Parsed* output_parsed);
+
+// Resolves a potentially relative URL relative to the given parsed base URL.
+// The base MUST be valid. The resulting canonical URL and parsed information
+// will be placed in to the given out variables.
+//
+// The relative need not be relative. If we discover that it's absolute, this
+// will produce a canonical version of that URL. See Canonicalize() for more
+// about the charset_converter.
+//
+// Returns true if the output is valid, false if the input could not produce
+// a valid URL.
+GURL_API bool ResolveRelative(const char* base_spec,
+                              int base_spec_len,
+                              const url_parse::Parsed& base_parsed,
+                              const char* relative,
+                              int relative_length,
+                              url_canon::CharsetConverter* charset_converter,
+                              url_canon::CanonOutput* output,
+                              url_parse::Parsed* output_parsed);
+GURL_API bool ResolveRelative(const char* base_spec,
+                              int base_spec_len,
+                              const url_parse::Parsed& base_parsed,
+                              const char16* relative,
+                              int relative_length,
+                              url_canon::CharsetConverter* charset_converter,
+                              url_canon::CanonOutput* output,
+                              url_parse::Parsed* output_parsed);
+
+// Replaces components in the given VALID input url. The new canonical URL info
+// is written to output and out_parsed.
+//
+// Returns true if the resulting URL is valid.
+GURL_API bool ReplaceComponents(
+    const char* spec,
+    int spec_len,
+    const url_parse::Parsed& parsed,
+    const url_canon::Replacements<char>& replacements,
+    url_canon::CharsetConverter* charset_converter,
+    url_canon::CanonOutput* output,
+    url_parse::Parsed* out_parsed);
+GURL_API bool ReplaceComponents(
+    const char* spec,
+    int spec_len,
+    const url_parse::Parsed& parsed,
+    const url_canon::Replacements<char16>& replacements,
+    url_canon::CharsetConverter* charset_converter,
+    url_canon::CanonOutput* output,
+    url_parse::Parsed* out_parsed);
+
+// String helper functions ----------------------------------------------------
+
+// Compare the lower-case form of the given string against the given ASCII
+// string.  This is useful for doing checking if an input string matches some
+// token, and it is optimized to avoid intermediate string copies.
+//
+// The versions of this function that don't take a b_end assume that the b
+// string is NULL terminated.
+GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
+                                   const char* a_end,
+                                   const char* b);
+GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
+                                   const char* a_end,
+                                   const char* b_begin,
+                                   const char* b_end);
+GURL_API bool LowerCaseEqualsASCII(const char16* a_begin,
+                                   const char16* a_end,
+                                   const char* b);
+
+// Unescapes the given string using URL escaping rules.
+GURL_API void DecodeURLEscapeSequences(const char* input, int length,
+                                       url_canon::CanonOutputW* output);
+
+// Escapes the given string as defined by the JS method encodeURIComponent.  See
+// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent
+GURL_API void EncodeURIComponent(const char* input, int length,
+                                 url_canon::CanonOutput* output);
+
+
+}  // namespace url_util
+
+#endif  // GOOGLEURL_SRC_URL_UTIL_H__
diff --git a/googleurl/url_util_internal.h b/googleurl/url_util_internal.h
new file mode 100644
index 0000000..38335fd
--- /dev/null
+++ b/googleurl/url_util_internal.h
@@ -0,0 +1,56 @@
+// Copyright 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__
+#define GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__
+
+#include <string>
+
+#include "base/string16.h"
+#include "url_common.h"
+#include "url_parse.h"
+
+namespace url_util {
+
+extern const char kFileScheme[];
+extern const char kFileSystemScheme[];
+extern const char kMailtoScheme[];
+
+// Given a string and a range inside the string, compares it to the given
+// lower-case |compare_to| buffer.
+bool CompareSchemeComponent(const char* spec,
+                            const url_parse::Component& component,
+                            const char* compare_to);
+bool CompareSchemeComponent(const char16* spec,
+                            const url_parse::Component& component,
+                            const char* compare_to);
+
+}  // namespace url_util
+
+#endif  // GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__
author	Andreas Baumann <abaumann@yahoo.com>	2012-08-04 14:01:19 +0200
committer	Andreas Baumann <abaumann@yahoo.com>	2012-08-04 14:01:19 +0200
commit	9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27 (patch)
tree	f88532f9adc9d15514f484cdf65e21c78d72e480 /googleurl
parent	4029e28c299049e19972556eeb22cf6d15147eab (diff)
download	crawler-9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27.tar.gz crawler-9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27.tar.bz2