diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-04 14:01:19 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-04 14:01:19 +0200 |
commit | 9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27 (patch) | |
tree | f88532f9adc9d15514f484cdf65e21c78d72e480 /googleurl | |
parent | 4029e28c299049e19972556eeb22cf6d15147eab (diff) | |
download | crawler-9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27.tar.gz crawler-9473c0bb8d1a69a042e1fd745fb2f76ea0b8ac27.tar.bz2 |
added google url library
Diffstat (limited to 'googleurl')
-rw-r--r-- | googleurl/GNUmakefile | 50 | ||||
-rw-r--r-- | googleurl/LICENSE.txt | 65 | ||||
-rw-r--r-- | googleurl/README.txt | 185 | ||||
-rw-r--r-- | googleurl/base/README.txt | 2 | ||||
-rw-r--r-- | googleurl/base/basictypes.h | 88 | ||||
-rw-r--r-- | googleurl/base/logging.cc | 380 | ||||
-rw-r--r-- | googleurl/base/logging.h | 489 | ||||
-rw-r--r-- | googleurl/base/scoped_ptr.h | 322 | ||||
-rw-r--r-- | googleurl/base/string16.cc | 94 | ||||
-rw-r--r-- | googleurl/base/string16.h | 193 | ||||
-rw-r--r-- | googleurl/gurl.h | 392 | ||||
-rw-r--r-- | googleurl/url_canon.h | 912 | ||||
-rw-r--r-- | googleurl/url_canon_icu.h | 63 | ||||
-rw-r--r-- | googleurl/url_canon_internal.h | 462 | ||||
-rw-r--r-- | googleurl/url_canon_internal_file.h | 157 | ||||
-rw-r--r-- | googleurl/url_canon_ip.h | 101 | ||||
-rw-r--r-- | googleurl/url_canon_stdstring.h | 134 | ||||
-rw-r--r-- | googleurl/url_common.h | 54 | ||||
-rw-r--r-- | googleurl/url_file.h | 108 | ||||
-rw-r--r-- | googleurl/url_parse.h | 373 | ||||
-rw-r--r-- | googleurl/url_parse_internal.h | 112 | ||||
-rw-r--r-- | googleurl/url_test_utils.h | 78 | ||||
-rw-r--r-- | googleurl/url_util.h | 229 | ||||
-rw-r--r-- | googleurl/url_util_internal.h | 56 |
24 files changed, 5099 insertions, 0 deletions
diff --git a/googleurl/GNUmakefile b/googleurl/GNUmakefile new file mode 100644 index 0000000..0971a4a --- /dev/null +++ b/googleurl/GNUmakefile @@ -0,0 +1,50 @@ +TOPDIR = .. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_CFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_DIRS = \ + -I. + +INCLUDE_LIBS = \ + +CPP_OBJS = \ + url_canon_etc.o \ + url_canon_filesystemurl.o \ + url_canon_fileurl.o \ + url_canon_host.o \ + url_canon_icu.o \ + url_canon_internal.o \ + url_canon_ip.o \ + url_canon_mailtourl.o \ + url_canon_path.o \ + url_canon_pathurl.o \ + url_canon_query.o \ + url_canon_relative.o \ + url_canon_stdurl.o \ + url_parse.o \ + url_parse_file.o \ + url_util.o \ + gurl.o + +STATIC_LIB = \ + libgoogleurl.a + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: diff --git a/googleurl/LICENSE.txt b/googleurl/LICENSE.txt new file mode 100644 index 0000000..ac40837 --- /dev/null +++ b/googleurl/LICENSE.txt @@ -0,0 +1,65 @@ +Copyright 2007, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is +licensed separately as follows: + +The contents of this file are subject to the Mozilla Public License Version +1.1 (the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at +http://www.mozilla.org/MPL/ + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +for the specific language governing rights and limitations under the +License. + +The Original Code is mozilla.org code. + +The Initial Developer of the Original Code is +Netscape Communications Corporation. +Portions created by the Initial Developer are Copyright (C) 1998 +the Initial Developer. All Rights Reserved. + +Contributor(s): + Darin Fisher (original author) + +Alternatively, the contents of this file may be used under the terms of +either the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +in which case the provisions of the GPL or the LGPL are applicable instead +of those above. If you wish to allow use of your version of this file only +under the terms of either the GPL or the LGPL, and not to allow others to +use your version of this file under the terms of the MPL, indicate your +decision by deleting the provisions above and replace them with the notice +and other provisions required by the GPL or the LGPL. If you do not delete +the provisions above, a recipient may use your version of this file under +the terms of any one of the MPL, the GPL or the LGPL. diff --git a/googleurl/README.txt b/googleurl/README.txt new file mode 100644 index 0000000..d5f79a3 --- /dev/null +++ b/googleurl/README.txt @@ -0,0 +1,185 @@ + ============================== + The Google URL Parsing Library + ============================== + +This is the Google URL Parsing Library which parses and canonicalizes URLs. +Please see the LICENSE.txt file for licensing information. + +Features +======== + + * Easily embeddable: This library was written for a variety of client and + server programs in mind, so unlike most implementations of URL parsing + and canonicalization, it can be easily emdedded. + + * Fast: hundreds of thousands of typical URLs can be parsed and + canonicalized per second on a modern CPU. It is much faster than, for + example, calling WinInet's corresponding functions. + + * Compatible: When possible, this library has strived for IE7 compatability + for both general web compatability, and so IE addons or other applications + that communicate with or embed IE will work properly. + + It supports Unix-style file URLs, as well as the more complex rules for + Window file URLs. Note that total compatability is not possible (for + example, IE6 and IE7 disagree about how to parse certain IP addresses), + and that this is more strict about certain illegal, rarely used, and + potentially dangerous constructs such as escaped control characters in + host names that IE will allow. It is typically a little less strict than + Firefox. + + +Example +======= + +An example implementation of a URL object that uses this library is provided +in src/gurl.*. This implementation uses the "application integration" layer +discussed below to interface with the low-level parsing and canonicalization +functions. + + +Building +======== + +The canonicalization files require ICU for some UTF-8 and UTF-16 conversion +macros. If your project does not use ICU, it should be straightforward to +factor out the macros and functions used in ICU, there are only a few well- +isolated things that are used. + +TODO(brettw) ADD INSTRUCTIONS FOR GETTING ICU HERE! + +logging.h and logging.cc are Windows-only because the corresponding Unix +logging system has many dependencies. This library uses few of the logging +macros, and a dummy header can easily be written that defines the +appropriate things for Unix. + + +Definitions +=========== + +"Standard URL": A URL with an "authority", which is a hostname and optionally + a port, username, and password. Most URLs are standard such as HTTP and FTP. + +"File URL": A URL that references a file on disk. There are special rules for + this type of URL. Note that it may have a hostname! "localhost" is allowed, + for example "file://localhost/foo" is the same as "file:///foo". + +"FileSystem URL": A URL referring to a file reached via the FileSystem API + described at http://www.w3.org/TR/file-system-api/. These are nested URLs, + with compound schemes of e.g. "filesystem:file:" or "filesystem:https:". + Parsed FileSystem URLs will have a nested inner_parsed() object containing + information about the inner URL. + +"Path URL": This is everything else. There is no standard on how to treat these + URLs, or even what they are called. This library decomposes them into a + scheme and a path. The path is everything following the scheme. This type of + URL includes "javascript", "data", and even "mailto" (although "mailto" + might look like a standard scheme in some respects, it is not). + +Design +====== + +The library is divided into four layers. They are listed here from the lowest +to the highest; you can use any portion of the library as long as you embed the +layers below it. + +1. Parsing +---------- +At the lowest level is the parsing code. The files encompassing this are +url_parse.* and the main include file is src/url_parse.h. This code will, given +an input string, parse it into the most likely form of a URL. + +Parsing cannot fail and does no validation. The exception is the port number, +which it currently validates, but this is a bug. Given crazy input, the parser +will do its best to find the various URL components according to its rules (see +url_parse_unittest.cc for some examples). + +To use this, an application will typically use ExtractScheme to determine the +type of a given input URL, and then call one of the initialization functions: +"ParseStandardURL", "ParsePathURL", or "ParseFileURL". This will result in +a "Parsed" structure which identifies the substrings of each identified +component. + +2. Canonicalization +------------------- +At the next highest level is canonicalization. The files encompasing this are +url_canon.* and the main include file is src/url_canon.h. This code will +validate an already-parsed URL, and will convert it to a canonical form. For +example, this will convert host names to lowercase, convert IP addresses +into dotted-decimal notation, handle encoding issues, etc. + +This layer will always do its best to produce a reasonable output string, but +it may return that the string is invalid. For example, if there are invalid +characters in the host name, it will escape them or replace them with the +Unicode "invalid character" character, but will fail. This way, the program can +display error messages to the user with the output, log it, etc. and the +string will have some meaning. + +Canonicalized output is written to a CanonOutput object which is a simple +wrapper around an expanding buffer. An implementation called RawCanonOutput is +proivided that writes to a raw buffer with a fixed amount statically allocated +(for performance). Applications using STL can use StdStringCanonOutput defined +in url_canon_stdstring.h which writes into a std::string. + +A normal application would call one of the four high-level functions +"CanonicalizeStandardURL", "CanonicalizeFileURL", "CanonicalizeFileSystemURL", +and CanonicalizePathURL" depending on the type of URL in question. Lower-level +functions are also provided which will canonicalize individual parts of a URL +(for example, "CanonicalizeHost"). + +Part of this layer is the integration with the host system for IDN and encoding +conversion. An implementation that provides integration with the ICU +(http://www-306.ibm.com/software/globalization/icu/index.jsp) is provided in +src/url_canon_icu.cc. The embedder may wish to replace this file with +implementations of the functions for their own IDN library if they do not use +ICU. + +3. Application integration +-------------------------- +The canonicalization and parsing layers do not know anything about the URI +schemes supported by your application. The parsing and canonicalization +functions are very low-level, and you must call the correct function to do the +work (for example, "CanonicalizeFileURL"). + +The application integration in url_util.* provides wrappers around the +low-level parsing and canonicalization to call the correct versions for +different identified schemes. Embedders will want to modify this file if +necessary to suit the needs of their application. + +4. URL object +------------- +The highest level is the "URL" object that a C++ application would use to +to encapsulate a URL. Embedders will typically want to provide their own URL +object that meets the requirements of their system. A reasonably complete +example implemnetation is provided in src/gurl.*. You may wish to use this +object, extend or modify it, or write your own. + +Whitespace +---------- +Sometimes, you may want to remove linefeeds and tabs from the content of a URL. +Some web pages, for example, expect that a URL spanning two lines should be +treated as one with the newline removed. Depending on the source of the URLs +you are canonicalizing, these newlines may or may not be trimmed off. + +If you want this behavior, call RemoveURLWhitespace before parsing. This will +remove CR, LF and TAB from the input. Note that it preserves spaces. On typical +URLs, this function produces a 10-15% speed reduction, so it is optional and +not done automatically. The example GURL object and the url_util wrapper does +this for you. + +Tests +===== + +There are a number of *_unittest.cc and *_perftest.cc files. These files are +not currently compilable as they rely on a not-included unit testing framework +Tests are declared like this: + TEST(TestCaseName, TestName) { + ASSERT_TRUE(a); + EXPECT_EQ(a, b); + } +If you would like to compile them, it should be straightforward to define +the TEST macro (which would declare a function by combining the two arguments) +and the other macros whose behavior should be self-explanatory (EXPECT is like +an ASSERT, but does not stop the test, if you are doing this, you probably +don't care about this difference). Then you would define a .cc file that +calls all of these functions. diff --git a/googleurl/base/README.txt b/googleurl/base/README.txt new file mode 100644 index 0000000..311faa0 --- /dev/null +++ b/googleurl/base/README.txt @@ -0,0 +1,2 @@ +These files contain some shared code. You can define your own assertion macros +to eliminate the dependency on logging.h. diff --git a/googleurl/base/basictypes.h b/googleurl/base/basictypes.h new file mode 100644 index 0000000..b0c404d --- /dev/null +++ b/googleurl/base/basictypes.h @@ -0,0 +1,88 @@ +// Copyright 2001 - 2003 Google Inc. All Rights Reserved + +#ifndef BASE_BASICTYPES_H__ +#define BASE_BASICTYPES_H__ + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; + +const uint8 kuint8max = (( uint8) 0xFF); +const uint32 kuint32max = ((uint32) 0xFFFFFFFF); + +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. +// +// One caveat is that arraysize() doesn't accept any array of an +// anonymous type or a type defined inside a function. In these rare +// cases, you have to use the unsafe ARRAYSIZE() macro below. This is +// due to a limitation in C++'s template system. The limitation might +// eventually be removed, but it hasn't happened yet. + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template <typename T, size_t N> +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef _MSC_VER +template <typename T, size_t N> +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + +// ARRAYSIZE performs essentially the same calculation as arraysize, +// but can be used on anonymous types or types defined inside +// functions. It's less safe than arraysize as it accepts some +// (although not all) pointers. Therefore, you should use arraysize +// whenever possible. +// +// The expression ARRAYSIZE(a) is a compile-time constant of type +// size_t. +// +// ARRAYSIZE catches a few type errors. If you see a compiler error +// +// "warning: division by zero in ..." +// +// when using ARRAYSIZE, you are (wrongfully) giving it a pointer. +// You should only use ARRAYSIZE on statically allocated arrays. +// +// The following comments are on the implementation details, and can +// be ignored by the users. +// +// ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in +// the array) and sizeof(*(arr)) (the # of bytes in one array +// element). If the former is divisible by the latter, perhaps arr is +// indeed an array, in which case the division result is the # of +// elements in the array. Otherwise, arr cannot possibly be an array, +// and we generate a compiler error to prevent the code from +// compiling. +// +// Since the size of bool is implementation-defined, we need to cast +// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final +// result has type size_t. +// +// This macro is not perfect as it wrongfully accepts certain +// pointers, namely where the pointer size is divisible by the pointee +// size. Since all our code has to go through a 32-bit compiler, +// where a pointer is 4 bytes, this means all pointers to a type whose +// size is 3 or greater than 4 will be (righteously) rejected. +// +// Starting with Visual C++ 2005, WinNT.h includes ARRAYSIZE. +#define ARRAYSIZE_UNSAFE(a) \ + ((sizeof(a) / sizeof(*(a))) / \ + static_cast<size_t>(!(sizeof(a) % sizeof(*(a))))) + +// A macro to disallow the evil copy constructor and operator= functions +// This should be used in the private: declarations for a class +#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#endif // BASE_BASICTYPES_H__ diff --git a/googleurl/base/logging.cc b/googleurl/base/logging.cc new file mode 100644 index 0000000..ab03150 --- /dev/null +++ b/googleurl/base/logging.cc @@ -0,0 +1,380 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <ctime> +#include <iomanip> +#include <cstring> +#include <windows.h> +#include <tchar.h> +#include <algorithm> +#include "base/logging.h" + +namespace logging { + +const char* const log_severity_names[LOG_NUM_SEVERITIES] = { + "INFO", "WARNING", "ERROR", "FATAL" }; + +int min_log_level = 0; +LogLockingState lock_log_file = LOCK_LOG_FILE; +LoggingDestination logging_destination = LOG_ONLY_TO_FILE; + +const int kMaxFilteredLogLevel = LOG_WARNING; +char* log_filter_prefix = NULL; + +// which log file to use? This is initialized by InitLogging or +// will be lazily initialized to the default value when it is +// first needed. +TCHAR log_file_name[MAX_PATH] = { 0 }; + +// this file is lazily opened and the handle may be NULL +HANDLE log_file = NULL; + +// what should be prepended to each message? +bool log_process_id = false; +bool log_thread_id = false; +bool log_timestamp = true; +bool log_tickcount = false; + +// An assert handler override specified by the client to be called instead of +// the debug message dialog. +LogAssertHandlerFunction log_assert_handler = NULL; + +// The critical section is used if log file locking is false. It helps us +// avoid problems with multiple threads writing to the log file at the same +// time. +bool initialized_critical_section = false; +CRITICAL_SECTION log_critical_section; + +// When we don't use a critical section, we are using a global mutex. We +// need to do this because LockFileEx is not thread safe +HANDLE log_mutex = NULL; + +// Called by logging functions to ensure that debug_file is initialized +// and can be used for writing. Returns false if the file could not be +// initialized. debug_file will be NULL in this case. +bool InitializeLogFileHandle() { + if (log_file) + return true; + + if (!log_file_name[0]) { + // nobody has called InitLogging to specify a debug log file, so here we + // initialize the log file name to the default + GetModuleFileName(NULL, log_file_name, MAX_PATH); + TCHAR* last_backslash = _tcsrchr(log_file_name, '\\'); + if (last_backslash) + last_backslash[1] = 0; // name now ends with the backslash + _tcscat_s(log_file_name, _T("debug.log")); + } + + log_file = CreateFile(log_file_name, GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + if (log_file == INVALID_HANDLE_VALUE || log_file == NULL) { + // try the current directory + log_file = CreateFile(_T(".\\debug.log"), GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + if (log_file == INVALID_HANDLE_VALUE || log_file == NULL) { + log_file = NULL; + return false; + } + } + SetFilePointer(log_file, 0, 0, FILE_END); + return true; +} + +void InitLogMutex() { + if (!log_mutex) { + // \ is not a legal character in mutex names so we replace \ with / + std::wstring safe_name(log_file_name); + std::replace(safe_name.begin(), safe_name.end(), '\\', '/'); + std::wstring t(L"Global\\"); + t.append(safe_name); + log_mutex = ::CreateMutex(NULL, FALSE, t.c_str()); + } +} + +void InitLogging(const TCHAR* new_log_file, LoggingDestination logging_dest, + LogLockingState lock_log, OldFileDeletionState delete_old) { + if (log_file) { + // calling InitLogging twice or after some log call has already opened the + // default log file will re-initialize to the new options + CloseHandle(log_file); + log_file = NULL; + } + + lock_log_file = lock_log; + logging_destination = logging_dest; + + // ignore file options if logging is only to system + if (logging_destination == LOG_ONLY_TO_SYSTEM_DEBUG_LOG) + return; + + _tcscpy_s(log_file_name, MAX_PATH, new_log_file); + if (delete_old == DELETE_OLD_LOG_FILE) + DeleteFile(log_file_name); + + if (lock_log_file == LOCK_LOG_FILE) { + InitLogMutex(); + } else if (!initialized_critical_section) { + // initialize the critical section + InitializeCriticalSection(&log_critical_section); + initialized_critical_section = true; + } + + InitializeLogFileHandle(); +} + +void SetMinLogLevel(int level) { + min_log_level = level; +} + +void SetLogFilterPrefix(char* filter) { + if (log_filter_prefix) { + delete[] log_filter_prefix; + log_filter_prefix = NULL; + } + + if (filter) { + size_t size = strlen(filter)+1; + log_filter_prefix = new char[size]; + strcpy_s(log_filter_prefix, size, filter); + } +} + +void SetLogItems(bool enable_process_id, bool enable_thread_id, + bool enable_timestamp, bool enable_tickcount) { + log_process_id = enable_process_id; + log_thread_id = enable_thread_id; + log_timestamp = enable_timestamp; + log_tickcount = enable_tickcount; +} + +void SetLogAssertHandler(LogAssertHandlerFunction handler) { + log_assert_handler = handler; +} + +// Displays a message box to the user with the error message in it. For +// Windows programs, it's possible that the message loop is messed up on +// a fatal error, and creating a MessageBox will cause that message loop +// to be run. Instead, we try to spawn another process that displays its +// command line. We look for "Debug Message.exe" in the same directory as +// the application. If it exists, we use it, otherwise, we use a regular +// message box. +void DisplayDebugMessage(const std::string& str) { + if (str.empty()) + return; + + // look for the debug dialog program next to our application + wchar_t prog_name[MAX_PATH]; + GetModuleFileNameW(NULL, prog_name, MAX_PATH); + wchar_t* backslash = wcsrchr(prog_name, '\\'); + if (backslash) + backslash[1] = 0; + wcscat_s(prog_name, MAX_PATH, L"debug_message.exe"); + + // stupid CreateProcess requires a non-const command line and may modify it. + // We also want to use the wide string + int charcount = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, NULL, 0); + if (!charcount) + return; + scoped_array<wchar_t> cmdline(new wchar_t[charcount]); + if (!MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, cmdline.get(), charcount)) + return; + + STARTUPINFO startup_info; + memset(&startup_info, 0, sizeof(startup_info)); + startup_info.cb = sizeof(startup_info); + + PROCESS_INFORMATION process_info; + if (CreateProcessW(prog_name, cmdline.get(), NULL, NULL, false, 0, NULL, + NULL, &startup_info, &process_info)) { + WaitForSingleObject(process_info.hProcess, INFINITE); + CloseHandle(process_info.hThread); + CloseHandle(process_info.hProcess); + } else { + // debug process broken, let's just do a message box + MessageBoxW(NULL, cmdline.get(), L"Fatal error", MB_OK | MB_ICONHAND); + } +} + +LogMessage::LogMessage(const char* file, int line, LogSeverity severity, + int ctr) + : severity_(severity) { + Init(file, line); +} + +LogMessage::LogMessage(const char* file, int line, const CheckOpString& result) + : severity_(LOG_FATAL) { + Init(file, line); + stream_ << "Check failed: " << (*result.str_); +} + +LogMessage::LogMessage(const char* file, int line) + : severity_(LOG_INFO) { + Init(file, line); +} + +LogMessage::LogMessage(const char* file, int line, LogSeverity severity) + : severity_(severity) { + Init(file, line); +} + +// writes the common header info to the stream +void LogMessage::Init(const char* file, int line) { + // log only the filename + const char* last_slash = strrchr(file, '\\'); + if (last_slash) + file = last_slash + 1; + + stream_ << '['; + if (log_process_id) + stream_ << GetCurrentProcessId() << ':'; + if (log_thread_id) + stream_ << GetCurrentThreadId() << ':'; + if (log_timestamp) { + time_t t = time(NULL); + struct tm tm_time; + localtime_s(&tm_time, &t); + stream_ << std::setfill('0') + << std::setw(2) << 1 + tm_time.tm_mon + << std::setw(2) << tm_time.tm_mday + << '/' + << std::setw(2) << tm_time.tm_hour + << std::setw(2) << tm_time.tm_min + << std::setw(2) << tm_time.tm_sec + << ':'; + } + if (log_tickcount) + stream_ << GetTickCount() << ':'; + stream_ << log_severity_names[severity_] << ":" << file << "(" << line << ")] "; + + message_start_ = stream_.pcount(); +} + +LogMessage::~LogMessage() { + if (severity_ < min_log_level) + return; + + std::string str_newline(stream_.str(), stream_.pcount()); + str_newline.append("\r\n"); + + if (log_filter_prefix && severity_ <= kMaxFilteredLogLevel && + str_newline.compare(message_start_, strlen(log_filter_prefix), + log_filter_prefix) != 0) { + goto cleanup; + } + + if (logging_destination != LOG_ONLY_TO_FILE) + OutputDebugStringA(str_newline.c_str()); + + // write to log file + if (logging_destination != LOG_ONLY_TO_SYSTEM_DEBUG_LOG && + InitializeLogFileHandle()) { + // we can have multiple threads and/or processes, so try to prevent them from + // clobbering each other's writes + if (lock_log_file == LOCK_LOG_FILE) { + // Ensure that the mutex is initialized in case the client app did not + // call InitLogging. This is not thread safe. See below + InitLogMutex(); + + DWORD r = ::WaitForSingleObject(log_mutex, INFINITE); + DCHECK(r != WAIT_ABANDONED); + } else { + // use the critical section + if (!initialized_critical_section) { + // The client app did not call InitLogging, and so the critical section + // has not been created. We do this on demand, but if two threads try to + // do this at the same time, there will be a race condition to create + // the critical section. This is why InitLogging should be called from + // the main thread at the beginning of execution. + InitializeCriticalSection(&log_critical_section); + initialized_critical_section = true; + } + EnterCriticalSection(&log_critical_section); + } + + SetFilePointer(log_file, 0, 0, SEEK_END); + DWORD num_written; + WriteFile(log_file, (void*)str_newline.c_str(), (DWORD)str_newline.length(), &num_written, NULL); + + if (lock_log_file == LOCK_LOG_FILE) { + ReleaseMutex(log_mutex); + } else { + LeaveCriticalSection(&log_critical_section); + } + } + + if (severity_ == LOG_FATAL) { + // display a message or break into the debugger on a fatal error + if (::IsDebuggerPresent()) { + DebugBreak(); + } else { + if (log_assert_handler) { + log_assert_handler(std::string(stream_.str(), stream_.pcount())); + } else { + // don't use the string with the newline, get a fresh version to send to + // the debug message process + DisplayDebugMessage(std::string(stream_.str(), stream_.pcount())); + TerminateProcess(GetCurrentProcess(), 1); + } + } + } + +cleanup: + // Calling stream_.str() freezes the stream buffer. A frozen buffer will + // not be freed during strstreambuf destruction. + stream_.freeze(false); +} + +void CloseLogFile() { + if (!log_file) + return; + + CloseHandle(log_file); + log_file = NULL; +} + +} // namespace logging + +std::ostream& operator<<(std::ostream& out, const wchar_t* wstr) { + if (!wstr || !wstr[0]) + return out; + + // compute the length of the buffer we'll need + int charcount = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, + NULL, 0, NULL, NULL); + if (charcount == 0) + return out; + + // convert + scoped_array<char> buf(new char[charcount]); + WideCharToMultiByte(CP_UTF8, 0, wstr, -1, buf.get(), charcount, NULL, NULL); + return out << buf.get(); +} diff --git a/googleurl/base/logging.h b/googleurl/base/logging.h new file mode 100644 index 0000000..0a69613 --- /dev/null +++ b/googleurl/base/logging.h @@ -0,0 +1,489 @@ +// Copyright 2006 Google Inc. All Rights Reserved. +// Author: brettw (Brett Wilson) + +#ifndef BASE_LOGGING_H__ +#define BASE_LOGGING_H__ + +#include <string> +#include <cstring> +#include <sstream> +#ifdef _WIN32 +#include <tchar.h> +#endif + +#include "base/basictypes.h" +#include "base/scoped_ptr.h" + +// Optional message capabilities +// ----------------------------- +// Assertion failed messages and fatal errors are displayed in a dialog box +// before the application exits. However, running this UI creates a message +// loop, which causes application messages to be processed and potentially +// dispatched to existing application windows. Since the application is in a +// bad state when this assertion dialog is displayed, these messages may not +// get processed and hang the dialog, or the application might go crazy. +// +// Therefore, it can be beneficial to display the error dialog in a separate +// process from the main application. When the logging system needs to display +// a fatal error dialog box, it will look for a program called +// "DebugMessage.exe" in the same directory as the application executable. It +// will run this application with the message as the command line, and will +// not include the name of the application as is traditional for easier +// parsing. +// +// The code for DebugMessage.exe is only one line. In WinMain, do: +// MessageBox(NULL, GetCommandLineW(), L"Fatal Error", 0); +// +// If DebugMessage.exe is not found, the logging code will use a normal +// MessageBox, potentially causing the problems discussed above. + + +// Instructions +// ------------ +// +// Make a bunch of macros for logging. The way to log things is to stream +// things to LOG(<a particular severity level>). E.g., +// +// LOG(INFO) << "Found " << num_cookies << " cookies"; +// +// You can also do conditional logging: +// +// LOG_IF(INFO, num_cookies > 10) << "Got lots of cookies"; +// +// The above will cause log messages to be output on the 1st, 11th, 21st, ... +// times it is executed. Note that the special COUNTER value is used to +// identify which repetition is happening. +// +// There are also "debug mode" logging macros like the ones above: +// +// DLOG(INFO) << "Found cookies"; +// +// DLOG_IF(INFO, num_cookies > 10) << "Got lots of cookies"; +// +// All "debug mode" logging is compiled away to nothing for non-debug mode +// compiles. LOG_IF and development flags also work well together +// because the code can be compiled away sometimes. +// +// We also have +// +// LOG_ASSERT(assertion); +// DLOG_ASSERT(assertion); +// +// which is syntactic sugar for {,D}LOG_IF(FATAL, assert fails) << assertion; +// +// We also override the standard 'assert' to use 'DLOG_ASSERT'. +// +// The supported severity levels for macros that allow you to specify one +// are (in increasing order of severity) INFO, WARNING, ERROR, and FATAL. +// +// There is also the special severity of DFATAL, which logs FATAL in +// debug mode, ERROR in normal mode. +// +// Very important: logging a message at the FATAL severity level causes +// the program to terminate (after the message is logged). + +namespace logging { + +// Where to record logging output? A flat file and/or system debug log via +// OutputDebugString. Defaults to LOG_ONLY_TO_FILE. +enum LoggingDestination { LOG_ONLY_TO_FILE, + LOG_ONLY_TO_SYSTEM_DEBUG_LOG, + LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG }; + +// Indicates that the log file should be locked when being written to. +// Often, there is no locking, which is fine for a single threaded program. +// If logging is being done from multiple threads or there can be more than +// one process doing the logging, the file should be locked during writes to +// make each log outut atomic. Other writers will block. +// +// All processes writing to the log file must have their locking set for it to +// work properly. Defaults to DONT_LOCK_LOG_FILE. +enum LogLockingState { LOCK_LOG_FILE, DONT_LOCK_LOG_FILE }; + +// On startup, should we delete or append to an existing log file (if any)? +// Defaults to APPEND_TO_OLD_LOG_FILE. +enum OldFileDeletionState { DELETE_OLD_LOG_FILE, APPEND_TO_OLD_LOG_FILE }; + +// Sets the log file name and other global logging state. Calling this function +// is recommended, and is normally done at the beginning of application init. +// If you don't call it, all the flags will be initialized to their default +// values, and there is a race condition that may leak a critical section +// object if two threads try to do the first log at the same time. +// See the definition of the enums above for descriptions and default values. +// +// The default log file is initialized to "debug.log" in the application +// directory. You probably don't want this, especially since the program +// directory may not be writable on an enduser's system. +#ifdef _WIN32 +void InitLogging(const TCHAR* log_file, LoggingDestination logging_dest, + LogLockingState lock_log, OldFileDeletionState delete_old); +#else +void InitLogging(const char* log_file, LoggingDestination logging_dest, + LogLockingState lock_log, OldFileDeletionState delete_old); +#endif + +// Sets the log level. Anything at or above this level will be written to the +// log file/displayed to the user (if applicable). Anything below this level +// will be silently ignored. The log level defaults to 0 (everything is logged) +// if this function is not called. +void SetMinLogLevel(int level); + +// Sets the log filter prefix. Any log message below LOG_ERROR severity that +// doesn't start with this prefix with be silently ignored. The filter defaults +// to NULL (everything is logged) if this function is not called. Messages +// with severity of LOG_ERROR or higher will not be filtered. +void SetLogFilterPrefix(char* filter); + +// Sets the common items you want to be prepended to each log message. +// process and thread IDs default to off, the timestamp defaults to on. +// If this function is not called, logging defaults to writing the timestamp +// only. +void SetLogItems(bool enable_process_id, bool enable_thread_id, + bool enable_timestamp, bool enable_tickcount); + +// Sets the Log Assert Handler that will be used to notify of check failures. +// The default handler shows a dialog box, however clients can use this +// function to override with their own handling (e.g. a silent one for Unit +// Tests) +typedef void (*LogAssertHandlerFunction)(const std::string& str); +void SetLogAssertHandler(LogAssertHandlerFunction handler); + +typedef int LogSeverity; +const LogSeverity LOG_INFO = 0; +const LogSeverity LOG_WARNING = 1; +const LogSeverity LOG_ERROR = 2; +const LogSeverity LOG_FATAL = 3; +const LogSeverity LOG_NUM_SEVERITIES = 4; + +// LOG_DFATAL_LEVEL is LOG_FATAL in debug mode, ERROR in normal mode +#ifdef NDEBUG +const LogSeverity LOG_DFATAL_LEVEL = LOG_ERROR; +#else +const LogSeverity LOG_DFATAL_LEVEL = LOG_FATAL; +#endif + +// A few definitions of macros that don't generate much code. These are used +// by LOG() and LOG_IF, etc. Since these are used all over our code, it's +// better to have compact code for these operations. +#define COMPACT_GOOGLE_LOG_INFO \ + logging::LogMessage(__FILE__, __LINE__) +#define COMPACT_GOOGLE_LOG_WARNING \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_WARNING) +#define COMPACT_GOOGLE_LOG_ERROR \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR) +#define COMPACT_GOOGLE_LOG_FATAL \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_FATAL) +#define COMPACT_GOOGLE_LOG_DFATAL \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_DFATAL_LEVEL) + +// wingdi.h defines ERROR to be 0. When we call LOG(ERROR), it gets +// substituted with 0, and it expands to COMPACT_GOOGLE_LOG_0. To allow us +// to keep using this syntax, we define this macro to do the same thing +// as COMPACT_GOOGLE_LOG_ERROR, and also define ERROR the same way that +// the Windows SDK does for consistency. +#define ERROR 0 +#define COMPACT_GOOGLE_LOG_0 \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR) + +// We use the preprocessor's merging operator, "##", so that, e.g., +// LOG(INFO) becomes the token COMPACT_GOOGLE_LOG_INFO. There's some funny +// subtle difference between ostream member streaming functions (e.g., +// ostream::operator<<(int) and ostream non-member streaming functions +// (e.g., ::operator<<(ostream&, string&): it turns out that it's +// impossible to stream something like a string directly to an unnamed +// ostream. We employ a neat hack by calling the stream() member +// function of LogMessage which seems to avoid the problem. + +#define LOG(severity) COMPACT_GOOGLE_LOG_ ## severity.stream() +#define SYSLOG(severity) LOG(severity) + +#define LOG_IF(severity, condition) \ + !(condition) ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) +#define SYSLOG_IF(severity, condition) LOG_IF(severity, condition) + +#define LOG_ASSERT(condition) \ + LOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". " +#define SYSLOG_ASSERT(condition) \ + SYSLOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". " + +// A container for a string pointer which can be evaluated to a bool - +// true iff the pointer is NULL. +struct CheckOpString { + CheckOpString(std::string* str) : str_(str) { } + // No destructor: if str_ is non-NULL, we're about to LOG(FATAL), + // so there's no point in cleaning up str_. + operator bool() const { return str_ != NULL; } + std::string* str_; +}; + +// Build the error message string. This is separate from the "Impl" +// function template because it is not performance critical and so can +// be out of line, while the "Impl" code should be inline. +template<class t1, class t2> +std::string* MakeCheckOpString(const t1& v1, const t2& v2, const char* names) { + std::ostringstream ss; + ss << names << " (" << v1 << " vs. " << v2 << ")"; + return new std::string(ss.str()); +} + +std::string* MakeCheckOpStringIntInt(int v1, int v2, const char* names); + +template<int, int> +std::string* MakeCheckOpString(const int& v1, const int& v2, const char* names) { + return MakeCheckOpStringIntInt(v1, v2, names); +} + +// Plus some debug-logging macros that get compiled to nothing for production +// +// DEBUG_MODE is for uses like +// if (DEBUG_MODE) foo.CheckThatFoo(); +// instead of +// #ifndef NDEBUG +// foo.CheckThatFoo(); +// #endif + +#ifndef NDEBUG + +#define DLOG(severity) LOG(severity) +#define DLOG_IF(severity, condition) LOG_IF(severity, condition) +#define DLOG_ASSERT(condition) LOG_ASSERT(condition) + +// debug-only checking. not executed in NDEBUG mode. +enum { DEBUG_MODE = 1 }; +#define DCHECK(condition) \ + LOG_IF(FATAL, !(condition)) << "Check failed: " #condition ". " + +// Helper functions for DCHECK_OP macro. +// The (int, int) specialization works around the issue that the compiler +// will not instantiate the template version of the function on values of +// unnamed enum type - see comment below. +#define DEFINE_DCHECK_OP_IMPL(name, op) \ + template <class t1, class t2> \ + inline std::string* Check##name##Impl(const t1& v1, const t2& v2, \ + const char* names) { \ + if (v1 op v2) return NULL; \ + else return MakeCheckOpString(v1, v2, names); \ + } \ + inline std::string* Check##name##Impl(int v1, int v2, const char* names) { \ + if (v1 op v2) return NULL; \ + else return MakeCheckOpString(v1, v2, names); \ + } +DEFINE_DCHECK_OP_IMPL(EQ, ==) +DEFINE_DCHECK_OP_IMPL(NE, !=) +DEFINE_DCHECK_OP_IMPL(LE, <=) +DEFINE_DCHECK_OP_IMPL(LT, < ) +DEFINE_DCHECK_OP_IMPL(GE, >=) +DEFINE_DCHECK_OP_IMPL(GT, > ) +#undef DEFINE_DCHECK_OP_IMPL + +// Helper macro for binary operators. +// Don't use this macro directly in your code, use CHECK_EQ et al below. +#define DCHECK_OP(name, op, val1, val2) \ + while (logging::CheckOpString _result = \ + logging::Check##name##Impl((val1), (val2), #val1 " " #op " " #val2)) \ + logging::LogMessage(__FILE__, __LINE__, _result).stream() + +// Equality/Inequality checks - compare two values, and log a LOG_FATAL message +// including the two values when the result is not as expected. The values +// must have operator<<(ostream, ...) defined. +// +// You may append to the error message like so: +// CHECK_NE(1, 2) << ": The world must be ending!"; +// +// We are very careful to ensure that each argument is evaluated exactly +// once, and that anything which is legal to pass as a function argument is +// legal here. In particular, the arguments may be temporary expressions +// which will end up being destroyed at the end of the apparent statement, +// for example: +// CHECK_EQ(string("abc")[1], 'b'); +// +// WARNING: These don't compile correctly if one of the arguments is a pointer +// and the other is NULL. To work around this, simply static_cast NULL to the +// type of the desired pointer. + +#define DCHECK_EQ(val1, val2) DCHECK_OP(EQ, ==, val1, val2) +#define DCHECK_NE(val1, val2) DCHECK_OP(NE, !=, val1, val2) +#define DCHECK_LE(val1, val2) DCHECK_OP(LE, <=, val1, val2) +#define DCHECK_LT(val1, val2) DCHECK_OP(LT, < , val1, val2) +#define DCHECK_GE(val1, val2) DCHECK_OP(GE, >=, val1, val2) +#define DCHECK_GT(val1, val2) DCHECK_OP(GT, > , val1, val2) + +// Helper functions for string comparisons. +// To avoid bloat, the definitions are in logging.cc. +#define DECLARE_DCHECK_STROP_IMPL(func, expected) \ + std::string* Check##func##expected##Impl(const char* s1, \ + const char* s2, \ + const char* names); +DECLARE_DCHECK_STROP_IMPL(strcmp, true) +DECLARE_DCHECK_STROP_IMPL(strcmp, false) +DECLARE_DCHECK_STROP_IMPL(_stricmp, true) +DECLARE_DCHECK_STROP_IMPL(_stricmp, false) +#undef DECLARE_DCHECK_STROP_IMPL + +// Helper macro for string comparisons. +// Don't use this macro directly in your code, use CHECK_STREQ et al below. +#define DCHECK_STROP(func, op, expected, s1, s2) \ + while (CheckOpString _result = \ + logging::Check##func##expected##Impl((s1), (s2), \ + #s1 " " #op " " #s2)) \ + LOG(FATAL) << *_result.str_ + +// String (char*) equality/inequality checks. +// CASE versions are case-insensitive. +// +// Note that "s1" and "s2" may be temporary strings which are destroyed +// by the compiler at the end of the current "full expression" +// (e.g. DCHECK_STREQ(Foo().c_str(), Bar().c_str())). + +#define DCHECK_STREQ(s1, s2) DCHECK_STROP(strcmp, ==, true, s1, s2) +#define DCHECK_STRNE(s1, s2) DCHECK_STROP(strcmp, !=, false, s1, s2) +#define DCHECK_STRCASEEQ(s1, s2) DCHECK_STROP(_stricmp, ==, true, s1, s2) +#define DCHECK_STRCASENE(s1, s2) DCHECK_STROP(_stricmp, !=, false, s1, s2) + +#define DCHECK_INDEX(I,A) DCHECK(I < (sizeof(A)/sizeof(A[0]))) +#define DCHECK_BOUND(B,A) DCHECK(B <= (sizeof(A)/sizeof(A[0]))) + +#else // NDEBUG + +#define DLOG(severity) \ + true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) + +#define DLOG_IF(severity, condition) \ + true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) + +#define DLOG_ASSERT(condition) \ + true ? (void) 0 : LOG_ASSERT(condition) + +enum { DEBUG_MODE = 0 }; + +// This macro can be followed by a sequence of stream parameters in +// non-debug mode. The DCHECK and friends macros use this so that +// the expanded expression DCHECK(foo) << "asdf" is still syntactically +// valid, even though the expression will get optimized away. +#define NDEBUG_EAT_STREAM_PARAMETERS \ + logging::LogMessage(__FILE__, __LINE__).stream() + +#define DCHECK(condition) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_EQ(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_NE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_LE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_LT(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_GE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_GT(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STREQ(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRCASEEQ(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRNE(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRCASENE(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#endif // NDEBUG + +#define NOTREACHED() DCHECK(false) + +// Redefine the standard assert to use our nice log files +#undef assert +#define assert(x) DLOG_ASSERT(x) + +// This class more or less represents a particular log message. You +// create an instance of LogMessage and then stream stuff to it. +// When you finish streaming to it, ~LogMessage is called and the +// full message gets streamed to the appropriate destination. +// +// You shouldn't actually use LogMessage's constructor to log things, +// though. You should use the LOG() macro (and variants thereof) +// above. +class LogMessage { + public: + LogMessage(const char* file, int line, LogSeverity severity, int ctr); + + // Two special constructors that generate reduced amounts of code at + // LOG call sites for common cases. + // + // Used for LOG(INFO): Implied are: + // severity = LOG_INFO, ctr = 0 + // + // Using this constructor instead of the more complex constructor above + // saves a couple of bytes per call site. + LogMessage(const char* file, int line); + + // Used for LOG(severity) where severity != INFO. Implied + // are: ctr = 0 + // + // Using this constructor instead of the more complex constructor above + // saves a couple of bytes per call site. + LogMessage(const char* file, int line, LogSeverity severity); + + // A special constructor used for check failures. + // Implied severity = LOG_FATAL + LogMessage(const char* file, int line, const CheckOpString& result); + + ~LogMessage(); + + std::ostream& stream() { return stream_; } + + private: + void Init(const char* file, int line); + + LogSeverity severity_; + std::ostringstream stream_; + int message_start_; // offset of the start of the message (past prefix info). + + DISALLOW_EVIL_CONSTRUCTORS(LogMessage); +}; + +// A non-macro interface to the log facility; (useful +// when the logging level is not a compile-time constant). +inline void LogAtLevel(int const log_level, std::string const &msg) { + LogMessage(__FILE__, __LINE__, log_level).stream() << msg; +} + +// This class is used to explicitly ignore values in the conditional +// logging macros. This avoids compiler warnings like "value computed +// is not used" and "statement has no effect". +class LogMessageVoidify { + public: + LogMessageVoidify() { } + // This has to be an operator with a precedence lower than << but + // higher than ?: + void operator&(std::ostream&) { } +}; + +// Closes the log file explicitly if open. +// NOTE: Since the log file is opened as necessary by the action of logging +// statements, there's no guarantee that it will stay closed +// after this call. +void CloseLogFile(); + +} // namespace Logging + +// These functions are provided as a convenience for logging, which is where we +// use streams (it is against Google style to use streams in other places). It +// is designed to allow you to emit non-ASCII Unicode strings to the log file, +// which is normally ASCII. It is relatively slow, so try not to use it for +// common cases. Non-ASCII characters will be converted to UTF-8 by these operators. +std::ostream& operator<<(std::ostream& out, const wchar_t* wstr); +inline std::ostream& operator<<(std::ostream& out, const std::wstring& wstr) { + return out << wstr.c_str(); +} + +#endif // BASE_LOGGING_H__ diff --git a/googleurl/base/scoped_ptr.h b/googleurl/base/scoped_ptr.h new file mode 100644 index 0000000..de0b388 --- /dev/null +++ b/googleurl/base/scoped_ptr.h @@ -0,0 +1,322 @@ +#ifndef BASE_SCOPED_PTR_H +#define BASE_SCOPED_PTR_H + +// (C) Copyright Greg Colvin and Beman Dawes 1998, 1999. +// Copyright (c) 2001, 2002 Peter Dimov +// +// Permission to copy, use, modify, sell and distribute this software +// is granted provided this copyright notice appears in all copies. +// This software is provided "as is" without express or implied +// warranty, and with no claim as to its suitability for any purpose. +// +// See http://www.boost.org/libs/smart_ptr/scoped_ptr.htm for documentation. +// + +// scoped_ptr mimics a built-in pointer except that it guarantees deletion +// of the object pointed to, either on destruction of the scoped_ptr or via +// an explicit reset(). scoped_ptr is a simple solution for simple needs; +// use shared_ptr or std::auto_ptr if your needs are more complex. + +// *** NOTE *** +// If your scoped_ptr is a class member of class FOO pointing to a +// forward declared type BAR (as shown below), then you MUST use a non-inlined +// version of the destructor. The destructor of a scoped_ptr (called from +// FOO's destructor) must have a complete definition of BAR in order to +// destroy it. Example: +// +// -- foo.h -- +// class BAR; +// +// class FOO { +// public: +// FOO(); +// ~FOO(); // Required for sources that instantiate class FOO to compile! +// +// private: +// scoped_ptr<BAR> bar_; +// }; +// +// -- foo.cc -- +// #include "foo.h" +// FOO::~FOO() {} // Empty, but must be non-inlined to FOO's class definition. + +#include <cstddef> // for std::ptrdiff_t +#include <assert.h> // for assert +#include <stdlib.h> // for free() decl + +template <typename T> +class scoped_ptr { + private: + + T* ptr; + + scoped_ptr(scoped_ptr const &); + scoped_ptr & operator=(scoped_ptr const &); + + public: + + typedef T element_type; + + explicit scoped_ptr(T* p = 0): ptr(p) {} + + ~scoped_ptr() { + typedef char type_must_be_complete[sizeof(T)]; + delete ptr; + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + delete ptr; + ptr = p; + } + } + + T& operator*() const { + assert(ptr != 0); + return *ptr; + } + + T* operator->() const { + assert(ptr != 0); + return ptr; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_ptr & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_ptr should have its own object + template <typename U> bool operator==(scoped_ptr<U> const& p) const; + template <typename U> bool operator!=(scoped_ptr<U> const& p) const; +}; + +template<typename T> inline +void swap(scoped_ptr<T>& a, scoped_ptr<T>& b) { + a.swap(b); +} + +template<typename T> inline +bool operator==(T* p, const scoped_ptr<T>& b) { + return p == b.get(); +} + +template<typename T> inline +bool operator!=(T* p, const scoped_ptr<T>& b) { + return p != b.get(); +} + +// scoped_array extends scoped_ptr to arrays. Deletion of the array pointed to +// is guaranteed, either on destruction of the scoped_array or via an explicit +// reset(). Use shared_array or std::vector if your needs are more complex. + +template<typename T> +class scoped_array { + private: + + T* ptr; + + scoped_array(scoped_array const &); + scoped_array & operator=(scoped_array const &); + + public: + + typedef T element_type; + + explicit scoped_array(T* p = 0) : ptr(p) {} + + ~scoped_array() { + typedef char type_must_be_complete[sizeof(T)]; + delete[] ptr; + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + delete [] ptr; + ptr = p; + } + } + + T& operator[](std::ptrdiff_t i) const { + assert(ptr != 0); + assert(i >= 0); + return ptr[i]; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_array & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_array should have its own object + template <typename U> bool operator==(scoped_array<U> const& p) const; + template <typename U> bool operator!=(scoped_array<U> const& p) const; +}; + +template<class T> inline +void swap(::scoped_array<T>& a, ::scoped_array<T>& b) { + a.swap(b); +} + +template<typename T> inline +bool operator==(T* p, const ::scoped_array<T>& b) { + return p == b.get(); +} + +template<typename T> inline +bool operator!=(T* p, const ::scoped_array<T>& b) { + return p != b.get(); +} + + +// This class wraps the c library function free() in a class that can be +// passed as a template argument to scoped_ptr_malloc below. +class ScopedPtrMallocFree { + public: + inline void operator()(void* x) const { + free(x); + } +}; + +// scoped_ptr_malloc<> is similar to scoped_ptr<>, but it accepts a +// second template argument, the functor used to free the object. + +template<typename T, typename FreeProc = ScopedPtrMallocFree> +class scoped_ptr_malloc { + private: + + T* ptr; + + scoped_ptr_malloc(scoped_ptr_malloc const &); + scoped_ptr_malloc & operator=(scoped_ptr_malloc const &); + + public: + + typedef T element_type; + + explicit scoped_ptr_malloc(T* p = 0): ptr(p) {} + + ~scoped_ptr_malloc() { + typedef char type_must_be_complete[sizeof(T)]; + free_((void*) ptr); + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + free_((void*) ptr); + ptr = p; + } + } + + T& operator*() const { + assert(ptr != 0); + return *ptr; + } + + T* operator->() const { + assert(ptr != 0); + return ptr; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_ptr_malloc & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_ptr_malloc should have its own object + template <typename U, typename GP> + bool operator==(scoped_ptr_malloc<U, GP> const& p) const; + template <typename U, typename GP> + bool operator!=(scoped_ptr_malloc<U, GP> const& p) const; + + static FreeProc const free_; +}; + +template<typename T, typename FP> +FP const scoped_ptr_malloc<T,FP>::free_ = FP(); + +template<typename T, typename FP> inline +void swap(scoped_ptr_malloc<T,FP>& a, scoped_ptr_malloc<T,FP>& b) { + a.swap(b); +} + +template<typename T, typename FP> inline +bool operator==(T* p, const scoped_ptr_malloc<T,FP>& b) { + return p == b.get(); +} + +template<typename T, typename FP> inline +bool operator!=(T* p, const scoped_ptr_malloc<T,FP>& b) { + return p != b.get(); +} + +#endif // #ifndef BASE_SCOPED_PTR_H diff --git a/googleurl/base/string16.cc b/googleurl/base/string16.cc new file mode 100644 index 0000000..fc25809 --- /dev/null +++ b/googleurl/base/string16.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "base/string16.h" + +#ifdef WIN32 + +#error This file should not be used on 2-byte wchar_t systems +// If this winds up being needed on 2-byte wchar_t systems, either the +// definitions below can be used, or the host system's wide character +// functions like wmemcmp can be wrapped. + +#else // !WIN32 + +namespace base { + +int c16memcmp(const char16* s1, const char16* s2, size_t n) { + // We cannot call memcmp because that changes the semantics. + while (n-- > 0) { + if (*s1 != *s2) { + // We cannot use (*s1 - *s2) because char16 is unsigned. + return ((*s1 < *s2) ? -1 : 1); + } + ++s1; + ++s2; + } + return 0; +} + +size_t c16len(const char16* s) { + const char16 *s_orig = s; + while (*s) { + ++s; + } + return s - s_orig; +} + +const char16* c16memchr(const char16* s, char16 c, size_t n) { + while (n-- > 0) { + if (*s == c) { + return s; + } + ++s; + } + return 0; +} + +char16* c16memmove(char16* s1, const char16* s2, size_t n) { + return reinterpret_cast<char16*>(memmove(s1, s2, n * sizeof(char16))); +} + +char16* c16memcpy(char16* s1, const char16* s2, size_t n) { + return reinterpret_cast<char16*>(memcpy(s1, s2, n * sizeof(char16))); +} + +char16* c16memset(char16* s, char16 c, size_t n) { + char16 *s_orig = s; + while (n-- > 0) { + *s = c; + ++s; + } + return s_orig; +} + +} // namespace base + +template class std::basic_string<char16, base::string16_char_traits>; + +#endif // WIN32 diff --git a/googleurl/base/string16.h b/googleurl/base/string16.h new file mode 100644 index 0000000..ed77165 --- /dev/null +++ b/googleurl/base/string16.h @@ -0,0 +1,193 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef BASE_STRING16_H_ +#define BASE_STRING16_H_ + +// WHAT: +// A version of std::basic_string that provides 2-byte characters even when +// wchar_t is not implemented as a 2-byte type. You can access this class as +// string16. We also define char16, which string16 is based upon. +// +// WHY: +// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2 +// data. Plenty of existing code operates on strings encoded as UTF-16. +// +// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make +// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails +// at run time, because it calls some functions (like wcslen) that come from +// the system's native C library -- which was built with a 4-byte wchar_t! +// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's +// entirely improper on those systems where the encoding of wchar_t is defined +// as UTF-32. +// +// Here, we define string16, which is similar to std::wstring but replaces all +// libc functions with custom, 2-byte-char compatible routines. It is capable +// of carrying UTF-16-encoded data. + +#include <string> +#include <cstdio> + +#include "base/basictypes.h" + +#ifdef WIN32 + +typedef wchar_t char16; +typedef std::wstring string16; + +#else // !WIN32 + +typedef uint16 char16; + +namespace base { + +// char16 versions of the functions required by string16_char_traits; these +// are based on the wide character functions of similar names ("w" or "wcs" +// instead of "c16"). +int c16memcmp(const char16* s1, const char16* s2, size_t n); +size_t c16len(const char16* s); +const char16* c16memchr(const char16* s, char16 c, size_t n); +char16* c16memmove(char16* s1, const char16* s2, size_t n); +char16* c16memcpy(char16* s1, const char16* s2, size_t n); +char16* c16memset(char16* s, char16 c, size_t n); + +struct string16_char_traits { + typedef char16 char_type; + typedef int int_type; + + typedef std::streamoff off_type; + typedef mbstate_t state_type; + typedef std::fpos<state_type> pos_type; + + static void assign(char_type& c1, const char_type& c2) { + c1 = c2; + } + + static bool eq(const char_type& c1, const char_type& c2) { + return c1 == c2; + } + static bool lt(const char_type& c1, const char_type& c2) { + return c1 < c2; + } + + static int compare(const char_type* s1, const char_type* s2, size_t n) { + return c16memcmp(s1, s2, n); + } + + static size_t length(const char_type* s) { + return c16len(s); + } + + static const char_type* find(const char_type* s, size_t n, + const char_type& a) { + return c16memchr(s, a, n); + } + + static char_type* move(char_type* s1, const char_type* s2, int_type n) { + return c16memmove(s1, s2, n); + } + + static char_type* copy(char_type* s1, const char_type* s2, size_t n) { + return c16memcpy(s1, s2, n); + } + + static char_type* assign(char_type* s, size_t n, char_type a) { + return c16memset(s, a, n); + } + + static int_type not_eof(const int_type& c) { + return eq_int_type(c, eof()) ? 0 : c; + } + + static char_type to_char_type(const int_type& c) { + return char_type(c); + } + + static int_type to_int_type(const char_type& c) { + return int_type(c); + } + + static bool eq_int_type(const int_type& c1, const int_type& c2) { + return c1 == c2; + } + + static int_type eof() { + return static_cast<int_type>(EOF); + } +}; + +} // namespace base + +// The string class will be explicitly instantiated only once, in string16.cc. +// +// std::basic_string<> in GNU libstdc++ contains a static data member, +// _S_empty_rep_storage, to represent empty strings. When an operation such +// as assignment or destruction is performed on a string, causing its existing +// data member to be invalidated, it must not be freed if this static data +// member is being used. Otherwise, it counts as an attempt to free static +// (and not allocated) data, which is a memory error. +// +// Generally, due to C++ template magic, _S_empty_rep_storage will be marked +// as a coalesced symbol, meaning that the linker will combine multiple +// instances into a single one when generating output. +// +// If a string class is used by multiple shared libraries, a problem occurs. +// Each library will get its own copy of _S_empty_rep_storage. When strings +// are passed across a library boundary for alteration or destruction, memory +// errors will result. GNU libstdc++ contains a configuration option, +// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which +// disables the static data member optimization, but it's a good optimization +// and non-STL code is generally at the mercy of the system's STL +// configuration. Fully-dynamic strings are not the default for GNU libstdc++ +// libstdc++ itself or for the libstdc++ installations on the systems we care +// about, such as Mac OS X and relevant flavors of Linux. +// +// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 . +// +// To avoid problems, string classes need to be explicitly instantiated only +// once, in exactly one library. All other string users see it via an "extern" +// declaration. This is precisely how GNU libstdc++ handles +// std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring). +// +// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2), +// in which the linker does not fully coalesce symbols when dead code +// stripping is enabled. This bug causes the memory errors described above +// to occur even when a std::basic_string<> does not cross shared library +// boundaries, such as in statically-linked executables. +// +// TODO(mark): File this bug with Apple and update this note with a bug number. + +template class std::basic_string<char16, base::string16_char_traits>; + +typedef std::basic_string<char16, base::string16_char_traits> string16; + +std::ostream& operator<<(std::ostream& out, const string16& str); + +#endif // !WIN32 + +#endif // BASE_STRING16_H_ diff --git a/googleurl/gurl.h b/googleurl/gurl.h new file mode 100644 index 0000000..c6b3712 --- /dev/null +++ b/googleurl/gurl.h @@ -0,0 +1,392 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_GURL_H__ +#define GOOGLEURL_SRC_GURL_H__ + +#include <iosfwd> +#include <string> + +#include "base/string16.h" +#include "url_canon.h" +#include "url_canon_stdstring.h" +#include "url_common.h" +#include "url_parse.h" + +class GURL { + public: + typedef url_canon::StdStringReplacements<std::string> Replacements; + typedef url_canon::StdStringReplacements<string16> ReplacementsW; + + // Creates an empty, invalid URL. + GURL_API GURL(); + + // Copy construction is relatively inexpensive, with most of the time going + // to reallocating the string. It does not re-parse. + GURL_API GURL(const GURL& other); + + // The narrow version requires the input be UTF-8. Invalid UTF-8 input will + // result in an invalid URL. + // + // The wide version should also take an encoding parameter so we know how to + // encode the query parameters. It is probably sufficient for the narrow + // version to assume the query parameter encoding should be the same as the + // input encoding. + GURL_API explicit GURL(const std::string& url_string + /*, output_param_encoding*/); + GURL_API explicit GURL(const string16& url_string + /*, output_param_encoding*/); + + // Constructor for URLs that have already been parsed and canonicalized. This + // is used for conversions from KURL, for example. The caller must supply all + // information associated with the URL, which must be correct and consistent. + GURL_API GURL(const char* canonical_spec, size_t canonical_spec_len, + const url_parse::Parsed& parsed, bool is_valid); + + GURL_API ~GURL(); + + GURL_API GURL& operator=(const GURL& other); + + // Returns true when this object represents a valid parsed URL. When not + // valid, other functions will still succeed, but you will not get canonical + // data out in the format you may be expecting. Instead, we keep something + // "reasonable looking" so that the user can see how it's busted if + // displayed to them. + bool is_valid() const { + return is_valid_; + } + + // Returns true if the URL is zero-length. Note that empty URLs are also + // invalid, and is_valid() will return false for them. This is provided + // because some users may want to treat the empty case differently. + bool is_empty() const { + return spec_.empty(); + } + + // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8, + // if the URL is valid. If the URL is not valid, this will assert and return + // the empty string (for safety in release builds, to keep them from being + // misused which might be a security problem). + // + // The URL will be ASCII except the reference fragment, which may be UTF-8. + // It is guaranteed to be valid UTF-8. + // + // The exception is for empty() URLs (which are !is_valid()) but this will + // return the empty string without asserting. + // + // Used invalid_spec() below to get the unusable spec of an invalid URL. This + // separation is designed to prevent errors that may cause security problems + // that could result from the mistaken use of an invalid URL. + GURL_API const std::string& spec() const; + + // Returns the potentially invalid spec for a the URL. This spec MUST NOT be + // modified or sent over the network. It is designed to be displayed in error + // messages to the user, as the apperance of the spec may explain the error. + // If the spec is valid, the valid spec will be returned. + // + // The returned string is guaranteed to be valid UTF-8. + const std::string& possibly_invalid_spec() const { + return spec_; + } + + // Getter for the raw parsed structure. This allows callers to locate parts + // of the URL within the spec themselves. Most callers should consider using + // the individual component getters below. + // + // The returned parsed structure will reference into the raw spec, which may + // or may not be valid. If you are using this to index into the spec, BE + // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you + // don't do anything "important" with invalid specs. + const url_parse::Parsed& parsed_for_possibly_invalid_spec() const { + return parsed_; + } + + // Defiant equality operator! + bool operator==(const GURL& other) const { + return spec_ == other.spec_; + } + bool operator!=(const GURL& other) const { + return spec_ != other.spec_; + } + + // Allows GURL to used as a key in STL (for example, a std::set or std::map). + bool operator<(const GURL& other) const { + return spec_ < other.spec_; + } + + // Resolves a URL that's possibly relative to this object's URL, and returns + // it. Absolute URLs are also handled according to the rules of URLs on web + // pages. + // + // It may be impossible to resolve the URLs properly. If the input is not + // "standard" (SchemeIsStandard() == false) and the input looks relative, we + // can't resolve it. In these cases, the result will be an empty, invalid + // GURL. + // + // The result may also be a nonempty, invalid URL if the input has some kind + // of encoding error. In these cases, we will try to construct a "good" URL + // that may have meaning to the user, but it will be marked invalid. + // + // It is an error to resolve a URL relative to an invalid URL. The result + // will be the empty URL. + GURL_API GURL Resolve(const std::string& relative) const; + GURL_API GURL Resolve(const string16& relative) const; + + // Like Resolve() above but takes a character set encoder which will be used + // for any query text specified in the input. The charset converter parameter + // may be NULL, in which case it will be treated as UTF-8. + // + // TODO(brettw): These should be replaced with versions that take something + // more friendly than a raw CharsetConverter (maybe like an ICU character set + // name). + GURL_API GURL ResolveWithCharsetConverter( + const std::string& relative, + url_canon::CharsetConverter* charset_converter) const; + GURL_API GURL ResolveWithCharsetConverter( + const string16& relative, + url_canon::CharsetConverter* charset_converter) const; + + // Creates a new GURL by replacing the current URL's components with the + // supplied versions. See the Replacements class in url_canon.h for more. + // + // These are not particularly quick, so avoid doing mutations when possible. + // Prefer the 8-bit version when possible. + // + // It is an error to replace components of an invalid URL. The result will + // be the empty URL. + // + // Note that we use the more general url_canon::Replacements type to give + // callers extra flexibility rather than our override. + GURL_API GURL ReplaceComponents( + const url_canon::Replacements<char>& replacements) const; + GURL_API GURL ReplaceComponents( + const url_canon::Replacements<char16>& replacements) const; + + // A helper function that is equivalent to replacing the path with a slash + // and clearing out everything after that. We sometimes need to know just the + // scheme and the authority. If this URL is not a standard URL (it doesn't + // have the regular authority and path sections), then the result will be + // an empty, invalid GURL. Note that this *does* work for file: URLs, which + // some callers may want to filter out before calling this. + // + // It is an error to get an empty path on an invalid URL. The result + // will be the empty URL. + GURL_API GURL GetWithEmptyPath() const; + + // A helper function to return a GURL containing just the scheme, host, + // and port from a URL. Equivalent to clearing any username and password, + // replacing the path with a slash, and clearing everything after that. If + // this URL is not a standard URL, then the result will be an empty, + // invalid GURL. If the URL has neither username nor password, this + // degenerates to GetWithEmptyPath(). + // + // It is an error to get the origin of an invalid URL. The result + // will be the empty URL. + GURL_API GURL GetOrigin() const; + + // Returns true if the scheme for the current URL is a known "standard" + // scheme. Standard schemes have an authority and a path section. This + // includes file: and filesystem:, which some callers may want to filter out + // explicitly by calling SchemeIsFile[System]. + GURL_API bool IsStandard() const; + + // Returns true if the given parameter (should be lower-case ASCII to match + // the canonicalized scheme) is the scheme for this URL. This call is more + // efficient than getting the scheme and comparing it because no copies or + // object constructions are done. + GURL_API bool SchemeIs(const char* lower_ascii_scheme) const; + + // We often need to know if this is a file URL. File URLs are "standard", but + // are often treated separately by some programs. + bool SchemeIsFile() const { + return SchemeIs("file"); + } + + // FileSystem URLs need to be treated differently in some cases. + bool SchemeIsFileSystem() const { + return SchemeIs("filesystem"); + } + + // If the scheme indicates a secure connection + bool SchemeIsSecure() const { + return SchemeIs("https") || + (SchemeIsFileSystem() && inner_url() && inner_url()->SchemeIsSecure()); + } + + // Returns true if the hostname is an IP address. Note: this function isn't + // as cheap as a simple getter because it re-parses the hostname to verify. + // This currently identifies only IPv4 addresses (bug 822685). + GURL_API bool HostIsIPAddress() const; + + // Getters for various components of the URL. The returned string will be + // empty if the component is empty or is not present. + std::string scheme() const { // Not including the colon. See also SchemeIs. + return ComponentString(parsed_.scheme); + } + std::string username() const { + return ComponentString(parsed_.username); + } + std::string password() const { + return ComponentString(parsed_.password); + } + // Note that this may be a hostname, an IPv4 address, or an IPv6 literal + // surrounded by square brackets, like "[2001:db8::1]". To exclude these + // brackets, use HostNoBrackets() below. + std::string host() const { + return ComponentString(parsed_.host); + } + std::string port() const { // Returns -1 if "default" + return ComponentString(parsed_.port); + } + std::string path() const { // Including first slash following host + return ComponentString(parsed_.path); + } + std::string query() const { // Stuff following '?' + return ComponentString(parsed_.query); + } + std::string ref() const { // Stuff following '#' + return ComponentString(parsed_.ref); + } + + // Existance querying. These functions will return true if the corresponding + // URL component exists in this URL. Note that existance is different than + // being nonempty. http://www.google.com/? has a query that just happens to + // be empty, and has_query() will return true. + bool has_scheme() const { + return parsed_.scheme.len >= 0; + } + bool has_username() const { + return parsed_.username.len >= 0; + } + bool has_password() const { + return parsed_.password.len >= 0; + } + bool has_host() const { + // Note that hosts are special, absense of host means length 0. + return parsed_.host.len > 0; + } + bool has_port() const { + return parsed_.port.len >= 0; + } + bool has_path() const { + // Note that http://www.google.com/" has a path, the path is "/". This can + // return false only for invalid or nonstandard URLs. + return parsed_.path.len >= 0; + } + bool has_query() const { + return parsed_.query.len >= 0; + } + bool has_ref() const { + return parsed_.ref.len >= 0; + } + + // Returns a parsed version of the port. Can also be any of the special + // values defined in Parsed for ExtractPort. + GURL_API int IntPort() const; + + // Returns the port number of the url, or the default port number. + // If the scheme has no concept of port (or unknown default) returns + // PORT_UNSPECIFIED. + GURL_API int EffectiveIntPort() const; + + // Extracts the filename portion of the path and returns it. The filename + // is everything after the last slash in the path. This may be empty. + GURL_API std::string ExtractFileName() const; + + // Returns the path that should be sent to the server. This is the path, + // parameter, and query portions of the URL. It is guaranteed to be ASCII. + GURL_API std::string PathForRequest() const; + + // Returns the host, excluding the square brackets surrounding IPv6 address + // literals. This can be useful for passing to getaddrinfo(). + GURL_API std::string HostNoBrackets() const; + + // Returns true if this URL's host matches or is in the same domain as + // the given input string. For example if this URL was "www.google.com", + // this would match "com", "google.com", and "www.google.com + // (input domain should be lower-case ASCII to match the canonicalized + // scheme). This call is more efficient than getting the host and check + // whether host has the specific domain or not because no copies or + // object constructions are done. + // + // If function DomainIs has parameter domain_len, which means the parameter + // lower_ascii_domain does not gurantee to terminate with NULL character. + GURL_API bool DomainIs(const char* lower_ascii_domain, int domain_len) const; + + // If function DomainIs only has parameter lower_ascii_domain, which means + // domain string should be terminate with NULL character. + bool DomainIs(const char* lower_ascii_domain) const { + return DomainIs(lower_ascii_domain, + static_cast<int>(strlen(lower_ascii_domain))); + } + + // Swaps the contents of this GURL object with the argument without doing + // any memory allocations. + GURL_API void Swap(GURL* other); + + // Returns a reference to a singleton empty GURL. This object is for callers + // who return references but don't have anything to return in some cases. + // This function may be called from any thread. + GURL_API static const GURL& EmptyGURL(); + + // Returns the inner URL of a nested URL [currently only non-null for + // filesystem: URLs]. + const GURL* inner_url() const { + return inner_url_; + } + + private: + // Returns the substring of the input identified by the given component. + std::string ComponentString(const url_parse::Component& comp) const { + if (comp.len <= 0) + return std::string(); + return std::string(spec_, comp.begin, comp.len); + } + + // The actual text of the URL, in canonical ASCII form. + std::string spec_; + + // Set when the given URL is valid. Otherwise, we may still have a spec and + // components, but they may not identify valid resources (for example, an + // invalid port number, invalid characters in the scheme, etc.). + bool is_valid_; + + // Identified components of the canonical spec. + url_parse::Parsed parsed_; + + // Used for nested schemes [currently only filesystem:]. + GURL* inner_url_; + + // TODO bug 684583: Add encoding for query params. +}; + +// Stream operator so GURL can be used in assertion statements. +GURL_API std::ostream& operator<<(std::ostream& out, const GURL& url); + +#endif // GOOGLEURL_SRC_GURL_H__ diff --git a/googleurl/url_canon.h b/googleurl/url_canon.h new file mode 100644 index 0000000..a3009fe --- /dev/null +++ b/googleurl/url_canon.h @@ -0,0 +1,912 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#ifndef GOOGLEURL_SRC_URL_CANON_H__ +#define GOOGLEURL_SRC_URL_CANON_H__ + +#include <string.h> +#include <stdlib.h> + +#include "base/string16.h" +#include "url_common.h" +#include "url_parse.h" + +namespace url_canon { + +// Canonicalizer output ------------------------------------------------------- + +// Base class for the canonicalizer output, this maintains a buffer and +// supports simple resizing and append operations on it. +// +// It is VERY IMPORTANT that no virtual function calls be made on the common +// code path. We only have two virtual function calls, the destructor and a +// resize function that is called when the existing buffer is not big enough. +// The derived class is then in charge of setting up our buffer which we will +// manage. +template<typename T> +class CanonOutputT { + public: + CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) { + } + virtual ~CanonOutputT() { + } + + // Implemented to resize the buffer. This function should update the buffer + // pointer to point to the new buffer, and any old data up to |cur_len_| in + // the buffer must be copied over. + // + // The new size |sz| must be larger than buffer_len_. + virtual void Resize(int sz) = 0; + + // Accessor for returning a character at a given position. The input offset + // must be in the valid range. + inline char at(int offset) const { + return buffer_[offset]; + } + + // Sets the character at the given position. The given position MUST be less + // than the length(). + inline void set(int offset, int ch) { + buffer_[offset] = ch; + } + + // Returns the number of characters currently in the buffer. + inline int length() const { + return cur_len_; + } + + // Returns the current capacity of the buffer. The length() is the number of + // characters that have been declared to be written, but the capacity() is + // the number that can be written without reallocation. If the caller must + // write many characters at once, it can make sure there is enough capacity, + // write the data, then use set_size() to declare the new length(). + int capacity() const { + return buffer_len_; + } + + // Called by the user of this class to get the output. The output will NOT + // be NULL-terminated. Call length() to get the + // length. + const T* data() const { + return buffer_; + } + T* data() { + return buffer_; + } + + // Shortens the URL to the new length. Used for "backing up" when processing + // relative paths. This can also be used if an external function writes a lot + // of data to the buffer (when using the "Raw" version below) beyond the end, + // to declare the new length. + // + // This MUST NOT be used to expand the size of the buffer beyond capacity(). + void set_length(int new_len) { + cur_len_ = new_len; + } + + // This is the most performance critical function, since it is called for + // every character. + void push_back(T ch) { + // In VC2005, putting this common case first speeds up execution + // dramatically because this branch is predicted as taken. + if (cur_len_ < buffer_len_) { + buffer_[cur_len_] = ch; + cur_len_++; + return; + } + + // Grow the buffer to hold at least one more item. Hopefully we won't have + // to do this very often. + if (!Grow(1)) + return; + + // Actually do the insertion. + buffer_[cur_len_] = ch; + cur_len_++; + } + + // Appends the given string to the output. + void Append(const T* str, int str_len) { + if (cur_len_ + str_len > buffer_len_) { + if (!Grow(cur_len_ + str_len - buffer_len_)) + return; + } + for (int i = 0; i < str_len; i++) + buffer_[cur_len_ + i] = str[i]; + cur_len_ += str_len; + } + + protected: + // Grows the given buffer so that it can fit at least |min_additional| + // characters. Returns true if the buffer could be resized, false on OOM. + bool Grow(int min_additional) { + static const int kMinBufferLen = 16; + int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_; + do { + if (new_len >= (1 << 30)) // Prevent overflow below. + return false; + new_len *= 2; + } while (new_len < buffer_len_ + min_additional); + Resize(new_len); + return true; + } + + T* buffer_; + int buffer_len_; + + // Used characters in the buffer. + int cur_len_; +}; + +// Simple implementation of the CanonOutput using new[]. This class +// also supports a static buffer so if it is allocated on the stack, most +// URLs can be canonicalized with no heap allocations. +template<typename T, int fixed_capacity = 1024> +class RawCanonOutputT : public CanonOutputT<T> { + public: + RawCanonOutputT() : CanonOutputT<T>() { + this->buffer_ = fixed_buffer_; + this->buffer_len_ = fixed_capacity; + } + virtual ~RawCanonOutputT() { + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + } + + virtual void Resize(int sz) { + T* new_buf = new T[sz]; + memcpy(new_buf, this->buffer_, + sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz)); + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + this->buffer_ = new_buf; + this->buffer_len_ = sz; + } + + protected: + T fixed_buffer_[fixed_capacity]; +}; + +// Normally, all canonicalization output is in narrow characters. We support +// the templates so it can also be used internally if a wide buffer is +// required. +typedef CanonOutputT<char> CanonOutput; +typedef CanonOutputT<char16> CanonOutputW; + +template<int fixed_capacity> +class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {}; +template<int fixed_capacity> +class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {}; + +// Character set converter ---------------------------------------------------- +// +// Converts query strings into a custom encoding. The embedder can supply an +// implementation of this class to interface with their own character set +// conversion libraries. +// +// Embedders will want to see the unit test for the ICU version. + +class CharsetConverter { + public: + CharsetConverter() {} + virtual ~CharsetConverter() {} + + // Converts the given input string from UTF-16 to whatever output format the + // converter supports. This is used only for the query encoding conversion, + // which does not fail. Instead, the converter should insert "invalid + // character" characters in the output for invalid sequences, and do the + // best it can. + // + // If the input contains a character not representable in the output + // character set, the converter should append the HTML entity sequence in + // decimal, (such as "你") with escaping of the ampersand, number + // sign, and semicolon (in the previous example it would be + // "%26%2320320%3B"). This rule is based on what IE does in this situation. + virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output) = 0; +}; + +// Whitespace ----------------------------------------------------------------- + +// Searches for whitespace that should be removed from the middle of URLs, and +// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces +// are preserved, which is what most browsers do. A pointer to the output will +// be returned, and the length of that output will be in |output_len|. +// +// This should be called before parsing if whitespace removal is desired (which +// it normally is when you are canonicalizing). +// +// If no whitespace is removed, this function will not use the buffer and will +// return a pointer to the input, to avoid the extra copy. If modification is +// required, the given |buffer| will be used and the returned pointer will +// point to the beginning of the buffer. +// +// Therefore, callers should not use the buffer, since it may actuall be empty, +// use the computed pointer and |*output_len| instead. +GURL_API const char* RemoveURLWhitespace(const char* input, int input_len, + CanonOutputT<char>* buffer, + int* output_len); +GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len, + CanonOutputT<char16>* buffer, + int* output_len); + +// IDN ------------------------------------------------------------------------ + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must fall in the ASCII range, but will be encoded in UTF-16. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, returns false. The output in this case is undefined. +GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); + +// Piece-by-piece canonicalizers ---------------------------------------------- +// +// These individual canonicalizers append the canonicalized versions of the +// corresponding URL component to the given std::string. The spec and the +// previously-identified range of that component are the input. The range of +// the canonicalized component will be written to the output component. +// +// These functions all append to the output so they can be chained. Make sure +// the output is empty when you start. +// +// These functions returns boolean values indicating success. On failure, they +// will attempt to write something reasonable to the output so that, if +// displayed to the user, they will recognise it as something that's messed up. +// Nothing more should ever be done with these invalid URLs, however. + +// Scheme: Appends the scheme and colon to the URL. The output component will +// indicate the range of characters up to but not including the colon. +// +// Canonical URLs always have a scheme. If the scheme is not present in the +// input, this will just write the colon to indicate an empty scheme. Does not +// append slashes which will be needed before any authority components for most +// URLs. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool CanonicalizeScheme(const char* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); +GURL_API bool CanonicalizeScheme(const char16* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); + +// User info: username/password. If present, this will add the delimiters so +// the output will be "<username>:<password>@" or "<username>@". Empty +// username/password pairs, or empty passwords, will get converted to +// nonexistant in the canonical version. +// +// The components for the username and password refer to ranges in the +// respective source strings. Usually, these will be the same string, which +// is legal as long as the two components don't overlap. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool CanonicalizeUserInfo(const char* username_source, + const url_parse::Component& username, + const char* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); +GURL_API bool CanonicalizeUserInfo(const char16* username_source, + const url_parse::Component& username, + const char16* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); + + +// This structure holds detailed state exported from the IP/Host canonicalizers. +// Additional fields may be added as callers require them. +struct CanonHostInfo { + CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} + + // Convenience function to test if family is an IP address. + bool IsIPAddress() const { return family == IPV4 || family == IPV6; } + + // This field summarizes how the input was classified by the canonicalizer. + enum Family { + NEUTRAL, // - Doesn't resemble an IP address. As far as the IP + // canonicalizer is concerned, it should be treated as a + // hostname. + BROKEN, // - Almost an IP, but was not canonicalized. This could be an + // IPv4 address where truncation occurred, or something + // containing the special characters :[] which did not parse + // as an IPv6 address. Never attempt to connect to this + // address, because it might actually succeed! + IPV4, // - Successfully canonicalized as an IPv4 address. + IPV6 // - Successfully canonicalized as an IPv6 address. + }; + Family family; + + // If |family| is IPV4, then this is the number of nonempty dot-separated + // components in the input text, from 1 to 4. If |family| is not IPV4, + // this value is undefined. + int num_ipv4_components; + + // Location of host within the canonicalized output. + // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. + // CanonicalizeHostVerbose() always sets it. + url_parse::Component out_host; + + // |address| contains the parsed IP Address (if any) in its first + // AddressLength() bytes, in network order. If IsIPAddress() is false + // AddressLength() will return zero and the content of |address| is undefined. + unsigned char address[16]; + + // Convenience function to calculate the length of an IP address corresponding + // to the current IP version in |family|, if any. For use with |address|. + int AddressLength() const { + return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); + } +}; + + +// Host. +// +// The 8-bit version requires UTF-8 encoding. Use this version when you only +// need to know whether canonicalization succeeded. +GURL_API bool CanonicalizeHost(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); +GURL_API bool CanonicalizeHost(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); + +// Extended version of CanonicalizeHost, which returns additional information. +// Use this when you need to know whether the hostname was an IP address. +// A successful return is indicated by host_info->family != BROKEN. See the +// definition of CanonHostInfo above for details. +GURL_API void CanonicalizeHostVerbose(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +GURL_API void CanonicalizeHostVerbose(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + + +// IP addresses. +// +// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is +// an IP address, it will canonicalize it as such, appending it to |output|. +// Additional status information is returned via the |*host_info| parameter. +// See the definition of CanonHostInfo above for details. +// +// This is called AUTOMATICALLY from the host canonicalizer, which ensures that +// the input is unescaped and name-prepped, etc. It should not normally be +// necessary or wise to call this directly. +GURL_API void CanonicalizeIPAddress(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +GURL_API void CanonicalizeIPAddress(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + +// Port: this function will add the colon for the port if a port is present. +// The caller can pass url_parse::PORT_UNSPECIFIED as the +// default_port_for_scheme argument if there is no default port. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool CanonicalizePort(const char* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); +GURL_API bool CanonicalizePort(const char16* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len); + +// Path. If the input does not begin in a slash (including if the input is +// empty), we'll prepend a slash to the path to make it canonical. +// +// The 8-bit version assumes UTF-8 encoding, but does not verify the validity +// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid +// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't +// an issue. Somebody giving us an 8-bit path is responsible for generating +// the path that the server expects (we'll escape high-bit characters), so +// if something is invalid, it's their problem. +GURL_API bool CanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API bool CanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Canonicalizes the input as a file path. This is like CanonicalizePath except +// that it also handles Windows drive specs. For example, the path can begin +// with "c|\" and it will get properly canonicalized to "C:/". +// The string will be appended to |*output| and |*out_path| will be updated. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool FileCanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API bool FileCanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Query: Prepends the ? if needed. +// +// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly +// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode +// "invalid character." This function can not fail, we always just try to do +// our best for crazy input here since web pages can set it themselves. +// +// This will convert the given input into the output encoding that the given +// character set converter object provides. The converter will only be called +// if necessary, for ASCII input, no conversions are necessary. +// +// The converter can be NULL. In this case, the output encoding will be UTF-8. +GURL_API void CanonicalizeQuery(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); +GURL_API void CanonicalizeQuery(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); + +// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only +// canonicalizer that does not produce ASCII output). The output is +// guaranteed to be valid UTF-8. +// +// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use +// the "Unicode replacement character" for the confusing bits and copy the rest. +GURL_API void CanonicalizeRef(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API void CanonicalizeRef(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Full canonicalizer --------------------------------------------------------- +// +// These functions replace any string contents, rather than append as above. +// See the above piece-by-piece functions for information specific to +// canonicalizing individual components. +// +// The output will be ASCII except the reference fragment, which may be UTF-8. +// +// The 8-bit versions require UTF-8 encoding. + +// Use for standard URLs with authorities and paths. +GURL_API bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeStandardURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for file URLs. +GURL_API bool CanonicalizeFileURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeFileURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for filesystem URLs. +GURL_API bool CanonicalizeFileSystemURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeFileSystemURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for path URLs such as javascript. This does not modify the path in any +// way, for example, by escaping it. +GURL_API bool CanonicalizePathURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizePathURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for mailto URLs. This "canonicalizes" the url into a path and query +// component. It does not attempt to merge "to" fields. It uses UTF-8 for +// the query encoding if there is a query. This is because a mailto URL is +// really intended for an external mail program, and the encoding of a page, +// etc. which would influence a query encoding normally are irrelevant. +GURL_API bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeMailtoURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Part replacer -------------------------------------------------------------- + +// Internal structure used for storing separate strings for each component. +// The basic canonicalization functions use this structure internally so that +// component replacement (different strings for different components) can be +// treated on the same code path as regular canonicalization (the same string +// for each component). +// +// A url_parse::Parsed structure usually goes along with this. Those +// components identify offsets within these strings, so that they can all be +// in the same string, or spread arbitrarily across different ones. +// +// This structures does not own any data. It is the caller's responsibility to +// ensure that the data the pointers point to stays in scope and is not +// modified. +template<typename CHAR> +struct URLComponentSource { + // Constructor normally used by callers wishing to replace components. This + // will make them all NULL, which is no replacement. The caller would then + // override the components they want to replace. + URLComponentSource() + : scheme(NULL), + username(NULL), + password(NULL), + host(NULL), + port(NULL), + path(NULL), + query(NULL), + ref(NULL) { + } + + // Constructor normally used internally to initialize all the components to + // point to the same spec. + explicit URLComponentSource(const CHAR* default_value) + : scheme(default_value), + username(default_value), + password(default_value), + host(default_value), + port(default_value), + path(default_value), + query(default_value), + ref(default_value) { + } + + const CHAR* scheme; + const CHAR* username; + const CHAR* password; + const CHAR* host; + const CHAR* port; + const CHAR* path; + const CHAR* query; + const CHAR* ref; +}; + +// This structure encapsulates information on modifying a URL. Each component +// may either be left unchanged, replaced, or deleted. +// +// By default, each component is unchanged. For those components that should be +// modified, call either Set* or Clear* to modify it. +// +// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT +// IN SCOPE BY THE CALLER for as long as this object exists! +// +// Prefer the 8-bit replacement version if possible since it is more efficient. +template<typename CHAR> +class Replacements { + public: + Replacements() { + } + + // Scheme + void SetScheme(const CHAR* s, const url_parse::Component& comp) { + sources_.scheme = s; + components_.scheme = comp; + } + // Note: we don't have a ClearScheme since this doesn't make any sense. + bool IsSchemeOverridden() const { return sources_.scheme != NULL; } + + // Username + void SetUsername(const CHAR* s, const url_parse::Component& comp) { + sources_.username = s; + components_.username = comp; + } + void ClearUsername() { + sources_.username = Placeholder(); + components_.username = url_parse::Component(); + } + bool IsUsernameOverridden() const { return sources_.username != NULL; } + + // Password + void SetPassword(const CHAR* s, const url_parse::Component& comp) { + sources_.password = s; + components_.password = comp; + } + void ClearPassword() { + sources_.password = Placeholder(); + components_.password = url_parse::Component(); + } + bool IsPasswordOverridden() const { return sources_.password != NULL; } + + // Host + void SetHost(const CHAR* s, const url_parse::Component& comp) { + sources_.host = s; + components_.host = comp; + } + void ClearHost() { + sources_.host = Placeholder(); + components_.host = url_parse::Component(); + } + bool IsHostOverridden() const { return sources_.host != NULL; } + + // Port + void SetPort(const CHAR* s, const url_parse::Component& comp) { + sources_.port = s; + components_.port = comp; + } + void ClearPort() { + sources_.port = Placeholder(); + components_.port = url_parse::Component(); + } + bool IsPortOverridden() const { return sources_.port != NULL; } + + // Path + void SetPath(const CHAR* s, const url_parse::Component& comp) { + sources_.path = s; + components_.path = comp; + } + void ClearPath() { + sources_.path = Placeholder(); + components_.path = url_parse::Component(); + } + bool IsPathOverridden() const { return sources_.path != NULL; } + + // Query + void SetQuery(const CHAR* s, const url_parse::Component& comp) { + sources_.query = s; + components_.query = comp; + } + void ClearQuery() { + sources_.query = Placeholder(); + components_.query = url_parse::Component(); + } + bool IsQueryOverridden() const { return sources_.query != NULL; } + + // Ref + void SetRef(const CHAR* s, const url_parse::Component& comp) { + sources_.ref = s; + components_.ref = comp; + } + void ClearRef() { + sources_.ref = Placeholder(); + components_.ref = url_parse::Component(); + } + bool IsRefOverridden() const { return sources_.ref != NULL; } + + // Getters for the itnernal data. See the variables below for how the + // information is encoded. + const URLComponentSource<CHAR>& sources() const { return sources_; } + const url_parse::Parsed& components() const { return components_; } + + private: + // Returns a pointer to a static empty string that is used as a placeholder + // to indicate a component should be deleted (see below). + const CHAR* Placeholder() { + static const CHAR empty_string = 0; + return &empty_string; + } + + // We support three states: + // + // Action | Source Component + // -----------------------+-------------------------------------------------- + // Don't change component | NULL (unused) + // Replace component | (replacement string) (replacement component) + // Delete component | (non-NULL) (invalid component: (0,-1)) + // + // We use a pointer to the empty string for the source when the component + // should be deleted. + URLComponentSource<CHAR> sources_; + url_parse::Parsed components_; +}; + +// The base must be an 8-bit canonical URL. +GURL_API bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Filesystem URLs can only have the path, query, or ref replaced. +// All other components will be ignored. +GURL_API bool ReplaceFileSystemURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceFileSystemURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Replacing some parts of a file URL is not permitted. Everything except +// the host, path, query, and ref will be ignored. +GURL_API bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Path URLs can only have the scheme and path replaced. All other components +// will be ignored. +GURL_API bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Mailto URLs can only have the scheme, path, and query replaced. +// All other components will be ignored. +GURL_API bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Relative URL --------------------------------------------------------------- + +// Given an input URL or URL fragment |fragment|, determines if it is a +// relative or absolute URL and places the result into |*is_relative|. If it is +// relative, the relevant portion of the URL will be placed into +// |*relative_component| (there may have been trimmed whitespace, for example). +// This value is passed to ResolveRelativeURL. If the input is not relative, +// this value is UNDEFINED (it may be changed by the function). +// +// Returns true on success (we successfully determined the URL is relative or +// not). Failure means that the combination of URLs doesn't make any sense. +// +// The base URL should always be canonical, therefore is ASCII. +GURL_API bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); +GURL_API bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); + +// Given a canonical parsed source URL, a URL fragment known to be relative, +// and the identified relevant portion of the relative URL (computed by +// IsRelativeURL), this produces a new parsed canonical URL in |output| and +// |out_parsed|. +// +// It also requires a flag indicating whether the base URL is a file: URL +// which triggers additional logic. +// +// The base URL should be canonical and have a host (may be empty for file +// URLs) and a path. If it doesn't have these, we can't resolve relative +// URLs off of it and will return the base as the output with an error flag. +// Becausee it is canonical is should also be ASCII. +// +// The query charset converter follows the same rules as CanonicalizeQuery. +// +// Returns true on success. On failure, the output will be "something +// reasonable" that will be consistent and valid, just probably not what +// was intended by the web page author or caller. +GURL_API bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); +GURL_API bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char16* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_H__ diff --git a/googleurl/url_canon_icu.h b/googleurl/url_canon_icu.h new file mode 100644 index 0000000..736e1e9 --- /dev/null +++ b/googleurl/url_canon_icu.h @@ -0,0 +1,63 @@ +// Copyright 2011, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ICU integration functions. + +#ifndef GOOGLEURL_SRC_URL_CANON_ICU_H__ +#define GOOGLEURL_SRC_URL_CANON_ICU_H__ + +#include "url_canon.h" + +typedef struct UConverter UConverter; + +namespace url_canon { + +// An implementation of CharsetConverter that implementations can use to +// interface the canonicalizer with ICU's conversion routines. +class ICUCharsetConverter : public CharsetConverter { + public: + // Constructs a converter using an already-existing ICU character set + // converter. This converter is NOT owned by this object; the lifetime must + // be managed by the creator such that it is alive as long as this is. + GURL_API ICUCharsetConverter(UConverter* converter); + + GURL_API virtual ~ICUCharsetConverter(); + + GURL_API virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output); + + private: + // The ICU converter, not owned by this class. + UConverter* converter_; +}; + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_ICU_H__ diff --git a/googleurl/url_canon_internal.h b/googleurl/url_canon_internal.h new file mode 100644 index 0000000..ac5774f --- /dev/null +++ b/googleurl/url_canon_internal.h @@ -0,0 +1,462 @@ +// Copyright 2011, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is intended to be included in another C++ file where the character +// types are defined. This allows us to write mostly generic code, but not have +// templace bloat because everything is inlined when anybody calls any of our +// functions. + +#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ + +#include <stdlib.h> + +#include "base/logging.h" +#include "url_canon.h" + +namespace url_canon { + +// Character type handling ----------------------------------------------------- + +// Bits that identify different character types. These types identify different +// bits that are set for each 8-bit character in the kSharedCharTypeTable. +enum SharedCharTypes { + // Characters that do not require escaping in queries. Characters that do + // not have this flag will be escaped; see url_canon_query.cc + CHAR_QUERY = 1, + + // Valid in the username/password field. + CHAR_USERINFO = 2, + + // Valid in a IPv4 address (digits plus dot and 'x' for hex). + CHAR_IPV4 = 4, + + // Valid in an ASCII-representation of a hex digit (as in %-escaped). + CHAR_HEX = 8, + + // Valid in an ASCII-representation of a decimal digit. + CHAR_DEC = 16, + + // Valid in an ASCII-representation of an octal digit. + CHAR_OCT = 32, + + // Characters that do not require escaping in encodeURIComponent. Characters + // that do not have this flag will be escaped; see url_util.cc. + CHAR_COMPONENT = 64 +}; + +// This table contains the flags in SharedCharTypes for each 8-bit character. +// Some canonicalization functions have their own specialized lookup table. +// For those with simple requirements, we have collected the flags in one +// place so there are fewer lookup tables to load into the CPU cache. +// +// Using an unsigned char type has a small but measurable performance benefit +// over using a 32-bit number. +extern const unsigned char kSharedCharTypeTable[0x100]; + +// More readable wrappers around the character type lookup table. +inline bool IsCharOfType(unsigned char c, SharedCharTypes type) { + return !!(kSharedCharTypeTable[c] & type); +} +inline bool IsQueryChar(unsigned char c) { + return IsCharOfType(c, CHAR_QUERY); +} +inline bool IsIPv4Char(unsigned char c) { + return IsCharOfType(c, CHAR_IPV4); +} +inline bool IsHexChar(unsigned char c) { + return IsCharOfType(c, CHAR_HEX); +} +inline bool IsComponentChar(unsigned char c) { + return IsCharOfType(c, CHAR_COMPONENT); +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. +void AppendStringOfType(const char* source, int length, + SharedCharTypes type, + CanonOutput* output); +void AppendStringOfType(const char16* source, int length, + SharedCharTypes type, + CanonOutput* output); + +// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit +// that will be used to represent it. +GURL_API extern const char kHexCharLookup[0x10]; + +// This lookup table allows fast conversion between ASCII hex letters and their +// corresponding numerical value. The 8-bit range is divided up into 8 +// regions of 0x20 characters each. Each of the three character types (numbers, +// uppercase, lowercase) falls into different regions of this range. The table +// contains the amount to subtract from characters in that range to get at +// the corresponding numerical value. +// +// See HexDigitToValue for the lookup. +extern const char kCharToHexLookup[8]; + +// Assumes the input is a valid hex digit! Call IsHexChar before using this. +inline unsigned char HexCharToValue(unsigned char c) { + return c - kCharToHexLookup[c / 0x20]; +} + +// Indicates if the given character is a dot or dot equivalent, returning the +// number of characters taken by it. This will be one for a literal dot, 3 for +// an escaped dot. If the character is not a dot, this will return 0. +template<typename CHAR> +inline int IsDot(const CHAR* spec, int offset, int end) { + if (spec[offset] == '.') { + return 1; + } else if (spec[offset] == '%' && offset + 3 <= end && + spec[offset + 1] == '2' && + (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) { + // Found "%2e" + return 3; + } + return 0; +} + +// Returns the canonicalized version of the input character according to scheme +// rules. This is implemented alongside the scheme canonicalizer, and is +// required for relative URL resolving to test for scheme equality. +// +// Returns 0 if the input character is not a valid scheme character. +char CanonicalSchemeChar(char16 ch); + +// Write a single character, escaped, to the output. This always escapes: it +// does no checking that thee character requires escaping. +// Escaping makes sense only 8 bit chars, so code works in all cases of +// input parameters (8/16bit). +template<typename UINCHAR, typename OUTCHAR> +inline void AppendEscapedChar(UINCHAR ch, + CanonOutputT<OUTCHAR>* output) { + output->push_back('%'); + output->push_back(kHexCharLookup[(ch >> 4) & 0xf]); + output->push_back(kHexCharLookup[ch & 0xf]); +} + +// The character we'll substitute for undecodable or invalid characters. +extern const char16 kUnicodeReplacementCharacter; + +// UTF-8 functions ------------------------------------------------------------ + +// Reads one character in UTF-8 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-byte ASCII character, it will not be changed). +// +// Implementation is in url_canon_icu.cc. +GURL_API bool ReadUTFChar(const char* str, int* begin, int length, + unsigned* code_point_out); + +// Generic To-UTF-8 converter. This will call the given append method for each +// character that should be appended, with the given output method. Wrappers +// are provided below for escaped and non-escaped versions of this. +// +// The char_value must have already been checked that it's a valid Unicode +// character. +template<class Output, void Appender(unsigned char, Output*)> +inline void DoAppendUTF8(unsigned char_value, Output* output) { + if (char_value <= 0x7f) { + Appender(static_cast<unsigned char>(char_value), output); + } else if (char_value <= 0x7ff) { + // 110xxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0xffff) { + // 1110xxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0x10FFFF) { // Max unicode code point. + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else { + // Invalid UTF-8 character (>20 bits). + NOTREACHED(); + } +} + +// Helper used by AppendUTF8Value below. We use an unsigned parameter so there +// are no funny sign problems with the input, but then have to convert it to +// a regular char for appending. +inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { + output->push_back(static_cast<char>(ch)); +} + +// Writes the given character to the output as UTF-8. This does NO checking +// of the validity of the unicode characters; the caller should ensure that +// the value it is appending is valid to append. +inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { + DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); +} + +// Writes the given character to the output as UTF-8, escaping ALL +// characters (even when they are ASCII). This does NO checking of the +// validity of the unicode characters; the caller should ensure that the value +// it is appending is valid to append. +inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { + DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); +} + +// UTF-16 functions ----------------------------------------------------------- + +// Reads one character in UTF-16 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-16-bit-word character, it will not be changed). +// +// Implementation is in url_canon_icu.cc. +GURL_API bool ReadUTFChar(const char16* str, int* begin, int length, + unsigned* code_point); + +// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. +inline void AppendUTF16Value(unsigned code_point, + CanonOutputT<char16>* output) { + if (code_point > 0xffff) { + output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0)); + output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00)); + } else { + output->push_back(static_cast<char16>(code_point)); + } +} + +// Escaping functions --------------------------------------------------------- + +// Writes the given character to the output as UTF-8, escaped. Call this +// function only when the input is wide. Returns true on success. Failure +// means there was some problem with the encoding, we'll still try to +// update the |*begin| pointer and add a placeholder character to the +// output so processing can continue. +// +// We will append the character starting at ch[begin] with the buffer ch +// being |length|. |*begin| will be updated to point to the last character +// consumed (we may consume more than one for UTF-16) so that if called in +// a loop, incrementing the pointer will move to the next character. +// +// Every single output character will be escaped. This means that if you +// give it an ASCII character as input, it will be escaped. Some code uses +// this when it knows that a character is invalid according to its rules +// for validity. If you don't want escaping for ASCII characters, you will +// have to filter them out prior to calling this function. +// +// Assumes that ch[begin] is within range in the array, but does not assume +// that any following characters are. +inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length, + CanonOutput* output) { + // UTF-16 input. Readchar16 will handle invalid characters for us and give + // us the kUnicodeReplacementCharacter, so we don't have to do special + // checking after failure, just pass through the failure to the caller. + unsigned char_value; + bool success = ReadUTFChar(str, begin, length, &char_value); + AppendUTF8EscapedValue(char_value, output); + return success; +} + +// Handles UTF-8 input. See the wide version above for usage. +inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, + CanonOutput* output) { + // ReadUTF8Char will handle invalid characters for us and give us the + // kUnicodeReplacementCharacter, so we don't have to do special checking + // after failure, just pass through the failure to the caller. + unsigned ch; + bool success = ReadUTFChar(str, begin, length, &ch); + AppendUTF8EscapedValue(ch, output); + return success; +} + +// Given a '%' character at |*begin| in the string |spec|, this will decode +// the escaped value and put it into |*unescaped_value| on success (returns +// true). On failure, this will return false, and will not write into +// |*unescaped_value|. +// +// |*begin| will be updated to point to the last character of the escape +// sequence so that when called with the index of a for loop, the next time +// through it will point to the next character to be considered. On failure, +// |*begin| will be unchanged. +inline bool Is8BitChar(char c) { + (void)c; + return true; // this case is specialized to avoid a warning +} +inline bool Is8BitChar(char16 c) { + return c <= 255; +} + +template<typename CHAR> +inline bool DecodeEscaped(const CHAR* spec, int* begin, int end, + unsigned char* unescaped_value) { + if (*begin + 3 > end || + !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) { + // Invalid escape sequence because there's not enough room, or the + // digits are not ASCII. + return false; + } + + unsigned char first = static_cast<unsigned char>(spec[*begin + 1]); + unsigned char second = static_cast<unsigned char>(spec[*begin + 2]); + if (!IsHexChar(first) || !IsHexChar(second)) { + // Invalid hex digits, fail. + return false; + } + + // Valid escape sequence. + *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second); + *begin += 2; + return true; +} + +// Appends the given substring to the output, escaping "some" characters that +// it feels may not be safe. It assumes the input values are all contained in +// 8-bit although it allows any type. +// +// This is used in error cases to append invalid output so that it looks +// approximately correct. Non-error cases should not call this function since +// the escaping rules are not guaranteed! +void AppendInvalidNarrowString(const char* spec, int begin, int end, + CanonOutput* output); +void AppendInvalidNarrowString(const char16* spec, int begin, int end, + CanonOutput* output); + +// Misc canonicalization helpers ---------------------------------------------- + +// Converts between UTF-8 and UTF-16, returning true on successful conversion. +// The output will be appended to the given canonicalizer output (so make sure +// it's empty if you want to replace). +// +// On invalid input, this will still write as much output as possible, +// replacing the invalid characters with the "invalid character". It will +// return false in the failure case, and the caller should not continue as +// normal. +GURL_API bool ConvertUTF16ToUTF8(const char16* input, int input_len, + CanonOutput* output); +GURL_API bool ConvertUTF8ToUTF16(const char* input, int input_len, + CanonOutputT<char16>* output); + +// Converts from UTF-16 to 8-bit using the character set converter. If the +// converter is NULL, this will use UTF-8. +void ConvertUTF16ToQueryEncoding(const char16* input, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output); + +// Applies the replacements to the given component source. The component source +// should be pre-initialized to the "old" base. That is, all pointers will +// point to the spec of the old URL, and all of the Parsed components will +// be indices into that string. +// +// The pointers and components in the |source| for all non-NULL strings in the +// |repl| (replacements) will be updated to reference those strings. +// Canonicalizing with the new |source| and |parsed| can then combine URL +// components from many different strings. +void SetupOverrideComponents(const char* base, + const Replacements<char>& repl, + URLComponentSource<char>* source, + url_parse::Parsed* parsed); + +// Like the above 8-bit version, except that it additionally converts the +// UTF-16 input to UTF-8 before doing the overrides. +// +// The given utf8_buffer is used to store the converted components. They will +// be appended one after another, with the parsed structure identifying the +// appropriate substrings. This buffer is a parameter because the source has +// no storage, so the buffer must have the same lifetime as the source +// parameter owned by the caller. +// +// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of +// |source| will point into this buffer, which could be invalidated if +// additional data is added and the CanonOutput resizes its buffer. +// +// Returns true on success. Fales means that the input was not valid UTF-16, +// although we will have still done the override with "invalid characters" in +// place of errors. +bool SetupUTF16OverrideComponents(const char* base, + const Replacements<char16>& repl, + CanonOutput* utf8_buffer, + URLComponentSource<char>* source, + url_parse::Parsed* parsed); + +// Implemented in url_canon_path.cc, these are required by the relative URL +// resolver as well, so we declare them here. +bool CanonicalizePartialPath(const char* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output); +bool CanonicalizePartialPath(const char16* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output); + +#ifndef WIN32 + +// Implementations of Windows' int-to-string conversions +GURL_API int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix); +GURL_API int _itow_s(int value, char16* buffer, size_t size_in_chars, + int radix); + +// Secure template overloads for these functions +template<size_t N> +inline int _itoa_s(int value, char (&buffer)[N], int radix) { + return _itoa_s(value, buffer, N, radix); +} + +template<size_t N> +inline int _itow_s(int value, char16 (&buffer)[N], int radix) { + return _itow_s(value, buffer, N, radix); +} + +// _strtoui64 and strtoull behave the same +inline unsigned long long _strtoui64(const char* nptr, + char** endptr, int base) { + return strtoull(nptr, endptr, base); +} + +#endif // WIN32 + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ diff --git a/googleurl/url_canon_internal_file.h b/googleurl/url_canon_internal_file.h new file mode 100644 index 0000000..c37c65e --- /dev/null +++ b/googleurl/url_canon_internal_file.h @@ -0,0 +1,157 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// As with url_canon_internal.h, this file is intended to be included in +// another C++ file where the template types are defined. This allows the +// programmer to use this to use these functions for their own strings +// types, without bloating the code by having inline templates used in +// every call site. +// +// *** This file must be included after url_canon_internal as we depend on some +// functions in it. *** + +#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ +#define GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ + +#include "url_file.h" +#include "url_parse_internal.h" + +using namespace url_canon; + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template<typename CHAR> +static int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z') + output->push_back(spec[after_slashes] - 'a' + 'A'); + else + output->push_back(static_cast<char>(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + output->push_back('/'); + return after_slashes + 2; +} + +// FileDoDriveSpec will have already added the first backslash, so we need to +// write everything following the slashes using the path canonicalizer. +template<typename CHAR, typename UCHAR> +static void FileDoPath(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // Normalize the number of slashes after the drive letter. The path + // canonicalizer expects the input to begin in a slash already so + // doesn't check. We want to handle no-slashes + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + // Now use the regular path canonicalizer to canonicalize the rest of the + // path. We supply it with the path following the slashes. It won't prepend + // a slash because it assumes any nonempty path already starts with one. + // We explicitly filter out calls with no path here to prevent that case. + ParsedURL::Component sub_path(after_slashes, end - after_slashes); + if (sub_path.len > 0) { + // Give it a fake output component to write into. DoCanonicalizeFile will + // compute the full path component. + ParsedURL::Component fake_output_path; + URLCanonInternal<CHAR, UCHAR>::DoPath( + spec, sub_path, output, &fake_output_path); + } +} + +template<typename CHAR, typename UCHAR> +static bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source, + const ParsedURL& parsed, + CanonOutput* output, + ParsedURL* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = ParsedURL::Component(0, -1); + new_parsed->password = ParsedURL::Component(0, -1); + new_parsed->port = ParsedURL::Component(0, -1); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->push_back('f'); + output->push_back('i'); + output->push_back('l'); + output->push_back('e'); + new_parsed->scheme.len = output->length() - new_parsed->scheme.begin; + output->push_back(':'); + + // Write the separator for the host. + output->push_back('/'); + output->push_back('/'); + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = URLCanonInternal<CHAR, UCHAR>::DoHost( + source.host, parsed.host, output, &new_parsed->host); + + // Write a separator for the start of the path. We'll ignore any slashes + // already at the beginning of the path. + new_parsed->path.begin = output->length(); + output->push_back('/'); + + // Copies and normalizes the "c:" at the beginning, if present. + int after_drive = FileDoDriveSpec(source.path, parsed.path.begin, + parsed.path.end(), output); + + // Copies the rest of the path + FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output); + new_parsed->path.len = output->length() - new_parsed->path.begin; + + // Things following the path we can use the standard canonicalizers for. + success &= URLCanonInternal<CHAR, UCHAR>::DoQuery( + source.query, parsed.query, output, &new_parsed->query); + success &= URLCanonInternal<CHAR, UCHAR>::DoRef( + source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ diff --git a/googleurl/url_canon_ip.h b/googleurl/url_canon_ip.h new file mode 100644 index 0000000..41da690 --- /dev/null +++ b/googleurl/url_canon_ip.h @@ -0,0 +1,101 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_CANON_IP_H__ +#define GOOGLEURL_SRC_URL_CANON_IP_H__ + +#include "base/string16.h" +#include "url_canon.h" +#include "url_common.h" +#include "url_parse.h" + +namespace url_canon { + +// Searches the host name for the portions of the IPv4 address. On success, +// each component will be placed into |components| and it will return true. +// It will return false if the host can not be separated as an IPv4 address +// or if there are any non-7-bit characters or other characters that can not +// be in an IP address. (This is important so we fail as early as possible for +// common non-IP hostnames.) +// +// Not all components may exist. If there are only 3 components, for example, +// the last one will have a length of -1 or 0 to indicate it does not exist. +// +// Note that many platform's inet_addr will ignore everything after a space +// in certain curcumstances if the stuff before the space looks like an IP +// address. IE6 is included in this. We do NOT handle this case. In many cases, +// the browser's canonicalization will get run before this which converts +// spaces to %20 (in the case of IE7) or rejects them (in the case of +// Mozilla), so this code path never gets hit. Our host canonicalization will +// notice these spaces and escape them, which will make IP address finding +// fail. This seems like better behavior than stripping after a space. +GURL_API bool FindIPv4Components(const char* spec, + const url_parse::Component& host, + url_parse::Component components[4]); +GURL_API bool FindIPv4Components(const char16* spec, + const url_parse::Component& host, + url_parse::Component components[4]); + +// Converts an IPv4 address to a 32-bit number (network byte order). +// +// Possible return values: +// IPV4 - IPv4 address was successfully parsed. +// BROKEN - Input was formatted like an IPv4 address, but overflow occurred +// during parsing. +// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address. +// It might be an IPv6 address, or a hostname. +// +// On success, |num_ipv4_components| will be populated with the number of +// components in the IPv4 address. +GURL_API CanonHostInfo::Family IPv4AddressToNumber( + const char* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); +GURL_API CanonHostInfo::Family IPv4AddressToNumber( + const char16* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +// +// NOTE that |host| is expected to be surrounded by square brackets. +// i.e. "[::1]" rather than "::1". +GURL_API bool IPv6AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[16]); +GURL_API bool IPv6AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[16]); + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_IP_H__ diff --git a/googleurl/url_canon_stdstring.h b/googleurl/url_canon_stdstring.h new file mode 100644 index 0000000..d766e05 --- /dev/null +++ b/googleurl/url_canon_stdstring.h @@ -0,0 +1,134 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This header file defines a canonicalizer output method class for STL +// strings. Because the canonicalizer tries not to be dependent on the STL, +// we have segregated it here. + +#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ +#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ + +#include <string> +#include "url_canon.h" + +namespace url_canon { + +// Write into a std::string given in the constructor. This object does not own +// the string itself, and the user must ensure that the string stays alive +// throughout the lifetime of this object. +// +// The given string will be appended to; any existing data in the string will +// be preserved. The caller should reserve() the amount of data in the string +// they expect to be written. We will resize if necessary, but that's slow. +// +// Note that when canonicalization is complete, the string will likely have +// unused space at the end because we make the string very big to start out +// with (by |initial_size|). This ends up being important because resize +// operations are slow, and because the base class needs to write directly +// into the buffer. +// +// Therefore, the user should call Complete() before using the string that +// this class wrote into. +class StdStringCanonOutput : public CanonOutput { + public: + StdStringCanonOutput(std::string* str) + : CanonOutput(), + str_(str) { + cur_len_ = static_cast<int>(str_->size()); // Append to existing data. + str_->resize(str_->capacity()); + buffer_ = str_->empty() ? NULL : &(*str_)[0]; + buffer_len_ = static_cast<int>(str_->size()); + } + virtual ~StdStringCanonOutput() { + // Nothing to do, we don't own the string. + } + + // Must be called after writing has completed but before the string is used. + void Complete() { + str_->resize(cur_len_); + buffer_len_ = cur_len_; + } + + virtual void Resize(int sz) { + str_->resize(sz); + buffer_ = str_->empty() ? NULL : &(*str_)[0]; + buffer_len_ = sz; + } + + protected: + std::string* str_; +}; + +// An extension of the Replacements class that allows the setters to use +// standard strings. +// +// The strings passed as arguments are not copied and must remain valid until +// this class goes out of scope. +template<typename STR> +class StdStringReplacements : + public url_canon::Replacements<typename STR::value_type> { + public: + void SetSchemeStr(const STR& s) { + this->SetScheme(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetUsernameStr(const STR& s) { + this->SetUsername(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPasswordStr(const STR& s) { + this->SetPassword(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetHostStr(const STR& s) { + this->SetHost(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPortStr(const STR& s) { + this->SetPort(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPathStr(const STR& s) { + this->SetPath(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetQueryStr(const STR& s) { + this->SetQuery(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetRefStr(const STR& s) { + this->SetRef(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } +}; + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ + diff --git a/googleurl/url_common.h b/googleurl/url_common.h new file mode 100644 index 0000000..ac045a8 --- /dev/null +++ b/googleurl/url_common.h @@ -0,0 +1,54 @@ +// Copyright 2010, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_COMMON_H__ +#define GOOGLEURL_SRC_URL_COMMON_H__ + +#if !defined(GURL_IMPLEMENTATION) +#define GURL_IMPLEMENTATION 0 +#endif + +#if defined(GURL_DLL) +#if defined(WIN32) +#if GURL_IMPLEMENTATION +#define GURL_API __declspec(dllexport) +#else +#define GURL_API __declspec(dllimport) +#endif +#else +// Non-Windows DLLs. +#define GURL_API __attribute__((visibility("default"))) +#endif +#else +// Not a DLL. +#define GURL_API +#endif + +#endif // GOOGLEURL_SRC_URL_COMMON_H__ + diff --git a/googleurl/url_file.h b/googleurl/url_file.h new file mode 100644 index 0000000..cb9c89f --- /dev/null +++ b/googleurl/url_file.h @@ -0,0 +1,108 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Provides shared functions used by the internals of the parser and +// canonicalizer for file URLs. Do not use outside of these modules. + +#ifndef GOOGLEURL_SRC_URL_FILE_H__ +#define GOOGLEURL_SRC_URL_FILE_H__ + +#include "url_parse_internal.h" + +namespace url_parse { + +#ifdef WIN32 + +// We allow both "c:" and "c|" as drive identifiers. +inline bool IsWindowsDriveSeparator(char16 ch) { + return ch == ':' || ch == '|'; +} +inline bool IsWindowsDriveLetter(char16 ch) { + return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +#endif // WIN32 + +// Returns the index of the next slash in the input after the given index, or +// spec_len if the end of the input is reached. +template<typename CHAR> +inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) { + int idx = begin_index; + while (idx < spec_len && !IsURLSlash(spec[idx])) + idx++; + return idx; +} + +#ifdef WIN32 + +// Returns true if the start_offset in the given spec looks like it begins a +// drive spec, for example "c:". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// If this returns true, the spec is guaranteed to have a valid drive letter +// plus a colon starting at |start_offset|. +template<typename CHAR> +inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + int remaining_len = spec_len - start_offset; + if (remaining_len < 2) + return false; // Not enough room. + if (!IsWindowsDriveLetter(spec[start_offset])) + return false; // Doesn't start with a valid drive letter. + if (!IsWindowsDriveSeparator(spec[start_offset + 1])) + return false; // Isn't followed with a drive separator. + return true; +} + +// Returns true if the start_offset in the given text looks like it begins a +// UNC path, for example "\\". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// When strict_slashes is set, this function will only accept backslashes as is +// standard for Windows. Otherwise, it will accept forward slashes as well +// which we use for a lot of URL handling. +template<typename CHAR> +inline bool DoesBeginUNCPath(const CHAR* text, + int start_offset, + int len, + bool strict_slashes) { + int remaining_len = len - start_offset; + if (remaining_len < 2) + return false; + + if (strict_slashes) + return text[start_offset] == '\\' && text[start_offset + 1] == '\\'; + return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]); +} + +#endif // WIN32 + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_FILE_H__ diff --git a/googleurl/url_parse.h b/googleurl/url_parse.h new file mode 100644 index 0000000..1eb6fcb --- /dev/null +++ b/googleurl/url_parse.h @@ -0,0 +1,373 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_PARSE_H__ +#define GOOGLEURL_SRC_URL_PARSE_H__ + +#include <string> + +#include "base/basictypes.h" +#include "base/string16.h" +#include "url_common.h" + +namespace url_parse { + +// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and +// KURLGoogle.cpp still rely on this type. +typedef char16 UTF16Char; + +// Component ------------------------------------------------------------------ + +// Represents a substring for URL parsing. +struct Component { + Component() : begin(0), len(-1) {} + + // Normal constructor: takes an offset and a length. + Component(int b, int l) : begin(b), len(l) {} + + int end() const { + return begin + len; + } + + // Returns true if this component is valid, meaning the length is given. Even + // valid components may be empty to record the fact that they exist. + bool is_valid() const { + return (len != -1); + } + + // Returns true if the given component is specified on false, the component + // is either empty or invalid. + bool is_nonempty() const { + return (len > 0); + } + + void reset() { + begin = 0; + len = -1; + } + + bool operator==(const Component& other) const { + return begin == other.begin && len == other.len; + } + + int begin; // Byte offset in the string of this component. + int len; // Will be -1 if the component is unspecified. +}; + +// Helper that returns a component created with the given begin and ending +// points. The ending point is non-inclusive. +inline Component MakeRange(int begin, int end) { + return Component(begin, end - begin); +} + +// Parsed --------------------------------------------------------------------- + +// A structure that holds the identified parts of an input URL. This structure +// does NOT store the URL itself. The caller will have to store the URL text +// and its corresponding Parsed structure separately. +// +// Typical usage would be: +// +// url_parse::Parsed parsed; +// url_parse::Component scheme; +// if (!url_parse::ExtractScheme(url, url_len, &scheme)) +// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; +// +// if (IsStandardScheme(url, scheme)) // Not provided by this component +// url_parseParseStandardURL(url, url_len, &parsed); +// else if (IsFileURL(url, scheme)) // Not provided by this component +// url_parse::ParseFileURL(url, url_len, &parsed); +// else +// url_parse::ParsePathURL(url, url_len, &parsed); +// +struct Parsed { + // Identifies different components. + enum ComponentType { + SCHEME, + USERNAME, + PASSWORD, + HOST, + PORT, + PATH, + QUERY, + REF + }; + + // The default constructor is sufficient for the components, but inner_parsed_ + // requires special handling. + GURL_API Parsed(); + GURL_API Parsed(const Parsed&); + GURL_API Parsed& operator=(const Parsed&); + GURL_API ~Parsed(); + + // Returns the length of the URL (the end of the last component). + // + // Note that for some invalid, non-canonical URLs, this may not be the length + // of the string. For example "http://": the parsed structure will only + // contain an entry for the four-character scheme, and it doesn't know about + // the "://". For all other last-components, it will return the real length. + GURL_API int Length() const; + + // Returns the number of characters before the given component if it exists, + // or where the component would be if it did exist. This will return the + // string length if the component would be appended to the end. + // + // Note that this can get a little funny for the port, query, and ref + // components which have a delimiter that is not counted as part of the + // component. The |include_delimiter| flag controls if you want this counted + // as part of the component or not when the component exists. + // + // This example shows the difference between the two flags for two of these + // delimited components that is present (the port and query) and one that + // isn't (the reference). The components that this flag affects are marked + // with a *. + // 0 1 2 + // 012345678901234567890 + // Example input: http://foo:80/?query + // include_delim=true, ...=false ("<-" indicates different) + // SCHEME: 0 0 + // USERNAME: 5 5 + // PASSWORD: 5 5 + // HOST: 7 7 + // *PORT: 10 11 <- + // PATH: 13 13 + // *QUERY: 14 15 <- + // *REF: 20 20 + // + GURL_API int CountCharactersBefore(ComponentType type, + bool include_delimiter) const; + + // Scheme without the colon: "http://foo"/ would have a scheme of "http". + // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there + // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed + // to start at the beginning of the string if there are preceeding whitespace + // or control characters. + Component scheme; + + // Username. Specified in URLs with an @ sign before the host. See |password| + Component username; + + // Password. The length will be -1 if unspecified, 0 if specified but empty. + // Not all URLs with a username have a password, as in "http://me@host/". + // The password is separated form the username with a colon, as in + // "http://me:secret@host/" + Component password; + + // Host name. + Component host; + + // Port number. + Component port; + + // Path, this is everything following the host name. Length will be -1 if + // unspecified. This includes the preceeding slash, so the path on + // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to + // have a 0 length path, it will be -1 in cases like "http://host?foo". + // Note that we treat backslashes the same as slashes. + Component path; + + // Stuff between the ? and the # after the path. This does not include the + // preceeding ? character. Length will be -1 if unspecified, 0 if there is + // a question mark but no query string. + Component query; + + // Indicated by a #, this is everything following the hash sign (not + // including it). If there are multiple hash signs, we'll use the last one. + // Length will be -1 if there is no hash sign, or 0 if there is one but + // nothing follows it. + Component ref; + + // This is used for nested URL types, currently only filesystem. If you + // parse a filesystem URL, the resulting Parsed will have a nested + // inner_parsed_ to hold the parsed inner URL's component information. + // For all other url types [including the inner URL], it will be NULL. + Parsed* inner_parsed() const { + return inner_parsed_; + } + + void set_inner_parsed(const Parsed& _inner_parsed) { + if (!inner_parsed_) + inner_parsed_ = new Parsed(_inner_parsed); + else + *inner_parsed_ = _inner_parsed; + } + + void clear_inner_parsed() { + if (inner_parsed_) { + delete inner_parsed_; + inner_parsed_ = NULL; + } + } + + private: + Parsed* inner_parsed_; // This object is owned and managed by this struct. +}; + +// Initialization functions --------------------------------------------------- +// +// These functions parse the given URL, filling in all of the structure's +// components. These functions can not fail, they will always do their best +// at interpreting the input given. +// +// The string length of the URL MUST be specified, we do not check for NULLs +// at any point in the process, and will actually handle embedded NULLs. +// +// IMPORTANT: These functions do NOT hang on to the given pointer or copy it +// in any way. See the comment above the struct. +// +// The 8-bit versions require UTF-8 encoding. + +// StandardURL is for when the scheme is known to be one that has an +// authority (host) like "http". This function will not handle weird ones +// like "about:" and "javascript:", or do the right thing for "file:" URLs. +GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed); + +// PathURL is for when the scheme is known not to have an authority (host) +// section but that aren't file URLs either. The scheme is parsed, and +// everything after the scheme is considered as the path. This is used for +// things like "about:" and "javascript:" +GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed); + +// FileURL is for file URLs. There are some special rules for interpreting +// these. +GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed); + +// Filesystem URLs are structured differently than other URLs. +GURL_API void ParseFileSystemURL(const char* url, + int url_len, + Parsed* parsed); +GURL_API void ParseFileSystemURL(const char16* url, + int url_len, + Parsed* parsed); + +// MailtoURL is for mailto: urls. They are made up scheme,path,query +GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); + +// Helper functions ----------------------------------------------------------- + +// Locates the scheme according to the URL parser's rules. This function is +// designed so the caller can find the scheme and call the correct Init* +// function according to their known scheme types. +// +// It also does not perform any validation on the scheme. +// +// This function will return true if the scheme is found and will put the +// scheme's range into *scheme. False means no scheme could be found. Note +// that a URL beginning with a colon has a scheme, but it is empty, so this +// function will return true but *scheme will = (0,0). +// +// The scheme is found by skipping spaces and control characters at the +// beginning, and taking everything from there to the first colon to be the +// scheme. The character at scheme.end() will be the colon (we may enhance +// this to handle full width colons or something, so don't count on the +// actual character value). The character at scheme.end()+1 will be the +// beginning of the rest of the URL, be it the authority or the path (or the +// end of the string). +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme); +GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme); + +// Returns true if ch is a character that terminates the authority segment +// of a URL. +GURL_API bool IsAuthorityTerminator(char16 ch); + +// Does a best effort parse of input |spec|, in range |auth|. If a particular +// component is not found, it will be set to invalid. +GURL_API void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); +GURL_API void ParseAuthority(const char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); + +// Computes the integer port value from the given port component. The port +// component should have been identified by one of the init functions on +// |Parsed| for the given input url. +// +// The return value will be a positive integer between 0 and 64K, or one of +// the two special values below. +enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; +GURL_API int ParsePort(const char* url, const Component& port); +GURL_API int ParsePort(const char16* url, const Component& port); + +// Extracts the range of the file name in the given url. The path must +// already have been computed by the parse function, and the matching URL +// and extracted path are provided to this function. The filename is +// defined as being everything from the last slash/backslash of the path +// to the end of the path. +// +// The file name will be empty if the path is empty or there is nothing +// following the last slash. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API void ExtractFileName(const char* url, + const Component& path, + Component* file_name); +GURL_API void ExtractFileName(const char16* url, + const Component& path, + Component* file_name); + +// Extract the first key/value from the range defined by |*query|. Updates +// |*query| to start at the end of the extracted key/value pair. This is +// designed for use in a loop: you can keep calling it with the same query +// object and it will iterate over all items in the query. +// +// Some key/value pairs may have the key, the value, or both be empty (for +// example, the query string "?&"). These will be returned. Note that an empty +// last parameter "foo.com?" or foo.com?a&" will not be returned, this case +// is the same as "done." +// +// The initial query component should not include the '?' (this is the default +// for parsed URLs). +// +// If no key/value are found |*key| and |*value| will be unchanged and it will +// return false. +GURL_API bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); +GURL_API bool ExtractQueryKeyValue(const char16* url, + Component* query, + Component* key, + Component* value); + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_PARSE_H__ diff --git a/googleurl/url_parse_internal.h b/googleurl/url_parse_internal.h new file mode 100644 index 0000000..32b306a --- /dev/null +++ b/googleurl/url_parse_internal.h @@ -0,0 +1,112 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Contains common inline helper functions used by the URL parsing routines. + +#ifndef GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ + +#include "url_parse.h" + +namespace url_parse { + +// We treat slashes and backslashes the same for IE compatability. +inline bool IsURLSlash(char16 ch) { + return ch == '/' || ch == '\\'; +} + +// Returns true if we should trim this character from the URL because it is a +// space or a control character. +inline bool ShouldTrimFromURL(char16 ch) { + return ch <= ' '; +} + +// Given an already-initialized begin index and length, this shrinks the range +// to eliminate "should-be-trimmed" characters. Note that the length does *not* +// indicate the length of untrimmed data from |*begin|, but rather the position +// in the input string (so the string starts at character |*begin| in the spec, +// and goes until |*len|). +template<typename CHAR> +inline void TrimURL(const CHAR* spec, int* begin, int* len) { + // Strip leading whitespace and control characters. + while (*begin < *len && ShouldTrimFromURL(spec[*begin])) + (*begin)++; + + // Strip trailing whitespace and control characters. We need the >i test for + // when the input string is all blanks; we don't want to back past the input. + while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) + (*len)--; +} + +// Counts the number of consecutive slashes starting at the given offset +// in the given string of the given length. +template<typename CHAR> +inline int CountConsecutiveSlashes(const CHAR *str, + int begin_offset, int str_len) { + int count = 0; + while (begin_offset + count < str_len && + IsURLSlash(str[begin_offset + count])) + ++count; + return count; +} + +// Internal functions in url_parse.cc that parse the path, that is, everything +// following the authority section. The input is the range of everything +// following the authority section, and the output is the identified ranges. +// +// This is designed for the file URL parser or other consumers who may do +// special stuff at the beginning, but want regular path parsing, it just +// maps to the internal parsing function for paths. +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); +void ParsePathInternal(const char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); + + +// Given a spec and a pointer to the character after the colon following the +// scheme, this parses it and fills in the structure, Every item in the parsed +// structure is filled EXCEPT for the scheme, which is untouched. +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed); +void ParseAfterScheme(const char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed); + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ diff --git a/googleurl/url_test_utils.h b/googleurl/url_test_utils.h new file mode 100644 index 0000000..77acf12 --- /dev/null +++ b/googleurl/url_test_utils.h @@ -0,0 +1,78 @@ +// Copyright 2007 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Convenience functions for string conversions. +// These are mostly intended for use in unit tests. + +#ifndef GOOGLEURL_SRC_URL_TEST_UTILS_H__ +#define GOOGLEURL_SRC_URL_TEST_UTILS_H__ + +#include <string> + +#include "base/string16.h" +#include "url_canon_internal.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace url_test_utils { + +// Converts a UTF-16 string from native wchar_t format to char16, by +// truncating the high 32 bits. This is not meant to handle true UTF-32 +// encoded strings. +inline string16 WStringToUTF16(const wchar_t* src) { + string16 str; + int length = static_cast<int>(wcslen(src)); + for (int i = 0; i < length; ++i) { + str.push_back(static_cast<char16>(src[i])); + } + return str; +} + +// Converts a string from UTF-8 to UTF-16 +inline string16 ConvertUTF8ToUTF16(const std::string& src) { + int length = static_cast<int>(src.length()); + EXPECT_LT(length, 1024); + url_canon::RawCanonOutputW<1024> output; + EXPECT_TRUE(url_canon::ConvertUTF8ToUTF16(src.data(), length, &output)); + return string16(output.data(), output.length()); +} + +// Converts a string from UTF-16 to UTF-8 +inline std::string ConvertUTF16ToUTF8(const string16& src) { + std::string str; + url_canon::StdStringCanonOutput output(&str); + EXPECT_TRUE(url_canon::ConvertUTF16ToUTF8(src.data(), + static_cast<int>(src.length()), + &output)); + output.Complete(); + return str; +} + +} // namespace url_test_utils + +#endif // GOOGLEURL_SRC_URL_TEST_UTILS_H__ diff --git a/googleurl/url_util.h b/googleurl/url_util.h new file mode 100644 index 0000000..32ab987 --- /dev/null +++ b/googleurl/url_util.h @@ -0,0 +1,229 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_UTIL_H__ +#define GOOGLEURL_SRC_URL_UTIL_H__ + +#include <string> + +#include "base/string16.h" +#include "url_common.h" +#include "url_parse.h" +#include "url_canon.h" + +namespace url_util { + +// Init ------------------------------------------------------------------------ + +// Initialization is NOT required, it will be implicitly initialized when first +// used. However, this implicit initialization is NOT threadsafe. If you are +// using this library in a threaded environment and don't have a consistent +// "first call" (an example might be calling "AddStandardScheme" with your +// special application-specific schemes) then you will want to call initialize +// before spawning any threads. +// +// It is OK to call this function more than once, subsequent calls will simply +// "noop", unless Shutdown() was called in the mean time. This will also be a +// "noop" if other calls to the library have forced an initialization +// beforehand. +GURL_API void Initialize(); + +// Cleanup is not required, except some strings may leak. For most user +// applications, this is fine. If you're using it in a library that may get +// loaded and unloaded, you'll want to unload to properly clean up your +// library. +GURL_API void Shutdown(); + +// Schemes -------------------------------------------------------------------- + +// Adds an application-defined scheme to the internal list of "standard" URL +// schemes. This function is not threadsafe and can not be called concurrently +// with any other url_util function. It will assert if the list of standard +// schemes has been locked (see LockStandardSchemes). +GURL_API void AddStandardScheme(const char* new_scheme); + +// Sets a flag to prevent future calls to AddStandardScheme from succeeding. +// +// This is designed to help prevent errors for multithreaded applications. +// Normal usage would be to call AddStandardScheme for your custom schemes at +// the beginning of program initialization, and then LockStandardSchemes. This +// prevents future callers from mistakenly calling AddStandardScheme when the +// program is running with multiple threads, where such usage would be +// dangerous. +// +// We could have had AddStandardScheme use a lock instead, but that would add +// some platform-specific dependencies we don't otherwise have now, and is +// overkill considering the normal usage is so simple. +GURL_API void LockStandardSchemes(); + +// Locates the scheme in the given string and places it into |found_scheme|, +// which may be NULL to indicate the caller does not care about the range. +// +// Returns whether the given |compare| scheme matches the scheme found in the +// input (if any). The |compare| scheme must be a valid canonical scheme or +// the result of the comparison is undefined. +GURL_API bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +GURL_API bool FindAndCompareScheme(const char16* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +inline bool FindAndCompareScheme(const std::string& str, + const char* compare, + url_parse::Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast<int>(str.size()), + compare, found_scheme); +} +inline bool FindAndCompareScheme(const string16& str, + const char* compare, + url_parse::Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast<int>(str.size()), + compare, found_scheme); +} + +// Returns true if the given string represents a standard URL. This means that +// either the scheme is in the list of known standard schemes. +GURL_API bool IsStandard(const char* spec, + const url_parse::Component& scheme); +GURL_API bool IsStandard(const char16* spec, + const url_parse::Component& scheme); + +// TODO(brettw) remove this. This is a temporary compatibility hack to avoid +// breaking the WebKit build when this version is synced via Chrome. +inline bool IsStandard(const char* spec, int spec_len, + const url_parse::Component& scheme) { + (void)spec_len; + return IsStandard(spec, scheme); +} + +// URL library wrappers ------------------------------------------------------- + +// Parses the given spec according to the extracted scheme type. Normal users +// should use the URL object, although this may be useful if performance is +// critical and you don't want to do the heap allocation for the std::string. +// +// As with the url_canon::Canonicalize* functions, the charset converter can +// be NULL to use UTF-8 (it will be faster in this case). +// +// Returns true if a valid URL was produced, false if not. On failure, the +// output and parsed structures will still be filled and will be consistent, +// but they will not represent a loadable URL. +GURL_API bool Canonicalize(const char* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +GURL_API bool Canonicalize(const char16* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); + +// Resolves a potentially relative URL relative to the given parsed base URL. +// The base MUST be valid. The resulting canonical URL and parsed information +// will be placed in to the given out variables. +// +// The relative need not be relative. If we discover that it's absolute, this +// will produce a canonical version of that URL. See Canonicalize() for more +// about the charset_converter. +// +// Returns true if the output is valid, false if the input could not produce +// a valid URL. +GURL_API bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +GURL_API bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char16* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); + +// Replaces components in the given VALID input url. The new canonical URL info +// is written to output and out_parsed. +// +// Returns true if the resulting URL is valid. +GURL_API bool ReplaceComponents( + const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); +GURL_API bool ReplaceComponents( + const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char16>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); + +// String helper functions ---------------------------------------------------- + +// Compare the lower-case form of the given string against the given ASCII +// string. This is useful for doing checking if an input string matches some +// token, and it is optimized to avoid intermediate string copies. +// +// The versions of this function that don't take a b_end assume that the b +// string is NULL terminated. +GURL_API bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b); +GURL_API bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b_begin, + const char* b_end); +GURL_API bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b); + +// Unescapes the given string using URL escaping rules. +GURL_API void DecodeURLEscapeSequences(const char* input, int length, + url_canon::CanonOutputW* output); + +// Escapes the given string as defined by the JS method encodeURIComponent. See +// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent +GURL_API void EncodeURIComponent(const char* input, int length, + url_canon::CanonOutput* output); + + +} // namespace url_util + +#endif // GOOGLEURL_SRC_URL_UTIL_H__ diff --git a/googleurl/url_util_internal.h b/googleurl/url_util_internal.h new file mode 100644 index 0000000..38335fd --- /dev/null +++ b/googleurl/url_util_internal.h @@ -0,0 +1,56 @@ +// Copyright 2011, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__ + +#include <string> + +#include "base/string16.h" +#include "url_common.h" +#include "url_parse.h" + +namespace url_util { + +extern const char kFileScheme[]; +extern const char kFileSystemScheme[]; +extern const char kMailtoScheme[]; + +// Given a string and a range inside the string, compares it to the given +// lower-case |compare_to| buffer. +bool CompareSchemeComponent(const char* spec, + const url_parse::Component& component, + const char* compare_to); +bool CompareSchemeComponent(const char16* spec, + const url_parse::Component& component, + const char* compare_to); + +} // namespace url_util + +#endif // GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__ |