diff --git a/README.md b/README.md index cd8afeebb..4374d78f6 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,24 @@ content or workflow. Features include: Because nginx does not support dynamic loading of modules, you need to add ngx_pagespeed as a build-time dependency. +### Simple method: Using a binary Pagespeed Optimization Library + +Check out ngx_pagespeed: + + $ cd ~ + $ git clone https://github.com/pagespeed/ngx_pagespeed.git + +Download and build nginx: + + $ # check http://nginx.org/en/download.html for the latest version + $ wget http://nginx.org/download/nginx-1.2.6.tar.gz + $ tar -xvzf nginx-1.2.6.tar.gz + $ cd nginx-1.2.6/src/ + $ ./configure --add-module=$HOME/ngx_pagespeed + $ make install + +### Complex method: Building the Pagespeed Optimization Library from source + First build mod_pagespeed against the current revision we work at: $ mkdir ~/mod_pagespeed @@ -60,13 +78,11 @@ Download and build nginx: $ wget http://nginx.org/download/nginx-1.2.6.tar.gz $ tar -xvzf nginx-1.2.6.tar.gz $ cd nginx-1.2.6/src/ - $ ./configure --with-debug --add-module=$HOME/ngx_pagespeed + $ MOD_PAGESPEED_DIR="$HOME/mod_pagespeed/src" ./configure --with-debug --add-module=$HOME/ngx_pagespeed $ make install -(This assumes you put everything in your home directory; if not, change paths -appropriately. The only restriction is that the `mod_pagespeed` and -`ngx_pagespeed` directories need to have the same parent so that ngx_pagespeed -can find the pagespeed optimization library.) +This assumes you put everything in your home directory; if not, change paths +appropriately. ## How to use @@ -225,3 +241,40 @@ adjustment: replace '"ModPagespeed"' with '"pagespeed "': pagespeed RunExperiment on; pagespeed ExperimentSpec "id=3;percent=50;default"; pagespeed ExperimentSpec "id=4;percent=50"; + +## Preparing the binary distribution + +If you just want to run ngx_pagespeed you don't need this. This is +documentation on how the `psol/` directory was created and is maintained. + +We redistribute precompiled libraries and the accompanying headers for the +pagespeed optimization library and its dependencies. To update the headers, +run: + + $ cd ngx_pagespeed/ + $ scripts/copy_includes.sh /path/to/mod_pagespeed/src + +This will delete `psol/include/` and recreate it from `mod_pagespeed/src` by +copying over all the headers and a few selected source files. The commit diff +should only be the changes, but it can be huge. + +To update the binaries, create a virtual machine running an old version of +Linux. The current binaries were created on two CentOS 5.4 virtual machines, +32-bit and 64-bit. Because the binaries will usually work on systems that are +more recent, it's important not to do this on your development machine. +Building the binaries meant building mod_pagespeed and pagespeed_automatic from +source, in separate directories with `BUILDTYPE=Release` on and off, and then +copying the resulting binaries over to `psol/lib/`: + + $ for buildtype in Debug Release ; do + for arch in ia32 x64 ; do + for library in + net/instaweb/automatic/pagespeed_automatic.a + out/Debug/obj.target/third_party/aprutil/libaprutil.a + out/Debug/obj.target/third_party/apr/libapr.a + out/Debug/obj.target/third_party/serf/libserf.a ; do + scp machine-${arch}:mod_pagespeed_${buildtype}/src/${library} + psol/lib/${buildtype}/linux/${arch}/ + done + done + done diff --git a/config b/config index 2f5d51e51..cbfffdff7 100644 --- a/config +++ b/config @@ -11,8 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# Environment Variables (Optional): +# MOD_PAGESPEED_DIR: absolute path to the mod_pagespeed/src directory +# PSOL_BINARY: absolute path to pagespeed_automatic.a + +mod_pagespeed_dir="${MOD_PAGESPEED_DIR:-unset}" +if [ "$mod_pagespeed_dir" = "unset" ] ; then + mod_pagespeed_dir="$ngx_addon_dir/psol/include" + build_from_source=false +else + build_from_source=true +fi + +echo "mod_pagespeed_dir=$mod_pagespeed_dir" +echo "build_from_source=$build_from_source" -mod_pagespeed_dir="$ngx_addon_dir/../mod_pagespeed/src" ngx_feature="psol" ngx_feature_name="" ngx_feature_run=no @@ -30,58 +44,74 @@ uname_os=`uname` uname_arch=`uname -m` if [ $uname_os = 'Linux' ]; then - os_name='linux' + os_name='linux' elif [ $uname_os = 'Darwin' ]; then - os_name='mac' + os_name='mac' else - echo "OS not supported: $uname_os" - exit 1 + echo "OS not supported: $uname_os" + exit 1 fi if [ $uname_arch = 'x86_64' ]; then - arch_name='x64' + arch_name='x64' elif [ $uname_arch = 'x86_32' ]; then - arch_name='ia32' + arch_name='ia32' else - echo "Architecture not supported: $uname_arch" - exit 1 + echo "Architecture not supported: $uname_arch" + exit 1 fi if [ "$NGX_DEBUG" = "YES" ]; then - buildtype=Debug + buildtype=Debug else - buildtype=Release + buildtype=Release fi -pagespeed_include="$mod_pagespeed_dir - $mod_pagespeed_dir/third_party/chromium/src/ - $mod_pagespeed_dir/third_party/protobuf/src/ - $mod_pagespeed_dir/out/$buildtype/obj/gen/ - $mod_pagespeed_dir/third_party/apr/src/include/ - $mod_pagespeed_dir/third_party/apr/gen/arch/$os_name/$arch_name/include/ - $mod_pagespeed_dir/third_party/aprutil/src/include/ - $mod_pagespeed_dir/third_party/aprutil/gen/arch/$os_name/$arch_name/include/" +pagespeed_include="\ + $mod_pagespeed_dir \ + $mod_pagespeed_dir/third_party/chromium/src \ + $mod_pagespeed_dir/third_party/protobuf/src \ + $mod_pagespeed_dir/out/$buildtype/obj/gen \ + $mod_pagespeed_dir/third_party/apr/src/include \ + $mod_pagespeed_dir/third_party/aprutil/src/include \ + $mod_pagespeed_dir/third_party/apr/gen/arch/$os_name/$arch_name/include \ + $mod_pagespeed_dir/third_party/aprutil/gen/arch/$os_name/$arch_name/include" ngx_feature_path="$pagespeed_include" -pagespeed_automatic_dir="$mod_pagespeed_dir/net/instaweb/automatic" -pagespeed_libs="-lstdc++ $pagespeed_automatic_dir/pagespeed_automatic.a -lrt -pthread" + +if $build_from_source ; then + psol_library_binaries="\ + $mod_pagespeed_dir/net/instaweb/automatic/pagespeed_automatic.a \ + $mod_pagespeed_dir/out/$buildtype/obj.target/third_party/serf/libserf.a \ + $mod_pagespeed_dir/out/$buildtype/obj.target/third_party/aprutil/libaprutil.a \ + $mod_pagespeed_dir/out/$buildtype/obj.target/third_party/apr/libapr.a" +else + psol_library_dir="$ngx_addon_dir/psol/lib/$buildtype/$os_name/$arch_name" + psol_library_binaries="\ + $psol_library_dir/pagespeed_automatic.a \ + $psol_library_dir/libserf.a \ + $psol_library_dir/libaprutil.a \ + $psol_library_dir/libapr.a" +fi + +pagespeed_libs="-lstdc++ $psol_library_binaries -lrt -pthread" ngx_feature_libs="$pagespeed_libs" ngx_feature_test=" - GoogleString output_buffer; - net_instaweb::StringWriter write_to_string(&output_buffer); + GoogleString output_buffer; + net_instaweb::StringWriter write_to_string(&output_buffer); - net_instaweb::NullMessageHandler handler; - net_instaweb::HtmlParse html_parse(&handler); - net_instaweb::HtmlWriterFilter html_writer_filter(&html_parse); + net_instaweb::NullMessageHandler handler; + net_instaweb::HtmlParse html_parse(&handler); + net_instaweb::HtmlWriterFilter html_writer_filter(&html_parse); - html_writer_filter.set_writer(&write_to_string); - html_parse.AddFilter(&html_writer_filter); + html_writer_filter.set_writer(&write_to_string); + html_parse.AddFilter(&html_writer_filter); - html_parse.StartParse(\"http:example.com\"); - html_parse.ParseText( - \"

Test

Test Text

\n\"); - html_parse.FinishParse(); + html_parse.StartParse(\"http:example.com\"); + html_parse.ParseText( + \"

Test

Test Text

\n\"); + html_parse.FinishParse(); - printf(\"parsed as: %s\", output_buffer.c_str())" + printf(\"parsed as: %s\", output_buffer.c_str())" # Test whether we have pagespeed and can compile and link against it. . "$ngx_addon_dir/cpp_feature" @@ -89,29 +119,28 @@ ngx_feature_test=" if [ $ngx_found = yes ]; then ps_src="$ngx_addon_dir/src" ngx_addon_name=ngx_pagespeed - NGX_ADDON_DEPS="$NGX_ADDON_DEPS $ps_src/ngx_cache.h" - NGX_ADDON_DEPS="$NGX_ADDON_DEPS $ps_src/ngx_pagespeed.h" - NGX_ADDON_DEPS="$NGX_ADDON_DEPS $ps_src/ngx_base_fetch.h" - NGX_ADDON_DEPS="$NGX_ADDON_DEPS $ps_src/ngx_server_context.h" - NGX_ADDON_DEPS="$NGX_ADDON_DEPS $ps_src/ngx_rewrite_options.h" - NGX_ADDON_DEPS="$NGX_ADDON_DEPS $ps_src/ngx_rewrite_driver_factory.h" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/../../mod_pagespeed/src/net/instaweb/apache/apr_thread_compatible_pool.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/../../mod_pagespeed/src/net/instaweb/apache/serf_url_async_fetcher.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/../../mod_pagespeed/src/net/instaweb/apache/apr_mem_cache.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/../../mod_pagespeed/src/net/instaweb/util/key_value_codec.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/../../mod_pagespeed/src/third_party/aprutil/apr_memcache2.c" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/ngx_pagespeed.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/ngx_rewrite_driver_factory.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/ngx_rewrite_options.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/ngx_server_context.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/ngx_base_fetch.cc" - NGX_ADDON_SRCS="$NGX_ADDON_SRCS $ps_src/ngx_cache.cc" + NGX_ADDON_DEPS="$NGX_ADDON_DEPS \ + $ps_src/ngx_cache.h \ + $ps_src/ngx_pagespeed.h \ + $ps_src/ngx_base_fetch.h \ + $ps_src/ngx_server_context.h \ + $ps_src/ngx_rewrite_options.h \ + $ps_src/ngx_rewrite_driver_factory.h" + NGX_ADDON_SRCS="$NGX_ADDON_SRCS \ + $ps_src/ngx_pagespeed.cc \ + $ps_src/ngx_rewrite_driver_factory.cc \ + $ps_src/ngx_rewrite_options.cc \ + $ps_src/ngx_server_context.cc \ + $ps_src/ngx_base_fetch.cc \ + $ps_src/ngx_cache.cc \ + $mod_pagespeed_dir/net/instaweb/apache/apr_thread_compatible_pool.cc \ + $mod_pagespeed_dir/net/instaweb/apache/serf_url_async_fetcher.cc \ + $mod_pagespeed_dir/net/instaweb/apache/apr_mem_cache.cc \ + $mod_pagespeed_dir/net/instaweb/util/key_value_codec.cc \ + $mod_pagespeed_dir/third_party/aprutil/apr_memcache2.c" + fi HTTP_AUX_FILTER_MODULES="$HTTP_AUX_FILTER_MODULES $ngx_addon_name" - CORE_LIBS="$CORE_LIBS - $pagespeed_libs - $mod_pagespeed_dir/out/$buildtype/obj.target/third_party/serf/libserf.a - $mod_pagespeed_dir/out/$buildtype/obj.target/third_party/aprutil/libaprutil.a - $mod_pagespeed_dir/out/$buildtype/obj.target/third_party/apr/libapr.a" + CORE_LIBS="$CORE_LIBS $pagespeed_libs" CORE_INCS="$CORE_INCS $pagespeed_include" else cat << END diff --git a/psol/include/googleurl/base/basictypes.h b/psol/include/googleurl/base/basictypes.h new file mode 100644 index 000000000..b0c404d1d --- /dev/null +++ b/psol/include/googleurl/base/basictypes.h @@ -0,0 +1,88 @@ +// Copyright 2001 - 2003 Google Inc. All Rights Reserved + +#ifndef BASE_BASICTYPES_H__ +#define BASE_BASICTYPES_H__ + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; + +const uint8 kuint8max = (( uint8) 0xFF); +const uint32 kuint32max = ((uint32) 0xFFFFFFFF); + +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. +// +// One caveat is that arraysize() doesn't accept any array of an +// anonymous type or a type defined inside a function. In these rare +// cases, you have to use the unsafe ARRAYSIZE() macro below. This is +// due to a limitation in C++'s template system. The limitation might +// eventually be removed, but it hasn't happened yet. + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef _MSC_VER +template +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + +// ARRAYSIZE performs essentially the same calculation as arraysize, +// but can be used on anonymous types or types defined inside +// functions. It's less safe than arraysize as it accepts some +// (although not all) pointers. Therefore, you should use arraysize +// whenever possible. +// +// The expression ARRAYSIZE(a) is a compile-time constant of type +// size_t. +// +// ARRAYSIZE catches a few type errors. If you see a compiler error +// +// "warning: division by zero in ..." +// +// when using ARRAYSIZE, you are (wrongfully) giving it a pointer. +// You should only use ARRAYSIZE on statically allocated arrays. +// +// The following comments are on the implementation details, and can +// be ignored by the users. +// +// ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in +// the array) and sizeof(*(arr)) (the # of bytes in one array +// element). If the former is divisible by the latter, perhaps arr is +// indeed an array, in which case the division result is the # of +// elements in the array. Otherwise, arr cannot possibly be an array, +// and we generate a compiler error to prevent the code from +// compiling. +// +// Since the size of bool is implementation-defined, we need to cast +// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final +// result has type size_t. +// +// This macro is not perfect as it wrongfully accepts certain +// pointers, namely where the pointer size is divisible by the pointee +// size. Since all our code has to go through a 32-bit compiler, +// where a pointer is 4 bytes, this means all pointers to a type whose +// size is 3 or greater than 4 will be (righteously) rejected. +// +// Starting with Visual C++ 2005, WinNT.h includes ARRAYSIZE. +#define ARRAYSIZE_UNSAFE(a) \ + ((sizeof(a) / sizeof(*(a))) / \ + static_cast(!(sizeof(a) % sizeof(*(a))))) + +// A macro to disallow the evil copy constructor and operator= functions +// This should be used in the private: declarations for a class +#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#endif // BASE_BASICTYPES_H__ diff --git a/psol/include/googleurl/base/logging.h b/psol/include/googleurl/base/logging.h new file mode 100644 index 000000000..5353b59bd --- /dev/null +++ b/psol/include/googleurl/base/logging.h @@ -0,0 +1,482 @@ +// Copyright 2006 Google Inc. All Rights Reserved. +// Author: brettw (Brett Wilson) + +#ifndef BASE_LOGGING_H__ +#define BASE_LOGGING_H__ + +#include +#include +#include +#include + +#include "base/basictypes.h" +#include "base/scoped_ptr.h" + +// Optional message capabilities +// ----------------------------- +// Assertion failed messages and fatal errors are displayed in a dialog box +// before the application exits. However, running this UI creates a message +// loop, which causes application messages to be processed and potentially +// dispatched to existing application windows. Since the application is in a +// bad state when this assertion dialog is displayed, these messages may not +// get processed and hang the dialog, or the application might go crazy. +// +// Therefore, it can be beneficial to display the error dialog in a separate +// process from the main application. When the logging system needs to display +// a fatal error dialog box, it will look for a program called +// "DebugMessage.exe" in the same directory as the application executable. It +// will run this application with the message as the command line, and will +// not include the name of the application as is traditional for easier +// parsing. +// +// The code for DebugMessage.exe is only one line. In WinMain, do: +// MessageBox(NULL, GetCommandLineW(), L"Fatal Error", 0); +// +// If DebugMessage.exe is not found, the logging code will use a normal +// MessageBox, potentially causing the problems discussed above. + + +// Instructions +// ------------ +// +// Make a bunch of macros for logging. The way to log things is to stream +// things to LOG(). E.g., +// +// LOG(INFO) << "Found " << num_cookies << " cookies"; +// +// You can also do conditional logging: +// +// LOG_IF(INFO, num_cookies > 10) << "Got lots of cookies"; +// +// The above will cause log messages to be output on the 1st, 11th, 21st, ... +// times it is executed. Note that the special COUNTER value is used to +// identify which repetition is happening. +// +// There are also "debug mode" logging macros like the ones above: +// +// DLOG(INFO) << "Found cookies"; +// +// DLOG_IF(INFO, num_cookies > 10) << "Got lots of cookies"; +// +// All "debug mode" logging is compiled away to nothing for non-debug mode +// compiles. LOG_IF and development flags also work well together +// because the code can be compiled away sometimes. +// +// We also have +// +// LOG_ASSERT(assertion); +// DLOG_ASSERT(assertion); +// +// which is syntactic sugar for {,D}LOG_IF(FATAL, assert fails) << assertion; +// +// We also override the standard 'assert' to use 'DLOG_ASSERT'. +// +// The supported severity levels for macros that allow you to specify one +// are (in increasing order of severity) INFO, WARNING, ERROR, and FATAL. +// +// There is also the special severity of DFATAL, which logs FATAL in +// debug mode, ERROR in normal mode. +// +// Very important: logging a message at the FATAL severity level causes +// the program to terminate (after the message is logged). + +namespace logging { + +// Where to record logging output? A flat file and/or system debug log via +// OutputDebugString. Defaults to LOG_ONLY_TO_FILE. +enum LoggingDestination { LOG_ONLY_TO_FILE, + LOG_ONLY_TO_SYSTEM_DEBUG_LOG, + LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG }; + +// Indicates that the log file should be locked when being written to. +// Often, there is no locking, which is fine for a single threaded program. +// If logging is being done from multiple threads or there can be more than +// one process doing the logging, the file should be locked during writes to +// make each log outut atomic. Other writers will block. +// +// All processes writing to the log file must have their locking set for it to +// work properly. Defaults to DONT_LOCK_LOG_FILE. +enum LogLockingState { LOCK_LOG_FILE, DONT_LOCK_LOG_FILE }; + +// On startup, should we delete or append to an existing log file (if any)? +// Defaults to APPEND_TO_OLD_LOG_FILE. +enum OldFileDeletionState { DELETE_OLD_LOG_FILE, APPEND_TO_OLD_LOG_FILE }; + +// Sets the log file name and other global logging state. Calling this function +// is recommended, and is normally done at the beginning of application init. +// If you don't call it, all the flags will be initialized to their default +// values, and there is a race condition that may leak a critical section +// object if two threads try to do the first log at the same time. +// See the definition of the enums above for descriptions and default values. +// +// The default log file is initialized to "debug.log" in the application +// directory. You probably don't want this, especially since the program +// directory may not be writable on an enduser's system. +void InitLogging(const TCHAR* log_file, LoggingDestination logging_dest, + LogLockingState lock_log, OldFileDeletionState delete_old); + +// Sets the log level. Anything at or above this level will be written to the +// log file/displayed to the user (if applicable). Anything below this level +// will be silently ignored. The log level defaults to 0 (everything is logged) +// if this function is not called. +void SetMinLogLevel(int level); + +// Sets the log filter prefix. Any log message below LOG_ERROR severity that +// doesn't start with this prefix with be silently ignored. The filter defaults +// to NULL (everything is logged) if this function is not called. Messages +// with severity of LOG_ERROR or higher will not be filtered. +void SetLogFilterPrefix(char* filter); + +// Sets the common items you want to be prepended to each log message. +// process and thread IDs default to off, the timestamp defaults to on. +// If this function is not called, logging defaults to writing the timestamp +// only. +void SetLogItems(bool enable_process_id, bool enable_thread_id, + bool enable_timestamp, bool enable_tickcount); + +// Sets the Log Assert Handler that will be used to notify of check failures. +// The default handler shows a dialog box, however clients can use this +// function to override with their own handling (e.g. a silent one for Unit +// Tests) +typedef void (*LogAssertHandlerFunction)(const std::string& str); +void SetLogAssertHandler(LogAssertHandlerFunction handler); + +typedef int LogSeverity; +const LogSeverity LOG_INFO = 0; +const LogSeverity LOG_WARNING = 1; +const LogSeverity LOG_ERROR = 2; +const LogSeverity LOG_FATAL = 3; +const LogSeverity LOG_NUM_SEVERITIES = 4; + +// LOG_DFATAL_LEVEL is LOG_FATAL in debug mode, ERROR in normal mode +#ifdef NDEBUG +const LogSeverity LOG_DFATAL_LEVEL = LOG_ERROR; +#else +const LogSeverity LOG_DFATAL_LEVEL = LOG_FATAL; +#endif + +// A few definitions of macros that don't generate much code. These are used +// by LOG() and LOG_IF, etc. Since these are used all over our code, it's +// better to have compact code for these operations. +#define COMPACT_GOOGLE_LOG_INFO \ + logging::LogMessage(__FILE__, __LINE__) +#define COMPACT_GOOGLE_LOG_WARNING \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_WARNING) +#define COMPACT_GOOGLE_LOG_ERROR \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR) +#define COMPACT_GOOGLE_LOG_FATAL \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_FATAL) +#define COMPACT_GOOGLE_LOG_DFATAL \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_DFATAL_LEVEL) + +// wingdi.h defines ERROR to be 0. When we call LOG(ERROR), it gets +// substituted with 0, and it expands to COMPACT_GOOGLE_LOG_0. To allow us +// to keep using this syntax, we define this macro to do the same thing +// as COMPACT_GOOGLE_LOG_ERROR, and also define ERROR the same way that +// the Windows SDK does for consistency. +#define ERROR 0 +#define COMPACT_GOOGLE_LOG_0 \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR) + +// We use the preprocessor's merging operator, "##", so that, e.g., +// LOG(INFO) becomes the token COMPACT_GOOGLE_LOG_INFO. There's some funny +// subtle difference between ostream member streaming functions (e.g., +// ostream::operator<<(int) and ostream non-member streaming functions +// (e.g., ::operator<<(ostream&, string&): it turns out that it's +// impossible to stream something like a string directly to an unnamed +// ostream. We employ a neat hack by calling the stream() member +// function of LogMessage which seems to avoid the problem. + +#define LOG(severity) COMPACT_GOOGLE_LOG_ ## severity.stream() +#define SYSLOG(severity) LOG(severity) + +#define LOG_IF(severity, condition) \ + !(condition) ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) +#define SYSLOG_IF(severity, condition) LOG_IF(severity, condition) + +#define LOG_ASSERT(condition) \ + LOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". " +#define SYSLOG_ASSERT(condition) \ + SYSLOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". " + +// A container for a string pointer which can be evaluated to a bool - +// true iff the pointer is NULL. +struct CheckOpString { + CheckOpString(std::string* str) : str_(str) { } + // No destructor: if str_ is non-NULL, we're about to LOG(FATAL), + // so there's no point in cleaning up str_. + operator bool() const { return str_ != NULL; } + std::string* str_; +}; + +// Build the error message string. This is separate from the "Impl" +// function template because it is not performance critical and so can +// be out of line, while the "Impl" code should be inline. +template +std::string* MakeCheckOpString(const t1& v1, const t2& v2, const char* names) { + std::ostrstream ss; + ss << names << " (" << v1 << " vs. " << v2 << ")"; + return new std::string(ss.str(), ss.pcount()); +} + +extern std::string* MakeCheckOpStringIntInt(int v1, int v2, const char* names); + +template +std::string* MakeCheckOpString(const int& v1, const int& v2, const char* names) { + return MakeCheckOpStringIntInt(v1, v2, names); +} + +// Plus some debug-logging macros that get compiled to nothing for production +// +// DEBUG_MODE is for uses like +// if (DEBUG_MODE) foo.CheckThatFoo(); +// instead of +// #ifndef NDEBUG +// foo.CheckThatFoo(); +// #endif + +#ifndef NDEBUG + +#define DLOG(severity) LOG(severity) +#define DLOG_IF(severity, condition) LOG_IF(severity, condition) +#define DLOG_ASSERT(condition) LOG_ASSERT(condition) + +// debug-only checking. not executed in NDEBUG mode. +enum { DEBUG_MODE = 1 }; +#define DCHECK(condition) \ + LOG_IF(FATAL, !(condition)) << "Check failed: " #condition ". " + +// Helper functions for DCHECK_OP macro. +// The (int, int) specialization works around the issue that the compiler +// will not instantiate the template version of the function on values of +// unnamed enum type - see comment below. +#define DEFINE_DCHECK_OP_IMPL(name, op) \ + template \ + inline std::string* Check##name##Impl(const t1& v1, const t2& v2, \ + const char* names) { \ + if (v1 op v2) return NULL; \ + else return MakeCheckOpString(v1, v2, names); \ + } \ + inline std::string* Check##name##Impl(int v1, int v2, const char* names) { \ + if (v1 op v2) return NULL; \ + else return MakeCheckOpString(v1, v2, names); \ + } +DEFINE_DCHECK_OP_IMPL(EQ, ==) +DEFINE_DCHECK_OP_IMPL(NE, !=) +DEFINE_DCHECK_OP_IMPL(LE, <=) +DEFINE_DCHECK_OP_IMPL(LT, < ) +DEFINE_DCHECK_OP_IMPL(GE, >=) +DEFINE_DCHECK_OP_IMPL(GT, > ) +#undef DEFINE_DCHECK_OP_IMPL + +// Helper macro for binary operators. +// Don't use this macro directly in your code, use CHECK_EQ et al below. +#define DCHECK_OP(name, op, val1, val2) \ + while (logging::CheckOpString _result = \ + logging::Check##name##Impl((val1), (val2), #val1 " " #op " " #val2)) \ + logging::LogMessage(__FILE__, __LINE__, _result).stream() + +// Equality/Inequality checks - compare two values, and log a LOG_FATAL message +// including the two values when the result is not as expected. The values +// must have operator<<(ostream, ...) defined. +// +// You may append to the error message like so: +// CHECK_NE(1, 2) << ": The world must be ending!"; +// +// We are very careful to ensure that each argument is evaluated exactly +// once, and that anything which is legal to pass as a function argument is +// legal here. In particular, the arguments may be temporary expressions +// which will end up being destroyed at the end of the apparent statement, +// for example: +// CHECK_EQ(string("abc")[1], 'b'); +// +// WARNING: These don't compile correctly if one of the arguments is a pointer +// and the other is NULL. To work around this, simply static_cast NULL to the +// type of the desired pointer. + +#define DCHECK_EQ(val1, val2) DCHECK_OP(EQ, ==, val1, val2) +#define DCHECK_NE(val1, val2) DCHECK_OP(NE, !=, val1, val2) +#define DCHECK_LE(val1, val2) DCHECK_OP(LE, <=, val1, val2) +#define DCHECK_LT(val1, val2) DCHECK_OP(LT, < , val1, val2) +#define DCHECK_GE(val1, val2) DCHECK_OP(GE, >=, val1, val2) +#define DCHECK_GT(val1, val2) DCHECK_OP(GT, > , val1, val2) + +// Helper functions for string comparisons. +// To avoid bloat, the definitions are in logging.cc. +#define DECLARE_DCHECK_STROP_IMPL(func, expected) \ + std::string* Check##func##expected##Impl(const char* s1, \ + const char* s2, \ + const char* names); +DECLARE_DCHECK_STROP_IMPL(strcmp, true) +DECLARE_DCHECK_STROP_IMPL(strcmp, false) +DECLARE_DCHECK_STROP_IMPL(_stricmp, true) +DECLARE_DCHECK_STROP_IMPL(_stricmp, false) +#undef DECLARE_DCHECK_STROP_IMPL + +// Helper macro for string comparisons. +// Don't use this macro directly in your code, use CHECK_STREQ et al below. +#define DCHECK_STROP(func, op, expected, s1, s2) \ + while (CheckOpString _result = \ + logging::Check##func##expected##Impl((s1), (s2), \ + #s1 " " #op " " #s2)) \ + LOG(FATAL) << *_result.str_ + +// String (char*) equality/inequality checks. +// CASE versions are case-insensitive. +// +// Note that "s1" and "s2" may be temporary strings which are destroyed +// by the compiler at the end of the current "full expression" +// (e.g. DCHECK_STREQ(Foo().c_str(), Bar().c_str())). + +#define DCHECK_STREQ(s1, s2) DCHECK_STROP(strcmp, ==, true, s1, s2) +#define DCHECK_STRNE(s1, s2) DCHECK_STROP(strcmp, !=, false, s1, s2) +#define DCHECK_STRCASEEQ(s1, s2) DCHECK_STROP(_stricmp, ==, true, s1, s2) +#define DCHECK_STRCASENE(s1, s2) DCHECK_STROP(_stricmp, !=, false, s1, s2) + +#define DCHECK_INDEX(I,A) DCHECK(I < (sizeof(A)/sizeof(A[0]))) +#define DCHECK_BOUND(B,A) DCHECK(B <= (sizeof(A)/sizeof(A[0]))) + +#else // NDEBUG + +#define DLOG(severity) \ + true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) + +#define DLOG_IF(severity, condition) \ + true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) + +#define DLOG_ASSERT(condition) \ + true ? (void) 0 : LOG_ASSERT(condition) + +enum { DEBUG_MODE = 0 }; + +// This macro can be followed by a sequence of stream parameters in +// non-debug mode. The DCHECK and friends macros use this so that +// the expanded expression DCHECK(foo) << "asdf" is still syntactically +// valid, even though the expression will get optimized away. +#define NDEBUG_EAT_STREAM_PARAMETERS \ + logging::LogMessage(__FILE__, __LINE__).stream() + +#define DCHECK(condition) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_EQ(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_NE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_LE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_LT(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_GE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_GT(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STREQ(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRCASEEQ(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRNE(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRCASENE(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#endif // NDEBUG + +#define NOTREACHED() DCHECK(false) + +// Redefine the standard assert to use our nice log files +#undef assert +#define assert(x) DLOG_ASSERT(x) + +// This class more or less represents a particular log message. You +// create an instance of LogMessage and then stream stuff to it. +// When you finish streaming to it, ~LogMessage is called and the +// full message gets streamed to the appropriate destination. +// +// You shouldn't actually use LogMessage's constructor to log things, +// though. You should use the LOG() macro (and variants thereof) +// above. +class LogMessage { + public: + LogMessage(const char* file, int line, LogSeverity severity, int ctr); + + // Two special constructors that generate reduced amounts of code at + // LOG call sites for common cases. + // + // Used for LOG(INFO): Implied are: + // severity = LOG_INFO, ctr = 0 + // + // Using this constructor instead of the more complex constructor above + // saves a couple of bytes per call site. + LogMessage(const char* file, int line); + + // Used for LOG(severity) where severity != INFO. Implied + // are: ctr = 0 + // + // Using this constructor instead of the more complex constructor above + // saves a couple of bytes per call site. + LogMessage(const char* file, int line, LogSeverity severity); + + // A special constructor used for check failures. + // Implied severity = LOG_FATAL + LogMessage(const char* file, int line, const CheckOpString& result); + + ~LogMessage(); + + std::ostream& stream() { return stream_; } + + private: + void Init(const char* file, int line); + + LogSeverity severity_; + std::ostrstream stream_; + int message_start_; // offset of the start of the message (past prefix info). + + DISALLOW_EVIL_CONSTRUCTORS(LogMessage); +}; + +// A non-macro interface to the log facility; (useful +// when the logging level is not a compile-time constant). +inline void LogAtLevel(int const log_level, std::string const &msg) { + LogMessage(__FILE__, __LINE__, log_level).stream() << msg; +} + +// This class is used to explicitly ignore values in the conditional +// logging macros. This avoids compiler warnings like "value computed +// is not used" and "statement has no effect". +class LogMessageVoidify { + public: + LogMessageVoidify() { } + // This has to be an operator with a precedence lower than << but + // higher than ?: + void operator&(std::ostream&) { } +}; + +// Closes the log file explicitly if open. +// NOTE: Since the log file is opened as necessary by the action of logging +// statements, there's no guarantee that it will stay closed +// after this call. +void CloseLogFile(); + +} // namespace Logging + +// These functions are provided as a convenience for logging, which is where we +// use streams (it is against Google style to use streams in other places). It +// is designed to allow you to emit non-ASCII Unicode strings to the log file, +// which is normally ASCII. It is relatively slow, so try not to use it for +// common cases. Non-ASCII characters will be converted to UTF-8 by these operators. +std::ostream& operator<<(std::ostream& out, const wchar_t* wstr); +inline std::ostream& operator<<(std::ostream& out, const std::wstring& wstr) { + return out << wstr.c_str(); +} + +#endif // BASE_LOGGING_H__ diff --git a/psol/include/googleurl/base/scoped_ptr.h b/psol/include/googleurl/base/scoped_ptr.h new file mode 100644 index 000000000..de0b388cb --- /dev/null +++ b/psol/include/googleurl/base/scoped_ptr.h @@ -0,0 +1,322 @@ +#ifndef BASE_SCOPED_PTR_H +#define BASE_SCOPED_PTR_H + +// (C) Copyright Greg Colvin and Beman Dawes 1998, 1999. +// Copyright (c) 2001, 2002 Peter Dimov +// +// Permission to copy, use, modify, sell and distribute this software +// is granted provided this copyright notice appears in all copies. +// This software is provided "as is" without express or implied +// warranty, and with no claim as to its suitability for any purpose. +// +// See http://www.boost.org/libs/smart_ptr/scoped_ptr.htm for documentation. +// + +// scoped_ptr mimics a built-in pointer except that it guarantees deletion +// of the object pointed to, either on destruction of the scoped_ptr or via +// an explicit reset(). scoped_ptr is a simple solution for simple needs; +// use shared_ptr or std::auto_ptr if your needs are more complex. + +// *** NOTE *** +// If your scoped_ptr is a class member of class FOO pointing to a +// forward declared type BAR (as shown below), then you MUST use a non-inlined +// version of the destructor. The destructor of a scoped_ptr (called from +// FOO's destructor) must have a complete definition of BAR in order to +// destroy it. Example: +// +// -- foo.h -- +// class BAR; +// +// class FOO { +// public: +// FOO(); +// ~FOO(); // Required for sources that instantiate class FOO to compile! +// +// private: +// scoped_ptr bar_; +// }; +// +// -- foo.cc -- +// #include "foo.h" +// FOO::~FOO() {} // Empty, but must be non-inlined to FOO's class definition. + +#include // for std::ptrdiff_t +#include // for assert +#include // for free() decl + +template +class scoped_ptr { + private: + + T* ptr; + + scoped_ptr(scoped_ptr const &); + scoped_ptr & operator=(scoped_ptr const &); + + public: + + typedef T element_type; + + explicit scoped_ptr(T* p = 0): ptr(p) {} + + ~scoped_ptr() { + typedef char type_must_be_complete[sizeof(T)]; + delete ptr; + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + delete ptr; + ptr = p; + } + } + + T& operator*() const { + assert(ptr != 0); + return *ptr; + } + + T* operator->() const { + assert(ptr != 0); + return ptr; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_ptr & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_ptr should have its own object + template bool operator==(scoped_ptr const& p) const; + template bool operator!=(scoped_ptr const& p) const; +}; + +template inline +void swap(scoped_ptr& a, scoped_ptr& b) { + a.swap(b); +} + +template inline +bool operator==(T* p, const scoped_ptr& b) { + return p == b.get(); +} + +template inline +bool operator!=(T* p, const scoped_ptr& b) { + return p != b.get(); +} + +// scoped_array extends scoped_ptr to arrays. Deletion of the array pointed to +// is guaranteed, either on destruction of the scoped_array or via an explicit +// reset(). Use shared_array or std::vector if your needs are more complex. + +template +class scoped_array { + private: + + T* ptr; + + scoped_array(scoped_array const &); + scoped_array & operator=(scoped_array const &); + + public: + + typedef T element_type; + + explicit scoped_array(T* p = 0) : ptr(p) {} + + ~scoped_array() { + typedef char type_must_be_complete[sizeof(T)]; + delete[] ptr; + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + delete [] ptr; + ptr = p; + } + } + + T& operator[](std::ptrdiff_t i) const { + assert(ptr != 0); + assert(i >= 0); + return ptr[i]; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_array & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_array should have its own object + template bool operator==(scoped_array const& p) const; + template bool operator!=(scoped_array const& p) const; +}; + +template inline +void swap(::scoped_array& a, ::scoped_array& b) { + a.swap(b); +} + +template inline +bool operator==(T* p, const ::scoped_array& b) { + return p == b.get(); +} + +template inline +bool operator!=(T* p, const ::scoped_array& b) { + return p != b.get(); +} + + +// This class wraps the c library function free() in a class that can be +// passed as a template argument to scoped_ptr_malloc below. +class ScopedPtrMallocFree { + public: + inline void operator()(void* x) const { + free(x); + } +}; + +// scoped_ptr_malloc<> is similar to scoped_ptr<>, but it accepts a +// second template argument, the functor used to free the object. + +template +class scoped_ptr_malloc { + private: + + T* ptr; + + scoped_ptr_malloc(scoped_ptr_malloc const &); + scoped_ptr_malloc & operator=(scoped_ptr_malloc const &); + + public: + + typedef T element_type; + + explicit scoped_ptr_malloc(T* p = 0): ptr(p) {} + + ~scoped_ptr_malloc() { + typedef char type_must_be_complete[sizeof(T)]; + free_((void*) ptr); + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + free_((void*) ptr); + ptr = p; + } + } + + T& operator*() const { + assert(ptr != 0); + return *ptr; + } + + T* operator->() const { + assert(ptr != 0); + return ptr; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_ptr_malloc & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_ptr_malloc should have its own object + template + bool operator==(scoped_ptr_malloc const& p) const; + template + bool operator!=(scoped_ptr_malloc const& p) const; + + static FreeProc const free_; +}; + +template +FP const scoped_ptr_malloc::free_ = FP(); + +template inline +void swap(scoped_ptr_malloc& a, scoped_ptr_malloc& b) { + a.swap(b); +} + +template inline +bool operator==(T* p, const scoped_ptr_malloc& b) { + return p == b.get(); +} + +template inline +bool operator!=(T* p, const scoped_ptr_malloc& b) { + return p != b.get(); +} + +#endif // #ifndef BASE_SCOPED_PTR_H diff --git a/psol/include/googleurl/base/string16.h b/psol/include/googleurl/base/string16.h new file mode 100644 index 000000000..9e0fd1de6 --- /dev/null +++ b/psol/include/googleurl/base/string16.h @@ -0,0 +1,192 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef BASE_STRING16_H_ +#define BASE_STRING16_H_ + +// WHAT: +// A version of std::basic_string that provides 2-byte characters even when +// wchar_t is not implemented as a 2-byte type. You can access this class as +// string16. We also define char16, which string16 is based upon. +// +// WHY: +// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2 +// data. Plenty of existing code operates on strings encoded as UTF-16. +// +// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make +// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails +// at run time, because it calls some functions (like wcslen) that come from +// the system's native C library -- which was built with a 4-byte wchar_t! +// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's +// entirely improper on those systems where the encoding of wchar_t is defined +// as UTF-32. +// +// Here, we define string16, which is similar to std::wstring but replaces all +// libc functions with custom, 2-byte-char compatible routines. It is capable +// of carrying UTF-16-encoded data. + +#include + +#include "base/basictypes.h" + +#ifdef WIN32 + +typedef wchar_t char16; +typedef std::wstring string16; + +#else // !WIN32 + +typedef uint16 char16; + +namespace base { + +// char16 versions of the functions required by string16_char_traits; these +// are based on the wide character functions of similar names ("w" or "wcs" +// instead of "c16"). +int c16memcmp(const char16* s1, const char16* s2, size_t n); +size_t c16len(const char16* s); +const char16* c16memchr(const char16* s, char16 c, size_t n); +char16* c16memmove(char16* s1, const char16* s2, size_t n); +char16* c16memcpy(char16* s1, const char16* s2, size_t n); +char16* c16memset(char16* s, char16 c, size_t n); + +struct string16_char_traits { + typedef char16 char_type; + typedef int int_type; + + typedef std::streamoff off_type; + typedef mbstate_t state_type; + typedef std::fpos pos_type; + + static void assign(char_type& c1, const char_type& c2) { + c1 = c2; + } + + static bool eq(const char_type& c1, const char_type& c2) { + return c1 == c2; + } + static bool lt(const char_type& c1, const char_type& c2) { + return c1 < c2; + } + + static int compare(const char_type* s1, const char_type* s2, size_t n) { + return c16memcmp(s1, s2, n); + } + + static size_t length(const char_type* s) { + return c16len(s); + } + + static const char_type* find(const char_type* s, size_t n, + const char_type& a) { + return c16memchr(s, a, n); + } + + static char_type* move(char_type* s1, const char_type* s2, int_type n) { + return c16memmove(s1, s2, n); + } + + static char_type* copy(char_type* s1, const char_type* s2, size_t n) { + return c16memcpy(s1, s2, n); + } + + static char_type* assign(char_type* s, size_t n, char_type a) { + return c16memset(s, a, n); + } + + static int_type not_eof(const int_type& c) { + return eq_int_type(c, eof()) ? 0 : c; + } + + static char_type to_char_type(const int_type& c) { + return char_type(c); + } + + static int_type to_int_type(const char_type& c) { + return int_type(c); + } + + static bool eq_int_type(const int_type& c1, const int_type& c2) { + return c1 == c2; + } + + static int_type eof() { + return static_cast(EOF); + } +}; + +} // namespace base + +// The string class will be explicitly instantiated only once, in string16.cc. +// +// std::basic_string<> in GNU libstdc++ contains a static data member, +// _S_empty_rep_storage, to represent empty strings. When an operation such +// as assignment or destruction is performed on a string, causing its existing +// data member to be invalidated, it must not be freed if this static data +// member is being used. Otherwise, it counts as an attempt to free static +// (and not allocated) data, which is a memory error. +// +// Generally, due to C++ template magic, _S_empty_rep_storage will be marked +// as a coalesced symbol, meaning that the linker will combine multiple +// instances into a single one when generating output. +// +// If a string class is used by multiple shared libraries, a problem occurs. +// Each library will get its own copy of _S_empty_rep_storage. When strings +// are passed across a library boundary for alteration or destruction, memory +// errors will result. GNU libstdc++ contains a configuration option, +// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which +// disables the static data member optimization, but it's a good optimization +// and non-STL code is generally at the mercy of the system's STL +// configuration. Fully-dynamic strings are not the default for GNU libstdc++ +// libstdc++ itself or for the libstdc++ installations on the systems we care +// about, such as Mac OS X and relevant flavors of Linux. +// +// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 . +// +// To avoid problems, string classes need to be explicitly instantiated only +// once, in exactly one library. All other string users see it via an "extern" +// declaration. This is precisely how GNU libstdc++ handles +// std::basic_string (string) and std::basic_string (wstring). +// +// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2), +// in which the linker does not fully coalesce symbols when dead code +// stripping is enabled. This bug causes the memory errors described above +// to occur even when a std::basic_string<> does not cross shared library +// boundaries, such as in statically-linked executables. +// +// TODO(mark): File this bug with Apple and update this note with a bug number. + +extern template class std::basic_string; + +typedef std::basic_string string16; + +extern std::ostream& operator<<(std::ostream& out, const string16& str); + +#endif // !WIN32 + +#endif // BASE_STRING16_H_ diff --git a/psol/include/googleurl/src/gurl.h b/psol/include/googleurl/src/gurl.h new file mode 100644 index 000000000..803cdfe37 --- /dev/null +++ b/psol/include/googleurl/src/gurl.h @@ -0,0 +1,375 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_GURL_H__ +#define GOOGLEURL_SRC_GURL_H__ + +#include +#include + +#include "base/string16.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_common.h" +#include "googleurl/src/url_parse.h" + +class GURL { + public: + typedef url_canon::StdStringReplacements Replacements; + typedef url_canon::StdStringReplacements ReplacementsW; + + // Creates an empty, invalid URL. + GURL_API GURL(); + + // Copy construction is relatively inexpensive, with most of the time going + // to reallocating the string. It does not re-parse. + GURL_API GURL(const GURL& other); + + // The narrow version requires the input be UTF-8. Invalid UTF-8 input will + // result in an invalid URL. + // + // The wide version should also take an encoding parameter so we know how to + // encode the query parameters. It is probably sufficient for the narrow + // version to assume the query parameter encoding should be the same as the + // input encoding. + GURL_API explicit GURL(const std::string& url_string + /*, output_param_encoding*/); + GURL_API explicit GURL(const string16& url_string + /*, output_param_encoding*/); + + // Constructor for URLs that have already been parsed and canonicalized. This + // is used for conversions from KURL, for example. The caller must supply all + // information associated with the URL, which must be correct and consistent. + GURL_API GURL(const char* canonical_spec, size_t canonical_spec_len, + const url_parse::Parsed& parsed, bool is_valid); + + GURL_API GURL& operator=(const GURL& other); + + // Returns true when this object represents a valid parsed URL. When not + // valid, other functions will still succeed, but you will not get canonical + // data out in the format you may be expecting. Instead, we keep something + // "reasonable looking" so that the user can see how it's busted if + // displayed to them. + bool is_valid() const { + return is_valid_; + } + + // Returns true if the URL is zero-length. Note that empty URLs are also + // invalid, and is_valid() will return false for them. This is provided + // because some users may want to treat the empty case differently. + bool is_empty() const { + return spec_.empty(); + } + + // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8, + // if the URL is valid. If the URL is not valid, this will assert and return + // the empty string (for safety in release builds, to keep them from being + // misused which might be a security problem). + // + // The URL will be ASCII except the reference fragment, which may be UTF-8. + // It is guaranteed to be valid UTF-8. + // + // The exception is for empty() URLs (which are !is_valid()) but this will + // return the empty string without asserting. + // + // Used invalid_spec() below to get the unusable spec of an invalid URL. This + // separation is designed to prevent errors that may cause security problems + // that could result from the mistaken use of an invalid URL. + GURL_API const std::string& spec() const; + + // Returns the potentially invalid spec for a the URL. This spec MUST NOT be + // modified or sent over the network. It is designed to be displayed in error + // messages to the user, as the apperance of the spec may explain the error. + // If the spec is valid, the valid spec will be returned. + // + // The returned string is guaranteed to be valid UTF-8. + const std::string& possibly_invalid_spec() const { + return spec_; + } + + // Getter for the raw parsed structure. This allows callers to locate parts + // of the URL within the spec themselves. Most callers should consider using + // the individual component getters below. + // + // The returned parsed structure will reference into the raw spec, which may + // or may not be valid. If you are using this to index into the spec, BE + // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you + // don't do anything "important" with invalid specs. + const url_parse::Parsed& parsed_for_possibly_invalid_spec() const { + return parsed_; + } + + // Defiant equality operator! + bool operator==(const GURL& other) const { + return spec_ == other.spec_; + } + bool operator!=(const GURL& other) const { + return spec_ != other.spec_; + } + + // Allows GURL to used as a key in STL (for example, a std::set or std::map). + bool operator<(const GURL& other) const { + return spec_ < other.spec_; + } + + // Resolves a URL that's possibly relative to this object's URL, and returns + // it. Absolute URLs are also handled according to the rules of URLs on web + // pages. + // + // It may be impossible to resolve the URLs properly. If the input is not + // "standard" (SchemeIsStandard() == false) and the input looks relative, we + // can't resolve it. In these cases, the result will be an empty, invalid + // GURL. + // + // The result may also be a nonempty, invalid URL if the input has some kind + // of encoding error. In these cases, we will try to construct a "good" URL + // that may have meaning to the user, but it will be marked invalid. + // + // It is an error to resolve a URL relative to an invalid URL. The result + // will be the empty URL. + GURL_API GURL Resolve(const std::string& relative) const; + GURL_API GURL Resolve(const string16& relative) const; + + // Like Resolve() above but takes a character set encoder which will be used + // for any query text specified in the input. The charset converter parameter + // may be NULL, in which case it will be treated as UTF-8. + // + // TODO(brettw): These should be replaced with versions that take something + // more friendly than a raw CharsetConverter (maybe like an ICU character set + // name). + GURL_API GURL ResolveWithCharsetConverter( + const std::string& relative, + url_canon::CharsetConverter* charset_converter) const; + GURL_API GURL ResolveWithCharsetConverter( + const string16& relative, + url_canon::CharsetConverter* charset_converter) const; + + // Creates a new GURL by replacing the current URL's components with the + // supplied versions. See the Replacements class in url_canon.h for more. + // + // These are not particularly quick, so avoid doing mutations when possible. + // Prefer the 8-bit version when possible. + // + // It is an error to replace components of an invalid URL. The result will + // be the empty URL. + // + // Note that we use the more general url_canon::Replacements type to give + // callers extra flexibility rather than our override. + GURL_API GURL ReplaceComponents( + const url_canon::Replacements& replacements) const; + GURL_API GURL ReplaceComponents( + const url_canon::Replacements& replacements) const; + + // A helper function that is equivalent to replacing the path with a slash + // and clearing out everything after that. We sometimes need to know just the + // scheme and the authority. If this URL is not a standard URL (it doesn't + // have the regular authority and path sections), then the result will be + // an empty, invalid GURL. Note that this *does* work for file: URLs, which + // some callers may want to filter out before calling this. + // + // It is an error to get an empty path on an invalid URL. The result + // will be the empty URL. + GURL_API GURL GetWithEmptyPath() const; + + // A helper function to return a GURL containing just the scheme, host, + // and port from a URL. Equivalent to clearing any username and password, + // replacing the path with a slash, and clearing everything after that. If + // this URL is not a standard URL, then the result will be an empty, + // invalid GURL. If the URL has neither username nor password, this + // degenerates to GetWithEmptyPath(). + // + // It is an error to get the origin of an invalid URL. The result + // will be the empty URL. + GURL_API GURL GetOrigin() const; + + // Returns true if the scheme for the current URL is a known "standard" + // scheme. Standard schemes have an authority and a path section. This + // includes file:, which some callers may want to filter out explicitly by + // calling SchemeIsFile. + GURL_API bool IsStandard() const; + + // Returns true if the given parameter (should be lower-case ASCII to match + // the canonicalized scheme) is the scheme for this URL. This call is more + // efficient than getting the scheme and comparing it because no copies or + // object constructions are done. + GURL_API bool SchemeIs(const char* lower_ascii_scheme) const; + + // We often need to know if this is a file URL. File URLs are "standard", but + // are often treated separately by some programs. + bool SchemeIsFile() const { + return SchemeIs("file"); + } + + // If the scheme indicates a secure connection + bool SchemeIsSecure() const { + return SchemeIs("https"); + } + + // Returns true if the hostname is an IP address. Note: this function isn't + // as cheap as a simple getter because it re-parses the hostname to verify. + // This currently identifies only IPv4 addresses (bug 822685). + GURL_API bool HostIsIPAddress() const; + + // Getters for various components of the URL. The returned string will be + // empty if the component is empty or is not present. + std::string scheme() const { // Not including the colon. See also SchemeIs. + return ComponentString(parsed_.scheme); + } + std::string username() const { + return ComponentString(parsed_.username); + } + std::string password() const { + return ComponentString(parsed_.password); + } + // Note that this may be a hostname, an IPv4 address, or an IPv6 literal + // surrounded by square brackets, like "[2001:db8::1]". To exclude these + // brackets, use HostNoBrackets() below. + std::string host() const { + return ComponentString(parsed_.host); + } + std::string port() const { // Returns -1 if "default" + return ComponentString(parsed_.port); + } + std::string path() const { // Including first slash following host + return ComponentString(parsed_.path); + } + std::string query() const { // Stuff following '?' + return ComponentString(parsed_.query); + } + std::string ref() const { // Stuff following '#' + return ComponentString(parsed_.ref); + } + + // Existance querying. These functions will return true if the corresponding + // URL component exists in this URL. Note that existance is different than + // being nonempty. http://www.google.com/? has a query that just happens to + // be empty, and has_query() will return true. + bool has_scheme() const { + return parsed_.scheme.len >= 0; + } + bool has_username() const { + return parsed_.username.len >= 0; + } + bool has_password() const { + return parsed_.password.len >= 0; + } + bool has_host() const { + // Note that hosts are special, absense of host means length 0. + return parsed_.host.len > 0; + } + bool has_port() const { + return parsed_.port.len >= 0; + } + bool has_path() const { + // Note that http://www.google.com/" has a path, the path is "/". This can + // return false only for invalid or nonstandard URLs. + return parsed_.path.len >= 0; + } + bool has_query() const { + return parsed_.query.len >= 0; + } + bool has_ref() const { + return parsed_.ref.len >= 0; + } + + // Returns a parsed version of the port. Can also be any of the special + // values defined in Parsed for ExtractPort. + GURL_API int IntPort() const; + + // Returns the port number of the url, or the default port number. + // If the scheme has no concept of port (or unknown default) returns + // PORT_UNSPECIFIED. + GURL_API int EffectiveIntPort() const; + + // Extracts the filename portion of the path and returns it. The filename + // is everything after the last slash in the path. This may be empty. + GURL_API std::string ExtractFileName() const; + + // Returns the path that should be sent to the server. This is the path, + // parameter, and query portions of the URL. It is guaranteed to be ASCII. + GURL_API std::string PathForRequest() const; + + // Returns the host, excluding the square brackets surrounding IPv6 address + // literals. This can be useful for passing to getaddrinfo(). + GURL_API std::string HostNoBrackets() const; + + // Returns true if this URL's host matches or is in the same domain as + // the given input string. For example if this URL was "www.google.com", + // this would match "com", "google.com", and "www.google.com + // (input domain should be lower-case ASCII to match the canonicalized + // scheme). This call is more efficient than getting the host and check + // whether host has the specific domain or not because no copies or + // object constructions are done. + // + // If function DomainIs has parameter domain_len, which means the parameter + // lower_ascii_domain does not gurantee to terminate with NULL character. + GURL_API bool DomainIs(const char* lower_ascii_domain, int domain_len) const; + + // If function DomainIs only has parameter lower_ascii_domain, which means + // domain string should be terminate with NULL character. + bool DomainIs(const char* lower_ascii_domain) const { + return DomainIs(lower_ascii_domain, + static_cast(strlen(lower_ascii_domain))); + } + + // Swaps the contents of this GURL object with the argument without doing + // any memory allocations. + GURL_API void Swap(GURL* other); + + // Returns a reference to a singleton empty GURL. This object is for callers + // who return references but don't have anything to return in some cases. + // This function may be called from any thread. + GURL_API static const GURL& EmptyGURL(); + + private: + // Returns the substring of the input identified by the given component. + std::string ComponentString(const url_parse::Component& comp) const { + if (comp.len <= 0) + return std::string(); + return std::string(spec_, comp.begin, comp.len); + } + + // The actual text of the URL, in canonical ASCII form. + std::string spec_; + + // Set when the given URL is valid. Otherwise, we may still have a spec and + // components, but they may not identify valid resources (for example, an + // invalid port number, invalid characters in the scheme, etc.). + bool is_valid_; + + // Identified components of the canonical spec. + url_parse::Parsed parsed_; + + // TODO bug 684583: Add encoding for query params. +}; + +// Stream operator so GURL can be used in assertion statements. +GURL_API std::ostream& operator<<(std::ostream& out, const GURL& url); + +#endif // GOOGLEURL_SRC_GURL_H__ diff --git a/psol/include/googleurl/src/url_canon.h b/psol/include/googleurl/src/url_canon.h new file mode 100644 index 000000000..e2cfb55e0 --- /dev/null +++ b/psol/include/googleurl/src/url_canon.h @@ -0,0 +1,872 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#ifndef GOOGLEURL_SRC_URL_CANON_H__ +#define GOOGLEURL_SRC_URL_CANON_H__ + +#include +#include + +#include "base/string16.h" +#include "googleurl/src/url_common.h" +#include "googleurl/src/url_parse.h" + +namespace url_canon { + +// Canonicalizer output ------------------------------------------------------- + +// Base class for the canonicalizer output, this maintains a buffer and +// supports simple resizing and append operations on it. +// +// It is VERY IMPORTANT that no virtual function calls be made on the common +// code path. We only have two virtual function calls, the destructor and a +// resize function that is called when the existing buffer is not big enough. +// The derived class is then in charge of setting up our buffer which we will +// manage. +template +class CanonOutputT { + public: + CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) { + } + virtual ~CanonOutputT() { + } + + // Implemented to resize the buffer. This function should update the buffer + // pointer to point to the new buffer, and any old data up to |cur_len_| in + // the buffer must be copied over. + // + // The new size |sz| must be larger than buffer_len_. + virtual void Resize(int sz) = 0; + + // Accessor for returning a character at a given position. The input offset + // must be in the valid range. + inline char at(int offset) const { + return buffer_[offset]; + } + + // Sets the character at the given position. The given position MUST be less + // than the length(). + inline void set(int offset, int ch) { + buffer_[offset] = ch; + } + + // Returns the number of characters currently in the buffer. + inline int length() const { + return cur_len_; + } + + // Returns the current capacity of the buffer. The length() is the number of + // characters that have been declared to be written, but the capacity() is + // the number that can be written without reallocation. If the caller must + // write many characters at once, it can make sure there is enough capacity, + // write the data, then use set_size() to declare the new length(). + int capacity() const { + return buffer_len_; + } + + // Called by the user of this class to get the output. The output will NOT + // be NULL-terminated. Call length() to get the + // length. + const T* data() const { + return buffer_; + } + T* data() { + return buffer_; + } + + // Shortens the URL to the new length. Used for "backing up" when processing + // relative paths. This can also be used if an external function writes a lot + // of data to the buffer (when using the "Raw" version below) beyond the end, + // to declare the new length. + // + // This MUST NOT be used to expand the size of the buffer beyond capacity(). + void set_length(int new_len) { + cur_len_ = new_len; + } + + // This is the most performance critical function, since it is called for + // every character. + void push_back(T ch) { + // In VC2005, putting this common case first speeds up execution + // dramatically because this branch is predicted as taken. + if (cur_len_ < buffer_len_) { + buffer_[cur_len_] = ch; + cur_len_++; + return; + } + + // Grow the buffer to hold at least one more item. Hopefully we won't have + // to do this very often. + if (!Grow(1)) + return; + + // Actually do the insertion. + buffer_[cur_len_] = ch; + cur_len_++; + } + + // Appends the given string to the output. + void Append(const T* str, int str_len) { + if (cur_len_ + str_len > buffer_len_) { + if (!Grow(cur_len_ + str_len - buffer_len_)) + return; + } + for (int i = 0; i < str_len; i++) + buffer_[cur_len_ + i] = str[i]; + cur_len_ += str_len; + } + + protected: + // Grows the given buffer so that it can fit at least |min_additional| + // characters. Returns true if the buffer could be resized, false on OOM. + bool Grow(int min_additional) { + static const int kMinBufferLen = 16; + int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_; + do { + if (new_len >= (1 << 30)) // Prevent overflow below. + return false; + new_len *= 2; + } while (new_len < buffer_len_ + min_additional); + Resize(new_len); + return true; + } + + T* buffer_; + int buffer_len_; + + // Used characters in the buffer. + int cur_len_; +}; + +// Simple implementation of the CanonOutput using new[]. This class +// also supports a static buffer so if it is allocated on the stack, most +// URLs can be canonicalized with no heap allocations. +template +class RawCanonOutputT : public CanonOutputT { + public: + RawCanonOutputT() : CanonOutputT() { + this->buffer_ = fixed_buffer_; + this->buffer_len_ = fixed_capacity; + } + virtual ~RawCanonOutputT() { + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + } + + virtual void Resize(int sz) { + T* new_buf = new T[sz]; + memcpy(new_buf, this->buffer_, + sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz)); + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + this->buffer_ = new_buf; + this->buffer_len_ = sz; + } + + protected: + T fixed_buffer_[fixed_capacity]; +}; + +// Normally, all canonicalization output is in narrow characters. We support +// the templates so it can also be used internally if a wide buffer is +// required. +typedef CanonOutputT CanonOutput; +typedef CanonOutputT CanonOutputW; + +template +class RawCanonOutput : public RawCanonOutputT {}; +template +class RawCanonOutputW : public RawCanonOutputT {}; + +// Character set converter ---------------------------------------------------- +// +// Converts query strings into a custom encoding. The embedder can supply an +// implementation of this class to interface with their own character set +// conversion libraries. +// +// Embedders will want to see the unit test for the ICU version. + +class CharsetConverter { + public: + CharsetConverter() {} + virtual ~CharsetConverter() {} + + // Converts the given input string from UTF-16 to whatever output format the + // converter supports. This is used only for the query encoding conversion, + // which does not fail. Instead, the converter should insert "invalid + // character" characters in the output for invalid sequences, and do the + // best it can. + // + // If the input contains a character not representable in the output + // character set, the converter should append the HTML entity sequence in + // decimal, (such as "你") with escaping of the ampersand, number + // sign, and semicolon (in the previous example it would be + // "%26%2320320%3B"). This rule is based on what IE does in this situation. + virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output) = 0; +}; + +// Whitespace ----------------------------------------------------------------- + +// Searches for whitespace that should be removed from the middle of URLs, and +// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces +// are preserved, which is what most browsers do. A pointer to the output will +// be returned, and the length of that output will be in |output_len|. +// +// This should be called before parsing if whitespace removal is desired (which +// it normally is when you are canonicalizing). +// +// If no whitespace is removed, this function will not use the buffer and will +// return a pointer to the input, to avoid the extra copy. If modification is +// required, the given |buffer| will be used and the returned pointer will +// point to the beginning of the buffer. +// +// Therefore, callers should not use the buffer, since it may actuall be empty, +// use the computed pointer and |*output_len| instead. +GURL_API const char* RemoveURLWhitespace(const char* input, int input_len, + CanonOutputT* buffer, + int* output_len); +GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len, + CanonOutputT* buffer, + int* output_len); + +// IDN ------------------------------------------------------------------------ + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must fall in the ASCII range, but will be encoded in UTF-16. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, returns false. The output in this case is undefined. +GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); + +// Piece-by-piece canonicalizers ---------------------------------------------- +// +// These individual canonicalizers append the canonicalized versions of the +// corresponding URL component to the given std::string. The spec and the +// previously-identified range of that component are the input. The range of +// the canonicalized component will be written to the output component. +// +// These functions all append to the output so they can be chained. Make sure +// the output is empty when you start. +// +// These functions returns boolean values indicating success. On failure, they +// will attempt to write something reasonable to the output so that, if +// displayed to the user, they will recognise it as something that's messed up. +// Nothing more should ever be done with these invalid URLs, however. + +// Scheme: Appends the scheme and colon to the URL. The output component will +// indicate the range of characters up to but not including the colon. +// +// Canonical URLs always have a scheme. If the scheme is not present in the +// input, this will just write the colon to indicate an empty scheme. Does not +// append slashes which will be needed before any authority components for most +// URLs. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool CanonicalizeScheme(const char* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); +GURL_API bool CanonicalizeScheme(const char16* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); + +// User info: username/password. If present, this will add the delimiters so +// the output will be ":@" or "@". Empty +// username/password pairs, or empty passwords, will get converted to +// nonexistant in the canonical version. +// +// The components for the username and password refer to ranges in the +// respective source strings. Usually, these will be the same string, which +// is legal as long as the two components don't overlap. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool CanonicalizeUserInfo(const char* username_source, + const url_parse::Component& username, + const char* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); +GURL_API bool CanonicalizeUserInfo(const char16* username_source, + const url_parse::Component& username, + const char16* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); + + +// This structure holds detailed state exported from the IP/Host canonicalizers. +// Additional fields may be added as callers require them. +struct CanonHostInfo { + CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} + + // Convenience function to test if family is an IP address. + bool IsIPAddress() const { return family == IPV4 || family == IPV6; } + + // This field summarizes how the input was classified by the canonicalizer. + enum Family { + NEUTRAL, // - Doesn't resemble an IP address. As far as the IP + // canonicalizer is concerned, it should be treated as a + // hostname. + BROKEN, // - Almost an IP, but was not canonicalized. This could be an + // IPv4 address where truncation occurred, or something + // containing the special characters :[] which did not parse + // as an IPv6 address. Never attempt to connect to this + // address, because it might actually succeed! + IPV4, // - Successfully canonicalized as an IPv4 address. + IPV6, // - Successfully canonicalized as an IPv6 address. + }; + Family family; + + // If |family| is IPV4, then this is the number of nonempty dot-separated + // components in the input text, from 1 to 4. If |family| is not IPV4, + // this value is undefined. + int num_ipv4_components; + + // Location of host within the canonicalized output. + // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. + // CanonicalizeHostVerbose() always sets it. + url_parse::Component out_host; +}; + + +// Host. +// +// The 8-bit version requires UTF-8 encoding. Use this version when you only +// need to know whether canonicalization succeeded. +GURL_API bool CanonicalizeHost(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); +GURL_API bool CanonicalizeHost(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); + +// Extended version of CanonicalizeHost, which returns additional information. +// Use this when you need to know whether the hostname was an IP address. +// A successful return is indicated by host_info->family != BROKEN. See the +// definition of CanonHostInfo above for details. +GURL_API void CanonicalizeHostVerbose(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +GURL_API void CanonicalizeHostVerbose(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + + +// IP addresses. +// +// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is +// an IP address, it will canonicalize it as such, appending it to |output|. +// Additional status information is returned via the |*host_info| parameter. +// See the definition of CanonHostInfo above for details. +// +// This is called AUTOMATICALLY from the host canonicalizer, which ensures that +// the input is unescaped and name-prepped, etc. It should not normally be +// necessary or wise to call this directly. +GURL_API void CanonicalizeIPAddress(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +GURL_API void CanonicalizeIPAddress(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + +// Port: this function will add the colon for the port if a port is present. +// The caller can pass url_parse::PORT_UNSPECIFIED as the +// default_port_for_scheme argument if there is no default port. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool CanonicalizePort(const char* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); +GURL_API bool CanonicalizePort(const char16* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len); + +// Path. If the input does not begin in a slash (including if the input is +// empty), we'll prepend a slash to the path to make it canonical. +// +// The 8-bit version assumes UTF-8 encoding, but does not verify the validity +// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid +// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't +// an issue. Somebody giving us an 8-bit path is responsible for generating +// the path that the server expects (we'll escape high-bit characters), so +// if something is invalid, it's their problem. +GURL_API bool CanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API bool CanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Canonicalizes the input as a file path. This is like CanonicalizePath except +// that it also handles Windows drive specs. For example, the path can begin +// with "c|\" and it will get properly canonicalized to "C:/". +// The string will be appended to |*output| and |*out_path| will be updated. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool FileCanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API bool FileCanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Query: Prepends the ? if needed. +// +// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly +// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode +// "invalid character." This function can not fail, we always just try to do +// our best for crazy input here since web pages can set it themselves. +// +// This will convert the given input into the output encoding that the given +// character set converter object provides. The converter will only be called +// if necessary, for ASCII input, no conversions are necessary. +// +// The converter can be NULL. In this case, the output encoding will be UTF-8. +GURL_API void CanonicalizeQuery(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); +GURL_API void CanonicalizeQuery(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); + +// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only +// canonicalizer that does not produce ASCII output). The output is +// guaranteed to be valid UTF-8. +// +// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use +// the "Unicode replacement character" for the confusing bits and copy the rest. +GURL_API void CanonicalizeRef(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API void CanonicalizeRef(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Full canonicalizer --------------------------------------------------------- +// +// These functions replace any string contents, rather than append as above. +// See the above piece-by-piece functions for information specific to +// canonicalizing individual components. +// +// The output will be ASCII except the reference fragment, which may be UTF-8. +// +// The 8-bit versions require UTF-8 encoding. + +// Use for standard URLs with authorities and paths. +GURL_API bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeStandardURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for file URLs. +GURL_API bool CanonicalizeFileURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeFileURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for path URLs such as javascript. This does not modify the path in any +// way, for example, by escaping it. +GURL_API bool CanonicalizePathURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizePathURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for mailto URLs. This "canonicalizes" the url into a path and query +// component. It does not attempt to merge "to" fields. It uses UTF-8 for +// the query encoding if there is a query. This is because a mailto URL is +// really intended for an external mail program, and the encoding of a page, +// etc. which would influence a query encoding normally are irrelevant. +GURL_API bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeMailtoURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Part replacer -------------------------------------------------------------- + +// Internal structure used for storing separate strings for each component. +// The basic canonicalization functions use this structure internally so that +// component remplacement (different strings for different components) can be +// treated on the same code path as regular canonicalization (the same string +// for each component). +// +// A url_parse::Parsed structure usually goes along with this. Those +// components identify offsets within these strings, so that they can all be +// in the same string, or spread arbitrarily across different ones. +// +// This structures does not own any data. It is the caller's responsibility to +// ensure that the data the pointers point to stays in scope and is not +// modified. +template +struct URLComponentSource { + // Constructor normally used by callers wishing to replace components. This + // will make them all NULL, which is no replacement. The caller would then + // override the components they want to replace. + URLComponentSource() + : scheme(NULL), + username(NULL), + password(NULL), + host(NULL), + port(NULL), + path(NULL), + query(NULL), + ref(NULL) { + } + + // Constructor normally used internally to initialize all the components to + // point to the same spec. + explicit URLComponentSource(const CHAR* default_value) + : scheme(default_value), + username(default_value), + password(default_value), + host(default_value), + port(default_value), + path(default_value), + query(default_value), + ref(default_value) { + } + + const CHAR* scheme; + const CHAR* username; + const CHAR* password; + const CHAR* host; + const CHAR* port; + const CHAR* path; + const CHAR* query; + const CHAR* ref; +}; + +// This structure encapsulates information on modifying a URL. Each component +// may either be left unchanged, replaced, or deleted. +// +// By default, each component is unchanged. For those components that should be +// modified, call either Set* or Clear* to modify it. +// +// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT +// IN SCOPE BY THE CALLER for as long as this object exists! +// +// Prefer the 8-bit replacement version if possible since it is more efficient. +template +class Replacements { + public: + Replacements() { + } + + // Scheme + void SetScheme(const CHAR* s, const url_parse::Component& comp) { + sources_.scheme = s; + components_.scheme = comp; + } + // Note: we don't have a ClearScheme since this doesn't make any sense. + bool IsSchemeOverridden() const { return sources_.scheme != NULL; } + + // Username + void SetUsername(const CHAR* s, const url_parse::Component& comp) { + sources_.username = s; + components_.username = comp; + } + void ClearUsername() { + sources_.username = Placeholder(); + components_.username = url_parse::Component(); + } + bool IsUsernameOverridden() const { return sources_.username != NULL; } + + // Password + void SetPassword(const CHAR* s, const url_parse::Component& comp) { + sources_.password = s; + components_.password = comp; + } + void ClearPassword() { + sources_.password = Placeholder(); + components_.password = url_parse::Component(); + } + bool IsPasswordOverridden() const { return sources_.password != NULL; } + + // Host + void SetHost(const CHAR* s, const url_parse::Component& comp) { + sources_.host = s; + components_.host = comp; + } + void ClearHost() { + sources_.host = Placeholder(); + components_.host = url_parse::Component(); + } + bool IsHostOverridden() const { return sources_.host != NULL; } + + // Port + void SetPort(const CHAR* s, const url_parse::Component& comp) { + sources_.port = s; + components_.port = comp; + } + void ClearPort() { + sources_.port = Placeholder(); + components_.port = url_parse::Component(); + } + bool IsPortOverridden() const { return sources_.port != NULL; } + + // Path + void SetPath(const CHAR* s, const url_parse::Component& comp) { + sources_.path = s; + components_.path = comp; + } + void ClearPath() { + sources_.path = Placeholder(); + components_.path = url_parse::Component(); + } + bool IsPathOverridden() const { return sources_.path != NULL; } + + // Query + void SetQuery(const CHAR* s, const url_parse::Component& comp) { + sources_.query = s; + components_.query = comp; + } + void ClearQuery() { + sources_.query = Placeholder(); + components_.query = url_parse::Component(); + } + bool IsQueryOverridden() const { return sources_.query != NULL; } + + // Ref + void SetRef(const CHAR* s, const url_parse::Component& comp) { + sources_.ref = s; + components_.ref = comp; + } + void ClearRef() { + sources_.ref = Placeholder(); + components_.ref = url_parse::Component(); + } + bool IsRefOverridden() const { return sources_.ref != NULL; } + + // Getters for the itnernal data. See the variables below for how the + // information is encoded. + const URLComponentSource& sources() const { return sources_; } + const url_parse::Parsed& components() const { return components_; } + + private: + // Returns a pointer to a static empty string that is used as a placeholder + // to indicate a component should be deleted (see below). + const CHAR* Placeholder() { + static const CHAR empty_string = 0; + return &empty_string; + } + + // We support three states: + // + // Action | Source Component + // -----------------------+-------------------------------------------------- + // Don't change component | NULL (unused) + // Replace component | (replacement string) (replacement component) + // Delete component | (non-NULL) (invalid component: (0,-1)) + // + // We use a pointer to the empty string for the source when the component + // should be deleted. + URLComponentSource sources_; + url_parse::Parsed components_; +}; + +// The base must be an 8-bit canonical URL. +GURL_API bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Replacing some parts of a file URL is not permitted. Everything except +// the host, path, query, and ref will be ignored. +GURL_API bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Path URLs can only have the scheme and path replaced. All other components +// will be ignored. +GURL_API bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Mailto URLs can only have the scheme, path, and query replaced. +// All other components will be ignored. +GURL_API bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Relative URL --------------------------------------------------------------- + +// Given an input URL or URL fragment |fragment|, determines if it is a +// relative or absolute URL and places the result into |*is_relative|. If it is +// relative, the relevant portion of the URL will be placed into +// |*relative_component| (there may have been trimmed whitespace, for example). +// This value is passed to ResolveRelativeURL. If the input is not relative, +// this value is UNDEFINED (it may be changed by the functin). +// +// Returns true on success (we successfully determined the URL is relative or +// not). Failure means that the combination of URLs doesn't make any sense. +// +// The base URL should always be canonical, therefore is ASCII. +GURL_API bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); +GURL_API bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); + +// Given a canonical parsed source URL, a URL fragment known to be relative, +// and the identified relevant portion of the relative URL (computed by +// IsRelativeURL), this produces a new parsed canonical URL in |output| and +// |out_parsed|. +// +// It also requires a flag indicating whether the base URL is a file: URL +// which triggers additional logic. +// +// The base URL should be canonical and have a host (may be empty for file +// URLs) and a path. If it doesn't have these, we can't resolve relative +// URLs off of it and will return the base as the output with an error flag. +// Becausee it is canonical is should also be ASCII. +// +// The query charset converter follows the same rules as CanonicalizeQuery. +// +// Returns true on success. On failure, the output will be "something +// reasonable" that will be consistent and valid, just probably not what +// was intended by the web page author or caller. +GURL_API bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); +GURL_API bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char16* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_H__ diff --git a/psol/include/googleurl/src/url_canon_icu.h b/psol/include/googleurl/src/url_canon_icu.h new file mode 100644 index 000000000..6bc52c394 --- /dev/null +++ b/psol/include/googleurl/src/url_canon_icu.h @@ -0,0 +1,63 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ICU integration functions. + +#ifndef GOOGLEURL_SRC_URL_CANON_ICU_H__ +#define GOOGLEURL_SRC_URL_CANON_ICU_H__ + +#include "googleurl/src/url_canon.h" + +typedef struct UConverter UConverter; + +namespace url_canon { + +// An implementation of CharsetConverter that implementations can use to +// interface the canonicalizer with ICU's conversion routines. +class ICUCharsetConverter : public CharsetConverter { + public: + // Constructs a converter using an already-existing ICU character set + // converter. This converter is NOT owned by this object; the lifetime must + // be managed by the creator such that it is alive as long as this is. + GURL_API ICUCharsetConverter(UConverter* converter); + + GURL_API virtual ~ICUCharsetConverter() {} + + GURL_API virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output); + + private: + // The ICU converter, not owned by this class. + UConverter* converter_; +}; + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_ICU_H__ diff --git a/psol/include/googleurl/src/url_canon_internal.h b/psol/include/googleurl/src/url_canon_internal.h new file mode 100644 index 000000000..572d6a890 --- /dev/null +++ b/psol/include/googleurl/src/url_canon_internal.h @@ -0,0 +1,460 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is intended to be included in another C++ file where the character +// types are defined. This allows us to write mostly generic code, but not have +// templace bloat because everything is inlined when anybody calls any of our +// functions. + +#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ + +#include + +#include "base/logging.h" +#include "googleurl/src/url_canon.h" + +namespace url_canon { + +// Character type handling ----------------------------------------------------- + +// Bits that identify different character types. These types identify different +// bits that are set for each 8-bit character in the kSharedCharTypeTable. +enum SharedCharTypes { + // Characters that do not require escaping in queries. Characters that do + // not have this flag will be escaped; see url_canon_query.cc + CHAR_QUERY = 1, + + // Valid in the username/password field. + CHAR_USERINFO = 2, + + // Valid in a IPv4 address (digits plus dot and 'x' for hex). + CHAR_IPV4 = 4, + + // Valid in an ASCII-representation of a hex digit (as in %-escaped). + CHAR_HEX = 8, + + // Valid in an ASCII-representation of a decimal digit. + CHAR_DEC = 16, + + // Valid in an ASCII-representation of an octal digit. + CHAR_OCT = 32, + + // Characters that do not require escaping in encodeURIComponent. Characters + // that do not have this flag will be escaped; see url_util.cc. + CHAR_COMPONENT = 64, +}; + +// This table contains the flags in SharedCharTypes for each 8-bit character. +// Some canonicalization functions have their own specialized lookup table. +// For those with simple requirements, we have collected the flags in one +// place so there are fewer lookup tables to load into the CPU cache. +// +// Using an unsigned char type has a small but measurable performance benefit +// over using a 32-bit number. +extern const unsigned char kSharedCharTypeTable[0x100]; + +// More readable wrappers around the character type lookup table. +inline bool IsCharOfType(unsigned char c, SharedCharTypes type) { + return !!(kSharedCharTypeTable[c] & type); +} +inline bool IsQueryChar(unsigned char c) { + return IsCharOfType(c, CHAR_QUERY); +} +inline bool IsIPv4Char(unsigned char c) { + return IsCharOfType(c, CHAR_IPV4); +} +inline bool IsHexChar(unsigned char c) { + return IsCharOfType(c, CHAR_HEX); +} +inline bool IsComponentChar(unsigned char c) { + return IsCharOfType(c, CHAR_COMPONENT); +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. +void AppendStringOfType(const char* source, int length, + SharedCharTypes type, + CanonOutput* output); +void AppendStringOfType(const char16* source, int length, + SharedCharTypes type, + CanonOutput* output); + +// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit +// that will be used to represent it. +extern const char kHexCharLookup[0x10]; + +// This lookup table allows fast conversion between ASCII hex letters and their +// corresponding numerical value. The 8-bit range is divided up into 8 +// regions of 0x20 characters each. Each of the three character types (numbers, +// uppercase, lowercase) falls into different regions of this range. The table +// contains the amount to subtract from characters in that range to get at +// the corresponding numerical value. +// +// See HexDigitToValue for the lookup. +extern const char kCharToHexLookup[8]; + +// Assumes the input is a valid hex digit! Call IsHexChar before using this. +inline unsigned char HexCharToValue(unsigned char c) { + return c - kCharToHexLookup[c / 0x20]; +} + +// Indicates if the given character is a dot or dot equivalent, returning the +// number of characters taken by it. This will be one for a literal dot, 3 for +// an escaped dot. If the character is not a dot, this will return 0. +template +inline int IsDot(const CHAR* spec, int offset, int end) { + if (spec[offset] == '.') { + return 1; + } else if (spec[offset] == '%' && offset + 3 <= end && + spec[offset + 1] == '2' && + (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) { + // Found "%2e" + return 3; + } + return 0; +} + +// Returns the canonicalized version of the input character according to scheme +// rules. This is implemented alongside the scheme canonicalizer, and is +// required for relative URL resolving to test for scheme equality. +// +// Returns 0 if the input character is not a valid scheme character. +char CanonicalSchemeChar(char16 ch); + +// Write a single character, escaped, to the output. This always escapes: it +// does no checking that thee character requires escaping. +// Escaping makes sense only 8 bit chars, so code works in all cases of +// input parameters (8/16bit). +template +inline void AppendEscapedChar(UINCHAR ch, + CanonOutputT* output) { + output->push_back('%'); + output->push_back(kHexCharLookup[(ch >> 4) & 0xf]); + output->push_back(kHexCharLookup[ch & 0xf]); +} + +// The character we'll substitute for undecodable or invalid characters. +extern const char16 kUnicodeReplacementCharacter; + +// UTF-8 functions ------------------------------------------------------------ + +// Reads one character in UTF-8 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-byte ASCII character, it will not be changed). +// +// Implementation is in url_canon_icu.cc. +bool ReadUTFChar(const char* str, int* begin, int length, + unsigned* code_point_out); + +// Generic To-UTF-8 converter. This will call the given append method for each +// character that should be appended, with the given output method. Wrappers +// are provided below for escaped and non-escaped versions of this. +// +// The char_value must have already been checked that it's a valid Unicode +// character. +template +inline void DoAppendUTF8(unsigned char_value, Output* output) { + if (char_value <= 0x7f) { + Appender(static_cast(char_value), output); + } else if (char_value <= 0x7ff) { + // 110xxxxx 10xxxxxx + Appender(static_cast(0xC0 | (char_value >> 6)), + output); + Appender(static_cast(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0xffff) { + // 1110xxxx 10xxxxxx 10xxxxxx + Appender(static_cast(0xe0 | (char_value >> 12)), + output); + Appender(static_cast(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0x10FFFF) { // Max unicode code point. + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Appender(static_cast(0xf0 | (char_value >> 18)), + output); + Appender(static_cast(0x80 | ((char_value >> 12) & 0x3f)), + output); + Appender(static_cast(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast(0x80 | (char_value & 0x3f)), + output); + } else { + // Invalid UTF-8 character (>20 bits). + NOTREACHED(); + } +} + +// Helper used by AppendUTF8Value below. We use an unsigned parameter so there +// are no funny sign problems with the input, but then have to convert it to +// a regular char for appending. +inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { + output->push_back(static_cast(ch)); +} + +// Writes the given character to the output as UTF-8. This does NO checking +// of the validity of the unicode characters; the caller should ensure that +// the value it is appending is valid to append. +inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { + DoAppendUTF8(char_value, output); +} + +// Writes the given character to the output as UTF-8, escaping ALL +// characters (even when they are ASCII). This does NO checking of the +// validity of the unicode characters; the caller should ensure that the value +// it is appending is valid to append. +inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { + DoAppendUTF8(char_value, output); +} + +// UTF-16 functions ----------------------------------------------------------- + +// Reads one character in UTF-16 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-16-bit-word character, it will not be changed). +// +// Implementation is in url_canon_icu.cc. +bool ReadUTFChar(const char16* str, int* begin, int length, + unsigned* code_point); + +// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. +inline void AppendUTF16Value(unsigned code_point, + CanonOutputT* output) { + if (code_point > 0xffff) { + output->push_back(static_cast((code_point >> 10) + 0xd7c0)); + output->push_back(static_cast((code_point & 0x3ff) | 0xdc00)); + } else { + output->push_back(static_cast(code_point)); + } +} + +// Escaping functions --------------------------------------------------------- + +// Writes the given character to the output as UTF-8, escaped. Call this +// function only when the input is wide. Returns true on success. Failure +// means there was some problem with the encoding, we'll still try to +// update the |*begin| pointer and add a placeholder character to the +// output so processing can continue. +// +// We will append the character starting at ch[begin] with the buffer ch +// being |length|. |*begin| will be updated to point to the last character +// consumed (we may consume more than one for UTF-16) so that if called in +// a loop, incrementing the pointer will move to the next character. +// +// Every single output character will be escaped. This means that if you +// give it an ASCII character as input, it will be escaped. Some code uses +// this when it knows that a character is invalid according to its rules +// for validity. If you don't want escaping for ASCII characters, you will +// have to filter them out prior to calling this function. +// +// Assumes that ch[begin] is within range in the array, but does not assume +// that any following characters are. +inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length, + CanonOutput* output) { + // UTF-16 input. Readchar16 will handle invalid characters for us and give + // us the kUnicodeReplacementCharacter, so we don't have to do special + // checking after failure, just pass through the failure to the caller. + unsigned char_value; + bool success = ReadUTFChar(str, begin, length, &char_value); + AppendUTF8EscapedValue(char_value, output); + return success; +} + +// Handles UTF-8 input. See the wide version above for usage. +inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, + CanonOutput* output) { + // ReadUTF8Char will handle invalid characters for us and give us the + // kUnicodeReplacementCharacter, so we don't have to do special checking + // after failure, just pass through the failure to the caller. + unsigned ch; + bool success = ReadUTFChar(str, begin, length, &ch); + AppendUTF8EscapedValue(ch, output); + return success; +} + +// Given a '%' character at |*begin| in the string |spec|, this will decode +// the escaped value and put it into |*unescaped_value| on success (returns +// true). On failure, this will return false, and will not write into +// |*unescaped_value|. +// +// |*begin| will be updated to point to the last character of the escape +// sequence so that when called with the index of a for loop, the next time +// through it will point to the next character to be considered. On failure, +// |*begin| will be unchanged. +inline bool Is8BitChar(char c) { + return true; // this case is specialized to avoid a warning +} +inline bool Is8BitChar(char16 c) { + return c <= 255; +} + +template +inline bool DecodeEscaped(const CHAR* spec, int* begin, int end, + unsigned char* unescaped_value) { + if (*begin + 3 > end || + !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) { + // Invalid escape sequence because there's not enough room, or the + // digits are not ASCII. + return false; + } + + unsigned char first = static_cast(spec[*begin + 1]); + unsigned char second = static_cast(spec[*begin + 2]); + if (!IsHexChar(first) || !IsHexChar(second)) { + // Invalid hex digits, fail. + return false; + } + + // Valid escape sequence. + *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second); + *begin += 2; + return true; +} + +// Appends the given substring to the output, escaping "some" characters that +// it feels may not be safe. It assumes the input values are all contained in +// 8-bit although it allows any type. +// +// This is used in error cases to append invalid output so that it looks +// approximately correct. Non-error cases should not call this function since +// the escaping rules are not guaranteed! +void AppendInvalidNarrowString(const char* spec, int begin, int end, + CanonOutput* output); +void AppendInvalidNarrowString(const char16* spec, int begin, int end, + CanonOutput* output); + +// Misc canonicalization helpers ---------------------------------------------- + +// Converts between UTF-8 and UTF-16, returning true on successful conversion. +// The output will be appended to the given canonicalizer output (so make sure +// it's empty if you want to replace). +// +// On invalid input, this will still write as much output as possible, +// replacing the invalid characters with the "invalid character". It will +// return false in the failure case, and the caller should not continue as +// normal. +bool ConvertUTF16ToUTF8(const char16* input, int input_len, + CanonOutput* output); +bool ConvertUTF8ToUTF16(const char* input, int input_len, + CanonOutputT* output); + +// Converts from UTF-16 to 8-bit using the character set converter. If the +// converter is NULL, this will use UTF-8. +void ConvertUTF16ToQueryEncoding(const char16* input, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output); + +// Applies the replacements to the given component source. The component source +// should be pre-initialized to the "old" base. That is, all pointers will +// point to the spec of the old URL, and all of the Parsed components will +// be indices into that string. +// +// The pointers and components in the |source| for all non-NULL strings in the +// |repl| (replacements) will be updated to reference those strings. +// Canonicalizing with the new |source| and |parsed| can then combine URL +// components from many different strings. +void SetupOverrideComponents(const char* base, + const Replacements& repl, + URLComponentSource* source, + url_parse::Parsed* parsed); + +// Like the above 8-bit version, except that it additionally converts the +// UTF-16 input to UTF-8 before doing the overrides. +// +// The given utf8_buffer is used to store the converted components. They will +// be appended one after another, with the parsed structure identifying the +// appropriate substrings. This buffer is a parameter because the source has +// no storage, so the buffer must have the same lifetime as the source +// parameter owned by the caller. +// +// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of +// |source| will point into this buffer, which could be invalidated if +// additional data is added and the CanonOutput resizes its buffer. +// +// Returns true on success. Fales means that the input was not valid UTF-16, +// although we will have still done the override with "invalid characters" in +// place of errors. +bool SetupUTF16OverrideComponents(const char* base, + const Replacements& repl, + CanonOutput* utf8_buffer, + URLComponentSource* source, + url_parse::Parsed* parsed); + +// Implemented in url_canon_path.cc, these are required by the relative URL +// resolver as well, so we declare them here. +bool CanonicalizePartialPath(const char* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output); +bool CanonicalizePartialPath(const char16* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output); + +#ifndef WIN32 + +// Implementations of Windows' int-to-string conversions +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix); +int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix); + +// Secure template overloads for these functions +template +inline int _itoa_s(int value, char (&buffer)[N], int radix) { + return _itoa_s(value, buffer, N, radix); +} + +template +inline int _itow_s(int value, char16 (&buffer)[N], int radix) { + return _itow_s(value, buffer, N, radix); +} + +// _strtoui64 and strtoull behave the same +inline unsigned long long _strtoui64(const char* nptr, + char** endptr, int base) { + return strtoull(nptr, endptr, base); +} + +#endif // WIN32 + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ diff --git a/psol/include/googleurl/src/url_canon_internal_file.h b/psol/include/googleurl/src/url_canon_internal_file.h new file mode 100644 index 000000000..63a9c5b85 --- /dev/null +++ b/psol/include/googleurl/src/url_canon_internal_file.h @@ -0,0 +1,157 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// As with url_canon_internal.h, this file is intended to be included in +// another C++ file where the template types are defined. This allows the +// programmer to use this to use these functions for their own strings +// types, without bloating the code by having inline templates used in +// every call site. +// +// *** This file must be included after url_canon_internal as we depend on some +// functions in it. *** + +#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ +#define GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ + +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +using namespace url_canon; + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template +static int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z') + output->push_back(spec[after_slashes] - 'a' + 'A'); + else + output->push_back(static_cast(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + output->push_back('/'); + return after_slashes + 2; +} + +// FileDoDriveSpec will have already added the first backslash, so we need to +// write everything following the slashes using the path canonicalizer. +template +static void FileDoPath(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // Normalize the number of slashes after the drive letter. The path + // canonicalizer expects the input to begin in a slash already so + // doesn't check. We want to handle no-slashes + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + // Now use the regular path canonicalizer to canonicalize the rest of the + // path. We supply it with the path following the slashes. It won't prepend + // a slash because it assumes any nonempty path already starts with one. + // We explicitly filter out calls with no path here to prevent that case. + ParsedURL::Component sub_path(after_slashes, end - after_slashes); + if (sub_path.len > 0) { + // Give it a fake output component to write into. DoCanonicalizeFile will + // compute the full path component. + ParsedURL::Component fake_output_path; + URLCanonInternal::DoPath( + spec, sub_path, output, &fake_output_path); + } +} + +template +static bool DoCanonicalizeFileURL(const URLComponentSource& source, + const ParsedURL& parsed, + CanonOutput* output, + ParsedURL* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = ParsedURL::Component(0, -1); + new_parsed->password = ParsedURL::Component(0, -1); + new_parsed->port = ParsedURL::Component(0, -1); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->push_back('f'); + output->push_back('i'); + output->push_back('l'); + output->push_back('e'); + new_parsed->scheme.len = output->length() - new_parsed->scheme.begin; + output->push_back(':'); + + // Write the separator for the host. + output->push_back('/'); + output->push_back('/'); + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = URLCanonInternal::DoHost( + source.host, parsed.host, output, &new_parsed->host); + + // Write a separator for the start of the path. We'll ignore any slashes + // already at the beginning of the path. + new_parsed->path.begin = output->length(); + output->push_back('/'); + + // Copies and normalizes the "c:" at the beginning, if present. + int after_drive = FileDoDriveSpec(source.path, parsed.path.begin, + parsed.path.end(), output); + + // Copies the rest of the path + FileDoPath(source.path, after_drive, parsed.path.end(), output); + new_parsed->path.len = output->length() - new_parsed->path.begin; + + // Things following the path we can use the standard canonicalizers for. + success &= URLCanonInternal::DoQuery( + source.query, parsed.query, output, &new_parsed->query); + success &= URLCanonInternal::DoRef( + source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ diff --git a/psol/include/googleurl/src/url_canon_ip.h b/psol/include/googleurl/src/url_canon_ip.h new file mode 100644 index 000000000..0a01c9f8e --- /dev/null +++ b/psol/include/googleurl/src/url_canon_ip.h @@ -0,0 +1,101 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_CANON_IP_H__ +#define GOOGLEURL_SRC_URL_CANON_IP_H__ + +#include "base/string16.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_common.h" +#include "googleurl/src/url_parse.h" + +namespace url_canon { + +// Searches the host name for the portions of the IPv4 address. On success, +// each component will be placed into |components| and it will return true. +// It will return false if the host can not be separated as an IPv4 address +// or if there are any non-7-bit characters or other characters that can not +// be in an IP address. (This is important so we fail as early as possible for +// common non-IP hostnames.) +// +// Not all components may exist. If there are only 3 components, for example, +// the last one will have a length of -1 or 0 to indicate it does not exist. +// +// Note that many platform's inet_addr will ignore everything after a space +// in certain curcumstances if the stuff before the space looks like an IP +// address. IE6 is included in this. We do NOT handle this case. In many cases, +// the browser's canonicalization will get run before this which converts +// spaces to %20 (in the case of IE7) or rejects them (in the case of +// Mozilla), so this code path never gets hit. Our host canonicalization will +// notice these spaces and escape them, which will make IP address finding +// fail. This seems like better behavior than stripping after a space. +GURL_API bool FindIPv4Components(const char* spec, + const url_parse::Component& host, + url_parse::Component components[4]); +GURL_API bool FindIPv4Components(const char16* spec, + const url_parse::Component& host, + url_parse::Component components[4]); + +// Converts an IPv4 address to a 32-bit number (network byte order). +// +// Possible return values: +// IPV4 - IPv4 address was successfully parsed. +// BROKEN - Input was formatted like an IPv4 address, but overflow occurred +// during parsing. +// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address. +// It might be an IPv6 address, or a hostname. +// +// On success, |num_ipv4_components| will be populated with the number of +// components in the IPv4 address. +GURL_API CanonHostInfo::Family IPv4AddressToNumber( + const char* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); +GURL_API CanonHostInfo::Family IPv4AddressToNumber( + const char16* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +// +// NOTE that |host| is expected to be surrounded by square brackets. +// i.e. "[::1]" rather than "::1". +GURL_API bool IPv6AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[16]); +GURL_API bool IPv6AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[16]); + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_IP_H__ diff --git a/psol/include/googleurl/src/url_canon_stdstring.h b/psol/include/googleurl/src/url_canon_stdstring.h new file mode 100644 index 000000000..c43b77760 --- /dev/null +++ b/psol/include/googleurl/src/url_canon_stdstring.h @@ -0,0 +1,134 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This header file defines a canonicalizer output method class for STL +// strings. Because the canonicalizer tries not to be dependent on the STL, +// we have segregated it here. + +#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ +#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ + +#include +#include "googleurl/src/url_canon.h" + +namespace url_canon { + +// Write into a std::string given in the constructor. This object does not own +// the string itself, and the user must ensure that the string stays alive +// throughout the lifetime of this object. +// +// The given string will be appended to; any existing data in the string will +// be preserved. The caller should reserve() the amount of data in the string +// they expect to be written. We will resize if necessary, but that's slow. +// +// Note that when canonicalization is complete, the string will likely have +// unused space at the end because we make the string very big to start out +// with (by |initial_size|). This ends up being important because resize +// operations are slow, and because the base class needs to write directly +// into the buffer. +// +// Therefore, the user should call Complete() before using the string that +// this class wrote into. +class StdStringCanonOutput : public CanonOutput { + public: + StdStringCanonOutput(std::string* str) + : CanonOutput(), + str_(str) { + cur_len_ = static_cast(str_->size()); // Append to existing data. + str_->resize(str_->capacity()); + buffer_ = &(*str_)[0]; + buffer_len_ = static_cast(str_->size()); + } + virtual ~StdStringCanonOutput() { + // Nothing to do, we don't own the string. + } + + // Must be called after writing has completed but before the string is used. + void Complete() { + str_->resize(cur_len_); + buffer_len_ = cur_len_; + } + + virtual void Resize(int sz) { + str_->resize(sz); + buffer_ = &(*str_)[0]; + buffer_len_ = sz; + } + + protected: + std::string* str_; +}; + +// An extension of the Replacements class that allows the setters to use +// standard strings. +// +// The strings passed as arguments are not copied and must remain valid until +// this class goes out of scope. +template +class StdStringReplacements : + public url_canon::Replacements { + public: + void SetSchemeStr(const STR& s) { + this->SetScheme(s.data(), + url_parse::Component(0, static_cast(s.length()))); + } + void SetUsernameStr(const STR& s) { + this->SetUsername(s.data(), + url_parse::Component(0, static_cast(s.length()))); + } + void SetPasswordStr(const STR& s) { + this->SetPassword(s.data(), + url_parse::Component(0, static_cast(s.length()))); + } + void SetHostStr(const STR& s) { + this->SetHost(s.data(), + url_parse::Component(0, static_cast(s.length()))); + } + void SetPortStr(const STR& s) { + this->SetPort(s.data(), + url_parse::Component(0, static_cast(s.length()))); + } + void SetPathStr(const STR& s) { + this->SetPath(s.data(), + url_parse::Component(0, static_cast(s.length()))); + } + void SetQueryStr(const STR& s) { + this->SetQuery(s.data(), + url_parse::Component(0, static_cast(s.length()))); + } + void SetRefStr(const STR& s) { + this->SetRef(s.data(), + url_parse::Component(0, static_cast(s.length()))); + } +}; + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ + diff --git a/psol/include/googleurl/src/url_common.h b/psol/include/googleurl/src/url_common.h new file mode 100644 index 000000000..ac045a8ce --- /dev/null +++ b/psol/include/googleurl/src/url_common.h @@ -0,0 +1,54 @@ +// Copyright 2010, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_COMMON_H__ +#define GOOGLEURL_SRC_URL_COMMON_H__ + +#if !defined(GURL_IMPLEMENTATION) +#define GURL_IMPLEMENTATION 0 +#endif + +#if defined(GURL_DLL) +#if defined(WIN32) +#if GURL_IMPLEMENTATION +#define GURL_API __declspec(dllexport) +#else +#define GURL_API __declspec(dllimport) +#endif +#else +// Non-Windows DLLs. +#define GURL_API __attribute__((visibility("default"))) +#endif +#else +// Not a DLL. +#define GURL_API +#endif + +#endif // GOOGLEURL_SRC_URL_COMMON_H__ + diff --git a/psol/include/googleurl/src/url_file.h b/psol/include/googleurl/src/url_file.h new file mode 100644 index 000000000..c1b8ac9c5 --- /dev/null +++ b/psol/include/googleurl/src/url_file.h @@ -0,0 +1,108 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Provides shared functions used by the internals of the parser and +// canonicalizer for file URLs. Do not use outside of these modules. + +#ifndef GOOGLEURL_SRC_URL_FILE_H__ +#define GOOGLEURL_SRC_URL_FILE_H__ + +#include "googleurl/src/url_parse_internal.h" + +namespace url_parse { + +#ifdef WIN32 + +// We allow both "c:" and "c|" as drive identifiers. +inline bool IsWindowsDriveSeparator(char16 ch) { + return ch == ':' || ch == '|'; +} +inline bool IsWindowsDriveLetter(char16 ch) { + return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +#endif // WIN32 + +// Returns the index of the next slash in the input after the given index, or +// spec_len if the end of the input is reached. +template +inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) { + int idx = begin_index; + while (idx < spec_len && !IsURLSlash(spec[idx])) + idx++; + return idx; +} + +#ifdef WIN32 + +// Returns true if the start_offset in the given spec looks like it begins a +// drive spec, for example "c:". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// If this returns true, the spec is guaranteed to have a valid drive letter +// plus a colon starting at |start_offset|. +template +inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + int remaining_len = spec_len - start_offset; + if (remaining_len < 2) + return false; // Not enough room. + if (!IsWindowsDriveLetter(spec[start_offset])) + return false; // Doesn't start with a valid drive letter. + if (!IsWindowsDriveSeparator(spec[start_offset + 1])) + return false; // Isn't followed with a drive separator. + return true; +} + +// Returns true if the start_offset in the given text looks like it begins a +// UNC path, for example "\\". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// When strict_slashes is set, this function will only accept backslashes as is +// standard for Windows. Otherwise, it will accept forward slashes as well +// which we use for a lot of URL handling. +template +inline bool DoesBeginUNCPath(const CHAR* text, + int start_offset, + int len, + bool strict_slashes) { + int remaining_len = len - start_offset; + if (remaining_len < 2) + return false; + + if (strict_slashes) + return text[start_offset] == '\\' && text[start_offset + 1] == '\\'; + return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]); +} + +#endif // WIN32 + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_FILE_H__ diff --git a/psol/include/googleurl/src/url_parse.h b/psol/include/googleurl/src/url_parse.h new file mode 100644 index 000000000..64bb223ee --- /dev/null +++ b/psol/include/googleurl/src/url_parse.h @@ -0,0 +1,336 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_PARSE_H__ +#define GOOGLEURL_SRC_URL_PARSE_H__ + +#include + +#include "base/basictypes.h" +#include "base/string16.h" +#include "googleurl/src/url_common.h" + +namespace url_parse { + +// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and +// KURLGoogle.cpp still rely on this type. +typedef char16 UTF16Char; + +// Component ------------------------------------------------------------------ + +// Represents a substring for URL parsing. +struct Component { + Component() : begin(0), len(-1) {} + + // Normal constructor: takes an offset and a length. + Component(int b, int l) : begin(b), len(l) {} + + int end() const { + return begin + len; + } + + // Returns true if this component is valid, meaning the length is given. Even + // valid components may be empty to record the fact that they exist. + bool is_valid() const { + return (len != -1); + } + + // Returns true if the given component is specified on false, the component + // is either empty or invalid. + bool is_nonempty() const { + return (len > 0); + } + + void reset() { + begin = 0; + len = -1; + } + + bool operator==(const Component& other) const { + return begin == other.begin && len == other.len; + } + + int begin; // Byte offset in the string of this component. + int len; // Will be -1 if the component is unspecified. +}; + +// Helper that returns a component created with the given begin and ending +// points. The ending point is non-inclusive. +inline Component MakeRange(int begin, int end) { + return Component(begin, end - begin); +} + +// Parsed --------------------------------------------------------------------- + +// A structure that holds the identified parts of an input URL. This structure +// does NOT store the URL itself. The caller will have to store the URL text +// and its corresponding Parsed structure separately. +// +// Typical usage would be: +// +// url_parse::Parsed parsed; +// url_parse::Component scheme; +// if (!url_parse::ExtractScheme(url, url_len, &scheme)) +// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; +// +// if (IsStandardScheme(url, scheme)) // Not provided by this component +// url_parseParseStandardURL(url, url_len, &parsed); +// else if (IsFileURL(url, scheme)) // Not provided by this component +// url_parse::ParseFileURL(url, url_len, &parsed); +// else +// url_parse::ParsePathURL(url, url_len, &parsed); +// +struct Parsed { + // Identifies different components. + enum ComponentType { + SCHEME, + USERNAME, + PASSWORD, + HOST, + PORT, + PATH, + QUERY, + REF, + }; + + // The default constructor is sufficient for the components. + GURL_API Parsed(); + + // Returns the length of the URL (the end of the last component). + // + // Note that for some invalid, non-canonical URLs, this may not be the length + // of the string. For example "http://": the parsed structure will only + // contain an entry for the four-character scheme, and it doesn't know about + // the "://". For all other last-components, it will return the real length. + GURL_API int Length() const; + + // Returns the number of characters before the given component if it exists, + // or where the component would be if it did exist. This will return the + // string length if the component would be appended to the end. + // + // Note that this can get a little funny for the port, query, and ref + // components which have a delimiter that is not counted as part of the + // component. The |include_delimiter| flag controls if you want this counted + // as part of the component or not when the component exists. + // + // This example shows the difference between the two flags for two of these + // delimited components that is present (the port and query) and one that + // isn't (the reference). The components that this flag affects are marked + // with a *. + // 0 1 2 + // 012345678901234567890 + // Example input: http://foo:80/?query + // include_delim=true, ...=false ("<-" indicates different) + // SCHEME: 0 0 + // USERNAME: 5 5 + // PASSWORD: 5 5 + // HOST: 7 7 + // *PORT: 10 11 <- + // PATH: 13 13 + // *QUERY: 14 15 <- + // *REF: 20 20 + // + GURL_API int CountCharactersBefore(ComponentType type, + bool include_delimiter) const; + + // Scheme without the colon: "http://foo"/ would have a scheme of "http". + // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there + // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed + // to start at the beginning of the string if there are preceeding whitespace + // or control characters. + Component scheme; + + // Username. Specified in URLs with an @ sign before the host. See |password| + Component username; + + // Password. The length will be -1 if unspecified, 0 if specified but empty. + // Not all URLs with a username have a password, as in "http://me@host/". + // The password is separated form the username with a colon, as in + // "http://me:secret@host/" + Component password; + + // Host name. + Component host; + + // Port number. + Component port; + + // Path, this is everything following the host name. Length will be -1 if + // unspecified. This includes the preceeding slash, so the path on + // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to + // have a 0 length path, it will be -1 in cases like "http://host?foo". + // Note that we treat backslashes the same as slashes. + Component path; + + // Stuff between the ? and the # after the path. This does not include the + // preceeding ? character. Length will be -1 if unspecified, 0 if there is + // a question mark but no query string. + Component query; + + // Indicated by a #, this is everything following the hash sign (not + // including it). If there are multiple hash signs, we'll use the last one. + // Length will be -1 if there is no hash sign, or 0 if there is one but + // nothing follows it. + Component ref; +}; + +// Initialization functions --------------------------------------------------- +// +// These functions parse the given URL, filling in all of the structure's +// components. These functions can not fail, they will always do their best +// at interpreting the input given. +// +// The string length of the URL MUST be specified, we do not check for NULLs +// at any point in the process, and will actually handle embedded NULLs. +// +// IMPORTANT: These functions do NOT hang on to the given pointer or copy it +// in any way. See the comment above the struct. +// +// The 8-bit versions require UTF-8 encoding. + +// StandardURL is for when the scheme is known to be one that has an +// authority (host) like "http". This function will not handle weird ones +// like "about:" and "javascript:", or do the right thing for "file:" URLs. +GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed); + +// PathURL is for when the scheme is known not to have an authority (host) +// section but that aren't file URLs either. The scheme is parsed, and +// everything after the scheme is considered as the path. This is used for +// things like "about:" and "javascript:" +GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed); + +// FileURL is for file URLs. There are some special rules for interpreting +// these. +GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed); + +// MailtoURL is for mailto: urls. They are made up scheme,path,query +GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); + +// Helper functions ----------------------------------------------------------- + +// Locates the scheme according to the URL parser's rules. This function is +// designed so the caller can find the scheme and call the correct Init* +// function according to their known scheme types. +// +// It also does not perform any validation on the scheme. +// +// This function will return true if the scheme is found and will put the +// scheme's range into *scheme. False means no scheme could be found. Note +// that a URL beginning with a colon has a scheme, but it is empty, so this +// function will return true but *scheme will = (0,0). +// +// The scheme is found by skipping spaces and control characters at the +// beginning, and taking everything from there to the first colon to be the +// scheme. The character at scheme.end() will be the colon (we may enhance +// this to handle full width colons or something, so don't count on the +// actual character value). The character at scheme.end()+1 will be the +// beginning of the rest of the URL, be it the authority or the path (or the +// end of the string). +// +// The 8-bit version requires UTF-8 encoding. +GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme); +GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme); + +// Returns true if ch is a character that terminates the authority segment +// of a URL. +GURL_API bool IsAuthorityTerminator(char16 ch); + +// Does a best effort parse of input |spec|, in range |auth|. If a particular +// component is not found, it will be set to invalid. +GURL_API void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); +GURL_API void ParseAuthority(const char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); + +// Computes the integer port value from the given port component. The port +// component should have been identified by one of the init functions on +// |Parsed| for the given input url. +// +// The return value will be a positive integer between 0 and 64K, or one of +// the two special values below. +enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; +GURL_API int ParsePort(const char* url, const Component& port); +GURL_API int ParsePort(const char16* url, const Component& port); + +// Extracts the range of the file name in the given url. The path must +// already have been computed by the parse function, and the matching URL +// and extracted path are provided to this function. The filename is +// defined as being everything from the last slash/backslash of the path +// to the end of the path. +// +// The file name will be empty if the path is empty or there is nothing +// following the last slash. +// +// The 8-bit version requires UTF-8 encoding. +GURL_API void ExtractFileName(const char* url, + const Component& path, + Component* file_name); +GURL_API void ExtractFileName(const char16* url, + const Component& path, + Component* file_name); + +// Extract the first key/value from the range defined by |*query|. Updates +// |*query| to start at the end of the extracted key/value pair. This is +// designed for use in a loop: you can keep calling it with the same query +// object and it will iterate over all items in the query. +// +// Some key/value pairs may have the key, the value, or both be empty (for +// example, the query string "?&"). These will be returned. Note that an empty +// last parameter "foo.com?" or foo.com?a&" will not be returned, this case +// is the same as "done." +// +// The initial query component should not include the '?' (this is the default +// for parsed URLs). +// +// If no key/value are found |*key| and |*value| will be unchanged and it will +// return false. +GURL_API bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); +GURL_API bool ExtractQueryKeyValue(const char16* url, + Component* query, + Component* key, + Component* value); + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_PARSE_H__ diff --git a/psol/include/googleurl/src/url_parse_internal.h b/psol/include/googleurl/src/url_parse_internal.h new file mode 100644 index 000000000..61bd0687f --- /dev/null +++ b/psol/include/googleurl/src/url_parse_internal.h @@ -0,0 +1,112 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Contains common inline helper functions used by the URL parsing routines. + +#ifndef GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ + +#include "googleurl/src/url_parse.h" + +namespace url_parse { + +// We treat slashes and backslashes the same for IE compatability. +inline bool IsURLSlash(char16 ch) { + return ch == '/' || ch == '\\'; +} + +// Returns true if we should trim this character from the URL because it is a +// space or a control character. +inline bool ShouldTrimFromURL(char16 ch) { + return ch <= ' '; +} + +// Given an already-initialized begin index and length, this shrinks the range +// to eliminate "should-be-trimmed" characters. Note that the length does *not* +// indicate the length of untrimmed data from |*begin|, but rather the position +// in the input string (so the string starts at character |*begin| in the spec, +// and goes until |*len|). +template +inline void TrimURL(const CHAR* spec, int* begin, int* len) { + // Strip leading whitespace and control characters. + while (*begin < *len && ShouldTrimFromURL(spec[*begin])) + (*begin)++; + + // Strip trailing whitespace and control characters. We need the >i test for + // when the input string is all blanks; we don't want to back past the input. + while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) + (*len)--; +} + +// Counts the number of consecutive slashes starting at the given offset +// in the given string of the given length. +template +inline int CountConsecutiveSlashes(const CHAR *str, + int begin_offset, int str_len) { + int count = 0; + while (begin_offset + count < str_len && + IsURLSlash(str[begin_offset + count])) + ++count; + return count; +} + +// Internal functions in url_parse.cc that parse the path, that is, everything +// following the authority section. The input is the range of everything +// following the authority section, and the output is the identified ranges. +// +// This is designed for the file URL parser or other consumers who may do +// special stuff at the beginning, but want regular path parsing, it just +// maps to the internal parsing function for paths. +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); +void ParsePathInternal(const char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); + + +// Given a spec and a pointer to the character after the colon following the +// scheme, this parses it and fills in the structure, Every item in the parsed +// structure is filled EXCEPT for the scheme, which is untouched. +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed); +void ParseAfterScheme(const char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed); + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ diff --git a/psol/include/googleurl/src/url_test_utils.h b/psol/include/googleurl/src/url_test_utils.h new file mode 100644 index 000000000..6278e3fbb --- /dev/null +++ b/psol/include/googleurl/src/url_test_utils.h @@ -0,0 +1,78 @@ +// Copyright 2007 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Convenience functions for string conversions. +// These are mostly intended for use in unit tests. + +#ifndef GOOGLEURL_SRC_URL_TEST_UTILS_H__ +#define GOOGLEURL_SRC_URL_TEST_UTILS_H__ + +#include + +#include "base/string16.h" +#include "googleurl/src/url_canon_internal.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace url_test_utils { + +// Converts a UTF-16 string from native wchar_t format to char16, by +// truncating the high 32 bits. This is not meant to handle true UTF-32 +// encoded strings. +inline string16 WStringToUTF16(const wchar_t* src) { + string16 str; + int length = static_cast(wcslen(src)); + for (int i = 0; i < length; ++i) { + str.push_back(static_cast(src[i])); + } + return str; +} + +// Converts a string from UTF-8 to UTF-16 +inline string16 ConvertUTF8ToUTF16(const std::string& src) { + int length = static_cast(src.length()); + EXPECT_LT(length, 1024); + url_canon::RawCanonOutputW<1024> output; + EXPECT_TRUE(url_canon::ConvertUTF8ToUTF16(src.data(), length, &output)); + return string16(output.data(), output.length()); +} + +// Converts a string from UTF-16 to UTF-8 +inline std::string ConvertUTF16ToUTF8(const string16& src) { + std::string str; + url_canon::StdStringCanonOutput output(&str); + EXPECT_TRUE(url_canon::ConvertUTF16ToUTF8(src.data(), + static_cast(src.length()), + &output)); + output.Complete(); + return str; +} + +} // namespace url_test_utils + +#endif // GOOGLEURL_SRC_URL_TEST_UTILS_H__ diff --git a/psol/include/googleurl/src/url_util.h b/psol/include/googleurl/src/url_util.h new file mode 100644 index 000000000..9e53d2d32 --- /dev/null +++ b/psol/include/googleurl/src/url_util.h @@ -0,0 +1,228 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_UTIL_H__ +#define GOOGLEURL_SRC_URL_UTIL_H__ + +#include + +#include "base/string16.h" +#include "googleurl/src/url_common.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_canon.h" + +namespace url_util { + +// Init ------------------------------------------------------------------------ + +// Initialization is NOT required, it will be implicitly initialized when first +// used. However, this implicit initialization is NOT threadsafe. If you are +// using this library in a threaded environment and don't have a consistent +// "first call" (an example might be calling "AddStandardScheme" with your +// special application-specific schemes) then you will want to call initialize +// before spawning any threads. +// +// It is OK to call this function more than once, subsequent calls will simply +// "noop", unless Shutdown() was called in the mean time. This will also be a +// "noop" if other calls to the library have forced an initialization +// beforehand. +GURL_API void Initialize(); + +// Cleanup is not required, except some strings may leak. For most user +// applications, this is fine. If you're using it in a library that may get +// loaded and unloaded, you'll want to unload to properly clean up your +// library. +GURL_API void Shutdown(); + +// Schemes -------------------------------------------------------------------- + +// Adds an application-defined scheme to the internal list of "standard" URL +// schemes. This function is not threadsafe and can not be called concurrently +// with any other url_util function. It will assert if the list of standard +// schemes has been locked (see LockStandardSchemes). +GURL_API void AddStandardScheme(const char* new_scheme); + +// Sets a flag to prevent future calls to AddStandardScheme from succeeding. +// +// This is designed to help prevent errors for multithreaded applications. +// Normal usage would be to call AddStandardScheme for your custom schemes at +// the beginning of program initialization, and then LockStandardSchemes. This +// prevents future callers from mistakenly calling AddStandardScheme when the +// program is running with multiple threads, where such usage would be +// dangerous. +// +// We could have had AddStandardScheme use a lock instead, but that would add +// some platform-specific dependencies we don't otherwise have now, and is +// overkill considering the normal usage is so simple. +GURL_API void LockStandardSchemes(); + +// Locates the scheme in the given string and places it into |found_scheme|, +// which may be NULL to indicate the caller does not care about the range. +// +// Returns whether the given |compare| scheme matches the scheme found in the +// input (if any). The |compare| scheme must be a valid canonical scheme or +// the result of the comparison is undefined. +GURL_API bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +GURL_API bool FindAndCompareScheme(const char16* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +inline bool FindAndCompareScheme(const std::string& str, + const char* compare, + url_parse::Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast(str.size()), + compare, found_scheme); +} +inline bool FindAndCompareScheme(const string16& str, + const char* compare, + url_parse::Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast(str.size()), + compare, found_scheme); +} + +// Returns true if the given string represents a standard URL. This means that +// either the scheme is in the list of known standard schemes. +GURL_API bool IsStandard(const char* spec, + const url_parse::Component& scheme); +GURL_API bool IsStandard(const char16* spec, + const url_parse::Component& scheme); + +// TODO(brettw) remove this. This is a temporary compatibility hack to avoid +// breaking the WebKit build when this version is synced via Chrome. +inline bool IsStandard(const char* spec, int spec_len, + const url_parse::Component& scheme) { + return IsStandard(spec, scheme); +} + +// URL library wrappers ------------------------------------------------------- + +// Parses the given spec according to the extracted scheme type. Normal users +// should use the URL object, although this may be useful if performance is +// critical and you don't want to do the heap allocation for the std::string. +// +// As with the url_canon::Canonicalize* functions, the charset converter can +// be NULL to use UTF-8 (it will be faster in this case). +// +// Returns true if a valid URL was produced, false if not. On failure, the +// output and parsed structures will still be filled and will be consistent, +// but they will not represent a loadable URL. +GURL_API bool Canonicalize(const char* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +GURL_API bool Canonicalize(const char16* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); + +// Resolves a potentially relative URL relative to the given parsed base URL. +// The base MUST be valid. The resulting canonical URL and parsed information +// will be placed in to the given out variables. +// +// The relative need not be relative. If we discover that it's absolute, this +// will produce a canonical version of that URL. See Canonicalize() for more +// about the charset_converter. +// +// Returns true if the output is valid, false if the input could not produce +// a valid URL. +GURL_API bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +GURL_API bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char16* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); + +// Replaces components in the given VALID input url. The new canonical URL info +// is written to output and out_parsed. +// +// Returns true if the resulting URL is valid. +GURL_API bool ReplaceComponents( + const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); +GURL_API bool ReplaceComponents( + const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); + +// String helper functions ---------------------------------------------------- + +// Compare the lower-case form of the given string against the given ASCII +// string. This is useful for doing checking if an input string matches some +// token, and it is optimized to avoid intermediate string copies. +// +// The versions of this function that don't take a b_end assume that the b +// string is NULL terminated. +GURL_API bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b); +GURL_API bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b_begin, + const char* b_end); +GURL_API bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b); + +// Unescapes the given string using URL escaping rules. +GURL_API void DecodeURLEscapeSequences(const char* input, int length, + url_canon::CanonOutputW* output); + +// Escapes the given string as defined by the JS method encodeURIComponent. See +// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent +GURL_API void EncodeURIComponent(const char* input, int length, + url_canon::CanonOutput* output); + + +} // namespace url_util + +#endif // GOOGLEURL_SRC_URL_UTIL_H__ diff --git a/psol/include/net/instaweb/apache/add_headers_fetcher.h b/psol/include/net/instaweb/apache/add_headers_fetcher.h new file mode 100644 index 000000000..6a410f2a5 --- /dev/null +++ b/psol/include/net/instaweb/apache/add_headers_fetcher.h @@ -0,0 +1,57 @@ +// Copyright 2012 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jefftk@google.com (Jeff Kaufman) + +// This class is a simple wrapper around another fetcher that adds headers to +// requests based on settings in the rewrite options before passing them on to +// the backend fetcher. + +#ifndef NET_INSTAWEB_APACHE_ADD_HEADERS_FETCHER_H_ +#define NET_INSTAWEB_APACHE_ADD_HEADERS_FETCHER_H_ + +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class AsyncFetch; +class RewriteOptions; +class MessageHandler; + +class AddHeadersFetcher : public UrlAsyncFetcher { + public: + AddHeadersFetcher(const RewriteOptions* options, + UrlAsyncFetcher* backend_fetcher); + virtual ~AddHeadersFetcher(); + + virtual bool SupportsHttps() const { + return backend_fetcher_->SupportsHttps(); + } + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* callback); + + private: + const RewriteOptions* const options_; + UrlAsyncFetcher* const backend_fetcher_; + + DISALLOW_COPY_AND_ASSIGN(AddHeadersFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_ADD_HEADERS_FETCHER_H_ diff --git a/psol/include/net/instaweb/apache/apache_cache.h b/psol/include/net/instaweb/apache/apache_cache.h new file mode 100644 index 000000000..2b35779dc --- /dev/null +++ b/psol/include/net/instaweb/apache/apache_cache.h @@ -0,0 +1,76 @@ +// Copyright 2011 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_APACHE_APACHE_CACHE_H_ +#define NET_INSTAWEB_APACHE_APACHE_CACHE_H_ + +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class ApacheConfig; +class ApacheRewriteDriverFactory; +class CacheInterface; +class FileCache; +class FileSystemLockManager; +class MessageHandler; +class NamedLockManager; +class SharedMemLockManager; + +// The ApacheCache encapsulates a cache-sharing model where a user specifies +// a file-cache path per virtual-host. With each file-cache object we keep +// a locking mechanism and an optional per-process LRUCache. +class ApacheCache { + public: + static const char kFileCache[]; + static const char kLruCache[]; + + ApacheCache(const StringPiece& path, + const ApacheConfig& config, + ApacheRewriteDriverFactory* factory); + ~ApacheCache(); + CacheInterface* l1_cache() { return l1_cache_.get(); } + CacheInterface* l2_cache() { return l2_cache_.get(); } + NamedLockManager* lock_manager() { return lock_manager_; } + + void RootInit(); + void ChildInit(); + void GlobalCleanup(MessageHandler* handler); // only called in root process + + private: + void FallBackToFileBasedLocking(); + + GoogleString path_; + + ApacheRewriteDriverFactory* factory_; + scoped_ptr shared_mem_lock_manager_; + scoped_ptr file_system_lock_manager_; + NamedLockManager* lock_manager_; + FileCache* file_cache_; // owned by l2 cache + scoped_ptr l1_cache_; + scoped_ptr l2_cache_; +}; + +// CACHE_STATISTICS is #ifdef'd to facilitate experiments with whether +// tracking the detailed stats & histograms has a QPS impact. Set it +// to 0 to turn it off. +#define CACHE_STATISTICS 1 + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APACHE_CACHE_H_ diff --git a/psol/include/net/instaweb/apache/apache_config.h b/psol/include/net/instaweb/apache/apache_config.h new file mode 100644 index 000000000..19f097a5b --- /dev/null +++ b/psol/include/net/instaweb/apache/apache_config.h @@ -0,0 +1,371 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_APACHE_APACHE_CONFIG_H_ +#define NET_INSTAWEB_APACHE_APACHE_CONFIG_H_ + +#include "net/instaweb/rewriter/public/rewrite_options.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class Hasher; + +// Establishes a context for VirtualHosts and directory-scoped +// options, either via .htaccess or .... +class ApacheConfig : public RewriteOptions { + public: + enum RefererStatisticsOutputLevel { + kFast, + kSimple, + kOrganized, + }; + + static const char kClassName[]; + + static bool ParseRefererStatisticsOutputLevel( + const StringPiece& in, RefererStatisticsOutputLevel* out); + + static void Initialize(); + static void Terminate(); + + explicit ApacheConfig(const StringPiece& dir); + ApacheConfig(); + ~ApacheConfig() {} + + // Human-readable description of what this configuration is for. This + // may be a directory, or a string indicating a combination of directives + // for multiple directories. + StringPiece description() const { return description_; } + void set_description(const StringPiece& x) { x.CopyToString(&description_); } + + int64 file_cache_clean_interval_ms() const { + return file_cache_clean_interval_ms_.value(); + } + void set_file_cache_clean_interval_ms(int64 x) { + set_option(x, &file_cache_clean_interval_ms_); + } + int64 file_cache_clean_size_kb() const { + return file_cache_clean_size_kb_.value(); + } + void set_file_cache_clean_size_kb(int64 x) { + set_option(x, &file_cache_clean_size_kb_); + } + int64 file_cache_clean_inode_limit() const { + return file_cache_clean_inode_limit_.value(); + } + void set_file_cache_clean_inode_limit(int64 x) { + set_option(x, &file_cache_clean_inode_limit_); + } + int64 lru_cache_byte_limit() const { + return lru_cache_byte_limit_.value(); + } + void set_lru_cache_byte_limit(int64 x) { + set_option(x, &lru_cache_byte_limit_); + } + int64 lru_cache_kb_per_process() const { + return lru_cache_kb_per_process_.value(); + } + void set_lru_cache_kb_per_process(int64 x) { + set_option(x, &lru_cache_kb_per_process_); + } + int64 slurp_flush_limit() const { + return slurp_flush_limit_.value(); + } + void set_slurp_flush_limit(int64 x) { + set_option(x, &slurp_flush_limit_); + } + bool use_shared_mem_locking() const { + return use_shared_mem_locking_.value(); + } + void set_use_shared_mem_locking(bool x) { + set_option(x, &use_shared_mem_locking_); + } + bool collect_referer_statistics() const { + return collect_referer_statistics_.value(); + } + void set_collect_referer_statistics(bool x) { + set_option(x, &collect_referer_statistics_); + } + bool hash_referer_statistics() const { + return hash_referer_statistics_.value(); + } + void set_hash_referer_statistics(bool x) { + set_option(x, &hash_referer_statistics_); + } + bool statistics_enabled() const { + return statistics_enabled_.value(); + } + void set_statistics_enabled(bool x) { + set_option(x, &statistics_enabled_); + } + bool statistics_logging_enabled() const { + return statistics_logging_enabled_.value(); + } + void set_statistics_logging_enabled(bool x) { + set_option(x, &statistics_logging_enabled_); + } + const GoogleString& statistics_logging_file() const { + return statistics_logging_file_.value(); + } + const GoogleString& statistics_logging_charts_css() const { + return statistics_logging_charts_css_.value(); + } + const GoogleString& statistics_logging_charts_js() const { + return statistics_logging_charts_js_.value(); + } + void set_statistics_logging_file(GoogleString x) { + set_option(x, &statistics_logging_file_); + } + int64 statistics_logging_interval_ms() const { + return statistics_logging_interval_ms_.value(); + } + void set_statistics_logging_interval_ms(int64 x) { + set_option(x, &statistics_logging_interval_ms_); + } + bool slurp_read_only() const { + return slurp_read_only_.value(); + } + void set_slurp_read_only(bool x) { + set_option(x, &slurp_read_only_); + } + bool rate_limit_background_fetches() const { + return rate_limit_background_fetches_.value(); + } + RefererStatisticsOutputLevel referer_statistics_output_level() const { + return referer_statistics_output_level_.value(); + } + void set_referer_statistics_output_level(RefererStatisticsOutputLevel x) { + set_option(x, &referer_statistics_output_level_); + } + const GoogleString& file_cache_path() const { + return file_cache_path_.value(); + } + void set_file_cache_path(GoogleString x) { + set_option(x, &file_cache_path_); + } + const GoogleString& memcached_servers() const { + return memcached_servers_.value(); + } + void set_memcached_servers(GoogleString x) { + set_option(x, &memcached_servers_); + } + int memcached_threads() const { + return memcached_threads_.value(); + } + void set_memcached_threads(int x) { + set_option(x, &memcached_threads_); + } + int memcached_timeout_us() const { + return memcached_timeout_us_.value(); + } + bool has_memcached_timeout_us() const { + return memcached_timeout_us_.was_set(); + } + void set_memcached_timeout_us(int x) { + set_option(x, &memcached_timeout_us_); + } + const GoogleString& slurp_directory() const { + return slurp_directory_.value(); + } + void set_slurp_directory(GoogleString x) { + set_option(x, &slurp_directory_); + } + const GoogleString& fetcher_proxy() const { + return fetcher_proxy_.value(); + } + void set_fetcher_proxy(GoogleString x) { + set_option(x, &fetcher_proxy_); + } + + // Cache flushing configuration. + void set_cache_flush_poll_interval_sec(int64 num_seconds) { + set_option(num_seconds, &cache_flush_poll_interval_sec_); + } + int64 cache_flush_poll_interval_sec() const { + return cache_flush_poll_interval_sec_.value(); + } + void set_cache_flush_filename(const StringPiece& sp) { + set_option(sp.as_string(), &cache_flush_filename_); + } + const GoogleString& cache_flush_filename() const { + return cache_flush_filename_.value(); + } + + // If this is set to true, we'll turn on our fallback proxy-like behavior + // on non-.pagespeed. URLs without changing the main fetcher from Serf + // (the way the slurp options would). + bool test_proxy() const { + return test_proxy_.value(); + } + void set_test_proxy(bool x) { + set_option(x, &test_proxy_); + } + + // This configures the fetcher we use for fallback handling if test_proxy() + // is on: + // - If this is empty, we use the usual mod_pagespeed fetcher + // (e.g. Serf) + // - If it's non-empty, the fallback URLs will be fetched from the given + // slurp directory. mod_pagespeed resource fetches, however, will still + // use the usual fetcher (e.g. Serf). + GoogleString test_proxy_slurp() const { + return test_proxy_slurp_.value(); + } + + // Helper functions + bool slurping_enabled() const { + return !slurp_directory().empty(); + } + + bool slurping_enabled_read_only() const { + return slurping_enabled() && slurp_read_only(); + } + + bool experimental_fetch_from_mod_spdy() const { + return experimental_fetch_from_mod_spdy_.value(); + } + + // Make an identical copy of these options and return it. + virtual ApacheConfig* Clone() const; + + // Returns a suitably down cast version of 'instance' if it is an instance + // of this class, NULL if not. + static const ApacheConfig* DynamicCast(const RewriteOptions* instance); + static ApacheConfig* DynamicCast(RewriteOptions* instance); + + // Name of the actual type of this instance as a poor man's RTTI. + virtual const char* class_name() const; + + protected: + template class ApacheOption : public OptionTemplateBase { + public: + ApacheOption() {} + + // Sets value_ from value_string. + virtual bool SetFromString(const GoogleString& value_string) { + T value; + bool success = ApacheConfig::ParseFromString(value_string, &value); + if (success) { + this->set(value); + } + return success; + } + + virtual GoogleString Signature(const Hasher* hasher) const { + return ApacheConfig::OptionSignature(this->value(), hasher); + } + + virtual GoogleString ToString() const { + return ApacheConfig::ToString(this->value()); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ApacheOption); + }; + + private: + // Keeps the properties added by this subclass. These are merged into + // RewriteOptions::all_properties_ during Initialize(). + static Properties* apache_properties_; + + // Adds an option to apache_properties_. + // + // TODO(jmarantz): rename this to avoid coinciding with private + // method RewriteOptions::add_option. This is done for now so + // review-diffs are readable, at the cost of a small non-functional + // follow-up refactor. + template + static void add_option(typename OptionClass::ValueType default_value, + OptionClass RewriteOptionsSubclass::*offset, + const char* id, + OptionEnum option_enum) { + AddProperty(default_value, offset, id, option_enum, apache_properties_); + } + + void InitializeSignaturesAndDefaults(); + static void AddProperties(); + void Init(); + + static bool ParseFromString(const GoogleString& value_string, + RefererStatisticsOutputLevel* value) { + return ParseRefererStatisticsOutputLevel(value_string, value); + } + + static GoogleString OptionSignature(RefererStatisticsOutputLevel x, + const Hasher* hasher) { + // TODO(sriharis): This is what we had so far due to implicit cast to int. + // Do we need something better now? + return IntegerToString(x); + } + + static GoogleString ToString(RefererStatisticsOutputLevel x) { + // TODO(sriharis): This is what we had so far due to implicit cast to int. + // Do we need something better now? + return IntegerToString(x); + } + + GoogleString description_; + RewriteOptions options_; + + Option fetcher_proxy_; + Option file_cache_path_; + + // comma-separated list of host[:port]. See AprMemCache::AprMemCache + // for code that parses it. + Option fetch_https_; + Option memcached_servers_; + Option slurp_directory_; + Option statistics_logging_file_; + Option statistics_logging_charts_css_; + Option statistics_logging_charts_js_; + Option cache_flush_filename_; + Option test_proxy_slurp_; + + ApacheOption referer_statistics_output_level_; + + Option collect_referer_statistics_; + Option hash_referer_statistics_; + Option slurp_read_only_; + Option statistics_enabled_; + Option statistics_logging_enabled_; + Option test_proxy_; + Option use_shared_mem_locking_; + Option rate_limit_background_fetches_; + Option experimental_fetch_from_mod_spdy_; + + Option memcached_threads_; + Option memcached_timeout_us_; + + Option file_cache_clean_inode_limit_; + Option file_cache_clean_interval_ms_; + Option file_cache_clean_size_kb_; + Option lru_cache_byte_limit_; + Option lru_cache_kb_per_process_; + Option slurp_flush_limit_; + Option statistics_logging_interval_ms_; + // If cache_flush_poll_interval_sec_<=0 then we turn off polling for + // cache-flushes. + Option cache_flush_poll_interval_sec_; + + DISALLOW_COPY_AND_ASSIGN(ApacheConfig); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APACHE_CONFIG_H_ diff --git a/psol/include/net/instaweb/apache/apache_logging_includes.h b/psol/include/net/instaweb/apache/apache_logging_includes.h new file mode 100644 index 000000000..8d04fabce --- /dev/null +++ b/psol/include/net/instaweb/apache/apache_logging_includes.h @@ -0,0 +1,38 @@ +// Copyright 2012 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: morlovich@google.com (Maksim Orlovich) +// +// Makes sure we include Apache's http_log.h without conflicting with +// Google LOG() macros, and with proper per-module logging support in +// Apache 2.4 + +#ifndef NET_INSTAWEB_APACHE_APACHE_LOGGING_INCLUDES_H_ +#define NET_INSTAWEB_APACHE_APACHE_LOGGING_INCLUDES_H_ + +// When HAVE_SYSLOG is defined, apache http_log.h will include syslog.h, which +// #defines LOG_* as numbers. This conflicts with definitions of the LOG(x) +// macros in Chromium base. +#undef HAVE_SYSLOG +#include "http_log.h" + +// Apache >= 2.4 expect us to use the APLOG_USE_MODULE macro in order to +// permit per-module log-level configuration. +#ifdef APLOG_USE_MODULE +extern "C" { +APLOG_USE_MODULE(pagespeed); +} +#endif + +#endif // NET_INSTAWEB_APACHE_APACHE_LOGGING_INCLUDES_H_ diff --git a/psol/include/net/instaweb/apache/apache_message_handler.h b/psol/include/net/instaweb/apache/apache_message_handler.h new file mode 100644 index 000000000..fecf454fc --- /dev/null +++ b/psol/include/net/instaweb/apache/apache_message_handler.h @@ -0,0 +1,91 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_APACHE_APACHE_MESSAGE_HANDLER_H_ +#define NET_INSTAWEB_APACHE_APACHE_MESSAGE_HANDLER_H_ + +#include + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/google_message_handler.h" +#include "net/instaweb/util/public/message_handler.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +struct server_rec; + +namespace net_instaweb { + +class AbstractMutex; +class SharedCircularBuffer; +class Timer; +class Writer; + +// Implementation of an HTML parser message handler that uses Apache +// logging to emit messsages. +class ApacheMessageHandler : public MessageHandler { + public: + // version is a string added to each message. + // Timer is used to generate timestamp for messages in shared memory. + ApacheMessageHandler(const server_rec* server, const StringPiece& version, + Timer* timer, AbstractMutex* mutex); + + // Installs a signal handler for common crash signals that tries to print + // out a backtrace. + static void InstallCrashHandler(server_rec* global_server); + + // When we initialize ApacheMessageHandler in ApacheRewriteDriverFactory, + // SharedCircularBuffer of ApacheRewriteDriverFactory is not initialized yet. + // We need to set buffer_ later in RootInit() or ChildInit(). + void set_buffer(SharedCircularBuffer* buff); + void SetPidString(const int64 pid) { + pid_string_ = StrCat("[", Integer64ToString(pid), "]"); + } + // Dump contents of SharedCircularBuffer. + bool Dump(Writer* writer); + + protected: + virtual void MessageVImpl(MessageType type, const char* msg, va_list args); + + virtual void FileMessageVImpl(MessageType type, const char* filename, + int line, const char* msg, va_list args); + + private: + int GetApacheLogLevel(MessageType type); + GoogleString Format(const char* msg, va_list args); + + const server_rec* server_rec_; + const GoogleString version_; + // This timer is used to prepend time when writing a message + // to SharedCircularBuffer. + Timer* timer_; + scoped_ptr mutex_; + // String "[pid]". + GoogleString pid_string_; + // This handler is for internal use. + // Some functions of SharedCircularBuffer need MessageHandler as argument, + // We do not want to pass in another ApacheMessageHandler to cause infinite + // loop. + GoogleMessageHandler handler_; + SharedCircularBuffer* buffer_; + + DISALLOW_COPY_AND_ASSIGN(ApacheMessageHandler); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APACHE_MESSAGE_HANDLER_H_ diff --git a/psol/include/net/instaweb/apache/apache_rewrite_driver_factory.h b/psol/include/net/instaweb/apache/apache_rewrite_driver_factory.h new file mode 100644 index 000000000..8ed622355 --- /dev/null +++ b/psol/include/net/instaweb/apache/apache_rewrite_driver_factory.h @@ -0,0 +1,473 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) +// lsong@google.com (Libo Song) + +#ifndef NET_INSTAWEB_APACHE_APACHE_REWRITE_DRIVER_FACTORY_H_ +#define NET_INSTAWEB_APACHE_APACHE_REWRITE_DRIVER_FACTORY_H_ + +#include +#include +#include + +#include "net/instaweb/rewriter/public/rewrite_driver_factory.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/md5_hasher.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +struct apr_pool_t; +struct request_rec; +struct server_rec; + +namespace net_instaweb { + +class AbstractSharedMem; +class ApacheCache; +class ApacheConfig; +class ApacheMessageHandler; +class ApacheServerContext; +class AprMemCache; +class AsyncCache; +class CacheInterface; +class FileSystem; +class Hasher; +class MessageHandler; +class ModSpdyFetchController; +class NamedLockManager; +class QueuedWorkerPool; +class RewriteDriver; +class RewriteOptions; +class SerfUrlAsyncFetcher; +class ServerContext; +class SharedCircularBuffer; +class SharedMemRefererStatistics; +class SharedMemStatistics; +class SlowWorker; +class StaticJavascriptManager; +class Statistics; +class Timer; +class UrlAsyncFetcher; +class UrlFetcher; +class UrlPollableAsyncFetcher; +class Writer; + +// Creates an Apache RewriteDriver. +class ApacheRewriteDriverFactory : public RewriteDriverFactory { + public: + static const char kMemcached[]; + static const char kStaticJavaScriptPrefix[]; + + ApacheRewriteDriverFactory(server_rec* server, const StringPiece& version); + virtual ~ApacheRewriteDriverFactory(); + + virtual Hasher* NewHasher(); + + // Returns the fetcher that will be used by the filters to load any + // resources they need. This either matches the resource manager's + // async fetcher or is NULL in case we are configured in a way that + // all fetches will succeed immediately. Must be called after the fetchers + // have been computed + UrlPollableAsyncFetcher* SubResourceFetcher(); + + GoogleString hostname_identifier() { return hostname_identifier_; } + + AbstractSharedMem* shared_mem_runtime() const { + return shared_mem_runtime_.get(); + } + SharedMemRefererStatistics* shared_mem_referer_statistics() const { + return shared_mem_referer_statistics_.get(); + } + // Give access to apache_message_handler_ for the cases we need + // to use ApacheMessageHandler rather than MessageHandler. + // e.g. Use ApacheMessageHandler::Dump() + // This is a better choice than cast from MessageHandler. + ApacheMessageHandler* apache_message_handler() { + return apache_message_handler_; + } + // For shared memory resources the general setup we follow is to have the + // first running process (aka the root) create the necessary segments and + // fill in their shared data structures, while processes created to actually + // handle requests attach to already existing shared data structures. + // + // During normal server startup[1], RootInit() is called from the Apache hooks + // in the root process for the first task, and then ChildInit() is called in + // any child process. + // + // Keep in mind, however, that when fork() is involved a process may + // effectively see both calls, in which case the 'ChildInit' call would + // come second and override the previous root status. Both calls are also + // invoked in the debug single-process mode (httpd -X). + // + // Note that these are not static methods --- they are invoked on every + // ApacheRewriteDriverFactory instance, which exist for the global + // configuration as well as all the vhosts. + // + // [1] Besides normal startup, Apache also uses a temporary process to + // syntax check the config file. That basically looks like a complete + // normal startup and shutdown to the code. + bool is_root_process() const { return is_root_process_; } + void RootInit(); + void ChildInit(); + + void DumpRefererStatistics(Writer* writer); + + SlowWorker* slow_worker() { return slow_worker_.get(); } + + // Build global shared-memory statistics. This is invoked if at least + // one server context (global or VirtualHost) enables statistics. + Statistics* MakeGlobalSharedMemStatistics(bool logging, + int64 logging_interval_ms, + const GoogleString& logging_file); + + // Creates and ::Initializes a shared memory statistics object. + SharedMemStatistics* AllocateAndInitSharedMemStatistics( + const StringPiece& name, const bool logging, + const int64 logging_interval_ms, const GoogleString& logging_file); + + ApacheServerContext* MakeApacheServerContext(server_rec* server); + + // Makes fetches from PSA to origin-server request + // accept-encoding:gzip, even when used in a context when we want + // cleartext. We'll decompress as we read the content if needed. + void set_fetch_with_gzip(bool x) { fetch_with_gzip_ = x; } + bool fetch_with_gzip() const { return fetch_with_gzip_; } + + // Tracks the size of resources fetched from origin and populates the + // X-Original-Content-Length header for resources derived from them. + void set_track_original_content_length(bool x) { + track_original_content_length_ = x; + } + bool track_original_content_length() const { + return track_original_content_length_; + } + + void set_num_rewrite_threads(int x) { num_rewrite_threads_ = x; } + int num_rewrite_threads() const { return num_rewrite_threads_; } + void set_num_expensive_rewrite_threads(int x) { + num_expensive_rewrite_threads_ = x; + } + int num_expensive_rewrite_threads() const { + return num_expensive_rewrite_threads_; + } + + void set_message_buffer_size(int x) { + message_buffer_size_ = x; + } + + // When Serf gets a system error during polling, to avoid spamming + // the log we just print the number of outstanding fetch URLs. To + // debug this it's useful to print the complete set of URLs, in + // which case this should be turned on. + void list_outstanding_urls_on_error(bool x) { + list_outstanding_urls_on_error_ = x; + } + + bool use_per_vhost_statistics() const { + return use_per_vhost_statistics_; + } + + void set_use_per_vhost_statistics(bool x) { + use_per_vhost_statistics_ = x; + } + + bool enable_property_cache() const { + return enable_property_cache_; + } + + void set_enable_property_cache(bool x) { + enable_property_cache_ = x; + } + + // If true, virtual hosts should inherit global configuration. + bool inherit_vhost_config() const { + return inherit_vhost_config_; + } + + void set_inherit_vhost_config(bool x) { + inherit_vhost_config_ = x; + } + + bool disable_loopback_routing() const { + return disable_loopback_routing_; + } + + void set_disable_loopback_routing(bool x) { + disable_loopback_routing_ = x; + } + + bool install_crash_handler() const { + return install_crash_handler_; + } + + void set_install_crash_handler(bool x) { + install_crash_handler_ = x; + } + + // Finds a Cache for the file_cache_path in the config. If none exists, + // creates one, using all the other parameters in the ApacheConfig. + // Currently, no checking is done that the other parameters (e.g. cache + // size, cleanup interval, etc.) are consistent. + ApacheCache* GetCache(ApacheConfig* config); + + // Create a new AprMemCache from the given hostname[:port] specification. + AprMemCache* NewAprMemCache(const GoogleString& spec); + + // Makes a memcached-based cache if the configuration contains a + // memcached server specification. The l2_cache passed in is used + // to handle puts/gets for huge (>1M) values. NULL is returned if + // memcached is not specified for this server. + // + // If a non-null CacheInterface* is returned, its ownership is transferred + // to the caller and must be freed on destruction. + CacheInterface* GetMemcached(ApacheConfig* config, CacheInterface* l2_cache); + + // Returns the filesystem metadata cache for the given config's specification + // (if it has one). NULL is returned if no cache is specified. + CacheInterface* GetFilesystemMetadataCache(ApacheConfig* config); + + // Stops any further Gets from occuring in the Async cache. This is used to + // help wind down activity during a shutdown. + void StopAsyncGets(); + + // Finds a fetcher for the settings in this config, sharing with + // existing fetchers if possible, otherwise making a new one (and + // its required thread). + UrlAsyncFetcher* GetFetcher(ApacheConfig* config); + + // As above, but just gets a Serf fetcher --- not a slurp fetcher or a rate + // limiting one, etc. + SerfUrlAsyncFetcher* GetSerfFetcher(ApacheConfig* config); + + // Notification of apache tearing down a context (vhost or top-level) + // corresponding to given ApacheServerContext. Returns true if it was + // the last context. + bool PoolDestroyed(ApacheServerContext* rm); + + // Create a new RewriteOptions. In this implementation it will be an + // ApacheConfig. + virtual RewriteOptions* NewRewriteOptions(); + + // As above, but set a name on the ApacheConfig noting that it came from + // a query. + virtual RewriteOptions* NewRewriteOptionsForQuery(); + + // Initializes all the statistics objects created transitively by + // ApacheRewriteDriverFactory, including apache-specific and + // platform-independent statistics. + static void InitStats(Statistics* statistics); + static void Initialize(); + static void Terminate(); + + // Print out details of all the connections to memcached servers. + void PrintMemCacheStats(GoogleString* out); + + // If needed, sets session fetchers on the driver to do the following: + // a) Adds custom headers when configured in RewriteOptions. + // b) Route requests directly to this very server when they are not + // configured to be external. + // c) Route requests to mod_spdy's slave connection code if configured to. + void ApplySessionFetchers(ApacheServerContext* manager, + RewriteDriver* driver, request_rec* req); + + // Returns true if we should handle request as SPDY. + // This happens in two cases: + // 1) It's actually a SPDY request using mod_spdy + // 2) The header X-PSA-Optimize-For-SPDY is present, with any value. + static bool TreatRequestAsSpdy(request_rec* req); + + // Parses a comma-separated list of HTTPS options. If successful, applies + // the options to the fetcher and returns true. If the options were invalid, + // *error_message is populated and false is returned. + // + // It is *not* considered an error in this context to attempt to enable HTTPS + // when support is not compiled in. However, an error message will be logged + // in the server log, and the option-setting will have no effect. + bool SetHttpsOptions(StringPiece directive, GoogleString* error_message); + + protected: + virtual UrlFetcher* DefaultUrlFetcher(); + virtual UrlAsyncFetcher* DefaultAsyncUrlFetcher(); + virtual void StopCacheActivity(); + + // Provide defaults. + virtual MessageHandler* DefaultHtmlParseMessageHandler(); + virtual MessageHandler* DefaultMessageHandler(); + virtual FileSystem* DefaultFileSystem(); + virtual Timer* DefaultTimer(); + virtual void SetupCaches(ServerContext* resource_manager); + virtual NamedLockManager* DefaultLockManager(); + virtual QueuedWorkerPool* CreateWorkerPool(WorkerPoolName name); + + // Disable the Resource Manager's filesystem since we have a + // write-through http_cache. + virtual bool ShouldWriteResourcesToFileSystem() { return false; } + + // This helper method contains init procedures invoked by both RootInit() + // and ChildInit() + void ParentOrChildInit(); + // Initialize SharedCircularBuffer and pass it to ApacheMessageHandler and + // ApacheHtmlParseMessageHandler. is_root is true if this is invoked from + // root (ie. parent) process. + void SharedCircularBufferInit(bool is_root); + // Initialize shared_mem_referer_statistics_; is_root should be true if this + // is invoked from the root (i.e. parent) process + void SharedMemRefererStatisticsInit(bool is_root); + + // Release all the resources. It also calls the base class ShutDown to release + // the base class resources. + virtual void ShutDown(); + + // Initializes the StaticJavascriptManager. + virtual void InitStaticJavascriptManager( + StaticJavascriptManager* static_js_manager); + + private: + // Updates num_rewrite_threads_ and num_expensive_rewrite_threads_ + // with sensible values if they are not explicitly set. + void AutoDetectThreadCounts(); + + apr_pool_t* pool_; + server_rec* server_rec_; + scoped_ptr shared_mem_statistics_; + scoped_ptr shared_mem_runtime_; + scoped_ptr shared_circular_buffer_; + scoped_ptr slow_worker_; + + // TODO(jmarantz): These options could be consolidated in a protobuf or + // some other struct, which would keep them distinct from the rest of the + // state. Note also that some of the options are in the base class, + // RewriteDriverFactory, so we'd have to sort out how that worked. + GoogleString version_; + + bool statistics_frozen_; + bool is_root_process_; + bool fetch_with_gzip_; + bool track_original_content_length_; + bool list_outstanding_urls_on_error_; + + scoped_ptr shared_mem_referer_statistics_; + + // hostname_identifier_ equals to "server_hostname:port" of Apache, + // it's used to distinguish the name of shared memory, + // so that each vhost has its own SharedCircularBuffer. + const GoogleString hostname_identifier_; + // This will be assigned to message_handler_ when message_handler() or + // html_parse_message_handler is invoked for the first time. + // We keep an extra link because we need to refer them as + // ApacheMessageHandlers rather than just MessageHandler in initialization + // process. + ApacheMessageHandler* apache_message_handler_; + // This will be assigned to html_parse_message_handler_ when + // html_parse_message_handler() is invoked for the first time. + // Note that apache_message_handler_ and apache_html_parse_message_handler + // writes to the same shared memory which is owned by the factory. + ApacheMessageHandler* apache_html_parse_message_handler_; + + // Once ServerContexts are initialized via + // RewriteDriverFactory::InitServerContext, they will be + // managed by the RewriteDriverFactory. But in the root Apache process + // the ServerContexts will never be initialized. We track these here + // so that ApacheRewriteDriverFactory::ChildInit can iterate over all + // the managers that need to be ChildInit'd, and so that we can free + // the managers in the Root process that were never ChildInit'd. + typedef std::set ApacheServerContextSet; + ApacheServerContextSet uninitialized_managers_; + + // If true, we'll have a separate statistics object for each vhost + // (along with a global aggregate), rather than just a single object + // aggregating all of them. + bool use_per_vhost_statistics_; + + // Enable the property cache. + bool enable_property_cache_; + + // Inherit configuration from global context into vhosts. + bool inherit_vhost_config_; + + // If false (default) we will redirect all fetches to unknown hosts to + // localhost. + bool disable_loopback_routing_; + + // If true, we'll install a signal handler that prints backtraces. + bool install_crash_handler_; + + // true iff we ran through AutoDetectThreadCounts() + bool thread_counts_finalized_; + + // These are <= 0 if we should autodetect. + int num_rewrite_threads_; + int num_expensive_rewrite_threads_; + + int max_mod_spdy_fetch_threads_; + + // Size of shared circular buffer for displaying Info messages in + // /mod_pagespeed_messages. + int message_buffer_size_; + + // File-Caches are expensive. Just allocate one per distinct file-cache path. + // At the moment there is no consistency checking for other parameters. Note + // that the LRUCache is instantiated inside the ApacheCache, so we get a new + // LRUCache for each distinct file-cache path. Also note that only the + // file-cache path is used as the key in this map. Other parameters changed, + // such as lru cache size or file cache clean interval, are taken from the + // first file-cache found configured to one address. + // + // TODO(jmarantz): Consider instantiating one LRUCache per process. + typedef std::map PathCacheMap; + PathCacheMap path_cache_map_; + + // memcache connections are expensive. Just allocate one per + // distinct server-list. At the moment there is no consistency + // checking for other parameters. Note that each memcached + // interface share the thread allocation, based on the + // ModPagespeedMemcachedThreads settings first encountered for + // a particular server-set. + // + // The QueuedWorkerPool for async cache-gets is shared among all + // memcached connections. + // + // The CacheInterface* value in the MemcacheMap now includes, + // depending on options, instances of CacheBatcher, AsyncCache, + // and CacheStats. Explicit lists of AprMemCache instances and + // AsyncCache objects are also included, as they require extra + // treatment during startup and shutdown. + typedef std::map MemcachedMap; + MemcachedMap memcached_map_; + scoped_ptr memcached_pool_; + std::vector memcache_servers_; + std::vector async_caches_; + + // Serf fetchers are expensive -- they each cost a thread. Allocate + // one for each proxy/slurp-setting. Currently there is no + // consistency checking for fetcher timeout. + typedef std::map FetcherMap; + FetcherMap fetcher_map_; + typedef std::map SerfFetcherMap; + SerfFetcherMap serf_fetcher_map_; + MD5Hasher cache_hasher_; + + // Helps coordinate direct-to-mod_spdy fetches. + scoped_ptr mod_spdy_fetch_controller_; + + GoogleString https_options_; + + DISALLOW_COPY_AND_ASSIGN(ApacheRewriteDriverFactory); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APACHE_REWRITE_DRIVER_FACTORY_H_ diff --git a/psol/include/net/instaweb/apache/apache_server_context.h b/psol/include/net/instaweb/apache/apache_server_context.h new file mode 100644 index 000000000..02886899a --- /dev/null +++ b/psol/include/net/instaweb/apache/apache_server_context.h @@ -0,0 +1,191 @@ +// Copyright 2011 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_APACHE_APACHE_SERVER_CONTEXT_H_ +#define NET_INSTAWEB_APACHE_APACHE_SERVER_CONTEXT_H_ + +#include "net/instaweb/apache/apache_config.h" +#include "net/instaweb/rewriter/public/server_context.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +struct server_rec; + +namespace net_instaweb { + +class AbstractMutex; +class ApacheRewriteDriverFactory; +class Histogram; +class RewriteDriverPool; +class RewriteStats; +class SharedMemStatistics; +class Statistics; +class UrlAsyncFetcherStats; +class Variable; + +// Creates an Apache-specific ServerContext. This differs from base class +// that it incorporates by adding per-VirtualHost configuration, including: +// - file-cache path & limits +// - default RewriteOptions. +// Additionally, there are startup semantics for apache's prefork model +// that require a phased initialization. +class ApacheServerContext : public ServerContext { + public: + ApacheServerContext(ApacheRewriteDriverFactory* factory, + server_rec* server, + const StringPiece& version); + virtual ~ApacheServerContext(); + + GoogleString hostname_identifier() { return hostname_identifier_; } + ApacheRewriteDriverFactory* apache_factory() { return apache_factory_; } + ApacheConfig* config(); + bool InitFileCachePath(); + + // These return configuration objects that hold settings from + // and sections of configuration. + // They initialize lazily, so are not thread-safe; however they are only + // meant to be used during configuration parsing. These methods should be + // called only if there is actually a need to put something in them, since + // otherwise we may end up constructing separate SPDY vs. non-SPDY + // configurations needlessly. + ApacheConfig* SpdyConfigOverlay(); + ApacheConfig* NonSpdyConfigOverlay(); + + // These return true if the given overlays were constructed (in response + // to having something in config files to put in them). + bool has_spdy_config_overlay() const { + return spdy_config_overlay_.get() != NULL; + } + + bool has_non_spdy_config_overlay() const { + return non_spdy_config_overlay_.get() != NULL; + } + + // These two take ownership of their parameters. + void set_spdy_config_overlay(ApacheConfig* x) { + spdy_config_overlay_.reset(x); + } + + void set_non_spdy_config_overlay(ApacheConfig* x) { + non_spdy_config_overlay_.reset(x); + } + + // Returns special configuration that should be used for SPDY sessions + // instead of config(). Returns NULL if config() should be used instead. + ApacheConfig* SpdyConfig() { return spdy_specific_config_.get(); } + + // Pool to pass to NewRewriteDriverFromPool to get a RewriteDriver configured + // with SPDY-specific options. May be NULL in case there is no spdy-specific + // configuration. + RewriteDriverPool* spdy_driver_pool() { return spdy_driver_pool_; } + + // This should be called after all configuration parsing is done to collapse + // configuration inside the config overlays into actual ApacheConfig objects. + // It will also compute signatures when done. + void CollapseConfigOverlaysAndComputeSignatures(); + + // Initialize this ServerContext to have its own statistics domain. + // Must be called after global_statistics has been created and had + // ::Initialize called on it. + void CreateLocalStatistics(Statistics* global_statistics); + + // Should be called after the child process is forked. + void ChildInit(); + + bool initialized() const { return initialized_; } + + // Called on notification from Apache on child exit. Returns true + // if this is the last ServerContext that exists. + bool PoolDestroyed(); + + // Poll; if we haven't checked the timestamp of + // $FILE_PREFIX/cache.flush in the past + // cache_flush_poll_interval_sec_ (default 5) seconds do so, and if + // the timestamp has expired then update the + // cache_invalidation_timestamp in global_options, thus flushing the + // cache. + // + // TODO(jmarantz): allow configuration of this option. + // TODO(jmarantz): allow a URL-based mechanism to flush cache, even if + // we implement it by simply writing the cache.flush file so other + // servers can see it. Note that using shared-memory is not a great + // plan because we need the cache-invalidation to persist across server + // restart. + void PollFilesystemForCacheFlush(); + + // Accumulate in a histogram the amount of time spent rewriting HTML. + // TODO(sligocki): Remove in favor of RewriteStats::rewrite_latency_histogram. + void AddHtmlRewriteTimeUs(int64 rewrite_time_us); + + static void InitStats(Statistics* statistics); + + const server_rec* server() const { return server_rec_; } + + private: + bool UpdateCacheFlushTimestampMs(int64 timestamp_ms); + + ApacheRewriteDriverFactory* apache_factory_; + server_rec* server_rec_; + GoogleString version_; + + // hostname_identifier_ equals to "server_hostname:port" of Apache, + // it's used to distinguish the name of shared memory, + // so that each vhost has its own SharedCircularBuffer. + GoogleString hostname_identifier_; + + bool initialized_; + + // Non-NULL if we have per-vhost stats. + scoped_ptr split_statistics_; + + // May be NULL. Owned by *split_statistics_. + SharedMemStatistics* local_statistics_; + + // These are non-NULL if we have per-vhost stats. + scoped_ptr local_rewrite_stats_; + scoped_ptr stats_fetcher_; + + // May be NULL. Constructed once we see things in config files that should + // be stored in these. + scoped_ptr spdy_config_overlay_; + scoped_ptr non_spdy_config_overlay_; + + // May be NULL if we don't have any special settings for when using SPDY. + scoped_ptr spdy_specific_config_; + + // Owned by ServerContext via a call to ManageRewriteDriverPool. + // May be NULL if we don't have a spdy-specific configuration. + RewriteDriverPool* spdy_driver_pool_; + + Histogram* html_rewrite_time_us_histogram_; + + // State used to implement periodic polling of $FILE_PREFIX/cache.flush. + // last_cache_flush_check_sec_ is ctor-initialized to 0 so the first + // time we Poll we will read the file. + scoped_ptr cache_flush_mutex_; + int64 last_cache_flush_check_sec_; // seconds since 1970 + + Variable* cache_flush_count_; + Variable* cache_flush_timestamp_ms_; + + DISALLOW_COPY_AND_ASSIGN(ApacheServerContext); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APACHE_SERVER_CONTEXT_H_ diff --git a/psol/include/net/instaweb/apache/apache_slurp.h b/psol/include/net/instaweb/apache/apache_slurp.h new file mode 100644 index 000000000..a59d08ac9 --- /dev/null +++ b/psol/include/net/instaweb/apache/apache_slurp.h @@ -0,0 +1,32 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_APACHE_APACHE_SLURP_H_ +#define NET_INSTAWEB_APACHE_APACHE_SLURP_H_ + +struct request_rec; + +namespace net_instaweb { + +class ApacheServerContext; + +// Loads the URL based on the fetchers and other infrastructure in the +// factory. +void SlurpUrl(ApacheServerContext* manager, request_rec* r); + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APACHE_SLURP_H_ diff --git a/psol/include/net/instaweb/apache/apache_thread_system.h b/psol/include/net/instaweb/apache/apache_thread_system.h new file mode 100644 index 000000000..e5c93b0a7 --- /dev/null +++ b/psol/include/net/instaweb/apache/apache_thread_system.h @@ -0,0 +1,49 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: morlovich@google.com (Maksim Orlovich) +// +// A wrapper around PthreadThreadSystem for use in Apache that takes care of +// some signal masking issues that arise in prefork. We prefer pthreads to APR +// as APR mutex, etc., creation requires pools which are generally thread +// unsafe, introducing some additional risks. + +#ifndef NET_INSTAWEB_APACHE_APACHE_THREAD_SYSTEM_H_ +#define NET_INSTAWEB_APACHE_APACHE_THREAD_SYSTEM_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/pthread_thread_system.h" + +namespace net_instaweb { + +class Timer; + +class ApacheThreadSystem : public PthreadThreadSystem { + public: + ApacheThreadSystem(); + virtual ~ApacheThreadSystem(); + virtual Timer* NewTimer(); + + protected: + virtual void BeforeThreadRunHook(); + + private: + DISALLOW_COPY_AND_ASSIGN(ApacheThreadSystem); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APACHE_THREAD_SYSTEM_H_ diff --git a/psol/include/net/instaweb/apache/apr_file_system.h b/psol/include/net/instaweb/apache/apr_file_system.h new file mode 100644 index 000000000..aa6782656 --- /dev/null +++ b/psol/include/net/instaweb/apache/apr_file_system.h @@ -0,0 +1,103 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: lsong@google.com (Libo Song) + +#ifndef NET_INSTAWEB_APACHE_APR_FILE_SYSTEM_H_ +#define NET_INSTAWEB_APACHE_APR_FILE_SYSTEM_H_ + +#include "apr.h" + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/file_system.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string_util.h" + +struct apr_finfo_t; +struct apr_pool_t; + +namespace net_instaweb { + +class AbstractMutex; +class MessageHandler; +class ThreadSystem; + +void AprReportError(MessageHandler* message_handler, const char* filename, + int line, const char* message, int error_code); + +class AprFileSystem : public FileSystem { + public: + AprFileSystem(apr_pool_t* pool, ThreadSystem* thread_system); + ~AprFileSystem(); + + virtual int MaxPathLength(const StringPiece& base) const; + virtual InputFile* OpenInputFile( + const char* file, MessageHandler* message_handler); + virtual OutputFile* OpenOutputFileHelper( + const char* file, bool append, MessageHandler* message_handler); + // See FileSystem interface for specifics of OpenTempFile. + virtual OutputFile* OpenTempFileHelper(const StringPiece& prefix_name, + MessageHandler* message_handler); + + virtual bool ListContents(const StringPiece& dir, StringVector* files, + MessageHandler* handler); + // Like POSIX 'mkdir', makes a directory only if parent directory exists. + // Fails if directory_name already exists or parent directory doesn't exist. + virtual bool MakeDir(const char* directory_path, MessageHandler* handler); + virtual bool RemoveDir(const char* directory_path, + MessageHandler* message_handler); + virtual bool RemoveFile(const char* filename, + MessageHandler* message_handler); + virtual bool RenameFileHelper(const char* old_filename, + const char* new_filename, + MessageHandler* message_handler); + + virtual bool Atime(const StringPiece& path, + int64* timestamp_sec, MessageHandler* handler); + virtual bool Mtime(const StringPiece& path, + int64* timestamp_sec, MessageHandler* handler); + // Report the disk utilization of the file specified by path. Note that disk + // utilization could differ from the apparent size of the file as it depends + // on the underlying file system and default block size. + virtual bool Size(const StringPiece& path, int64* size, + MessageHandler* handler); + virtual BoolOrError Exists(const char* path, MessageHandler* handler); + virtual BoolOrError IsDir(const char* path, MessageHandler* handler); + + virtual BoolOrError TryLock(const StringPiece& lock_name, + MessageHandler* handler); + virtual BoolOrError TryLockWithTimeout(const StringPiece& lock_name, + int64 timeout_ms, + MessageHandler* handler); + virtual bool Unlock(const StringPiece& lock_name, MessageHandler* handler); + + private: + // Used by *time and Size methods to get file info. + bool Stat(const StringPiece& path, + apr_finfo_t* file_info, apr_int32_t field_wanted, + MessageHandler* handler); + + apr_pool_t* pool_; + + // We use a mutex to protect the pool above when calling into apr's file + // system ops, which might otherwise access it concurrently in an unsafe + // way. + scoped_ptr mutex_; + + DISALLOW_COPY_AND_ASSIGN(AprFileSystem); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APR_FILE_SYSTEM_H_ diff --git a/psol/include/net/instaweb/apache/apr_mem_cache.cc b/psol/include/net/instaweb/apache/apr_mem_cache.cc new file mode 100644 index 000000000..7dbadc01d --- /dev/null +++ b/psol/include/net/instaweb/apache/apr_mem_cache.cc @@ -0,0 +1,487 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#include "net/instaweb/apache/apr_mem_cache.h" + +#include "apr_pools.h" +#include "base/logging.h" +#include "net/instaweb/apache/apr_thread_compatible_pool.h" +#include "net/instaweb/util/public/cache_interface.h" +#include "net/instaweb/util/public/hasher.h" +#include "net/instaweb/util/public/hostname_util.h" +#include "net/instaweb/util/public/key_value_codec.h" +#include "net/instaweb/util/public/message_handler.h" +#include "net/instaweb/util/public/shared_string.h" +#include "net/instaweb/util/public/statistics.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/timer.h" +#include "net/instaweb/util/stack_buffer.h" +#include "third_party/aprutil/apr_memcache2.h" + +namespace net_instaweb { + +namespace { + +// Defaults copied from Apache 2.4 src distribution: +// src/modules/cache/mod_socache_memcache.c +const int kDefaultMemcachedPort = 11211; +const int kDefaultServerMin = 0; // minimum # client sockets to open +const int kDefaultServerSmax = 1; // soft max # client connections to open +const char kMemCacheTimeouts[] = "memcache_timeouts"; +const char kLastErrorCheckpointMs[] = "memcache_last_error_checkpoint_ms"; +const char kErrorBurstSize[] = "memcache_error_burst_size"; + +// time-to-live of a client connection. There is a bug in the APR +// implementation, where the TTL argument to apr_memcache2_server_create was +// being interpreted in microseconds, rather than seconds. +// +// See: http://mail-archives.apache.org/mod_mbox/apr-dev/201209.mbox/browser +// and: http://svn.apache.org/viewvc?view=revision&revision=1390530 +// +// TODO(jmarantz): figure out somehow if that fix is applied, and if so, +// do not multiply by 1M. +const int kDefaultServerTtlUs = 600*1000*1000; + +const int kTimeoutUnset = -1; + +} // namespace + +AprMemCache::AprMemCache(const StringPiece& servers, int thread_limit, + Hasher* hasher, Statistics* statistics, + Timer* timer, MessageHandler* handler) + : valid_server_spec_(false), + thread_limit_(thread_limit), + timeout_us_(kTimeoutUnset), + pool_(NULL), + memcached_(NULL), + hasher_(hasher), + timer_(timer), + timeouts_(statistics->GetVariable(kMemCacheTimeouts)), + last_error_checkpoint_ms_(statistics->GetVariable( + kLastErrorCheckpointMs)), + error_burst_size_(statistics->GetVariable(kErrorBurstSize)), + is_machine_local_(true), + message_handler_(handler) { + servers.CopyToString(&server_spec_); + pool_ = AprCreateThreadCompatiblePool(NULL); + + // Get our hostname for the is_machine_local_ analysis below. + GoogleString hostname(GetHostname()); + + // Don't try to connect on construction; we don't want to bother creating + // connections to the memcached servers in the root process. But do parse + // the server spec so we can determine its validity. + // + // TODO(jmarantz): consider doing an initial connect/disconnect during + // config parsing to get better error reporting on Apache startup. + StringPieceVector server_vector; + SplitStringPieceToVector(servers, ",", &server_vector, true); + bool success = true; + for (int i = 0, n = server_vector.size(); i < n; ++i) { + StringPieceVector host_port; + int port = kDefaultMemcachedPort; + SplitStringPieceToVector(server_vector[i], ":", &host_port, true); + bool ok = false; + if (host_port.size() == 1) { + ok = true; + } else if (host_port.size() == 2) { + ok = StringToInt(host_port[1].as_string(), &port); + } + if (ok) { + // If any host isn't "localhost" then the machine isn't local. + is_machine_local_ &= IsLocalhost(host_port[0], hostname); + host_port[0].CopyToString(StringVectorAdd(&hosts_)); + ports_.push_back(port); + } else { + message_handler_->Message(kError, "Invalid memcached sever: %s", + server_vector[i].as_string().c_str()); + success = false; + } + } + valid_server_spec_ = success && !server_vector.empty(); +} + +AprMemCache::~AprMemCache() { + apr_pool_destroy(pool_); +} + +void AprMemCache::InitStats(Statistics* statistics) { + statistics->AddVariable(kMemCacheTimeouts); + statistics->AddVariable(kLastErrorCheckpointMs); + statistics->AddVariable(kErrorBurstSize); +} + +bool AprMemCache::Connect() { + apr_status_t status = + apr_memcache2_create(pool_, hosts_.size(), 0, &memcached_); + bool success = false; + if ((status == APR_SUCCESS) && !hosts_.empty()) { + success = true; + CHECK_EQ(hosts_.size(), ports_.size()); + for (int i = 0, n = hosts_.size(); i < n; ++i) { + apr_memcache2_server_t* server = NULL; + status = apr_memcache2_server_create( + pool_, hosts_[i].c_str(), ports_[i], + kDefaultServerMin, kDefaultServerSmax, + thread_limit_, kDefaultServerTtlUs, &server); + if ((status != APR_SUCCESS) || + ((status = apr_memcache2_add_server(memcached_, server) != + APR_SUCCESS))) { + char buf[kStackBufferSize]; + apr_strerror(status, buf, sizeof(buf)); + message_handler_->Message( + kError, "Failed to attach memcached server %s:%d %s (%d)", + hosts_[i].c_str(), ports_[i], buf, status); + success = false; + } else { + if (timeout_us_ != kTimeoutUnset) { + apr_memcache2_set_timeout_microseconds(memcached_, timeout_us_); + } + servers_.push_back(server); + } + } + } + return success; +} + +void AprMemCache::DecodeValueMatchingKeyAndCallCallback( + const GoogleString& key, const char* data, size_t data_len, + const char* calling_method, Callback* callback) { + SharedString key_and_value; + key_and_value.Assign(data, data_len); + GoogleString actual_key; + if (key_value_codec::Decode(&key_and_value, &actual_key, callback->value())) { + if (key == actual_key) { + ValidateAndReportResult(actual_key, CacheInterface::kAvailable, callback); + } else { + message_handler_->Message( + kError, "AprMemCache::%s key collision %s != %s", + calling_method, key.c_str(), actual_key.c_str()); + ValidateAndReportResult(key, CacheInterface::kNotFound, callback); + } + } else { + message_handler_->Message( + kError, "AprMemCache::%s decoding error on key %s", + calling_method, key.c_str()); + ValidateAndReportResult(key, CacheInterface::kNotFound, callback); + } +} + +void AprMemCache::Get(const GoogleString& key, Callback* callback) { + if (!IsHealthy()) { + ValidateAndReportResult(key, CacheInterface::kNotFound, callback); + return; + } + apr_pool_t* data_pool; + apr_pool_create(&data_pool, pool_); + CHECK(data_pool != NULL) << "apr_pool_t data_pool allocation failure"; + GoogleString hashed_key = hasher_->Hash(key); + char* data; + apr_size_t data_len; + apr_status_t status = apr_memcache2_getp( + memcached_, data_pool, hashed_key.c_str(), &data, &data_len, NULL); + if (status == APR_SUCCESS) { + DecodeValueMatchingKeyAndCallCallback(key, data, data_len, "Get", callback); + } else { + if (status != APR_NOTFOUND) { + RecordError(); + char buf[kStackBufferSize]; + apr_strerror(status, buf, sizeof(buf)); + message_handler_->Message( + kError, "AprMemCache::Get error: %s (%d) on key %s", + buf, status, key.c_str()); + if (status == APR_TIMEUP) { + timeouts_->Add(1); + } + } + ValidateAndReportResult(key, CacheInterface::kNotFound, callback); + } + apr_pool_destroy(data_pool); +} + +void AprMemCache::MultiGet(MultiGetRequest* request) { + if (!IsHealthy()) { + ReportMultiGetNotFound(request); + return; + } + + // apr_memcache2_multgetp documentation indicates it may clear the + // temp_pool inside the function. Thus it is risky to pass the same + // pool for both temp_pool and data_pool, as we need to read the + // data after the call. + apr_pool_t* data_pool; + apr_pool_create(&data_pool, pool_); + CHECK(data_pool != NULL) << "apr_pool_t data_pool allocation failure"; + apr_pool_t* temp_pool = NULL; + apr_pool_create(&temp_pool, pool_); + CHECK(temp_pool != NULL) << "apr_pool_t temp_pool allocation failure"; + apr_hash_t* hash_table = apr_hash_make(data_pool); + StringVector hashed_keys; + + for (int i = 0, n = request->size(); i < n; ++i) { + GoogleString hashed_key = hasher_->Hash((*request)[i].key); + hashed_keys.push_back(hashed_key); + apr_memcache2_add_multget_key(data_pool, hashed_key.c_str(), &hash_table); + } + + apr_status_t status = apr_memcache2_multgetp(memcached_, temp_pool, data_pool, + hash_table); + apr_pool_destroy(temp_pool); + bool error_recorded = false; + if (status == APR_SUCCESS) { + for (int i = 0, n = request->size(); i < n; ++i) { + CacheInterface::KeyCallback* key_callback = &(*request)[i]; + const GoogleString& key = key_callback->key; + Callback* callback = key_callback->callback; + const GoogleString& hashed_key = hashed_keys[i]; + apr_memcache2_value_t* value = static_cast( + apr_hash_get(hash_table, hashed_key.data(), hashed_key.size())); + if (value == NULL) { + status = APR_NOTFOUND; + } else { + status = value->status; + } + if (status == APR_SUCCESS) { + DecodeValueMatchingKeyAndCallCallback(key, value->data, value->len, + "MultiGet", callback); + } else { + if (status != APR_NOTFOUND) { + if (!error_recorded) { + // Only count 1 error towards threshold on MultiGet failure. + error_recorded = true; + RecordError(); + } + char buf[kStackBufferSize]; + apr_strerror(status, buf, sizeof(buf)); + message_handler_->Message( + kError, "AprMemCache::MultiGet error: %s (%d) on key %s", + buf, status, key.c_str()); + if (status == APR_TIMEUP) { + timeouts_->Add(1); + } + } + ValidateAndReportResult(key, CacheInterface::kNotFound, callback); + } + } + delete request; + } else { + RecordError(); + char buf[kStackBufferSize]; + apr_strerror(status, buf, sizeof(buf)); + message_handler_->Message( + kError, "AprMemCache::MultiGet error: %s (%d) on %d keys", + buf, status, static_cast(request->size())); + ReportMultiGetNotFound(request); + } + apr_pool_destroy(data_pool); +} + +void AprMemCache::PutHelper(const GoogleString& key, + SharedString* key_and_value) { + // I believe apr_memcache2_set erroneously takes a char* for the value. + // Hence we const_cast. + GoogleString hashed_key = hasher_->Hash(key); + apr_status_t status = apr_memcache2_set( + memcached_, hashed_key.c_str(), + const_cast(key_and_value->data()), key_and_value->size(), + 0, 0); + if (status != APR_SUCCESS) { + RecordError(); + char buf[kStackBufferSize]; + apr_strerror(status, buf, sizeof(buf)); + + int value_size = key_value_codec::GetValueSizeFromKeyAndKeyValue( + key, *key_and_value); + message_handler_->Message( + kError, "AprMemCache::Put error: %s (%d) on key %s, value-size %d", + buf, status, key.c_str(), value_size); + } +} + +void AprMemCache::PutWithKeyInValue(const GoogleString& key, + SharedString* key_and_value) { + if (!IsHealthy()) { + return; + } + PutHelper(key, key_and_value); +} + +void AprMemCache::Put(const GoogleString& key, SharedString* value) { + if (!IsHealthy()) { + return; + } + + SharedString key_and_value; + if (key_value_codec::Encode(key, value, &key_and_value)) { + PutHelper(key, &key_and_value); + } else { + message_handler_->Message( + kError, "AprMemCache::Put error: key size %d too large, first " + "100 bytes of key is: %s", + static_cast(key.size()), key.substr(0, 100).c_str()); + } +} + +void AprMemCache::Delete(const GoogleString& key) { + if (!IsHealthy()) { + return; + } + + // Note that deleting a key whose value exceeds our size threshold + // will not actually remove it from the fallback cache. However, it + // will remove our sentinel indicating that it's in the fallback cache, + // and therefore it will be functionally deleted. + // + // TODO(jmarantz): determine whether it's better to defensively delete + // it from the fallback cache even though most data will not be, thus + // incurring file system overhead for small data deleted from memcached. + // + // Another option would be to issue a Get before the Delete to see + // if it's in the fallback cache, but that would send more load to + // memcached, possibly transferring significant amounts of data that + // will be tossed. + + GoogleString hashed_key = hasher_->Hash(key); + apr_status_t status = apr_memcache2_delete(memcached_, hashed_key.c_str(), 0); + if ((status != APR_SUCCESS) && (status != APR_NOTFOUND)) { + RecordError(); + char buf[kStackBufferSize]; + apr_strerror(status, buf, sizeof(buf)); + message_handler_->Message( + kError, "AprMemCache::Delete error: %s (%d) on key %s", buf, status, + key.c_str()); + } +} + +bool AprMemCache::GetStatus(GoogleString* buffer) { + apr_pool_t* temp_pool = NULL; + apr_pool_create(&temp_pool, pool_); + CHECK(temp_pool != NULL) << "apr_pool_t allocation failure"; + bool ret = true; + for (int i = 0, n = servers_.size(); i < n; ++i) { + apr_memcache2_stats_t* stats; + apr_status_t status = apr_memcache2_stats(servers_[i], temp_pool, &stats); + if (status == APR_SUCCESS) { + StrAppend(buffer, "memcached server ", hosts_[i], ":", + IntegerToString(ports_[i]), " version ", stats->version); + StrAppend(buffer, " pid ", IntegerToString(stats->pid), " up ", + IntegerToString(stats->uptime), " seconds \n"); + StrAppend(buffer, "bytes: ", + Integer64ToString(stats->bytes), "\n"); + StrAppend(buffer, "bytes_read: ", + Integer64ToString(stats->bytes_read), "\n"); + StrAppend(buffer, "bytes_written: ", + Integer64ToString(stats->bytes_written), "\n"); + StrAppend(buffer, "cmd_get: ", + IntegerToString(stats->cmd_get), "\n"); + StrAppend(buffer, "cmd_set: ", + IntegerToString(stats->cmd_set), "\n"); + StrAppend(buffer, "connection_structures: ", + IntegerToString(stats->connection_structures), "\n"); + StrAppend(buffer, "curr_connections: ", + IntegerToString(stats->curr_connections), "\n"); + StrAppend(buffer, "curr_items: ", + IntegerToString(stats->curr_items), "\n"); + StrAppend(buffer, "evictions: ", + Integer64ToString(stats->evictions), "\n"); + StrAppend(buffer, "get_hits: ", + IntegerToString(stats->get_hits), "\n"); + StrAppend(buffer, "get_misses: ", + IntegerToString(stats->get_misses), "\n"); + StrAppend(buffer, "limit_maxbytes: ", + IntegerToString(stats->limit_maxbytes), "\n"); + StrAppend(buffer, "pointer_size: ", + IntegerToString(stats->pointer_size), "\n"); + StrAppend(buffer, "rusage_system: ", + Integer64ToString(stats->rusage_system), "\n"); + StrAppend(buffer, "rusage_user: ", + Integer64ToString(stats->pointer_size), "\n"); + StrAppend(buffer, "threads: ", + IntegerToString(stats->threads), "\n"); + StrAppend(buffer, "total_connections: ", + IntegerToString(stats->total_connections), "\n"); + StrAppend(buffer, "total_items: ", + IntegerToString(stats->total_items), "\n"); + StrAppend(buffer, "\n"); + // TODO(jmarantz): add the rest of the stats from http://apr.apache.org + // /docs/apr-util/1.4/structapr__memcache__stats__t.html + } else { + ret = false; + } + } + apr_pool_destroy(temp_pool); + return ret; +} + +void AprMemCache::RecordError() { + // Note that we are sharing state with other Apache child processes, + // and we use Statistics Variables to determine our current health + // status. In Apache those are implemented via shared memory. + int64 time_ms = timer_->NowMs(); + int64 last_error_checkpoint_ms = last_error_checkpoint_ms_->Get(); + int64 delta_ms = time_ms - last_error_checkpoint_ms; + + // The first time we catch an error we'll set the time of the error. + // We'll keep counting errors for 30 seconds declaring sickness when + // we reach 4. That's an approximation because there will be + // cross-process races between accesses of the time & counts. + // + // When we get to 30 seconds since the start of the error burst we + // clear everything & start counting again. + if (delta_ms > kHealthCheckpointIntervalMs) { + last_error_checkpoint_ms_->Set(time_ms); + error_burst_size_->Set(1); + } else { + error_burst_size_->Add(1); + } +} + +bool AprMemCache::IsHealthy() const { + if (shutdown_.value()) { + return false; + } + int64 time_ms = timer_->NowMs(); + int64 last_error_checkpoint_ms = last_error_checkpoint_ms_->Get(); + int64 delta_ms = time_ms - last_error_checkpoint_ms; + int64 error_burst_size = error_burst_size_->Get(); + + if (delta_ms > kHealthCheckpointIntervalMs) { + if (error_burst_size >= kMaxErrorBurst) { + // We were sick, but now it seems enough time has expired to + // see whether we've recovered. + message_handler_->Message( + kInfo, "AprMemCache::IsHealthy error: Attempting to recover"); + } + error_burst_size_->Set(0); + return true; + } + return error_burst_size < kMaxErrorBurst; +} + +void AprMemCache::ShutDown() { + shutdown_.set_value(true); +} + +void AprMemCache::set_timeout_us(int timeout_us) { + timeout_us_ = timeout_us; + if ((memcached_ != NULL) && (timeout_us != kTimeoutUnset)) { + apr_memcache2_set_timeout_microseconds(memcached_, timeout_us); + } +} + +} // namespace net_instaweb diff --git a/psol/include/net/instaweb/apache/apr_mem_cache.h b/psol/include/net/instaweb/apache/apr_mem_cache.h new file mode 100644 index 000000000..f80a618c7 --- /dev/null +++ b/psol/include/net/instaweb/apache/apr_mem_cache.h @@ -0,0 +1,171 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_APACHE_APR_MEM_CACHE_H_ +#define NET_INSTAWEB_APACHE_APR_MEM_CACHE_H_ + +#include +#include + +#include "net/instaweb/util/public/atomic_bool.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/cache_interface.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/timer.h" + +struct apr_memcache2_t; +struct apr_memcache2_server_t; +struct apr_pool_t; + +namespace net_instaweb { + +class Hasher; +class MessageHandler; +class SharedString; +class Statistics; +class Variable; + +// Interface to memcached via the apr_memcache2*, as documented in +// http://apr.apache.org/docs/apr-util/1.4/group___a_p_r___util___m_c.html. +// +// While this class derives from CacheInterface, it is a blocking +// implementation, suitable for instantiating underneath an AsyncCache. +class AprMemCache : public CacheInterface { + public: + // Experimentally it seems large values larger than 1M bytes result in + // a failure, e.g. from load-tests: + // [Fri Jul 20 10:29:34 2012] [error] [mod_pagespeed 0.10.0.0-1699 @1522] + // AprMemCache::Put error: Internal error on key + // http://example.com/image.jpg, value-size 1393146 + // External to this class, we use a fallback cache (in Apache a FileCache) to + // handle too-large requests. This is managed by class FallbackCache in + // ../util. + static const size_t kValueSizeThreshold = 1 * 1000 * 1000; + + // Amount of time after a burst of errors to retry memcached operations. + static const int64 kHealthCheckpointIntervalMs = 30 * Timer::kSecondMs; + + // Maximum number of errors tolerated within kHealthCheckpointIntervalMs, + // after which AprMemCache will declare itself unhealthy for + // kHealthCheckpointIntervalMs. + static const int64 kMaxErrorBurst = 4; + + // servers is a comma-separated list of host[:port] where port defaults + // to 11211, the memcached default. + // + // thread_limit is used to provide apr_memcache2_server_create with + // a hard maximum number of client connections to open. + AprMemCache(const StringPiece& servers, int thread_limit, Hasher* hasher, + Statistics* statistics, Timer* timer, MessageHandler* handler); + ~AprMemCache(); + + static void InitStats(Statistics* statistics); + + const GoogleString& server_spec() const { return server_spec_; } + + // As mentioned above, Get and MultiGet are blocking in this implementation. + virtual void Get(const GoogleString& key, Callback* callback); + virtual void Put(const GoogleString& key, SharedString* value); + virtual void Delete(const GoogleString& key); + virtual void MultiGet(MultiGetRequest* request); + + // Connects to the server, returning whether the connnection was + // successful or not. + bool Connect(); + + bool valid_server_spec() const { return valid_server_spec_; } + + // Get detailed status in a string, returning false if the server + // failed to return status. + bool GetStatus(GoogleString* status_string); + + virtual const char* Name() const { return "AprMemCache"; } + virtual bool IsBlocking() const { return true; } + + // Records in statistics that a system error occurred, helping it detect + // when it's unhealthy if they are too frequent. + void RecordError(); + + // Determines whether memcached is healthy enough to attempt another + // operation. Note that even though there may be multiple shards, + // some of which are healthy and some not, we don't currently track + // errors on a per-shard basis, so we effectively declare all the + // memcached instances unhealthy if any of them are. + virtual bool IsHealthy() const; + + // Close down the connection to the memcached servers. + virtual void ShutDown(); + + virtual bool MustEncodeKeyInValueOnPut() const { return true; } + virtual void PutWithKeyInValue(const GoogleString& key, + SharedString* key_and_value); + + // Sets the I/O timeout in microseconds. This should be called at + // setup time and not while there are operations in flight. + void set_timeout_us(int timeout_us); + + private: + void DecodeValueMatchingKeyAndCallCallback( + const GoogleString& key, const char* data, size_t data_len, + const char* calling_method, Callback* callback); + + // Puts a value that's already encoded with the key into the cache, without + // checking health first. This is meant to be called from Put and + // PutWithKeyInValue, which will do the health check. + void PutHelper(const GoogleString& key, SharedString* key_and_value); + + StringVector hosts_; + std::vector ports_; + GoogleString server_spec_; + bool valid_server_spec_; + int thread_limit_; + int timeout_us_; + apr_pool_t* pool_; + apr_memcache2_t* memcached_; + std::vector servers_; + Hasher* hasher_; + Timer* timer_; + AtomicBool shutdown_; + + Variable* timeouts_; + Variable* last_error_checkpoint_ms_; + Variable* error_burst_size_; + + bool is_machine_local_; + MessageHandler* message_handler_; + + // When memcached is killed, we will generate errors for every cache + // operation. To bound the amount of logging we do, we keep track + // of the last time when we issued a log message for an APR failure. + // We use a Statistic here for this so that it's shared across + // Apache processes. + // + // Note that we have some messages indicating a potential functional issue on + // (e.g. key collision) and a variety of places where we print messages + // because the Apr routine failed. We are grouping together Apr failures + // for Get, Put, Delete, and MultiGet. We might at some point wish to + // track the last time we sent a message for each of those. + Variable* last_apr_error_; + + DISALLOW_COPY_AND_ASSIGN(AprMemCache); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APR_MEM_CACHE_H_ diff --git a/psol/include/net/instaweb/apache/apr_thread_compatible_pool.cc b/psol/include/net/instaweb/apache/apr_thread_compatible_pool.cc new file mode 100644 index 000000000..c57c325cb --- /dev/null +++ b/psol/include/net/instaweb/apache/apr_thread_compatible_pool.cc @@ -0,0 +1,54 @@ +// Copyright 2012 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: morlovich@google.com (Maksim Orlovich) +// jmarantz@google.com (Joshua Marantz) (refactoring only) + +#include "net/instaweb/apache/apr_thread_compatible_pool.h" + +#include +#include "apr_pools.h" +#include "base/logging.h" +#include "net/instaweb/util/stack_buffer.h" + +namespace net_instaweb { + +apr_pool_t* AprCreateThreadCompatiblePool(apr_pool_t* parent_pool) { + // Creates a pool that can be used in any thread, even when run in + // Apache prefork. + // + // 1) Concurrent allocations from the same pools are not (thread)safe. + // 2) Concurrent allocations from different pools using the same allocator + // are not safe unless the allocator has a mutex set. + // 3) prefork's pchild pool (which is our ancestor) has an allocator without + // a mutex set. + // + // Note: the above is all about the release version of the pool code, the + // checking one has some additional locking! + apr_pool_t* pool = NULL; + apr_allocator_t* allocator = NULL; + CHECK(apr_allocator_create(&allocator) == APR_SUCCESS); + apr_status_t status = + apr_pool_create_ex(&pool, parent_pool, NULL /*abortfn*/, allocator); + if ((status != APR_SUCCESS) || (pool == NULL)) { + char buf[kStackBufferSize]; + apr_strerror(status, buf, sizeof(buf)); + CHECK_EQ(APR_SUCCESS, status) << "apr_pool_create_ex failed: " << buf; + CHECK(pool != NULL) << "apr_pool_create_ex failed: " << buf; + } + apr_allocator_owner_set(allocator, pool); + return pool; +} + +} // namespace net_instaweb diff --git a/psol/include/net/instaweb/apache/apr_thread_compatible_pool.h b/psol/include/net/instaweb/apache/apr_thread_compatible_pool.h new file mode 100644 index 000000000..e6da0b846 --- /dev/null +++ b/psol/include/net/instaweb/apache/apr_thread_compatible_pool.h @@ -0,0 +1,46 @@ +// Copyright 2012 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: morlovich@google.com (Maksim Orlovich) +// jmarantz@google.com (Joshua Marantz) (refactoring only) + +#ifndef NET_INSTAWEB_APACHE_APR_THREAD_COMPATIBLE_POOL_H_ +#define NET_INSTAWEB_APACHE_APR_THREAD_COMPATIBLE_POOL_H_ + +#include "apr_pools.h" + +namespace net_instaweb { + +// Creates a pool that can be used in any thread, even when run in +// Apache prefork. +// +// 1) This method must be called from startup phase only +// 2) Each pool must be accessed only from a single thread (or otherwise +// have its access serialized) +// 3) Different pools returned by this function may be safely used concurrently. +// 4) It's OK to just use ap_pool_create to create child pools of this one from +// multiple threads; those will be re-entrant too (but pools created merely +// as children of Apache's pools will not be reentrant in prefork) +// +// In short, pools returned by this method are not fully threadsafe, but +// at least they are not thread-hostile, which is what you get with +// apr_pool_create in Prefork. +// +// Note: the above is all about the release version of the pool code, the +// checking one has some additional locking! +apr_pool_t* AprCreateThreadCompatiblePool(apr_pool_t* parent_pool); + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APR_THREAD_COMPATIBLE_POOL_H_ diff --git a/psol/include/net/instaweb/apache/apr_timer.h b/psol/include/net/instaweb/apache/apr_timer.h new file mode 100644 index 000000000..f2752edbb --- /dev/null +++ b/psol/include/net/instaweb/apache/apr_timer.h @@ -0,0 +1,38 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) +// lsong@google.com (Libo Song) + +#ifndef NET_INSTAWEB_APACHE_APR_TIMER_H_ +#define NET_INSTAWEB_APACHE_APR_TIMER_H_ + +#include "net/instaweb/util/public/timer.h" + +#include "net/instaweb/util/public/basictypes.h" + +using net_instaweb::Timer; + +namespace net_instaweb { + +class AprTimer : public Timer { + public: + virtual ~AprTimer(); + virtual int64 NowUs() const; + virtual void SleepUs(int64 us); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_APR_TIMER_H_ diff --git a/psol/include/net/instaweb/apache/header_util.h b/psol/include/net/instaweb/apache/header_util.h new file mode 100644 index 000000000..4d956dd7e --- /dev/null +++ b/psol/include/net/instaweb/apache/header_util.h @@ -0,0 +1,65 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_APACHE_HEADER_UTIL_H_ +#define NET_INSTAWEB_APACHE_HEADER_UTIL_H_ + +struct request_rec; + +namespace net_instaweb { + +class RequestHeaders; +class ResponseHeaders; + +// Converts Apache header structure into RequestHeaders. +void ApacheRequestToRequestHeaders(const request_rec& request, + RequestHeaders* request_headers); + +// Converts Apache header structure (request.headers_out) into ResponseHeaders +// headers. If err_headers is not NULL then request.err_headers_out is copied +// into it. In the event that headers == err_headers, the headers from +// request.err_headers_out will be appended to the list of headers, but no +// merging occurs. +void ApacheRequestToResponseHeaders(const request_rec& request, + ResponseHeaders* headers, + ResponseHeaders* err_headers); + + + +// Converts ResponseHeaders into an Apache request's headers_out table. +void ResponseHeadersToApacheRequest(const ResponseHeaders& response_headers, + bool ok_to_disable_downstream_headers, + request_rec* request); + +// Converts ResponseHeaders (headers and err_headers) into Apache request +// headers (headers_out and err_headers_out respectively). Either headers or +// err_headers may be NULL but both cannot be. Unlike in +// ApacheRequestToResponseHeaders it does not make sense for headers to equal +// err_headers since it will result in duplicate headers being written. +void AddResponseHeadersToRequest(const ResponseHeaders* headers, + const ResponseHeaders* err_headers, + bool ok_to_disable_downstream_headers, + request_rec* request); + +// Remove downstream filters that might corrupt our caching headers. +void DisableDownstreamHeaderFilters(request_rec* request); + +// Debug utility for printing Apache headers to stdout +void PrintHeaders(request_rec* request); + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_HEADER_UTIL_H_ diff --git a/psol/include/net/instaweb/apache/instaweb_context.h b/psol/include/net/instaweb/apache/instaweb_context.h new file mode 100644 index 000000000..c56ebd81d --- /dev/null +++ b/psol/include/net/instaweb/apache/instaweb_context.h @@ -0,0 +1,169 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) +// lsong@google.com (Libo Song) + +#ifndef NET_INSTAWEB_APACHE_INSTAWEB_CONTEXT_H_ +#define NET_INSTAWEB_APACHE_INSTAWEB_CONTEXT_H_ + +#include "net/instaweb/automatic/public/html_detector.h" +#include "net/instaweb/http/public/content_type.h" +#include "net/instaweb/http/public/response_headers.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/property_cache.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/string_writer.h" +#include "net/instaweb/util/public/thread_system.h" + +// The httpd header must be after the +// apache_rewrite_driver_factory.h. Otherwise, the compiler will +// complain "strtoul_is_not_a_portable_function_use_strtol_instead". +#include "httpd.h" +#include "apr_pools.h" + +struct apr_bucket_brigade; +struct request_rec; +struct server_rec; + +namespace net_instaweb { + +class ApacheServerContext; +class GzipInflater; +class RequestHeaders; +class RewriteDriver; +class RewriteOptions; + +const char kPagespeedOriginalUrl[] = "mod_pagespeed_original_url"; + +// Tracks a single property-cache lookup. +class PropertyCallback : public PropertyPage { + public: + PropertyCallback(RewriteDriver* driver, + ThreadSystem* thread_system, + const StringPiece& key); + + virtual void Done(bool success); + + void BlockUntilDone(); + + private: + RewriteDriver* driver_; + GoogleString url_; + bool done_; + scoped_ptr mutex_; + scoped_ptr condvar_; + DISALLOW_COPY_AND_ASSIGN(PropertyCallback); +}; + +// Context for an HTML rewrite. +// +// One is created for responses that appear to be HTML (although there is +// a basic sanity check that the first non-space char is '<'). +// +// The rewriter will put the rewritten content into the output string when +// flushed or finished. We call Flush when we see the FLUSH bucket, and +// call Finish when we see the EOS bucket. +// +// TODO(sligocki): Factor out similarities between this and ProxyFetch. +class InstawebContext { + public: + enum ContentEncoding { kNone, kGzip, kDeflate, kOther }; + enum ContentDetectionState { kStart, kHtml, kNotHtml }; + + // Takes ownership of request_headers. + InstawebContext(request_rec* request, + RequestHeaders* request_headers, + const ContentType& content_type, + ApacheServerContext* server_context, + const GoogleString& base_url, + bool using_spdy, + bool use_custom_options, + const RewriteOptions& options); + ~InstawebContext(); + + void Rewrite(const char* input, int size); + void Flush(); + void Finish(); + + apr_bucket_brigade* bucket_brigade() const { return bucket_brigade_; } + ContentEncoding content_encoding() const { return content_encoding_; } + ApacheServerContext* apache_server_context() { return server_context_; } + const GoogleString& output() { return output_; } + bool empty() const { return output_.empty(); } + void clear() { output_.clear(); } // TODO(jmarantz): needed? + + ResponseHeaders* response_headers() { + return &response_headers_; + } + + bool sent_headers() { return sent_headers_; } + void set_sent_headers(bool sent) { sent_headers_ = sent; } + + // Populated response_headers_ with the request's headers_out table. + void PopulateHeaders(request_rec* request); + + // Looks up the apache server context from the server rec. + // TODO(jmarantz): Is there a better place to put this? It needs to + // be used by both mod_instaweb.cc and instaweb_handler.cc. + static ApacheServerContext* ServerContextFromServerRec(server_rec* server); + + // Returns a fetchable URI from a request, using the request pool. + static const char* MakeRequestUrl(const RewriteOptions& options, + request_rec* request); + + bool modify_caching_headers() const { return modify_caching_headers_; } + + private: + void ComputeContentEncoding(request_rec* request); + + // Start a new property cache lookup. The caller is responsible for cleaning + // up the returned PropertyCallback*. + PropertyCallback* InitiatePropertyCacheLookup(); + void ProcessBytes(const char* input, int size); + + // Checks to see if there was a Furious cookie sent with the request. + // If there was not, set one, and add a Set-Cookie header to the + // response headers. + // If there was one, make sure to set the options state appropriately. + void SetFuriousStateAndCookie(request_rec* request, RewriteOptions* options); + + static apr_status_t Cleanup(void* object); + + GoogleString output_; // content after instaweb rewritten. + apr_bucket_brigade* bucket_brigade_; + ContentEncoding content_encoding_; + const ContentType content_type_; + + ApacheServerContext* server_context_; + RewriteDriver* rewrite_driver_; + StringWriter string_writer_; + scoped_ptr inflater_; + HtmlDetector html_detector_; + GoogleString absolute_url_; + scoped_ptr request_headers_; + ResponseHeaders response_headers_; + bool started_parse_; + bool sent_headers_; + bool populated_headers_; + bool modify_caching_headers_; + + DISALLOW_COPY_AND_ASSIGN(InstawebContext); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_INSTAWEB_CONTEXT_H_ diff --git a/psol/include/net/instaweb/apache/instaweb_handler.h b/psol/include/net/instaweb/apache/instaweb_handler.h new file mode 100644 index 000000000..300a415be --- /dev/null +++ b/psol/include/net/instaweb/apache/instaweb_handler.h @@ -0,0 +1,52 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: lsong@google.com (Libo Song) +// jmarantz@google.com (Joshua Marantz) +// +// The Apache handler for rewriten resources and a couple other Apache hooks. + +#ifndef NET_INSTAWEB_APACHE_INSTAWEB_HANDLER_H_ +#define NET_INSTAWEB_APACHE_INSTAWEB_HANDLER_H_ + +#include "apr_pools.h" // for apr_status_t +// The httpd header must be after the instaweb_context.h. Otherwise, +// the compiler will complain +// "strtoul_is_not_a_portable_function_use_strtol_instead". +#include "httpd.h" + +namespace net_instaweb { + +// Was this request made by mod_pagespeed itself? If so, we should not try to +// handle it, just let Apache deal with it like normal. +bool is_pagespeed_subrequest(request_rec* request); + +// Handle mod_pagespeed-specific requests. Handles both .pagespeed. rewritten +// resources and /mod_pagespeed_statistics, /mod_pagespeed_beacon, etc. +// TODO(sligocki): Why not make each of these different handlers? +apr_status_t instaweb_handler(request_rec* request); + +// Save the original URL as a request "note" before mod_rewrite has +// a chance to corrupt mod_pagespeed's generated URLs, which would +// prevent instaweb_handler from being able to decode the resource. +apr_status_t save_url_hook(request_rec *request); + +// By default, apache imposes limitations on URL segments of around +// 256 characters that appear to correspond to filename limitations. +// To prevent that, we hook map_to_storage for our own purposes. +apr_status_t instaweb_map_to_storage(request_rec* request); + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_INSTAWEB_HANDLER_H_ diff --git a/psol/include/net/instaweb/apache/interface_mod_spdy.h b/psol/include/net/instaweb/apache/interface_mod_spdy.h new file mode 100644 index 000000000..23211625d --- /dev/null +++ b/psol/include/net/instaweb/apache/interface_mod_spdy.h @@ -0,0 +1,64 @@ +// Copyright 2012 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: morlovich@google.com (Maksim Orlovich) +// +// Interfaces with mod_spdy's exported functions. + +#ifndef NET_INSTAWEB_APACHE_INTERFACE_MOD_SPDY_H_ +#define NET_INSTAWEB_APACHE_INTERFACE_MOD_SPDY_H_ + +#include "util_filter.h" + +#include "third_party/mod_spdy/src/mod_spdy/apache/slave_connection_api.h" + +struct conn_rec; + +namespace net_instaweb { + +// Needs to be called from a ap_hook_optional_fn_retrieve hook. +void attach_mod_spdy(); + +// If the connection is using SPDY with mod_spdy, returns the protocol +// version. Otherwise, returns 0. +int mod_spdy_get_spdy_version(conn_rec* conn); + +// See mod_spdy's slave_connection_api.h for description of the methods below. +// These are merely forwarding wrappers with some CHECKS. +// Note that this method will return NULL if the relevant mod_spdy methods +// weren't found registered with Apache. Others, however, will CHECK-fail +// (since there is no sensible way to call them if this method failed); +// except you can always safely mod_spdy_destroy_slave_connection_factory(NULL). +spdy_slave_connection_factory* mod_spdy_create_slave_connection_factory( + conn_rec* master_connection); +void mod_spdy_destroy_slave_connection_factory( + spdy_slave_connection_factory* factory); + +spdy_slave_connection* mod_spdy_create_slave_connection( + spdy_slave_connection_factory* factory, + ap_filter_rec_t* input_filter, + void* input_filter_ctx, + ap_filter_rec_t* output_filter, + void* output_filter_ctx); + +void mod_spdy_run_slave_connection(spdy_slave_connection* conn); +void mod_spdy_destroy_slave_connection(spdy_slave_connection* conn); + +// Returns true if given connection is using HTTPS. +// (This is actually a mod_ssl function). +bool mod_ssl_is_https(conn_rec* conn); + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_INTERFACE_MOD_SPDY_H_ diff --git a/psol/include/net/instaweb/apache/log_message_handler.h b/psol/include/net/instaweb/apache/log_message_handler.h new file mode 100644 index 000000000..773a9af98 --- /dev/null +++ b/psol/include/net/instaweb/apache/log_message_handler.h @@ -0,0 +1,49 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_APACHE_LOG_MESSAGE_HANDLER_H_ +#define NET_INSTAWEB_APACHE_LOG_MESSAGE_HANDLER_H_ + +#include // for std::min +#include "apr_pools.h" + +#include "net/instaweb/util/public/string_util.h" + +struct server_rec; + +namespace net_instaweb { + +namespace log_message_handler { + +// Install a log message handler that routes LOG() messages to the +// apache error log. Should be called once at startup. +void Install(apr_pool_t* pool); + +// The log_message_handler is not attached to a specific server_rec, so the +// LogLevel is not automatically set for it. Every server_rec instance +// should call AddServerConfig and let us decide what level to log at. +// Currently we set it to the min LogLevel. +void AddServerConfig(const server_rec* server, const StringPiece& version); + +// Free the memory from the log message handler +void ShutDown(); + +} // namespace log_message_handler + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_LOG_MESSAGE_HANDLER_H_ diff --git a/psol/include/net/instaweb/apache/loopback_route_fetcher.h b/psol/include/net/instaweb/apache/loopback_route_fetcher.h new file mode 100644 index 000000000..0d547a5fc --- /dev/null +++ b/psol/include/net/instaweb/apache/loopback_route_fetcher.h @@ -0,0 +1,70 @@ +// Copyright 2012 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: morlovich@google.com (Maksim Orlovich) +// +// This fetcher routes requests to hosts that are not explicitly mentioned in +// the DomainLawyer via the loopback. + +#ifndef NET_INSTAWEB_APACHE_LOOPBACK_ROUTE_FETCHER_H_ +#define NET_INSTAWEB_APACHE_LOOPBACK_ROUTE_FETCHER_H_ + +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +struct apr_sockaddr_t; + +namespace net_instaweb { + +class AsyncFetch; +class RewriteOptions; +class MessageHandler; + +// See file comment. +class LoopbackRouteFetcher : public UrlAsyncFetcher { + public: + // Does not take ownership of anything. own_port is the port the incoming + // request came in on. If the backend_fetcher does actual fetching (and is + // not merely simulating it for testing purposes) it should be the Serf + // fetcher, as others may not direct requests this class produces properly. + // (As this fetcher may produce requests that need to connect to 127.0.0.1 + // but have a Host: and URL from somewhere else). + LoopbackRouteFetcher(const RewriteOptions* options, + int own_port, + UrlAsyncFetcher* backend_fetcher); + virtual ~LoopbackRouteFetcher(); + + virtual bool SupportsHttps() const { + return backend_fetcher_->SupportsHttps(); + } + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch); + + // Returns true if the given address is an IPv4 or IPv6 loopback. + static bool IsLoopbackAddr(const apr_sockaddr_t* addr); + + private: + const RewriteOptions* const options_; + int own_port_; + UrlAsyncFetcher* const backend_fetcher_; + + DISALLOW_COPY_AND_ASSIGN(LoopbackRouteFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_LOOPBACK_ROUTE_FETCHER_H_ diff --git a/psol/include/net/instaweb/apache/mod_instaweb.h b/psol/include/net/instaweb/apache/mod_instaweb.h new file mode 100644 index 000000000..a771893b8 --- /dev/null +++ b/psol/include/net/instaweb/apache/mod_instaweb.h @@ -0,0 +1,27 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jefftk@google.com (Jeff Kaufman) + +#ifndef NET_INSTAWEB_APACHE_MOD_INSTAWEB_H_ +#define NET_INSTAWEB_APACHE_MOD_INSTAWEB_H_ + +#include "http_config.h" +#include "httpd.h" + +extern "C" { +extern module AP_MODULE_DECLARE_DATA pagespeed_module; +} + +#endif // NET_INSTAWEB_APACHE_MOD_INSTAWEB_H_ diff --git a/psol/include/net/instaweb/apache/mod_spdy_fetch_controller.h b/psol/include/net/instaweb/apache/mod_spdy_fetch_controller.h new file mode 100644 index 000000000..ae6b66542 --- /dev/null +++ b/psol/include/net/instaweb/apache/mod_spdy_fetch_controller.h @@ -0,0 +1,67 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: morlovich@google.com (Maksim Orlovich) +// +// ModSpdyFetchController coordinates a threadpool and a rate controller between +// multiple ModSpdyFetcher objects. The basic usage pattern is that +// ModSpdyFetcher::Fetch calls ModSpdyFetchController::ScheduleBlockingFetch, +// which will then cause ModSpdyFetcher::BlockingFetch to be called on a +// thread in a hopefully intelligent manner. + +#ifndef NET_INSTAWEB_APACHE_MOD_SPDY_FETCH_CONTROLLER_H_ +#define NET_INSTAWEB_APACHE_MOD_SPDY_FETCH_CONTROLLER_H_ + +#include "net/instaweb/http/public/rate_controller.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/queued_worker_pool.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class AsyncFetch; +class MessageHandler; +class ModSpdyFetcher; +class Statistics; +class ThreadSystem; + +class ModSpdyFetchController { + public: + // Note: RateController::InitStats must have been called before using this. + ModSpdyFetchController(int num_threads, + ThreadSystem* thread_system, + Statistics* statistics); + ~ModSpdyFetchController(); + + // Arranges for fetcher->BlockingFetch to be called on our thread pool. + void ScheduleBlockingFetch( + ModSpdyFetcher* fetcher, const GoogleString& url, + MessageHandler* message_handler, AsyncFetch* fetch); + + // TODO(morlovich): Add a ShutDown(), with semantics matching those + // of UrlAsyncFetcher::ShutDown, and invoked similarly. + + private: + class FetchDispatcher; + + RateController rate_controller_; + QueuedWorkerPool thread_pool_; + DISALLOW_COPY_AND_ASSIGN(ModSpdyFetchController); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_MOD_SPDY_FETCH_CONTROLLER_H_ diff --git a/psol/include/net/instaweb/apache/mod_spdy_fetcher.h b/psol/include/net/instaweb/apache/mod_spdy_fetcher.h new file mode 100644 index 000000000..2fd0ee49c --- /dev/null +++ b/psol/include/net/instaweb/apache/mod_spdy_fetcher.h @@ -0,0 +1,84 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: morlovich@google.com (Maksim Orlovich) +// +// A fetcher that talks to mod_spdy for requests matching a certain +// domain (and passes the rest to fallthrough fetcher). + +#ifndef NET_INSTAWEB_APACHE_MOD_SPDY_FETCHER_H_ +#define NET_INSTAWEB_APACHE_MOD_SPDY_FETCHER_H_ + +#include "net/instaweb/http/public/url_async_fetcher.h" + +#include "httpd.h" + + +#include "net/instaweb/apache/interface_mod_spdy.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +struct request_rec; +struct spdy_slave_connection_factory; + +namespace net_instaweb { + +class AsyncFetch; +class MessageHandler; +class ModSpdyFetchController; +class RewriteDriver; + +class ModSpdyFetcher : public UrlAsyncFetcher { + public: + // Initializes various filters this fetcher needs for operation. + // This must be from within a register hooks implementation. + static void Initialize(); + + ModSpdyFetcher(ModSpdyFetchController* controller, + request_rec* req, RewriteDriver* driver); + virtual ~ModSpdyFetcher(); + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch); + + // Returns true if a ModSpdyFetcher should be installed as a session fetcher + // on a given connection. + static bool ShouldUseOn(request_rec* req); + + // TODO(morlovich): Implement virtual void ShutDown(), + // and give a good story on session fetchers and fetcher shutdowns in general. + + private: + friend class ModSpdyFetchController; + + // The actual implementation of fetching code, normally called by + // ModSpdyFetchController. + void BlockingFetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch); + + ModSpdyFetchController* controller_; + spdy_slave_connection_factory* connection_factory_; + UrlAsyncFetcher* fallback_fetcher_; + GoogleString own_origin_; // empty if we couldn't figure it out. + + DISALLOW_COPY_AND_ASSIGN(ModSpdyFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_MOD_SPDY_FETCHER_H_ diff --git a/psol/include/net/instaweb/apache/serf_url_async_fetcher.cc b/psol/include/net/instaweb/apache/serf_url_async_fetcher.cc new file mode 100644 index 000000000..0f569af5f --- /dev/null +++ b/psol/include/net/instaweb/apache/serf_url_async_fetcher.cc @@ -0,0 +1,1432 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) +// lsong@google.com (Libo Song) + +// TODO(jmarantz): Avoid initiating fetches for resources already in flight. +// The challenge is that we would want to call all the callbacks that indicated +// interest in a particular URL once the callback completed. Alternatively, +// this could be done in a level above the URL fetcher. + +#include "net/instaweb/apache/serf_url_async_fetcher.h" + +#include +#include +#include + +#include "apr_strings.h" +#include "apr_pools.h" +#include "apr_thread_proc.h" +#include "base/logging.h" +#include "net/instaweb/apache/apr_thread_compatible_pool.h" +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/meta_data.h" +#include "net/instaweb/http/public/request_headers.h" +#include "net/instaweb/http/public/response_headers.h" +#include "net/instaweb/http/public/response_headers_parser.h" +#include "net/instaweb/public/global_constants.h" +#include "net/instaweb/public/version.h" +#include "net/instaweb/util/public/abstract_mutex.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/condvar.h" +#include "net/instaweb/util/public/message_handler.h" +#include "net/instaweb/util/public/pool.h" +#include "net/instaweb/util/public/pool_element.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/statistics.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/thread_system.h" +#include "net/instaweb/util/public/timer.h" +#include "third_party/serf/src/serf.h" + +// This is an easy way to turn on lots of debug messages. Note that this +// is somewhat verbose. +#define SERF_DEBUG(x) + +namespace { + +enum HttpsOptions { + kEnableHttps = 1 << 0, + kAllowSelfSigned = 1 << 1, + kAllowUnknownCertificateAuthority = 1 << 2, + kAllowCertificateNotYetValid = 1 << 3, +}; + +const char kFetchMethod[] = "GET"; +} // namespace + +extern "C" { + // Declares new functions added to + // src/third_party/serf/instaweb_context.c +serf_bucket_t* serf_request_bucket_request_create_for_host( + serf_request_t *request, + const char *method, + const char *uri, + serf_bucket_t *body, + serf_bucket_alloc_t *allocator, const char* host); + +int serf_connection_is_in_error_state(serf_connection_t* connection); +} // extern "C" + +namespace net_instaweb { + +const char SerfStats::kSerfFetchRequestCount[] = "serf_fetch_request_count"; +const char SerfStats::kSerfFetchByteCount[] = "serf_fetch_bytes_count"; +const char SerfStats::kSerfFetchTimeDurationMs[] = + "serf_fetch_time_duration_ms"; +const char SerfStats::kSerfFetchCancelCount[] = "serf_fetch_cancel_count"; +const char SerfStats::kSerfFetchActiveCount[] = + "serf_fetch_active_count"; +const char SerfStats::kSerfFetchTimeoutCount[] = "serf_fetch_timeout_count"; +const char SerfStats::kSerfFetchFailureCount[] = "serf_fetch_failure_count"; +const char SerfStats::kSerfFetchCertErrors[] = "serf_fetch_cert_errors"; + +GoogleString GetAprErrorString(apr_status_t status) { + char error_str[1024]; + apr_strerror(status, error_str, sizeof(error_str)); + return error_str; +} + +// TODO(lsong): Move this to a separate file. Necessary? +class SerfFetch : public PoolElement { + public: + // TODO(lsong): make use of request_headers. + SerfFetch(const GoogleString& url, + AsyncFetch* async_fetch, + MessageHandler* message_handler, + Timer* timer) + : fetcher_(NULL), + timer_(timer), + str_url_(url), + async_fetch_(async_fetch), + parser_(async_fetch->response_headers()), + status_line_read_(false), + one_byte_read_(false), + has_saved_byte_(false), + saved_byte_('\0'), + message_handler_(message_handler), + pool_(NULL), // filled in once assigned to a thread, to use its pool. + bucket_alloc_(NULL), + connection_(NULL), + bytes_received_(0), + fetch_start_ms_(0), + fetch_end_ms_(0), + using_https_(false), + ssl_context_(NULL), + ssl_error_message_(NULL) { + } + + ~SerfFetch() { + DCHECK(async_fetch_ == NULL); + if (connection_ != NULL) { + serf_connection_close(connection_); + } + if (pool_ != NULL) { + apr_pool_destroy(pool_); + } + } + + // Start the fetch. It returns immediately. This can only be run when + // locked with fetcher->mutex_. + bool Start(SerfUrlAsyncFetcher* fetcher); + + const char* str_url() { return str_url_.c_str(); } + + // This must be called while holding SerfUrlAsyncFetcher's mutex_. + void Cancel() { + if (connection_ != NULL) { + // We can get here either because we're canceling the connection ourselves + // or because Serf detected an error. + // + // If we canceled/timed out, we want to close the serf connection so it + // doesn't call us back, as we will detach from the async_fetch_ shortly. + // + // If Serf detected an error we also want to clean up as otherwise it will + // keep re-detecting it, which will interfere with other jobs getting + // handled (until we finally cleanup the old fetch and close things in + // ~SerfFetch). + serf_connection_close(connection_); + connection_ = NULL; + } + + CallCallback(false); + } + + // Calls the callback supplied by the user. This needs to happen + // exactly once. In some error cases it appears that Serf calls + // HandleResponse multiple times on the same object. + // + // This must be called while holding SerfUrlAsyncFetcher's mutex_. + void CallCallback(bool success) { + if (ssl_error_message_ != NULL) { + success = false; + } + + if (async_fetch_ == NULL) { + LOG(FATAL) << "BUG: Serf callback called more than once on same fetch " + << str_url() << " (" << this << "). Please report this " + << "at http://code.google.com/p/modpagespeed/issues/"; + } else { + CallbackDone(success); + fetch_end_ms_ = timer_->NowMs(); + fetcher_->FetchComplete(this); + } + } + + void CallbackDone(bool success) { + // fetcher_==NULL if Start is called during shutdown. + if (fetcher_ != NULL) { + if (!success) { + fetcher_->failure_count_->Add(1); + } + if (fetcher_->track_original_content_length() && + !async_fetch_->response_headers()->Has( + HttpAttributes::kXOriginalContentLength)) { + async_fetch_->extra_response_headers()->SetOriginalContentLength( + bytes_received_); + } + } + async_fetch_->Done(success); + // We should always NULL the async_fetch_ out after calling otherwise we + // could get weird double calling errors. + async_fetch_ = NULL; + } + + // If last poll of this fetch's connection resulted in an error, clean it up. + // Must be called after serf_context_run, with fetcher's mutex_ held. + void CleanupIfError() { + if ((connection_ != NULL) && + serf_connection_is_in_error_state(connection_)) { + message_handler_->Message( + kInfo, "Serf cleanup for error'd fetch of: %s", str_url()); + Cancel(); + } + } + + int64 TimeDuration() const { + if ((fetch_start_ms_ != 0) && (fetch_end_ms_ != 0)) { + return fetch_end_ms_ - fetch_start_ms_; + } else { + return 0; + } + } + int64 fetch_start_ms() const { return fetch_start_ms_; } + + size_t bytes_received() const { return bytes_received_; } + MessageHandler* message_handler() { return message_handler_; } + + private: + // Static functions used in callbacks. + + // The code under SERF_HTTPS_FETCHING was contributed by Devin Anderson + // (surfacepatterns@gmail.com). + // + // Note this must be ifdef'd because calling serf_bucket_ssl_decrypt_create + // requires ssl_buckets.c in the link. ssl_buckets.c requires openssl. +#if SERF_HTTPS_FETCHING + static apr_status_t SSLCertError(void *data, int failures, + const serf_ssl_certificate_t *cert) { + return static_cast(data)->HandleSSLCertErrors(failures, 0); + } + + static apr_status_t SSLCertChainError( + void *data, int failures, int error_depth, + const serf_ssl_certificate_t * const *certs, + apr_size_t certs_count) { + return static_cast(data)->HandleSSLCertErrors(failures, + error_depth); + } +#endif + + static apr_status_t ConnectionSetup( + apr_socket_t* socket, serf_bucket_t **read_bkt, serf_bucket_t **write_bkt, + void* setup_baton, apr_pool_t* pool) { + SerfFetch* fetch = static_cast(setup_baton); + *read_bkt = serf_bucket_socket_create(socket, fetch->bucket_alloc_); +#if SERF_HTTPS_FETCHING + if (fetch->using_https_) { + *read_bkt = serf_bucket_ssl_decrypt_create(*read_bkt, + fetch->ssl_context_, + fetch->bucket_alloc_); + if (fetch->ssl_context_ == NULL) { + fetch->ssl_context_ = serf_bucket_ssl_decrypt_context_get(*read_bkt); + } + + serf_ssl_server_cert_callback_set(fetch->ssl_context_, SSLCertError, + fetch); + + serf_ssl_server_cert_chain_callback_set(fetch->ssl_context_, + SSLCertError, SSLCertChainError, + fetch); + + serf_ssl_set_hostname(fetch->ssl_context_, fetch->url_.hostinfo); + *write_bkt = serf_bucket_ssl_encrypt_create(*write_bkt, + fetch->ssl_context_, + fetch->bucket_alloc_); + } +#endif + return APR_SUCCESS; + } + + static void ClosedConnection(serf_connection_t* conn, + void* closed_baton, + apr_status_t why, + apr_pool_t* pool) { + SerfFetch* fetch = static_cast(closed_baton); + if (why != APR_SUCCESS) { + fetch->message_handler_->Warning( + fetch->str_url_.c_str(), 0, "Connection close (code=%d %s).", + why, GetAprErrorString(why).c_str()); + } + // Connection is closed. + fetch->connection_ = NULL; + } + + static serf_bucket_t* AcceptResponse(serf_request_t* request, + serf_bucket_t* stream, + void* acceptor_baton, + apr_pool_t* pool) { + // Get the per-request bucket allocator. + serf_bucket_alloc_t* bucket_alloc = serf_request_get_alloc(request); + // Create a barrier so the response doesn't eat us! + // From the comment in Serf: + // ### the stream does not have a barrier, this callback should generally + // ### add a barrier around the stream before incorporating it into a + // ### response bucket stack. + // ... i.e. the passed bucket becomes owned rather than + // ### borrowed. + serf_bucket_t* bucket = serf_bucket_barrier_create(stream, bucket_alloc); + return serf_bucket_response_create(bucket, bucket_alloc); + } + + static apr_status_t HandleResponse(serf_request_t* request, + serf_bucket_t* response, + void* handler_baton, + apr_pool_t* pool) { + SerfFetch* fetch = static_cast(handler_baton); + return fetch->HandleResponse(response); + } + + static bool MoreDataAvailable(apr_status_t status) { + // This OR is structured like this to make debugging easier, as it's + // not obvious when looking at the status mask which of these conditions + // is hit. + if (APR_STATUS_IS_EAGAIN(status)) { + return true; + } + return APR_STATUS_IS_EINTR(status); + } + + static bool IsStatusOk(apr_status_t status) { + return ((status == APR_SUCCESS) || + APR_STATUS_IS_EOF(status) || + MoreDataAvailable(status)); + } + +#if SERF_HTTPS_FETCHING + // Called indicating whether SSL certificate errors have occurred detected. + // The function returns SUCCESS in all cases, but sets ssl_error_message_ + // non-null for errors as a signal to ReadHeaders that we should not let + // any output thorugh. + // + // Interpretation of two of the error conditions is configuraable: + // 'allow_unknown_certificate_authority' and 'allow_self_signed'. + apr_status_t HandleSSLCertErrors(int errors, int failure_depth) { + // TODO(jmarantz): is there value in logging the errors and failure_depth + // formals here? + + // Note that HandleSSLCertErrors can be called multiple times for + // a single request. As far as I can tell, there is value in + // recording only one of these. For now, I have set up the logic + // so only the last error will be printed lazilly, in ReadHeaders. + if (((errors & SERF_SSL_CERT_SELF_SIGNED) != 0) && + !fetcher_->allow_self_signed()) { + ssl_error_message_ = "SSL certificate is self-signed"; + } else if (((errors & SERF_SSL_CERT_UNKNOWNCA) != 0) && + !fetcher_->allow_unknown_certificate_authority()) { + ssl_error_message_ = + "SSL certificate has an unknown certificate authority"; + } else if (((errors & SERF_SSL_CERT_NOTYETVALID) != 0) && + !fetcher_->allow_certificate_not_yet_valid()) { + ssl_error_message_ = "SSL certificate is not yet valid"; + } else if (errors & SERF_SSL_CERT_EXPIRED) { + ssl_error_message_ = "SSL certificate is expired"; + } else if (errors & SERF_SSL_CERT_UNKNOWN_FAILURE) { + ssl_error_message_ = "SSL certificate has an unknown error"; + } + // Fall-through here implies success. + + // TODO(jmarantz): I think the design of this system indicates + // that we should be returning APR_EGENERAL on failure. However I + // have found that doesn't work properly, at least for + // SERF_SSL_CERT_SELF_SIGNED. The request does not terminate + // quickly but instead times out. Thus we return APR_SUCCESS + // but change the status_code to 404, report an error, and suppress + // the output. + // + // TODO(jmarantz): consider aiding diagnosibility with by changing the + // 404 to a 401 (Unauthorized) or 418 (I'm a teapot), or 459 (nginx + // internal cert error code). + + return APR_SUCCESS; + } +#endif + + // The handler MUST process data from the response bucket until the + // bucket's read function states it would block (APR_STATUS_IS_EAGAIN). + // The handler is invoked only when new data arrives. If no further data + // arrives, and the handler does not process all available data, then the + // system can result in a deadlock around the unprocessed, but read, data. + apr_status_t HandleResponse(serf_bucket_t* response) { + if (response == NULL) { + message_handler_->Message( + kInfo, "serf HandlerReponse called with NULL response for %s", + str_url()); + CallCallback(false); + return APR_EGENERAL; + } + + // The response-handling code must be robust to packets coming in all at + // once, one byte at a time, or anything in between. EAGAIN indicates + // that more data is available in the socket so another read should + // be issued before returning. + apr_status_t status = APR_EAGAIN; + while (MoreDataAvailable(status) && (async_fetch_ != NULL) && + !parser_.headers_complete()) { + if (!status_line_read_) { + status = ReadStatusLine(response); + } + + if (status_line_read_ && !one_byte_read_) { + status = ReadOneByteFromBody(response); + } + + if (one_byte_read_ && !parser_.headers_complete()) { + status = ReadHeaders(response); + } + } + + if (parser_.headers_complete()) { + status = ReadBody(response); + } + + if ((async_fetch_ != NULL) && + ((APR_STATUS_IS_EOF(status) && parser_.headers_complete()) || + (status == APR_EGENERAL))) { + bool success = (IsStatusOk(status) && parser_.headers_complete()); + if (!parser_.headers_complete() && (async_fetch_ != NULL)) { + // Be careful not to leave headers in inconsistent state in some error + // conditions. + async_fetch_->response_headers()->Clear(); + } + CallCallback(success); + } + return status; + } + + apr_status_t ReadStatusLine(serf_bucket_t* response) { + serf_status_line status_line; + apr_status_t status = serf_bucket_response_status(response, &status_line); + ResponseHeaders* response_headers = async_fetch_->response_headers(); + if (status == APR_SUCCESS) { + response_headers->SetStatusAndReason( + static_cast(status_line.code)); + response_headers->set_major_version(status_line.version / 1000); + response_headers->set_minor_version(status_line.version % 1000); + status_line_read_ = true; + } + return status; + } + + // Know what's weird? You have do a body-read to get access to the + // headers. You need to read 1 byte of body to force an FSM inside + // Serf to parse the headers. Then you can parse the headers and + // finally read the rest of the body. I know, right? + // + // The simpler approach, and likely what the Serf designers intended, + // is that you read the entire body first, and then read the headers. + // But if you are trying to stream the data as its fetched through some + // kind of function that needs to know the content-type, then it's + // really a drag to have to wait till the end of the body to get the + // content type. + apr_status_t ReadOneByteFromBody(serf_bucket_t* response) { + apr_size_t len = 0; + const char* data = NULL; + apr_status_t status = serf_bucket_read(response, 1, &data, &len); + if (!APR_STATUS_IS_EINTR(status) && IsStatusOk(status)) { + one_byte_read_ = true; + if (len == 1) { + has_saved_byte_ = true; + saved_byte_ = data[0]; + } + } + return status; + } + + // Once that one byte is read from the body, we can go ahead and + // parse the headers. The dynamics of this appear that for N + // headers we'll get 2N calls to serf_bucket_read: one each for + // attribute names & values. + apr_status_t ReadHeaders(serf_bucket_t* response) { + serf_bucket_t* headers = serf_bucket_response_get_headers(response); + const char* data = NULL; + apr_size_t len = 0; + apr_status_t status = serf_bucket_read(headers, SERF_READ_ALL_AVAIL, + &data, &len); + + // Feed valid chunks to the header parser --- but skip empty ones, + // which can occur for value-less headers, since otherwise they'd + // look like parse errors. + if (IsStatusOk(status) && (len > 0)) { + if (parser_.ParseChunk(StringPiece(data, len), message_handler_)) { + if (parser_.headers_complete()) { + ResponseHeaders* response_headers = async_fetch_->response_headers(); + if (ssl_error_message_ != NULL) { + response_headers->set_status_code(HttpStatus::kNotFound); + message_handler_->Message(kInfo, "%s: %s", str_url_.c_str(), + ssl_error_message_); + fetcher_->cert_errors_->Add(1); + has_saved_byte_ = false; + } + + if (fetcher_->track_original_content_length()) { + // Set X-Original-Content-Length, if Content-Length is available. + int64 content_length; + if (response_headers->FindContentLength(&content_length)) { + response_headers->SetOriginalContentLength(content_length); + } + } + // Stream the one byte read from ReadOneByteFromBody to writer. + if (has_saved_byte_) { + ++bytes_received_; + if (!async_fetch_->Write(StringPiece(&saved_byte_, 1), + message_handler_)) { + status = APR_EGENERAL; + } + } + } + } else { + status = APR_EGENERAL; + } + } + return status; + } + + // Once headers are complete we can get the body. The dynamics of this + // are likely dependent on everything on the network between the client + // and server, but for a 10k buffer I seem to frequently get 8k chunks. + apr_status_t ReadBody(serf_bucket_t* response) { + apr_status_t status = APR_EAGAIN; + const char* data = NULL; + apr_size_t len = 0; + apr_size_t bytes_to_flush = 0; + while (MoreDataAvailable(status) && (async_fetch_ != NULL)) { + status = serf_bucket_read(response, SERF_READ_ALL_AVAIL, &data, &len); + bytes_received_ += len; + bytes_to_flush += len; + if (IsStatusOk(status) && (len != 0) && + !async_fetch_->Write(StringPiece(data, len), message_handler_)) { + status = APR_EGENERAL; + } + } + if ((bytes_to_flush != 0) && !async_fetch_->Flush(message_handler_)) { + status = APR_EGENERAL; + } + return status; + } + + // Ensures that a user-agent string is included, and that the mod_pagespeed + // version is appended. + void FixUserAgent() { + // Supply a default user-agent if none is present, and in any case + // append on a 'serf' suffix. + GoogleString user_agent; + ConstStringStarVector v; + RequestHeaders* request_headers = async_fetch_->request_headers(); + if (request_headers->Lookup(HttpAttributes::kUserAgent, &v)) { + for (int i = 0, n = v.size(); i < n; ++i) { + if (i != 0) { + user_agent += " "; + } + if (v[i] != NULL) { + user_agent += *(v[i]); + } + } + request_headers->RemoveAll(HttpAttributes::kUserAgent); + } + if (user_agent.empty()) { + user_agent += "Serf/" SERF_VERSION_STRING; + } + GoogleString version = StrCat( + " ", kModPagespeedSubrequestUserAgent, + "/" MOD_PAGESPEED_VERSION_STRING "-" LASTCHANGE_STRING); + if (!StringPiece(user_agent).ends_with(version)) { + user_agent += version; + } + request_headers->Add(HttpAttributes::kUserAgent, user_agent); + } + + static apr_status_t SetupRequest(serf_request_t* request, + void* setup_baton, + serf_bucket_t** req_bkt, + serf_response_acceptor_t* acceptor, + void** acceptor_baton, + serf_response_handler_t* handler, + void** handler_baton, + apr_pool_t* pool) { + SerfFetch* fetch = static_cast(setup_baton); + const char* url_path = apr_uri_unparse(pool, &fetch->url_, + APR_URI_UNP_OMITSITEPART); + + // If there is an explicit Host header, then override the + // host field in the Serf structure, as we will not be able + // to override it after it is created; only append to it. + // + // Serf automatically populates the Host field based on the + // URL, and provides no mechanism to override it, except + // by hacking source. We hacked source. + // + // See src/third_party/serf/src/instaweb_context.c + ConstStringStarVector v; + const char* host = NULL; + RequestHeaders* request_headers = fetch->async_fetch_->request_headers(); + if (request_headers->Lookup(HttpAttributes::kHost, &v) && + (v.size() == 1) && (v[0] != NULL)) { + host = v[0]->c_str(); + } + + fetch->FixUserAgent(); + + *req_bkt = serf_request_bucket_request_create_for_host( + request, kFetchMethod, + url_path, NULL, + serf_request_get_alloc(request), host); + serf_bucket_t* hdrs_bkt = serf_bucket_request_get_headers(*req_bkt); + + // Add other headers from the caller's request. Skip the "Host:" header + // because it's set above. + for (int i = 0; i < request_headers->NumAttributes(); ++i) { + const GoogleString& name = request_headers->Name(i); + const GoogleString& value = request_headers->Value(i); + if (!(StringCaseEqual(name, HttpAttributes::kHost))) { + // Note: *_setn() stores a pointer to name and value instead of a + // copy of those values. So name and value must have long lifetimes. + // In this case, we depend on request_headers being unchanged for + // the lifetime of hdrs_bkt, which is a documented requirement of + // the UrlAsyncFetcher interface. + serf_bucket_headers_setn(hdrs_bkt, name.c_str(), value.c_str()); + } + } + + *acceptor = SerfFetch::AcceptResponse; + *acceptor_baton = fetch; + *handler = SerfFetch::HandleResponse; + *handler_baton = fetch; + return APR_SUCCESS; + } + + bool ParseUrl() { + apr_status_t status = 0; + status = apr_uri_parse(pool_, str_url_.c_str(), &url_); + if (status != APR_SUCCESS) { + return false; // Failed to parse URL. + } + if (!fetcher_->allow_https() && StringCaseEqual(url_.scheme, "https")) { + return false; + } + if (!url_.port) { + url_.port = apr_uri_port_of_scheme(url_.scheme); + } + if (!url_.path) { + url_.path = apr_pstrdup(pool_, "/"); + } + return true; + } + + SerfUrlAsyncFetcher* fetcher_; + Timer* timer_; + const GoogleString str_url_; + AsyncFetch* async_fetch_; + ResponseHeadersParser parser_; + bool status_line_read_; + bool one_byte_read_; + bool has_saved_byte_; + char saved_byte_; + MessageHandler* message_handler_; + + apr_pool_t* pool_; + serf_bucket_alloc_t* bucket_alloc_; + apr_uri_t url_; + serf_connection_t* connection_; + size_t bytes_received_; + int64 fetch_start_ms_; + int64 fetch_end_ms_; + + // Variables used for HTTPS connection handling + bool using_https_; + serf_ssl_context_t* ssl_context_; + const char* ssl_error_message_; + + DISALLOW_COPY_AND_ASSIGN(SerfFetch); +}; + +class SerfThreadedFetcher : public SerfUrlAsyncFetcher { + public: + SerfThreadedFetcher(SerfUrlAsyncFetcher* parent, const char* proxy) : + SerfUrlAsyncFetcher(parent, proxy), + thread_id_(NULL), + initiate_mutex_(parent->thread_system()->NewMutex()), + initiate_fetches_(new SerfFetchPool()), + initiate_fetches_nonempty_(initiate_mutex_->NewCondvar()), + thread_finish_(false), + thread_started_(false) { + } + + ~SerfThreadedFetcher() { + // Let the thread terminate naturally by telling it to unblock, + // then waiting for it to finish its next active Poll operation. + { + // Indicate termination and unblock the worker thread so it can clean up. + ScopedMutex lock(initiate_mutex_.get()); + if (thread_started_) { + thread_finish_ = true; + initiate_fetches_nonempty_->Signal(); + } else { + LOG(INFO) << "Serf threaded not actually started, quick shutdown."; + return; + } + } + + LOG(INFO) << "Waiting for threaded serf fetcher to terminate"; + apr_status_t ignored_retval; + apr_thread_join(&ignored_retval, thread_id_); + + // Under normal circumstances there shouldn't be any active fetches at + // this point. However, in practice we may have some lingering fetches that + // have timed out, and we need to clean those up properly before we can + // exit. We try to do this gracefully, but fall back to graceless cleanup + // if that fails. + + // Before we can clean up, we must make sure we haven't initiated any + // fetches that haven't moved to the active pool yet. This should not + // happen, but we're exercising undue caution here. We do this by just + // moving them across. From this point, calls to InitiateFetch(...) are + // illegal, but we should be invoking this destructor from the only thread + // that could have called InitiateFetch anyhow. + TransferFetchesAndCheckDone(false); + // Although Cancel will be called in the base class destructor, we + // want to call it here as well, as it will make it easier for the + // thread to terminate. + CancelActiveFetches(); + completed_fetches_.DeleteAll(); + initiate_fetches_->DeleteAll(); + } + + void StartThread() { + CHECK_EQ(APR_SUCCESS, + apr_thread_create(&thread_id_, NULL, SerfThreadFn, this, pool_)); + thread_started_ = true; + } + + // Called from mainline to queue up a fetch for the thread. If the + // thread is idle then we can unlock it. + void InitiateFetch(SerfFetch* fetch) { + ScopedMutex lock(initiate_mutex_.get()); + + // We delay thread startup until we actually want to fetch something + // to avoid problems with ITK. + if (!thread_started_) { + StartThread(); + } + + // TODO(jmaessen): Consider adding an awaiting_nonempty_ flag to avoid + // spurious calls to Signal(). + bool signal = initiate_fetches_->empty(); + initiate_fetches_->Add(fetch); + if (signal) { + initiate_fetches_nonempty_->Signal(); + } + } + + void ShutDown() { + // See comments in the destructor above.. The big difference is that + // because we set shutdown_ to true new jobs can't actually come in. + { + // Acquisition order is initiate before hold, see e.g. AnyPendingFetches() + ScopedMutex hold_initiate(initiate_mutex_.get()); + ScopedMutex hold(mutex_); + set_shutdown(true); + if (!thread_started_) { + return; + } + } + TransferFetchesAndCheckDone(false); + CancelActiveFetches(); + } + + protected: + bool AnyPendingFetches() { + ScopedMutex lock(initiate_mutex_.get()); + // NOTE: We must hold both mutexes to avoid the case where we miss a fetch + // in transit. + return !initiate_fetches_->empty() || + SerfUrlAsyncFetcher::AnyPendingFetches(); + } + + private: + static void* SerfThreadFn(apr_thread_t* thread_id, void* context) { + SerfThreadedFetcher* stc = static_cast(context); + CHECK_EQ(thread_id, stc->thread_id_); + stc->SerfThread(); + return NULL; + } + + // Transfer fetches from initiate_fetches_ to active_fetches_. If there's no + // new fetches to initiate, check whether the Apache thread is trying to shut + // down the worker thread, and return true to indicate "done". Doesn't do any + // work if initiate_fetches_ is empty, but in that case if block_on_empty is + // true it will perform a bounded wait for initiate_fetches_nonempty_. Called + // by worker thread and during thread cleanup. + bool TransferFetchesAndCheckDone(bool block_on_empty) { + // Use a temp to minimize the amount of time we hold the + // initiate_mutex_ lock, so that the parent thread doesn't get + // blocked trying to initiate fetches. + scoped_ptr xfer_fetches(NULL); + { + ScopedMutex lock(initiate_mutex_.get()); + // We must do this checking under the initiate_mutex_ lock. + if (initiate_fetches_->empty()) { + // No new work to do now. + if (!block_on_empty || thread_finish_) { + return thread_finish_; + } else { + // Wait until some work shows up. Note that after the wait we still + // must actually check that there's some work to be done. + initiate_fetches_nonempty_->TimedWait(Timer::kSecondMs); + if (initiate_fetches_->empty()) { + // On timeout / false wakeup, return control to caller; we might be + // finished or have other things to attend to. + return thread_finish_; + } + } + } + xfer_fetches.reset(new SerfFetchPool()); + + // Take mutex_ before relinquishing initiate_mutex_. This guarantees that + // AnyPendingFetches cannot see us in the time between emptying + // initiate_fetches_ and inserting into active_fetches_. At that time, it + // can look as though no fetch work is occurring. Note that we obtain + // mutex_ before performing the swap (but after creating the new pool) + // because additional fetches might arrive in the mean time. This was + // causing problems with timeout in TestThreeThreaded under valgrind, + // because we'd block waiting for mutex_ after a single fetch had been + // initiated, but not obtain mutex_ until after several more fetches + // arrived (at which point we'd go into the poll loop without initiating + // all available fetches). + mutex_->Lock(); + xfer_fetches.swap(initiate_fetches_); + } + + // Now that we've unblocked the parent thread, we can leisurely + // queue up the fetches, employing the proper lock for the active_fetches_ + // set. Actually we expect we wll never have contention on this mutex + // from the thread. + while (!xfer_fetches->empty()) { + SerfFetch* fetch = xfer_fetches->RemoveOldest(); + if (StartFetch(fetch)) { + SERF_DEBUG(LOG(INFO) << "Adding threaded fetch to url " + << fetch->str_url() + << " (" << active_fetches_.size() << ")"); + } + } + mutex_->Unlock(); + return false; + } + + void SerfThread() { + // Make sure we don't get yet-another copy of signals used by Apache to + // shutdown here, to avoid double-free. + // TODO(morlovich): Port this to use ThreadSystem stuff, and have + // ApacheThreadSystem take care of this automatically. + apr_setup_signal_thread(); + + // Initially there's no active fetch work to be done. + int num_active_fetches = 0; + while (!TransferFetchesAndCheckDone(num_active_fetches == 0)) { + // If initiate_fetches is empty, and there's no current active fetch + // work to do, we'll block in the above call. Otherwise the call will + // start initiated fetches (if any) without blocking. + + // We set the poll interval to try to start new fetches promptly from the + // observer's perspective (ie .1s is perceptible, so we try to make sure + // new fetches are started after at most half that time). The downside is + // that we don't hand off control to serf / the OS for long periods when + // fetches are active but no data is arriving. We trust that doesn't + // happen often. + // TODO(jmaessen): Break out of Poll before timeout if work becomes + // available, so that we initiate new fetches as promptly as possible + // while continuing to serve the old ones. This would let us dial the + // poll interval up high (to multiple seconds). The classic trick here is + // to set up a pipe/FIFO/socket and add it to the set of things being + // read, then use a write to force wakeup. But will serf support this + // kind of thing? + const int64 kPollIntervalMs = Timer::kSecondMs / 20; + // If active_fetches_ is empty, we will not do any work and won't block + // here. num_active_fetches will be 0, and we'll block in the next + // call to TransferFetches above. + num_active_fetches = Poll(kPollIntervalMs); + SERF_DEBUG(LOG(INFO) << "Finished polling from serf thread (" + << this << ")"); + } + } + + apr_thread_t* thread_id_; + + // protects initiate_fetches_, initiate_fetches_nonempty_, thread_finish_ + // and thread_started_. + scoped_ptr initiate_mutex_; + // pushed in the main thread; popped by TransferFetches(). + scoped_ptr initiate_fetches_; + // condvar that indicates that initiate_fetches_ has become nonempty. During + // normal operation, only the serf worker thread consumes initiated fetches + // (this can change during thread shutdown), but the usual condition variable + // caveats apply: Just because the condition variable indicates + // initiate_fetches_nonempty_ doesn't mean it's true, and a waiting thread + // must check initiate_fetches_ explicitly while holding initiate_mutex_. + scoped_ptr initiate_fetches_nonempty_; + + // Flag to signal worker to finish working and terminate. + bool thread_finish_; + + // True if we actually started the worker thread. Protected by initiate_mutex_ + bool thread_started_; + + DISALLOW_COPY_AND_ASSIGN(SerfThreadedFetcher); +}; + +bool SerfFetch::Start(SerfUrlAsyncFetcher* fetcher) { + // Note: this is called in the thread's context, so this is when we do + // the pool ops. + fetcher_ = fetcher; + apr_pool_create(&pool_, fetcher_->pool()); + bucket_alloc_ = serf_bucket_allocator_create(pool_, NULL, NULL); + + fetch_start_ms_ = timer_->NowMs(); + // Parse and validate the URL. + if (!ParseUrl()) { + return false; + } + + using_https_ = StringCaseEqual("https", url_.scheme); + DCHECK(fetcher->allow_https() || !using_https_); + + apr_status_t status = serf_connection_create2(&connection_, + fetcher_->serf_context(), + url_, + ConnectionSetup, this, + ClosedConnection, this, + pool_); + if (status != APR_SUCCESS) { + message_handler_->Error(str_url_.c_str(), 0, + "Error status=%d (%s) serf_connection_create2", + status, GetAprErrorString(status).c_str()); + return false; + } + serf_connection_request_create(connection_, SetupRequest, this); + + // Start the fetch. It will connect to the remote host, send the request, + // and accept the response, without blocking. + status = serf_context_run(fetcher_->serf_context(), 0, fetcher_->pool()); + + if (status == APR_SUCCESS || APR_STATUS_IS_TIMEUP(status)) { + return true; + } else { + message_handler_->Error(str_url_.c_str(), 0, + "serf_context_run error status=%d (%s)", + status, GetAprErrorString(status).c_str()); + return false; + } +} + + +// Set up the proxy for all the connections in the context. The proxy is in the +// format of hostname:port. +bool SerfUrlAsyncFetcher::SetupProxy(const char* proxy) { + apr_status_t status = 0; + if (proxy == NULL || *proxy == '\0') { + return true; // No proxy to be set. + } + + apr_sockaddr_t* proxy_address = NULL; + apr_port_t proxy_port; + char* proxy_host; + char* proxy_scope; + status = apr_parse_addr_port(&proxy_host, &proxy_scope, &proxy_port, proxy, + pool_); + if (status != APR_SUCCESS || proxy_host == NULL || proxy_port == 0 || + (status = apr_sockaddr_info_get(&proxy_address, proxy_host, APR_UNSPEC, + proxy_port, 0, pool_)) != APR_SUCCESS) { + return false; + } + serf_config_proxy(serf_context_, proxy_address); + return true; +} + +SerfUrlAsyncFetcher::SerfUrlAsyncFetcher(const char* proxy, apr_pool_t* pool, + ThreadSystem* thread_system, + Statistics* statistics, Timer* timer, + int64 timeout_ms, + MessageHandler* message_handler) + : pool_(NULL), + thread_system_(thread_system), + timer_(timer), + mutex_(NULL), + serf_context_(NULL), + threaded_fetcher_(NULL), + active_count_(NULL), + request_count_(NULL), + byte_count_(NULL), + time_duration_ms_(NULL), + cancel_count_(NULL), + timeout_count_(NULL), + failure_count_(NULL), + cert_errors_(NULL), + timeout_ms_(timeout_ms), + force_threaded_(false), + shutdown_(false), + list_outstanding_urls_on_error_(false), + track_original_content_length_(false), + https_options_(0), + message_handler_(message_handler) { + CHECK(statistics != NULL); + request_count_ = + statistics->GetVariable(SerfStats::kSerfFetchRequestCount); + byte_count_ = statistics->GetVariable(SerfStats::kSerfFetchByteCount); + time_duration_ms_ = + statistics->GetVariable(SerfStats::kSerfFetchTimeDurationMs); + cancel_count_ = statistics->GetVariable(SerfStats::kSerfFetchCancelCount); + active_count_ = statistics->GetVariable(SerfStats::kSerfFetchActiveCount); + timeout_count_ = statistics->GetVariable(SerfStats::kSerfFetchTimeoutCount); + failure_count_ = statistics->GetVariable(SerfStats::kSerfFetchFailureCount); + cert_errors_ = statistics->GetVariable(SerfStats::kSerfFetchCertErrors); + Init(pool, proxy); + threaded_fetcher_ = new SerfThreadedFetcher(this, proxy); +} + +SerfUrlAsyncFetcher::SerfUrlAsyncFetcher(SerfUrlAsyncFetcher* parent, + const char* proxy) + : pool_(NULL), + thread_system_(parent->thread_system_), + timer_(parent->timer_), + mutex_(NULL), + serf_context_(NULL), + threaded_fetcher_(NULL), + active_count_(parent->active_count_), + request_count_(parent->request_count_), + byte_count_(parent->byte_count_), + time_duration_ms_(parent->time_duration_ms_), + cancel_count_(parent->cancel_count_), + timeout_count_(parent->timeout_count_), + failure_count_(parent->failure_count_), + cert_errors_(parent->cert_errors_), + timeout_ms_(parent->timeout_ms()), + force_threaded_(parent->force_threaded_), + shutdown_(false), + list_outstanding_urls_on_error_(parent->list_outstanding_urls_on_error_), + track_original_content_length_(parent->track_original_content_length_), + https_options_(parent->https_options_), + message_handler_(parent->message_handler_) { + Init(parent->pool(), proxy); +} + +SerfUrlAsyncFetcher::~SerfUrlAsyncFetcher() { + CancelActiveFetches(); + completed_fetches_.DeleteAll(); + int orphaned_fetches = active_fetches_.size(); + if (orphaned_fetches != 0) { + message_handler_->Message( + kError, "SerfFetcher destructed with %d orphaned fetches.", + orphaned_fetches); + if (active_count_ != NULL) { + active_count_->Add(-orphaned_fetches); + } + if (cancel_count_ != NULL) { + cancel_count_->Add(orphaned_fetches); + } + } + + active_fetches_.DeleteAll(); + if (threaded_fetcher_ != NULL) { + delete threaded_fetcher_; + } + delete mutex_; + apr_pool_destroy(pool_); // also calls apr_allocator_destroy on the allocator +} + +void SerfUrlAsyncFetcher::ShutDown() { + // Note that we choose not to delete the threaded_fetcher_ to avoid worrying + // about races on its deletion. + if (threaded_fetcher_ != NULL) { + threaded_fetcher_->ShutDown(); + } + + ScopedMutex lock(mutex_); + shutdown_ = true; + CancelActiveFetchesMutexHeld(); +} + +void SerfUrlAsyncFetcher::Init(apr_pool_t* parent_pool, const char* proxy) { + // Here, we give each our Serf threads' (main and work) separate pools + // with separate threadsafe allocators. + pool_ = AprCreateThreadCompatiblePool(parent_pool); + mutex_ = thread_system_->NewMutex(); + serf_context_ = serf_context_create(pool_); + + if (!SetupProxy(proxy)) { + message_handler_->Message(kError, "Proxy failed: %s", proxy); + } +} + +void SerfUrlAsyncFetcher::CancelActiveFetches() { + ScopedMutex lock(mutex_); + CancelActiveFetchesMutexHeld(); +} + +void SerfUrlAsyncFetcher::CancelActiveFetchesMutexHeld() { + // If there are still active requests, cancel them. + int num_canceled = 0; + while (!active_fetches_.empty()) { + // Canceling a fetch requires that the fetch reside in active_fetches_, + // but can invalidate iterators pointing to the affected fetch. To avoid + // trouble, we simply ask for the oldest element, knowing it will go away. + SerfFetch* fetch = active_fetches_.oldest(); + LOG(WARNING) << "Aborting fetch of " << fetch->str_url(); + fetch->Cancel(); + ++num_canceled; + } + + if (num_canceled != 0) { + if (cancel_count_ != NULL) { + cancel_count_->Add(num_canceled); + } + } +} + +bool SerfUrlAsyncFetcher::StartFetch(SerfFetch* fetch) { + bool started = !shutdown_ && fetch->Start(this); + if (started) { + active_fetches_.Add(fetch); + active_count_->Add(1); + } else { + LOG(WARNING) << "Fetch failed to start: " << fetch->str_url(); + fetch->CallbackDone(false); + delete fetch; + } + return started; +} + +void SerfUrlAsyncFetcher::Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* async_fetch) { + async_fetch = EnableInflation(async_fetch, NULL /* blacklist */); + SerfFetch* fetch = new SerfFetch(url, async_fetch, message_handler, timer_); + + request_count_->Add(1); + if (force_threaded_ || async_fetch->EnableThreaded()) { + message_handler->Message(kInfo, "Initiating async fetch for %s", + url.c_str()); + threaded_fetcher_->InitiateFetch(fetch); + } else { + message_handler->Message(kInfo, "Initiating blocking fetch for %s", + url.c_str()); + { + ScopedMutex mutex(mutex_); + StartFetch(fetch); + } + } +} + +void SerfUrlAsyncFetcher::PrintActiveFetches( + MessageHandler* handler) const { + ScopedMutex mutex(mutex_); + for (SerfFetchPool::const_iterator p = active_fetches_.begin(), + e = active_fetches_.end(); p != e; ++p) { + SerfFetch* fetch = *p; + handler->Message(kInfo, "Active fetch: %s", + fetch->str_url()); + } +} + +// If active_fetches_ is empty, this does no work and returns 0. +int SerfUrlAsyncFetcher::Poll(int64 max_wait_ms) { + // Run serf polling up to microseconds. + ScopedMutex mutex(mutex_); + if (!active_fetches_.empty()) { + apr_status_t status = + serf_context_run(serf_context_, 1000*max_wait_ms, pool_); + completed_fetches_.DeleteAll(); + if (APR_STATUS_IS_TIMEUP(status)) { + // Remove expired fetches from the front of the queue. + // This relies on the insertion-ordering guarantee + // provided by the Pool iterator. + int64 stale_cutoff = timer_->NowMs() - timeout_ms_; + int timeouts = 0; + // This loop calls Cancel, which deletes a fetch and thus invalidates + // iterators; we thus rely on retrieving oldest(). + while (!active_fetches_.empty()) { + SerfFetch* fetch = active_fetches_.oldest(); + if (fetch->fetch_start_ms() >= stale_cutoff) { + // This and subsequent fetches are still active, so we're done. + break; + } + message_handler_->Message( + kWarning, "Fetch timed out: %s (%ld) waiting for %ld ms", + fetch->str_url(), + static_cast(active_fetches_.size()), // NOLINT + static_cast(max_wait_ms)); // NOLINT + timeouts++; + // Note that canceling the fetch will ultimately call FetchComplete and + // delete it from the pool. + fetch->Cancel(); + } + if ((timeouts > 0) && (timeout_count_ != NULL)) { + timeout_count_->Add(timeouts); + } + } + bool success = ((status == APR_SUCCESS) || APR_STATUS_IS_TIMEUP(status)); + // TODO(jmarantz): provide the success status to the caller if there is a + // need. + if (!success && !active_fetches_.empty()) { + // TODO(jmarantz): I have a new theory that we are getting + // behind when our self-directed URL fetches queue up multiple + // requests for the same URL, which might be sending the Serf + // library into an n^2 situation with its polling, even though + // we are using an rb_tree to hold the active fetches. We + // should fix this by keeping a map from url->SerfFetch, where + // we'd have to store lists of Callback*, ResponseHeader*, Writer* so + // all interested parties were updated if and when the fetch finally + // completed. + // NOTE(jmaessen): this is actually hard because all the above data is + // process-local, and the multiple requests are likely cross-process. + // + // In the meantime by putting more detail into the log here, we'll + // know whether we are accumulating active fetches to make the + // server fall over. + message_handler_->Message( + kError, + "Serf status %d(%s) polling for %ld %s fetches for %g seconds", + status, GetAprErrorString(status).c_str(), + static_cast(active_fetches_.size()), // NOLINT + (threaded_fetcher_ == NULL) ? "threaded" : "non-blocking", + max_wait_ms/1.0e3); + if (list_outstanding_urls_on_error_) { + int64 now_ms = timer_->NowMs(); + for (Pool::iterator p = active_fetches_.begin(), + e = active_fetches_.end(); p != e; ++p) { + SerfFetch* fetch = *p; + int64 age_ms = now_ms - fetch->fetch_start_ms(); + message_handler_->Message(kError, "URL %s active for %ld ms", + fetch->str_url(), + static_cast(age_ms)); // NOLINT + } + } + CleanupFetchesWithErrors(); + } + } + return active_fetches_.size(); +} + +void SerfUrlAsyncFetcher::FetchComplete(SerfFetch* fetch) { + // We do not have a ScopedMutex in FetchComplete, because it is only + // called from Poll and CancelActiveFetches, which have ScopedMutexes. + // Note that SerfFetch::Cancel is currently not exposed from outside this + // class. + active_fetches_.Remove(fetch); + completed_fetches_.Add(fetch); + fetch->message_handler()->Message(kInfo, "Fetch complete: %s", + fetch->str_url()); + if (time_duration_ms_) { + time_duration_ms_->Add(fetch->TimeDuration()); + } + if (byte_count_) { + byte_count_->Add(fetch->bytes_received()); + } + if (active_count_) { + active_count_->Add(-1); + } +} + +bool SerfUrlAsyncFetcher::AnyPendingFetches() { + ScopedMutex lock(mutex_); + return !active_fetches_.empty(); +} + +int SerfUrlAsyncFetcher:: ApproximateNumActiveFetches() { + ScopedMutex lock(mutex_); + return active_fetches_.size(); +} + +bool SerfUrlAsyncFetcher::WaitForActiveFetches( + int64 max_ms, MessageHandler* message_handler, WaitChoice wait_choice) { + bool ret = true; + if ((threaded_fetcher_ != NULL) && (wait_choice != kMainlineOnly)) { + ret &= threaded_fetcher_->WaitForActiveFetchesHelper( + max_ms, message_handler); + } + if (wait_choice != kThreadedOnly) { + ret &= WaitForActiveFetchesHelper(max_ms, message_handler); + } + return ret; +} + +bool SerfUrlAsyncFetcher::WaitForActiveFetchesHelper( + int64 max_ms, MessageHandler* message_handler) { + bool any_pending_fetches = AnyPendingFetches(); + if (any_pending_fetches) { + int64 now_ms = timer_->NowMs(); + int64 end_ms = now_ms + max_ms; + while ((now_ms < end_ms) && any_pending_fetches) { + int64 remaining_ms = end_ms - now_ms; + SERF_DEBUG(LOG(INFO) << "Blocking process waiting " << remaining_ms + << "ms for " << ApproximateNumActiveFetches() + << " fetches to complete"); + SERF_DEBUG(PrintActiveFetches(message_handler)); + Poll(remaining_ms); + now_ms = timer_->NowMs(); + any_pending_fetches = AnyPendingFetches(); + } + if (any_pending_fetches) { + message_handler->Message( + kError, "Serf timeout waiting for fetches to complete:"); + PrintActiveFetches(message_handler); + return false; + } + SERF_DEBUG(LOG(INFO) << "Serf successfully completed " + << ApproximateNumActiveFetches() << " active fetches"); + } + return true; +} + +void SerfUrlAsyncFetcher::CleanupFetchesWithErrors() { + // Create a copy of list of active fetches, as we may have to cancel + // some failed ones, modifying the list. + std::vector fetches; + for (SerfFetchPool::iterator i = active_fetches_.begin(); + i != active_fetches_.end(); ++i) { + fetches.push_back(*i); + } + + // Check each fetch to see if it needs cleanup because its serf connection + // got into an error state. + for (int i = 0, size = fetches.size(); i < size; ++i) { + fetches[i]->CleanupIfError(); + } +} + +void SerfUrlAsyncFetcher::InitStats(Statistics* statistics) { + statistics->AddVariable(SerfStats::kSerfFetchRequestCount); + statistics->AddVariable(SerfStats::kSerfFetchByteCount); + statistics->AddVariable(SerfStats::kSerfFetchTimeDurationMs); + statistics->AddVariable(SerfStats::kSerfFetchCancelCount); + statistics->AddVariable(SerfStats::kSerfFetchActiveCount); + statistics->AddVariable(SerfStats::kSerfFetchTimeoutCount); + statistics->AddVariable(SerfStats::kSerfFetchFailureCount); + statistics->AddVariable(SerfStats::kSerfFetchCertErrors); +} + +void SerfUrlAsyncFetcher::set_list_outstanding_urls_on_error(bool x) { + list_outstanding_urls_on_error_ = x; + if (threaded_fetcher_ != NULL) { + threaded_fetcher_->set_list_outstanding_urls_on_error(x); + } +} + +void SerfUrlAsyncFetcher::set_track_original_content_length(bool x) { + track_original_content_length_ = x; + if (threaded_fetcher_ != NULL) { + threaded_fetcher_->set_track_original_content_length(x); + } +} + +bool SerfUrlAsyncFetcher::ParseHttpsOptions(StringPiece directive, + uint32* options, + GoogleString* error_message) { + StringPieceVector keywords; + SplitStringPieceToVector(directive, ",", &keywords, true); + uint32 https_options = 0; + for (int i = 0, n = keywords.size(); i < n; ++i) { + StringPiece keyword = keywords[i]; + if (keyword == "enable") { + https_options |= kEnableHttps; + } else if (keyword == "disable") { + https_options &= ~static_cast(kEnableHttps); + } else if (keyword == "allow_self_signed") { + https_options |= kAllowSelfSigned; + } else if (keyword == "allow_unknown_certificate_authority") { + https_options |= kAllowUnknownCertificateAuthority; + } else if (keyword == "allow_certificate_not_yet_valid") { + https_options |= kAllowCertificateNotYetValid; + } else { + StrAppend(error_message, + "Invalid HTTPS keyword: ", keyword, ", legal options are: " + SERF_HTTPS_KEYWORDS); + return false; + } + } + *options = https_options; + return true; +} + +bool SerfUrlAsyncFetcher::SetHttpsOptions(StringPiece directive) { + GoogleString error_message; + if (!ParseHttpsOptions(directive, &https_options_, &error_message)) { + message_handler_->Message(kError, "%s", error_message.c_str()); + return false; + } + +#if !SERF_HTTPS_FETCHING + if (allow_https()) { + message_handler_->Message(kError, "HTTPS fetching has not been compiled " + "into the binary, so it has not been enabled."); + https_options_ = 0; + } +#endif + if (threaded_fetcher_ != NULL) { + threaded_fetcher_->set_https_options(https_options_); + } + return true; +} + +bool SerfUrlAsyncFetcher::allow_https() const { + return ((https_options_ & kEnableHttps) != 0); +} + +bool SerfUrlAsyncFetcher::allow_self_signed() const { + return ((https_options_ & kAllowSelfSigned) != 0); +} + +bool SerfUrlAsyncFetcher::allow_unknown_certificate_authority() const { + return ((https_options_ & kAllowUnknownCertificateAuthority) != 0); +} + +bool SerfUrlAsyncFetcher::allow_certificate_not_yet_valid() const { + return ((https_options_ & kAllowCertificateNotYetValid) != 0); +} + +bool SerfUrlAsyncFetcher::SupportsHttps() const { + return allow_https(); +} + +} // namespace net_instaweb diff --git a/psol/include/net/instaweb/apache/serf_url_async_fetcher.h b/psol/include/net/instaweb/apache/serf_url_async_fetcher.h new file mode 100644 index 000000000..c1dd925e4 --- /dev/null +++ b/psol/include/net/instaweb/apache/serf_url_async_fetcher.h @@ -0,0 +1,251 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com (Joshua Marantz) +// lsong@google.com (Libo Song) + +#ifndef NET_INSTAWEB_APACHE_SERF_URL_ASYNC_FETCHER_H_ +#define NET_INSTAWEB_APACHE_SERF_URL_ASYNC_FETCHER_H_ + +#include + +#include "net/instaweb/http/public/url_pollable_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/pool.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/thread_system.h" + +// To enable HTTPS fetching with serf, we must link against OpenSSL, +// which is a a large library with licensing restrictions not known to +// be wholly inline with the Apache license. To enable HTTPS fetching: +// 1. Set SERF_HTTPS_FETCHING to 1 here +// 2. Uncomment the references to openssl.gyp and ssl_buckets.c in +// src/third_party/serf/serf.gyp. +// 3. Uncomment both references to openssl in src/DEPS. +// +// If this is enabled, then the HTTPS fetching can be tested with +// install/apache_https_fetch_test.sh +#ifndef SERF_HTTPS_FETCHING +#define SERF_HTTPS_FETCHING 0 +#endif + +struct apr_pool_t; +struct serf_context_t; + +namespace net_instaweb { + +class AsyncFetch; +class MessageHandler; +class Statistics; +class SerfFetch; +class SerfThreadedFetcher; +class Timer; +class Variable; + +struct SerfStats { + static const char kSerfFetchRequestCount[]; + static const char kSerfFetchByteCount[]; + static const char kSerfFetchTimeDurationMs[]; + static const char kSerfFetchCancelCount[]; + static const char kSerfFetchActiveCount[]; + static const char kSerfFetchTimeoutCount[]; + static const char kSerfFetchFailureCount[]; + static const char kSerfFetchCertErrors[]; +}; + +// Identifies the set of HTML keywords. This is used in error messages emitted +// both from the config parser in this module, and in the directives table in +// mod_instaweb.cc which must be statically constructed using a compile-time +// concatenation. Hence this must be a literal string and not a const char*. +#define SERF_HTTPS_KEYWORDS \ + "enable,disable,allow_self_signed," \ + "allow_unknown_certificate_authority,allow_certificate_not_yet_valid" + +// TODO(sligocki): Serf does not seem to act appropriately in IPv6 +// environments, fix and test this. +// Specifically: +// (1) It does not attempt to fall-back to IPv4 if IPv6 connection fails; +// (2) It may not correctly signal failure, which causes the incoming +// connection to hang. +class SerfUrlAsyncFetcher : public UrlPollableAsyncFetcher { + public: + enum WaitChoice { + kThreadedOnly, + kMainlineOnly, + kThreadedAndMainline + }; + + SerfUrlAsyncFetcher(const char* proxy, apr_pool_t* pool, + ThreadSystem* thread_system, + Statistics* statistics, Timer* timer, int64 timeout_ms, + MessageHandler* handler); + SerfUrlAsyncFetcher(SerfUrlAsyncFetcher* parent, const char* proxy); + virtual ~SerfUrlAsyncFetcher(); + + static void InitStats(Statistics* statistics); + + // Stops all active fetches and prevents further fetches from starting + // (they will instead quickly call back to ->Done(false). + virtual void ShutDown(); + + virtual bool SupportsHttps() const; + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* callback); + + virtual int Poll(int64 max_wait_ms); + + bool WaitForActiveFetches(int64 max_milliseconds, + MessageHandler* message_handler, + WaitChoice wait_choice); + + // Remove the completed fetch from the active fetch set, and put it into a + // completed fetch list to be cleaned up. + void FetchComplete(SerfFetch* fetch); + apr_pool_t* pool() const { return pool_; } + serf_context_t* serf_context() const { return serf_context_; } + + void PrintActiveFetches(MessageHandler* handler) const; + virtual int64 timeout_ms() { return timeout_ms_; } + ThreadSystem* thread_system() { return thread_system_; } + + // By default, the Serf fetcher will call + // UrlAsyncFetcher::Callback::EnableThreaded() to determine whether + // a particular URL fetch should be executed in the fetcher thread. + // + // Setting this variable causes the fetches to be threaded independent + // of the value of UrlAsyncFetcher::Callback::EnableThreaded(). + void set_force_threaded(bool x) { force_threaded_ = x; } + + // Indicates that Serf should enumerate failing URLs whenever the underlying + // Serf library reports an error. + void set_list_outstanding_urls_on_error(bool x); + + // Indicates that Serf should track the original content length for + // fetched resources. + bool track_original_content_length() const { + return track_original_content_length_; + } + void set_track_original_content_length(bool x); + + // Indicates that direct HTTPS fetching should be allowed, and how picky + // to be about certificates. The directive is a comma separated list of + // these keywords: + // enable + // disable + // allow_self_signed + // allow_unknown_certificate_authority + // allow_certificate_not_yet_valid + // Returns 'false' if the directive does not parse properly. + bool SetHttpsOptions(StringPiece directive); + + // Validates the correctness of an https directive. Exposed as a static + // method for early exit on mis-specified pagespeed.conf. + static bool ValidateHttpsOptions(StringPiece directive, + GoogleString* error_message) { + uint32 options; + return ParseHttpsOptions(directive, &options, error_message); + } + + protected: + typedef Pool SerfFetchPool; + + // Determines whether https is allowed in the current configuration. + inline bool allow_https() const; + inline bool allow_self_signed() const; + inline bool allow_unknown_certificate_authority() const; + inline bool allow_certificate_not_yet_valid() const; + + void set_https_options(uint32 https_options) { + https_options_ = https_options; + } + + void Init(apr_pool_t* parent_pool, const char* proxy); + bool SetupProxy(const char* proxy); + + // Start a SerfFetch. Takes ownership of fetch and makes sure callback is + // called even if fetch fails to start. + // + // mutex_ must be held before calling StartFetch. + bool StartFetch(SerfFetch* fetch); + + // AnyPendingFetches is accurate only at the time of call; this is + // used conservatively during shutdown. It counts fetches that have been + // requested by some thread, and can include fetches for which no action + // has yet been taken (ie fetches that are not active). + virtual bool AnyPendingFetches(); + // ApproximateNumActiveFetches can under- or over-count and is used only for + // error reporting. + int ApproximateNumActiveFetches(); + + void CancelActiveFetches(); + void CancelActiveFetchesMutexHeld(); + bool WaitForActiveFetchesHelper(int64 max_ms, + MessageHandler* message_handler); + + // This cleans up the serf resources for fetches that errored out. + // Must be called only immediately after running the serf event loop. + // Must be called with mutex_ held. + void CleanupFetchesWithErrors(); + + // These must be accessed with mutex_ held. + bool shutdown() const { return shutdown_; } + void set_shutdown(bool s) { shutdown_ = s; } + + apr_pool_t* pool_; + ThreadSystem* thread_system_; + Timer* timer_; + + // mutex_ protects serf_context_ and active_fetches_. + ThreadSystem::CondvarCapableMutex* mutex_; + serf_context_t* serf_context_; + SerfFetchPool active_fetches_; + + typedef std::vector FetchVector; + SerfFetchPool completed_fetches_; + SerfThreadedFetcher* threaded_fetcher_; + + // This is protected because it's updated along with active_fetches_, + // which happens in subclass SerfThreadedFetcher as well as this class. + Variable* active_count_; + + private: + friend class SerfFetch; // To access stats variables below. + + static bool ParseHttpsOptions(StringPiece directive, uint32* options, + GoogleString* error_message); + + Variable* request_count_; + Variable* byte_count_; + Variable* time_duration_ms_; + Variable* cancel_count_; + Variable* timeout_count_; + Variable* failure_count_; + Variable* cert_errors_; + const int64 timeout_ms_; + bool force_threaded_; + bool shutdown_; + bool list_outstanding_urls_on_error_; + bool track_original_content_length_; + uint32 https_options_; // Composed of HttpsOptions ORed together. + MessageHandler* message_handler_; + + DISALLOW_COPY_AND_ASSIGN(SerfUrlAsyncFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_APACHE_SERF_URL_ASYNC_FETCHER_H_ diff --git a/psol/include/net/instaweb/automatic/public/blink_flow_critical_line.h b/psol/include/net/instaweb/automatic/public/blink_flow_critical_line.h new file mode 100644 index 000000000..fc1e63d74 --- /dev/null +++ b/psol/include/net/instaweb/automatic/public/blink_flow_critical_line.h @@ -0,0 +1,175 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: pulkitg@google.com (Pulkit Goyal) + +#ifndef NET_INSTAWEB_AUTOMATIC_PUBLIC_BLINK_FLOW_CRITICAL_LINE_H_ +#define NET_INSTAWEB_AUTOMATIC_PUBLIC_BLINK_FLOW_CRITICAL_LINE_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/google_url.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class AsyncFetch; +class BlinkCriticalLineData; +class BlinkCriticalLineDataFinder; +class LogRecord; +class PropertyPage; +class ProxyFetchPropertyCallbackCollector; +class ProxyFetchFactory; +class ServerContext; +class RewriteOptions; +class Statistics; +class TimedVariable; + +// This class manages the blink flow for looking up BlinkCriticalLineData in +// cache, modifying the options for passthru and triggering asynchronous +// lookups to compute the critical line and insert it into cache. +class BlinkFlowCriticalLine { + public: + // These strings identify sync-points for reproducing races between foreground + // serving request and background blink computation requests in tests. + static const char kBackgroundComputationDone[]; + static const char kUpdateResponseCodeDone[]; + + static void Start(const GoogleString& url, + AsyncFetch* base_fetch, + RewriteOptions* options, + ProxyFetchFactory* factory, + ServerContext* manager, + ProxyFetchPropertyCallbackCollector* property_callback); + + virtual ~BlinkFlowCriticalLine(); + + static void InitStats(Statistics* statistics); + + static const char kNumBlinkHtmlCacheHits[]; + static const char kNumBlinkHtmlCacheMisses[]; + static const char kNumBlinkSharedFetchesStarted[]; + static const char kNumBlinkSharedFetchesCompleted[]; + static const char kNumComputeBlinkCriticalLineDataCalls[]; + static const char kNumBlinkHtmlMatches[]; + static const char kNumBlinkHtmlMismatches[]; + static const char kNumBlinkHtmlMismatchesCacheDeletes[]; + static const char kNumBlinkHtmlSmartdiffMatches[]; + static const char kNumBlinkHtmlSmartdiffMismatches[]; + + private: + BlinkFlowCriticalLine(const GoogleString& url, + AsyncFetch* base_fetch, + RewriteOptions* options, + ProxyFetchFactory* factory, + ServerContext* manager, + ProxyFetchPropertyCallbackCollector* property_callback); + + // Sets request start time. + void SetStartRequestTimings(); + + // Sets the server side response start time. + void SetResponseStartTime(); + + // Function called by the callback collector whenever property cache lookup + // is done. Based on the result, it will call either + // BlinkCriticalLineDataHit() or BlinkCriticalLineDataMiss(). + void BlinkCriticalLineDataLookupDone( + ProxyFetchPropertyCallbackCollector* collector); + + // Serves the critical html content to the client and triggers the proxy fetch + // for non cacheable content. + void BlinkCriticalLineDataHit(); + + // Serves the request in passthru mode and triggers a background request to + // compute BlinkCriticalLineData. + void BlinkCriticalLineDataMiss(); + + // Creates a rewrite driver and triggers proxy fetch. + // critical_line_data_found indicates whether it is a cache hit case, while + // serve_non_critical means that non critical needs to be served (i.e., not + // yet served). + void TriggerProxyFetch(bool critical_line_data_found, + bool serve_non_critical); + + void WriteResponseStartAndLookUpTimings(); + + // Serves all the panel contents including critical html, critical images json + // and non critical json. This is the case when there are no cacheable panels + // in the page. + void ServeAllPanelContents(); + + // Serves critical panel contents including critical html and + // critical images json. This is the case when there are cacheable panels + // in the page. + void ServeCriticalPanelContents(); + + // Sends critical html to the client. + void SendCriticalHtml(const GoogleString& critical_json_str); + + // Sends inline images json to the client. + void SendInlineImagesJson(const GoogleString& pushed_images_str); + + // Sends non critical json to the client. + void SendNonCriticalJson(GoogleString* non_critical_json_str); + + // Sends the lazyload filter javascript code. + void SendLazyloadImagesJs(); + + void WriteString(const StringPiece& str); + + int64 GetTimeElapsedFromStartRequest(); + + GoogleString GetAddTimingScriptString(const GoogleString& timing_str, + int64 time_ms); + + void Flush(); + + // Modify the rewrite options to be used in the background and user-facing + // request when BlinkCriticalLineData is found in the cache. + void SetFilterOptions(RewriteOptions* options) const; + + // Returns true if property cache has last response code as non 200. + bool IsLastResponseCodeInvalid(PropertyPage* page); + + // Convenience method to access the log record from base_fetch_'s request + // context. + LogRecord* log_record(); + + GoogleString url_; + GoogleUrl google_url_; + GoogleString critical_html_; + AsyncFetch* base_fetch_; + RewriteOptions* options_; + ProxyFetchFactory* factory_; + ServerContext* manager_; + ProxyFetchPropertyCallbackCollector* property_callback_; + scoped_ptr blink_critical_line_data_; + BlinkCriticalLineDataFinder* finder_; + int64 request_start_time_ms_; + int64 time_to_start_blink_flow_critical_line_ms_; + int64 time_to_critical_line_data_look_up_done_ms_; + + TimedVariable* num_blink_html_cache_hits_; + TimedVariable* num_blink_shared_fetches_started_; + + DISALLOW_COPY_AND_ASSIGN(BlinkFlowCriticalLine); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_AUTOMATIC_PUBLIC_BLINK_FLOW_CRITICAL_LINE_H_ diff --git a/psol/include/net/instaweb/automatic/public/flush_early_flow.h b/psol/include/net/instaweb/automatic/public/flush_early_flow.h new file mode 100644 index 000000000..502e360db --- /dev/null +++ b/psol/include/net/instaweb/automatic/public/flush_early_flow.h @@ -0,0 +1,132 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mmohabey@google.com (Megha Mohabey) + +#ifndef NET_INSTAWEB_AUTOMATIC_PUBLIC_FLUSH_EARLY_FLOW_H_ +#define NET_INSTAWEB_AUTOMATIC_PUBLIC_FLUSH_EARLY_FLOW_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/string_writer.h" + +namespace net_instaweb { + +class AsyncFetch; +class FlushEarlyInfo; +class Histogram; +class MessageHandler; +class ProxyFetchPropertyCallbackCollector; +class ProxyFetchFactory; +class ServerContext; +class RewriteDriver; +class Statistics; +class TimedVariable; + +// FlushEarlyFlow manages the flow for the rewriters which flush a response to +// the client before receiving a response from the origin server. If a request +// can be responded to early, then FlushEarlyFlow is initiated. It also has +// helper functions to update the property cache with the response headers which +// are used when a request is responded to early. +class FlushEarlyFlow { + public: + static const char kNumRequestsFlushedEarly[]; + static const char kNumResourcesFlushedEarly[]; + static const char kFlushEarlyRewriteLatencyMs[]; + static const char kNumFlushEarlyHttpStatusCodeDeemedUnstable[]; + + static void Start( + const GoogleString& url, + AsyncFetch** base_fetch, + RewriteDriver* driver, + ProxyFetchFactory* factory, + ProxyFetchPropertyCallbackCollector* property_callback); + + static void InitStats(Statistics* stats); + + virtual ~FlushEarlyFlow(); + + private: + class FlushEarlyAsyncFetch; + // Flushes some response for this request before receiving the fetch response + // from the origin server. + void FlushEarly(); + + // Cancels the flush early flow. + void Cancel(); + + FlushEarlyFlow(const GoogleString& url, + AsyncFetch* base_fetch, + FlushEarlyAsyncFetch* flush_early_fetch, + RewriteDriver* driver, + ProxyFetchFactory* factory, + ProxyFetchPropertyCallbackCollector* property_cache_callback); + + // Generates a dummy head with subresources and counts the number of resources + // which can be flused early. + void GenerateDummyHeadAndCountResources( + const FlushEarlyInfo& flush_early_info); + + // Generates response headers from previous values stored in property cache. + void GenerateResponseHeaders(const FlushEarlyInfo& flush_early_info); + + GoogleString GetHeadString(const FlushEarlyInfo& flush_early_info, + const char* css_format, + const char* js_format); + + // Callback that is invoked after we rewrite the early head. + // start_time_ms indicates the time we started rewriting the flush early + // head. This is set to -1 if is_experimental_hit is false. + void FlushEarlyRewriteDone(int64 start_time_ms, + RewriteDriver* flush_early_driver); + + void Write(const StringPiece& val); + + // Writes the script content to base_fetch. + void WriteScript(const GoogleString& script_content); + + // Write the external script to base fetch. + void WriteExternalScript(const GoogleString& script_url); + + GoogleString url_; + GoogleString dummy_head_; + StringWriter dummy_head_writer_; + int num_resources_flushed_; + int num_rewritten_resources_; + int64 average_fetch_time_; + + AsyncFetch* base_fetch_; + FlushEarlyAsyncFetch* flush_early_fetch_; + RewriteDriver* driver_; + ProxyFetchFactory* factory_; + ServerContext* manager_; + ProxyFetchPropertyCallbackCollector* property_cache_callback_; + bool should_flush_early_lazyload_script_; + bool should_flush_early_js_defer_script_; + MessageHandler* handler_; + + TimedVariable* num_requests_flushed_early_; + TimedVariable* num_resources_flushed_early_; + TimedVariable* num_flush_early_http_status_code_deemed_unstable_; + Histogram* flush_early_rewrite_latency_ms_; + + DISALLOW_COPY_AND_ASSIGN(FlushEarlyFlow); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_AUTOMATIC_PUBLIC_FLUSH_EARLY_FLOW_H_ diff --git a/psol/include/net/instaweb/automatic/public/html_detector.h b/psol/include/net/instaweb/automatic/public/html_detector.h new file mode 100644 index 000000000..4eb20edc6 --- /dev/null +++ b/psol/include/net/instaweb/automatic/public/html_detector.h @@ -0,0 +1,100 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: morlovich@google.com (Maksim Orlovich), +// sligocki@google.com (Shawn Ligocki) +// +// This contains HtmlDetector, which tries to heuristically detect whether +// content a server claims to be HTML actually is HTML (it sometimes isn't). + +#ifndef NET_INSTAWEB_AUTOMATIC_PUBLIC_HTML_DETECTOR_H_ +#define NET_INSTAWEB_AUTOMATIC_PUBLIC_HTML_DETECTOR_H_ + +#include "base/logging.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +// This class tries to heuristically detect whether something that claims to +// HTML is likely to be. For now, it merely looks at whether the first +// non-whitespace/non-BOM character is <. +// +// Typical usage: +// HtmlDetector detect_html_; +// +// if (!detect_html_.already_decided() && +// detect_html_.ConsiderInput(data)) { +// GoogleString buffered; +// detect_html_.ReleaseBuffered(&buffered); +// if (detect_html_.probable_html()) { +// do html-specific bits with buffered +// } else { +// do non-html things with buffered +// } +// } +// +// if (detect_html_.already_decided()) { +// do appropriate things with data based on detect_html_.probable_html() +// } +class HtmlDetector { + public: + HtmlDetector(); + ~HtmlDetector(); + + // Processes the data, trying to determine if it's HTML or not. If there is + // enough evidence to make a decision, returns true. + // + // If true is returned, already_decided() will be true as well, and hence + // probable_html() will be accessible. buffered() will not be changed. + // + // If false is returned, data will be accumulated inside buffered(). + // + // Precondition: !already_decided() + bool ConsiderInput(const StringPiece& data); + + // Returns true if we have seen enough input to make a guess as to whether + // it's HTML or not. + bool already_decided() const { return already_decided_; } + + // Precondition: already_decided() true (or ConsiderInput returning true). + bool probable_html() const { + DCHECK(already_decided_); + return probable_html_; + } + + // Transfers any data that was buffered by ConsiderInput calls that returned + // false into *out_buffer. The old value of out_buffer is overwritten, and + // HtmlDetector's internal buffers are cleared. + void ReleaseBuffered(GoogleString* out_buffer); + + // Forces already_decided() to true, and probable_html() to match is_html. + // + // Precondition: !already_decided() + void ForceDecision(bool is_html); + + private: + GoogleString buffer_; + bool already_decided_; + bool probable_html_; // valid only if already_decided_. + + DISALLOW_COPY_AND_ASSIGN(HtmlDetector); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_AUTOMATIC_PUBLIC_HTML_DETECTOR_H_ diff --git a/psol/include/net/instaweb/automatic/public/proxy_fetch.h b/psol/include/net/instaweb/automatic/public/proxy_fetch.h new file mode 100644 index 000000000..ab20f7113 --- /dev/null +++ b/psol/include/net/instaweb/automatic/public/proxy_fetch.h @@ -0,0 +1,451 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) +// +// NOTE: This interface is actively under development and may be +// changed extensively. Contact us at mod-pagespeed-discuss@googlegroups.com +// if you are interested in using it. + +#ifndef NET_INSTAWEB_AUTOMATIC_PUBLIC_PROXY_FETCH_H_ +#define NET_INSTAWEB_AUTOMATIC_PUBLIC_PROXY_FETCH_H_ + +#include +#include +#include + +#include "net/instaweb/automatic/public/html_detector.h" +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/meta_data.h" +#include "net/instaweb/util/public/queued_worker_pool.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/property_cache.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class AbstractClientState; +class AbstractMutex; +class CacheUrlAsyncFetcher; +class Function; +class MessageHandler; +class ProxyFetch; +class ProxyFetchPropertyCallbackCollector; +class QueuedAlarm; +class ServerContext; +class ResponseHeaders; +class RewriteDriver; +class RewriteOptions; +class Timer; + +// Factory for creating and starting ProxyFetches. Must outlive all +// ProxyFetches it creates. +class ProxyFetchFactory { + public: + explicit ProxyFetchFactory(ServerContext* manager); + ~ProxyFetchFactory(); + + // Convenience method that calls CreateNewProxyFetch and then StartFetch() on + // the resulting fetch. + void StartNewProxyFetch( + const GoogleString& url, + AsyncFetch* async_fetch, + RewriteDriver* driver, + ProxyFetchPropertyCallbackCollector* property_callback, + AsyncFetch* original_content_fetch); + + // Creates a new proxy fetch and passes it to the fetcher to start it. If the + // UrlNamer doesn't authorize this url it calls CleanUp() on the driver, + // Detach() on the property callback, Done() on the async_fetch and + // original_content_fetch, and returns NULL. + // + // If you're using a fetcher for the original request content you should use + // StartNewProxyFetch() instead. CreateNewProxyFetch is for callers who will + // not be calling StartFetch() and instead will call HeadersComplete(), + // Write(), Flush(), and Done() as they get data in from another source. + ProxyFetch* CreateNewProxyFetch( + const GoogleString& url, + AsyncFetch* async_fetch, + RewriteDriver* driver, + ProxyFetchPropertyCallbackCollector* property_callback, + AsyncFetch* original_content_fetch); + + MessageHandler* message_handler() const { return handler_; } + + private: + friend class ProxyFetch; + + // Helps track the status of in-flight ProxyFetches. These are intended for + // use only by ProxyFetch. + // + // TODO(jmarantz): Enumerate outstanding fetches in server status page. + void RegisterNewFetch(ProxyFetch* proxy_fetch); + void RegisterFinishedFetch(ProxyFetch* proxy_fetch); + + ServerContext* manager_; + Timer* timer_; + MessageHandler* handler_; + + scoped_ptr outstanding_proxy_fetches_mutex_; + std::set outstanding_proxy_fetches_; + + DISALLOW_COPY_AND_ASSIGN(ProxyFetchFactory); +}; + +// Tracks a single property-cache lookup. These lookups are initiated +// immediately upon handling the request, in parallel with determining +// domain-specific RewriteOptions and fetching the HTTP headers for the HTML. +// +// Request handling can proceed in parallel with the property-cache lookups, +// including RewriteOptions lookup and initating the HTTP fetch. However, +// handling incoming bytes will be blocked waiting for property-cache lookups +// to complete. +class ProxyFetchPropertyCallback : public PropertyPage { + public: + // The cache type associated with this callback. + enum CacheType { + kPagePropertyCache, + kClientPropertyCache + }; + + ProxyFetchPropertyCallback(CacheType cache_type, + const StringPiece& key, + ProxyFetchPropertyCallbackCollector* collector, + AbstractMutex* mutex); + + CacheType cache_type() const { return cache_type_; } + + // Delegates to collector_'s IsCacheValid. + virtual bool IsCacheValid(int64 write_timestamp_ms) const; + + virtual void Done(bool success); + + private: + CacheType cache_type_; + ProxyFetchPropertyCallbackCollector* collector_; + GoogleString url_; + DISALLOW_COPY_AND_ASSIGN(ProxyFetchPropertyCallback); +}; + +// Tracks a collection of property-cache lookups occuring in parallel. +class ProxyFetchPropertyCallbackCollector { + public: + ProxyFetchPropertyCallbackCollector(ServerContext* manager, + const StringPiece& url, + const RewriteOptions* options); + virtual ~ProxyFetchPropertyCallbackCollector(); + + // Add a callback to be handled by this collector. + // Transfers ownership of the callback to the collector. + void AddCallback(ProxyFetchPropertyCallback* callback); + + // In our flow, we initiate the property-cache lookup prior to + // creating a proxy-fetch, so that RewriteOptions lookup can proceed + // in parallel. If/when we determine that ProxyFetch is associated + // with HTML content, we connect it to this callback. Note that if + // the property cache lookups have completed, this will result in + // a direct call into proxy_fetch->PropertyCacheComplete. + void ConnectProxyFetch(ProxyFetch* proxy_fetch); + + // If for any reason we decide *not* to initiate a ProxyFetch for a + // request, then we need to 'detach' this request so that we can + // delete it once it completes, rather than waiting for a + // ProxyFetch to be inserted. The status code of the response is passed from + // ProxyFetch to the Collector. In case the status code is unknown then pass + // RewriteDriver::kStatusCodeUnknown. + void Detach(HttpStatus::Code status_code); + + // Returns the collected PropertyPage with the corresponding cache_type. + // Ownership of the object is transferred to the caller. + PropertyPage* GetPropertyPage( + ProxyFetchPropertyCallback::CacheType cache_type); + + // Returns the collected PropertyPage with the corresponding cache_type. + // Ownership of the object is retained by collector. + PropertyPage* GetPropertyPageWithoutOwnership( + ProxyFetchPropertyCallback::CacheType cache_type); + + // In our flow, property-page will be available via RewriteDriver only after + // ProxyFetch is set. But there may be instances where the result may be + // required even before proxy-fetch is created. Any task that depends on the + // PropertyCache result will be executed as soon as PropertyCache lookup is + // done. + // func is guaranteed to execute after PropertyCache lookup has completed, as + // long as ProxyFetch is not set before PropertyCache lookup is done. One + // should use PropertyCache result via RewriteDriver if some other thread can + // initiate SetProxyFetch(). + void AddPostLookupTask(Function* func); + + // If options_ is NULL returns true. Else, returns true if (url_, + // write_timestamp_ms) is valid as per URL cache invalidation entries is + // options_. + bool IsCacheValid(int64 write_timestamp_ms) const; + + // Called by a ProxyFetchPropertyCallback when the former is complete. + void Done(ProxyFetchPropertyCallback* callback, bool success); + + // Updates the status code of response in property cache. + void UpdateStatusCodeInPropertyCache(); + + private: + std::set pending_callbacks_; + std::map + property_pages_; + scoped_ptr mutex_; + ServerContext* server_context_; + GoogleString url_; + bool detached_; // protected by mutex_. + bool done_; // protected by mutex_. + bool success_; // protected by mutex_; accessed after quiescence. + ProxyFetch* proxy_fetch_; // protected by mutex_. + // protected by mutex_. + scoped_ptr > post_lookup_task_vector_; + const RewriteOptions* options_; // protected by mutex_; + HttpStatus::Code status_code_; // status_code_ of the response. + + DISALLOW_COPY_AND_ASSIGN(ProxyFetchPropertyCallbackCollector); +}; + +// Manages a single fetch of an HTML or resource file from the original server. +// If it is an HTML file, it is rewritten. +// Fetch is initialized by calling ProxyFetchFactory::StartNewProxyFetch(). +// For fetching pagespeed rewritten resources, use ResourceFetch. +// This is only meant to be used by ProxyInterface. +// +// Takes ownership of custom_options. +// +// The ProxyFetch passes through non-HTML directly to base_writer. +// +// For HTML, the sequence is this: +// 1. HeadersComplete is called, allowing us to establish we've got HTML. +// 2. Some number of calls to Write occur. +// 3. Optional: Flush is called, followed by more Writes. Repeat. +// 4. Done is called. +// These virtual methods are called from some arbitrary thread, e.g. a +// dedicated fetcher thread. We use a QueuedWorkerPool::Sequence to +// offload them to a worker-thread. This implementation bundles together +// multiple Writes, and depending on the timing, may move Flushes to +// follow Writes and collapse multiple Flushes into one. +class ProxyFetch : public SharedAsyncFetch { + public: + // These strings identify sync-points for reproducing races between + // PropertyCache lookup completion and Origin HTML Fetch completion. + static const char kCollectorDone[]; + static const char kCollectorPrefix[]; + static const char kCollectorReady[]; + static const char kCollectorDelete[]; + static const char kCollectorDetach[]; + static const char kCollectorDoneDelete[]; + + // These strings identify sync-points for introducing races between + // PropertyCache lookup completion and HeadersComplete. + static const char kHeadersSetupRaceAlarmQueued[]; + static const char kHeadersSetupRaceDone[]; + static const char kHeadersSetupRaceFlush[]; + static const char kHeadersSetupRacePrefix[]; + static const char kHeadersSetupRaceWait[]; + + // Number of milliseconds to wait, in a test, for an event that we + // are hoping does not occur, specifically an inappropriate call to + // base_fetch()->HeadersComplete() while we are still mutating + // response headers in SetupForHtml. + // + // This is used only for testing. + static const int kTestSignalTimeoutMs = 200; + + protected: + // protected interface from AsyncFetch. + virtual void HandleHeadersComplete(); + virtual bool HandleWrite(const StringPiece& content, MessageHandler* handler); + virtual bool HandleFlush(MessageHandler* handler); + virtual void HandleDone(bool success); + virtual bool IsCachedResultValid(const ResponseHeaders& headers); + + private: + friend class ProxyFetchFactory; + friend class ProxyFetchPropertyCallbackCollector; + friend class MockProxyFetch; + + // Called by ProxyFetchPropertyCallbackCollector when all property-cache + // fetches are complete. This function takes ownership of collector. + virtual void PropertyCacheComplete( + bool success, ProxyFetchPropertyCallbackCollector* collector); + + // Returns the AbstractClientState object carried by the property cache + // callback collector, if any. Returns NULL if no AbstractClientState + // is found. This method assumes that the property cache is enabled and + // the client state property cache lookup has completed. + AbstractClientState* GetClientState( + ProxyFetchPropertyCallbackCollector* collector); + + // If cross_domain is true, we're requested under a domain different from + // the underlying host, using proxy mode in UrlNamer. + ProxyFetch(const GoogleString& url, + bool cross_domain, + ProxyFetchPropertyCallbackCollector* property_cache_callback, + AsyncFetch* async_fetch, + AsyncFetch* original_content_fetch, + RewriteDriver* driver, + ServerContext* manager, + Timer* timer, + ProxyFetchFactory* factory); + virtual ~ProxyFetch(); + + const RewriteOptions* Options(); + + // Once we have decided this is HTML, begin parsing and set headers. + void SetupForHtml(); + + // Adds a pagespeed header to response_headers if enabled. + void AddPagespeedHeader(); + + // Sets up driver_, registering the writer and start parsing url. + // Returns whether we started parsing successfully or not. + bool StartParse(); + + // Start the fetch which includes preparing the request. + void StartFetch(); + + // Actually do the fetch, called from callback of StartFetch. + void DoFetch(); + + // Handles buffered HTML writes, flushes, and done calls + // in the QueuedWorkerPool::Sequence sequence_. + void ExecuteQueued(); + + // Schedules the task to run any buffered work, if needed. Assumes mutex + // held. + void ScheduleQueueExecutionIfNeeded(); + + // Frees up the RewriteDriver (via FinishParse or ReleaseRewriteDriver), + // calls the callback (nulling out callback_ to ensure that we don't + // do it again), notifies the ProxyInterface that the fetch is + // complete, and deletes the ProxyFetch. + void Finish(bool success); + + // Used to wrap up the FinishParseAsync invocation. + void CompleteFinishParse(bool success); + + // Callback we give to ExecuteFlushIfRequestedAsync to notify us when + // it's done with its work. + void FlushDone(); + + // Management functions for idle_alarm_. Must only be called from + // within sequence_. + + // Cancels any previous alarm. + void CancelIdleAlarm(); + + // Cancels previous alarm and starts next one. + void QueueIdleAlarm(); + + // Handler for the alarm; run in sequence_. + void HandleIdleAlarm(); + + GoogleString url_; + ServerContext* server_context_; + Timer* timer_; + + scoped_ptr cache_fetcher_; + + // True if we're handling a cross-domain request in proxy mode, which + // should do some additional checking. + bool cross_domain_; + + // Does page claim to be "Content-Type: text/html"? (It may be lying) + bool claims_html_; + + // Has a call to StartParse succeeded? We'll only do this if we actually + // decide it is HTML. + bool started_parse_; + + // Has a call to RewriteDriver::ParseText been made yet. + bool parse_text_called_; + + // Tracks whether Done() has been called. + bool done_called_; + + HtmlDetector html_detector_; + + // Tracks a set of outstanding property-cache lookups. This is NULLed + // when the property-cache completes or when we detach it. We use + // this to detach the callback if we decide we don't care about the + // property-caches because we discovered we are not working with HTML. + ProxyFetchPropertyCallbackCollector* property_cache_callback_; + + // Fetch where raw original headers and contents are sent. + // To contrast, base_fetch() is sent rewritten contents and headers. + // If NULL, original_content_fetch_ is ignored. + AsyncFetch* original_content_fetch_; + + // ProxyFetch is responsible for getting RewriteDrivers from the pool and + // putting them back. + RewriteDriver* driver_; + + // True if we have queued up ExecuteQueued but did not + // execute it yet. + bool queue_run_job_created_; + + // As the UrlAsyncFetcher calls our Write & Flush methods, we collect + // the text in text_queue, and note the Flush call in + // network_flush_requested_, returning control to the fetcher as quickly + // as possible so it can continue to process incoming network traffic. + // + // We offload the handling of the incoming text events to a + // QueuedWorkerPool::Sequence. Note that we may receive a new chunk + // of text while we are still processing an old chunk. The sequentiality + // is preserved by QueuedWorkerPool::Sequence. + // + // The Done callback is also indirected through this Sequence. + scoped_ptr mutex_; + StringStarVector text_queue_; + bool network_flush_outstanding_; + QueuedWorkerPool::Sequence* sequence_; + + // done_oustanding_ will be true if we got called with ::Done but didn't + // invoke Finish yet. + bool done_outstanding_; + + // Finish is true if we started Finish, perhaps doing FinishParseAsync. + // Accessed only from within context of sequence_. + bool finishing_; + + // done_result_ is used to store the result of ::Done if we're deferring + // handling it until the driver finishes handling a Flush. + bool done_result_; + + // We may also end up receiving new events in between calling FlushAsync + // and getting the callback called. In that case, we want to hold off + // on actually dispatching things queued up above. + bool waiting_for_flush_to_finish_; + + // Alarm used to keep track of inactivity, in order to help issue + // flushes. Must only be accessed from the thread context of sequence_ + QueuedAlarm* idle_alarm_; + + ProxyFetchFactory* factory_; + + // Whether PrepareRequest() to url_namer succeeded. + bool prepare_success_; + + DISALLOW_COPY_AND_ASSIGN(ProxyFetch); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_AUTOMATIC_PUBLIC_PROXY_FETCH_H_ diff --git a/psol/include/net/instaweb/automatic/public/proxy_interface.h b/psol/include/net/instaweb/automatic/public/proxy_interface.h new file mode 100644 index 000000000..1433deaa3 --- /dev/null +++ b/psol/include/net/instaweb/automatic/public/proxy_interface.h @@ -0,0 +1,135 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) +// +// Simple interface for running Page Speed Automatic as a proxy. +// +// When implementing a Page Speed Automatic proxy, simply construct a +// ProxyInterface at start up time and call Fetch for every +// requested resource. Fetch decides how to deal with requests +// (pagespeed resources will be computed, HTML pages will be proxied +// and rewritten, and other resources will just be proxied). + +#ifndef NET_INSTAWEB_AUTOMATIC_PUBLIC_PROXY_INTERFACE_H_ +#define NET_INSTAWEB_AUTOMATIC_PUBLIC_PROXY_INTERFACE_H_ + +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class AsyncFetch; +class GoogleUrl; +class MessageHandler; +class ProxyFetchPropertyCallbackCollector; +class ProxyFetchFactory; +class ServerContext; +class RewriteOptions; +class Statistics; +class TimedVariable; +class Timer; + +// TODO(sligocki): Rename as per style-guide. +class ProxyInterface : public UrlAsyncFetcher { + public: + ProxyInterface(const StringPiece& hostname, int port, + ServerContext* manager, Statistics* stats); + virtual ~ProxyInterface(); + + // Initializes statistics variables associated with this class. + static void InitStats(Statistics* statistics); + + // All requests use this interface. We decide internally whether the + // request is a pagespeed resource, HTML page to be rewritten or another + // resource to be proxied directly. + virtual void Fetch(const GoogleString& requested_url, + MessageHandler* handler, + AsyncFetch* async_fetch); + + // Callback function passed to UrlNamer to finish handling requests once we + // have rewrite_options for requests that are being proxied. + void ProxyRequestCallback( + bool is_resource_fetch, + GoogleUrl* request_url, + AsyncFetch* async_fetch, + RewriteOptions* domain_options, + RewriteOptions* query_options, + MessageHandler* handler); + + // Is this url_string well-formed enough to proxy through? + bool IsWellFormedUrl(const GoogleUrl& url); + + static const char kBlinkRequestCount[]; + static const char kBlinkCriticalLineRequestCount[]; + + // Initiates the PropertyCache look up. + virtual ProxyFetchPropertyCallbackCollector* InitiatePropertyCacheLookup( + bool is_resource_fetch, + const GoogleUrl& request_url, + RewriteOptions* options, + AsyncFetch* async_fetch); + + private: + friend class ProxyInterfaceTest; + + // Handle requests that are being proxied. + // * HTML requests are rewritten. + // * Resource requests are proxied verbatim. + void ProxyRequest(bool is_resource_fetch, + const GoogleUrl& requested_url, + AsyncFetch* async_fetch, + MessageHandler* handler); + + // If the URL and port are for this server, don't proxy those (to avoid + // infinite fetching loops). This might be the favicon or something... + bool UrlAndPortMatchThisServer(const GoogleUrl& url); + + // References to unowned objects. + ServerContext* server_context_; // thread-safe + UrlAsyncFetcher* fetcher_; // thread-safe + Timer* timer_; // thread-safe + MessageHandler* handler_; // thread-safe + + // This server's hostname and port (to avoid making circular requests). + // TODO(sligocki): This assumes we will only be called as one hostname, + // there could be multiple DNS entries pointing at us. + const GoogleString hostname_; + const int port_; + + // Varz variables + // Total requests. + TimedVariable* all_requests_; + // Total Pagespeed requests. + TimedVariable* pagespeed_requests_; + // Blink requests. + TimedVariable* blink_requests_; + // Blink requests in the critical line flow. + TimedVariable* blink_critical_line_requests_; + // Rejected requests counter. + TimedVariable* rejected_requests_; + + scoped_ptr proxy_fetch_factory_; + + DISALLOW_COPY_AND_ASSIGN(ProxyInterface); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_AUTOMATIC_PUBLIC_PROXY_INTERFACE_H_ diff --git a/psol/include/net/instaweb/automatic/public/resource_fetch.h b/psol/include/net/instaweb/automatic/public/resource_fetch.h new file mode 100644 index 000000000..2f2a86d4d --- /dev/null +++ b/psol/include/net/instaweb/automatic/public/resource_fetch.h @@ -0,0 +1,138 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) +// +// NOTE: This interface is actively under development and may be +// changed extensively. Contact us at mod-pagespeed-discuss@googlegroups.com +// if you are interested in using it. + +#ifndef NET_INSTAWEB_AUTOMATIC_PUBLIC_RESOURCE_FETCH_H_ +#define NET_INSTAWEB_AUTOMATIC_PUBLIC_RESOURCE_FETCH_H_ + +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/google_url.h" + +namespace net_instaweb { + +class MessageHandler; +class ServerContext; +class RewriteDriver; +class RewriteDriverPool; +class RewriteOptions; +class SyncFetcherAdapterCallback; +class Timer; + +// Manages a single fetch of a pagespeed rewritten resource. +// Fetch is initialized by calling ResourceFetch::Start() +// +// TODO(sligocki): Rename to PagespeedResourceFetch or something else ... +class ResourceFetch : public SharedAsyncFetch { + public: + // Start an async fetch for pagespeed resource. Response will be streamed + // to async_fetch. + // + // If custom_options it not NULL, takes ownership of it and and can mutate it. + static void Start(const GoogleUrl& url, + RewriteOptions* custom_options, + // This is intentionally not set in RewriteOptions because + // it is not so much an option as request-specific info + // similar to User-Agent (also not an option). + bool using_spdy, + ServerContext* server_context, + AsyncFetch* async_fetch); + + // Fetch a pagespeed resource in a blocking fashion. Response will be + // streamed back to async_fetch, but this function will not return until + // fetch has completed. + // + // You'll probably want to use GetDriver to construct the driver passed in + // to this method, in order to properly apply experiment info encoded into + // the URL into settings. + // + // Returns true iff the fetch succeeded and thus response headers and + // contents were sent to async_fetch. + static bool BlockingFetch(const GoogleUrl& url, + ServerContext* server_context, + RewriteDriver* driver, + SyncFetcherAdapterCallback* async_fetch); + + // Creates a rewrite_driver suitable for passing to BlockingFetch + // (or StartWithDriver) incorporating any experiment settings. + // If custom_options it not NULL, takes ownership of it and and can mutate it; + // otherwise the driver will be made using driver_pool with matching options + // as the pre-experiment starting point. + // + // Exactly one of custom_options and driver_pool must be non-NULL. + static RewriteDriver* GetDriver(const GoogleUrl& url, + RewriteOptions* custom_options, + RewriteDriverPool* driver_pool, + bool using_spdy, + ServerContext* server_context, + const RequestContextPtr& request_ctx); + + protected: + // Protected interface from AsyncFetch. + virtual void HandleHeadersComplete(); + virtual void HandleDone(bool success); + + private: + enum CleanupMode { + kAutoCleanupDriver, + kDontAutoCleanupDriver + }; + + ResourceFetch(const GoogleUrl& url, CleanupMode cleanup_mode, + RewriteDriver* driver, Timer* timer, + MessageHandler* handler, AsyncFetch* async_fetch); + virtual ~ResourceFetch(); + + // Same as Start(), but takes the RewriteDriver to use. + // cleanup_mode determines whether ResourceFetch will call Cleanup() + // on the driver itself. If it's set to kAutoCleanupDriver, the driver should + // not be used by the caller after this call. Otherwise, it may be used by + // the caller, but it's responsible for calling Cleanup() once done with it. + static void StartWithDriver(const GoogleUrl& url, + CleanupMode cleanup_mode, + ServerContext* manager, + RewriteDriver* driver, + AsyncFetch* async_fetch); + + // If we're running an experiment and the url specifies an experiment spec, + // set custom_options to use that experiment spec. If custom_options is NULL + // one will be allocated and the caller takes ownership of it. + static void ApplyFuriousOptions(const ServerContext* server_context, + const GoogleUrl& url, + RewriteDriverPool* driver_pool, + RewriteOptions** custom_options); + + GoogleUrl resource_url_; + RewriteDriver* driver_; + Timer* timer_; + MessageHandler* message_handler_; + + int64 start_time_ms_; + int redirect_count_; + CleanupMode cleanup_mode_; + + DISALLOW_COPY_AND_ASSIGN(ResourceFetch); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_AUTOMATIC_PUBLIC_RESOURCE_FETCH_H_ diff --git a/psol/include/net/instaweb/automatic/public/static_rewriter.h b/psol/include/net/instaweb/automatic/public/static_rewriter.h new file mode 100644 index 000000000..40f1ea4ad --- /dev/null +++ b/psol/include/net/instaweb/automatic/public/static_rewriter.h @@ -0,0 +1,96 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_AUTOMATIC_PUBLIC_STATIC_REWRITER_H_ +#define NET_INSTAWEB_AUTOMATIC_PUBLIC_STATIC_REWRITER_H_ + +#include "net/instaweb/rewriter/public/rewrite_driver_factory.h" +#include "net/instaweb/rewriter/public/rewrite_gflags.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/simple_stats.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class FileSystem; +class Hasher; +class MessageHandler; +class ServerContext; +class Statistics; +class Timer; +class UrlAsyncFetcher; +class UrlFetcher; +class Writer; + +// Implements a baseline RewriteDriverFactory with the simplest possible +// options for cache, fetchers, & system interface. +// +// TODO(jmarantz): fill out enough functionality so that this will be +// a functional static rewriter that could optimize an HTML file +// passed as a command-line parameter or via stdin. +class FileRewriter : public RewriteDriverFactory { + public: + FileRewriter(const RewriteGflags* gflags, + bool echo_errors_to_stdout); + virtual ~FileRewriter(); + virtual Hasher* NewHasher(); + virtual UrlFetcher* DefaultUrlFetcher(); + virtual UrlAsyncFetcher* DefaultAsyncUrlFetcher(); + virtual MessageHandler* DefaultHtmlParseMessageHandler(); + virtual MessageHandler* DefaultMessageHandler(); + virtual FileSystem* DefaultFileSystem(); + virtual Timer* DefaultTimer(); + virtual void SetupCaches(ServerContext* resource_manager); + virtual Statistics* statistics(); + + private: + const RewriteGflags* gflags_; + SimpleStats simple_stats_; + bool echo_errors_to_stdout_; + + DISALLOW_COPY_AND_ASSIGN(FileRewriter); +}; + +// Encapsulates the instantiation of a FileRewriter & a simple one-shot +// interface to rewrite some HTML text. +class StaticRewriter { + public: + StaticRewriter(int* argc, char*** argv); + StaticRewriter(); + ~StaticRewriter(); + + bool ParseText(const StringPiece& text, + const StringPiece& url, + const StringPiece& id, + const StringPiece& output_dir, + Writer* writer); + + FileSystem* file_system(); + MessageHandler* message_handler(); + + private: + RewriteGflags gflags_; + FileRewriter file_rewriter_; + ServerContext* server_context_; + + DISALLOW_COPY_AND_ASSIGN(StaticRewriter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_AUTOMATIC_PUBLIC_STATIC_REWRITER_H_ diff --git a/psol/include/net/instaweb/htmlparse/html_event.h b/psol/include/net/instaweb/htmlparse/html_event.h new file mode 100644 index 000000000..482e1c23d --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/html_event.h @@ -0,0 +1,220 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTMLPARSE_HTML_EVENT_H_ +#define NET_INSTAWEB_HTMLPARSE_HTML_EVENT_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/htmlparse/public/html_element.h" +#include "net/instaweb/htmlparse/public/html_filter.h" +#include "net/instaweb/htmlparse/public/html_node.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class HtmlEvent { + public: + explicit HtmlEvent(int line_number) : line_number_(line_number) { + } + virtual ~HtmlEvent(); + virtual void Run(HtmlFilter* filter) = 0; + virtual void ToString(GoogleString* buffer) = 0; + + // If this is a StartElement event, returns the HtmlElement that is being + // started. Otherwise returns NULL. + virtual HtmlElement* GetElementIfStartEvent() { return NULL; } + + // If this is an EndElement event, returns the HtmlElement that is being + // ended. Otherwise returns NULL. + virtual HtmlElement* GetElementIfEndEvent() { return NULL; } + + virtual HtmlLeafNode* GetLeafNode() { return NULL; } + virtual HtmlNode* GetNode() { return NULL; } + virtual HtmlCharactersNode* GetCharactersNode() { return NULL; } + void DebugPrint(); + + int line_number() const { return line_number_; } + + private: + int line_number_; + + DISALLOW_COPY_AND_ASSIGN(HtmlEvent); +}; + +class HtmlStartDocumentEvent: public HtmlEvent { + public: + explicit HtmlStartDocumentEvent(int line_number) : HtmlEvent(line_number) {} + virtual void Run(HtmlFilter* filter) { filter->StartDocument(); } + virtual void ToString(GoogleString* str) { *str += "StartDocument"; } + + private: + DISALLOW_COPY_AND_ASSIGN(HtmlStartDocumentEvent); +}; + +class HtmlEndDocumentEvent: public HtmlEvent { + public: + explicit HtmlEndDocumentEvent(int line_number) : HtmlEvent(line_number) {} + virtual void Run(HtmlFilter* filter) { filter->EndDocument(); } + virtual void ToString(GoogleString* str) { *str += "EndDocument"; } + + private: + DISALLOW_COPY_AND_ASSIGN(HtmlEndDocumentEvent); +}; + +class HtmlStartElementEvent: public HtmlEvent { + public: + HtmlStartElementEvent(HtmlElement* element, int line_number) + : HtmlEvent(line_number), + element_(element) { + } + virtual void Run(HtmlFilter* filter) { filter->StartElement(element_); } + virtual void ToString(GoogleString* str) { + *str += "StartElement "; + *str += element_->name_str(); + } + virtual HtmlElement* GetElementIfStartEvent() { return element_; } + virtual HtmlElement* GetNode() { return element_; } + private: + HtmlElement* element_; + + DISALLOW_COPY_AND_ASSIGN(HtmlStartElementEvent); +}; + +class HtmlEndElementEvent: public HtmlEvent { + public: + HtmlEndElementEvent(HtmlElement* element, int line_number) + : HtmlEvent(line_number), + element_(element) { + } + virtual void Run(HtmlFilter* filter) { filter->EndElement(element_); } + virtual void ToString(GoogleString* str) { + *str += "EndElement "; + *str += element_->name_str(); + } + virtual HtmlElement* GetElementIfEndEvent() { return element_; } + virtual HtmlElement* GetNode() { return element_; } + private: + HtmlElement* element_; + + DISALLOW_COPY_AND_ASSIGN(HtmlEndElementEvent); +}; + +class HtmlLeafNodeEvent: public HtmlEvent { + public: + explicit HtmlLeafNodeEvent(int line_number) : HtmlEvent(line_number) { } + virtual HtmlNode* GetNode() { return GetLeafNode(); } + + private: + DISALLOW_COPY_AND_ASSIGN(HtmlLeafNodeEvent); +}; + +class HtmlIEDirectiveEvent: public HtmlLeafNodeEvent { + public: + HtmlIEDirectiveEvent(HtmlIEDirectiveNode* directive, int line_number) + : HtmlLeafNodeEvent(line_number), + directive_(directive) { + } + virtual void Run(HtmlFilter* filter) { filter->IEDirective(directive_); } + virtual void ToString(GoogleString* str) { + *str += "IEDirective "; + *str += directive_->contents(); + } + virtual HtmlLeafNode* GetLeafNode() { return directive_; } + private: + HtmlIEDirectiveNode* directive_; + + DISALLOW_COPY_AND_ASSIGN(HtmlIEDirectiveEvent); +}; + +class HtmlCdataEvent: public HtmlLeafNodeEvent { + public: + HtmlCdataEvent(HtmlCdataNode* cdata, int line_number) + : HtmlLeafNodeEvent(line_number), + cdata_(cdata) { + } + virtual void Run(HtmlFilter* filter) { filter->Cdata(cdata_); } + virtual void ToString(GoogleString* str) { + *str += "Cdata "; + *str += cdata_->contents(); + } + virtual HtmlLeafNode* GetLeafNode() { return cdata_; } + private: + HtmlCdataNode* cdata_; + + DISALLOW_COPY_AND_ASSIGN(HtmlCdataEvent); +}; + +class HtmlCommentEvent: public HtmlLeafNodeEvent { + public: + HtmlCommentEvent(HtmlCommentNode* comment, int line_number) + : HtmlLeafNodeEvent(line_number), + comment_(comment) { + } + virtual void Run(HtmlFilter* filter) { filter->Comment(comment_); } + virtual void ToString(GoogleString* str) { + *str += "Comment "; + *str += comment_->contents(); + } + virtual HtmlLeafNode* GetLeafNode() { return comment_; } + + private: + HtmlCommentNode* comment_; + + DISALLOW_COPY_AND_ASSIGN(HtmlCommentEvent); +}; + +class HtmlCharactersEvent: public HtmlLeafNodeEvent { + public: + HtmlCharactersEvent(HtmlCharactersNode* characters, int line_number) + : HtmlLeafNodeEvent(line_number), + characters_(characters) { + } + virtual void Run(HtmlFilter* filter) { filter->Characters(characters_); } + virtual void ToString(GoogleString* str) { + *str += "Characters "; + *str += characters_->contents(); + } + virtual HtmlLeafNode* GetLeafNode() { return characters_; } + virtual HtmlCharactersNode* GetCharactersNode() { return characters_; } + private: + HtmlCharactersNode* characters_; + + DISALLOW_COPY_AND_ASSIGN(HtmlCharactersEvent); +}; + +class HtmlDirectiveEvent: public HtmlLeafNodeEvent { + public: + HtmlDirectiveEvent(HtmlDirectiveNode* directive, int line_number) + : HtmlLeafNodeEvent(line_number), + directive_(directive) { + } + virtual void Run(HtmlFilter* filter) { filter->Directive(directive_); } + virtual void ToString(GoogleString* str) { + *str += "Directive: "; + *str += directive_->contents(); + } + virtual HtmlLeafNode* GetLeafNode() { return directive_; } + private: + HtmlDirectiveNode* directive_; + + DISALLOW_COPY_AND_ASSIGN(HtmlDirectiveEvent); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_HTML_EVENT_H_ diff --git a/psol/include/net/instaweb/htmlparse/html_lexer.h b/psol/include/net/instaweb/htmlparse/html_lexer.h new file mode 100644 index 000000000..b89c5b8f9 --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/html_lexer.h @@ -0,0 +1,237 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_ +#define NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_ + +#include +#include "net/instaweb/htmlparse/public/html_name.h" +#include "net/instaweb/htmlparse/public/doctype.h" +#include "net/instaweb/htmlparse/public/html_element.h" +#include "net/instaweb/http/public/content_type.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/printf_format.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HtmlParse; + +// Constructs a re-entrant HTML lexer. This lexer minimally parses tags, +// attributes, and comments. It is intended to parse the Wild West of the +// Web. It's designed to be tolerant of syntactic transgressions, merely +// passing through unparseable chunks as Characters. +// +// TODO(jmarantz): refactor this with html_parse, so that this class owns +// the symbol table and the event queue, and no longer needs to mutually +// depend on HtmlParse. That will make it easier to unit-test. +class HtmlLexer { + public: + explicit HtmlLexer(HtmlParse* html_parse); + ~HtmlLexer(); + + // Initialize a new parse session, id is only used for error messages. + void StartParse(const StringPiece& id, const ContentType& content_type); + + // Parse a chunk of text, adding events to the parser by calling + // html_parse_->AddEvent(...). + void Parse(const char* text, int size); + + // Completes parse, reporting any leftover text as a final HtmlCharacterEvent. + void FinishParse(); + + // Determines whether a tag should be terminated in HTML. + bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; + + // Determines whether a tag can be terminated briefly (e.g. ) + bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; + + // Determines whether it's OK to leave a tag unclosed. + bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; + + // Print element stack to stdout (for debugging). + void DebugPrintStack(); + + // Returns the current lowest-level parent element in the element stack, or + // NULL if the stack is empty. + HtmlElement* Parent() const; + + // Return the current assumed doctype of the document (based on the content + // type and any HTML directives encountered so far). + const DocType& doctype() const { return doctype_; } + + // Sets the limit on the maximum number of bytes that should be parsed. + void set_size_limit(int64 x) { size_limit_ = x; } + + // Indicates whether we have exceeded the limit on the maximum number of bytes + // that we should parse. + bool size_limit_exceeded() const { return size_limit_exceeded_; } + + private: + // Most of these routines expect c to be the last character of literal_ + inline void EvalStart(char c); + inline void EvalTag(char c); + inline void EvalTagOpen(char c); + inline void EvalTagClose(char c); + inline void EvalTagCloseTerminate(char c); + inline void EvalTagBriefClose(char c); + inline void EvalTagBriefCloseAttr(char c); + inline void EvalCommentStart1(char c); + inline void EvalCommentStart2(char c); + inline void EvalCommentBody(char c); + inline void EvalCommentEnd1(char c); + inline void EvalCommentEnd2(char c); + inline void EvalCdataStart1(char c); + inline void EvalCdataStart2(char c); + inline void EvalCdataStart3(char c); + inline void EvalCdataStart4(char c); + inline void EvalCdataStart5(char c); + inline void EvalCdataStart6(char c); + inline void EvalCdataBody(char c); + inline void EvalCdataEnd1(char c); + inline void EvalCdataEnd2(char c); + inline void EvalAttribute(char c); + inline void EvalAttrName(char c); + inline void EvalAttrEq(char c); + inline void EvalAttrVal(char c); + inline void EvalAttrValSq(char c); + inline void EvalAttrValDq(char c); + inline void EvalLiteralTag(char c); + inline void EvalDirective(char c); + + // Makes an element based on token_, which will be parsed as the tag + // name. + void MakeElement(); + + void MakeAttribute(bool has_value); + void FinishAttribute(char c, bool has_value, bool brief_close); + + void EmitCdata(); + void EmitComment(); + void EmitLiteral(); + void EmitTagOpen(bool allow_implicit_close); // expects element_ != NULL. + void EmitTagClose(HtmlElement::CloseStyle close_style); + void EmitTagBriefClose(); + void EmitDirective(); + void Restart(char c); + + // Emits a syntax error message. + void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3); + + // Tries to find a HTML element on the stack matching a tag. If it + // finds it, it pops all the intervening elements off the stack, + // issuing warnings for each discarded tag, the matching element is + // also popped off the stack, and returned. + // + // If the tag is not matched, then no mutations are done to the stack, + // and NULL is returned. + // + // The tag name should be interned. + // TODO(jmarantz): use type system + HtmlElement* PopElementMatchingTag(const StringPiece& tag); + + HtmlElement* PopElement(); + void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style); + + // Minimal i18n analysis. With utf-8 and gb2312 we can do this + // context-free, and thus the method can be static. If we add + // more encodings we may need to turn this into a non-static method. + static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); } + + // Determines whether a character can be used in a tag name as first char ... + static inline bool IsLegalTagFirstChar(char c); + // ... or subsequent char. + static inline bool IsLegalTagChar(char c); + + // Determines whether a character can be used in an attribute name. + static inline bool IsLegalAttrNameChar(char c); + + // The lexer is implemented as a pure state machine. There is + // no lookahead. The state is understood primarily in this + // enum, although there are a few state flavors that are managed + // by the other member variables, notably: has_attr_value_ and + // attr_name_.empty(). Those could be eliminated by adding + // a few more explicit states. + enum State { + START, + TAG, // "<" + TAG_CLOSE, // " + TAG_ATTR_VALDQ, // '), + // briefly (e.g.
), or explicitly (...
). The + // Lexer will always record the way it parsed a tag, but synthesized + // elements will have AUTO_CLOSE, and rewritten elements may + // no longer qualify for the closing style with which they were + // parsed. + enum CloseStyle { + AUTO_CLOSE, // synthesized tag, or not yet closed in source + IMPLICIT_CLOSE, // E.g. + EXPLICIT_CLOSE, // E.g. anchor + BRIEF_CLOSE, // E.g. + UNCLOSED // Was never closed in source + }; + + // Various ways things can be quoted (or not) + enum QuoteStyle { + NO_QUOTE, + SINGLE_QUOTE, + DOUBLE_QUOTE + }; + + class Attribute : public InlineSListElement { + public: + // A large quantity of HTML in the wild has attributes that are + // improperly escaped. Browsers are generally tolerant of this. + // But we want to avoid corrupting pages we do not understand. + + // The result of DecodedValueOrNull() and escaped_value() is still + // owned by this, and will be invalidated by a subsequent call to + // SetValue() or SetUnescapedValue + + // Returns the attribute name, which is not guaranteed to be case-folded. + // Compare keyword() to the Keyword constant found in html_name.h for + // fast attribute comparisons. + const char* name_str() const { return name_.c_str(); } + + // Returns the HTML keyword enum. If this attribute name is not + // recognized, returns HtmlName::kNotAKeyword, and you can examine + // name_str(). + HtmlName::Keyword keyword() const { return name_.keyword(); } + + HtmlName name() const { return name_; } + void set_name(const HtmlName& name) { name_ = name; } + + // Returns the value in its original directly from the HTML source. + // This may have HTML escapes in it, such as "&". + const char* escaped_value() const { return escaped_value_.get(); } + + // The result of DecodedValueOrNull() is still owned by this, and + // will be invalidated by a subsequent call to SetValue(). + // + // The result will be a NUL-terminated string containing the value of the + // attribute, or NULL if the attribute has no value at all (this is + // distinct from having the empty string for a value), or there is + // a decoding error. E.g. + // --> "val" + // --> "&" + // --> "" + // --> NULL + // --> NULL (decoding_error()==true) + // + // Returns the unescaped value, suitable for directly operating on + // in filters as URLs or other data. Note that decoding_error() is + // true if the parsed value from HTML could not be decoded. This + // might occur if: + // - the charset is not known + // - the charset is not supported. Currently none are supported and + // only values that fall in 7-bit ascii can be interpreted. + // - the charset is known & supported but the value does not appear to be + // legal. + // + // The decoded value uses 8-bit characters to represent any unicode + // code-point less than 256. + const char* DecodedValueOrNull() const { + if (!decoded_value_computed_) { + ComputeDecodedValue(); + } + return decoded_value_.get(); + } + + void set_decoding_error(bool x) { decoding_error_ = x; } + bool decoding_error() const { + if (!decoded_value_computed_) { + ComputeDecodedValue(); + } + return decoding_error_; + } + + // See comment about quote on constructor for Attribute. + // Returns the quotation mark associated with this URL. + QuoteStyle quote_style() const { return quote_style_; } + + // Textual form of quote for printing. + const char* quote_str() const; + + // Two related methods to modify the value of attribute (eg to rewrite + // dest of src or href). As with the constructor, copies the string in, + // so caller retains ownership of value. + // + // A StringPiece pointing to an empty string (that is, a char array {'\0'}) + // indicates that the attribute value is the empty string (e.g. ); however, a StringPiece with a data() pointer of NULL indicates + // that the attribute has no value at all (e.g. ). This is an + // important distinction. + // + // Note that passing a value containing NULs in the middle will cause + // breakage, but this isn't currently checked for. + // TODO(mdsteele): Perhaps we should check for this? + + // Sets the value of the attribute. No HTML escaping is expected. + // This call causes the HTML-escaped value to be automatically computed + // by scanning the value and escaping any characters required in HTML + // attributes. + void SetValue(const StringPiece& value); + + // Sets the escaped value. This is intended to be called from the HTML + // Lexer, and results in the Value being computed automatically by + // scanning the value for escape sequences. + void SetEscapedValue(const StringPiece& value); + + void set_quote_style(QuoteStyle new_quote_style) { + quote_style_ = new_quote_style; + } + + friend class HtmlElement; + + private: + void ComputeDecodedValue() const; + + // This should only be called from AddAttribute + Attribute(const HtmlName& name, const StringPiece& escaped_value, + QuoteStyle quote_style); + + static inline void CopyValue(const StringPiece& src, + scoped_array* dst); + + HtmlName name_; + QuoteStyle quote_style_ : 8; + mutable bool decoding_error_; + mutable bool decoded_value_computed_; + + // Attribute value represented as ascii and + // HTML-escape-sequences, typically parsed directly from an HTML + // file. This is the canonical representation, and it can handle + // any arbitrary multi-byte characters. + // + // Note that it is acceptable to have 8-bit characters in escape + // sequences (typically iso8859). However we will not be able to + // decode such attributes. + scoped_array escaped_value_; + + // An 8-bit representation of the escaped_value. Escape sequences + // that contain character-codes >= 256 are not decoded, and will + // result in decoding_error_==true. Also note that a literal 8-bit + // code in escaped_value_ cannot be decoded either. + // + // We can get fewer decoding errors if we are careful to track the + // character-encoding for the document, and implement some of the + // popular ones, e.g. utf8, gb2312 and iso8859. Note that failing + // to decode an attribute value does not impact our ability to + // parse and reserialize the document. It just prevents us from + // looking at the decoded value, which is a requirement primarily + // for tags referencing URLs, e.g. . + // + // Note that we do not decode non-ASCII characters but we can + // represent them in escaped_value_. We can get 8-bit characters + // into decoded_value_ via  etc. + mutable scoped_array decoded_value_; + + DISALLOW_COPY_AND_ASSIGN(Attribute); + }; + + typedef InlineSList AttributeList; + typedef InlineSList::Iterator AttributeIterator; + typedef InlineSList::ConstIterator AttributeConstIterator; + + virtual ~HtmlElement(); + + virtual bool live() const { return (data_.get() != NULL) && data_->live_; } + virtual void MarkAsDead(const HtmlEventListIterator& end); + + // Add a copy of an attribute to this element. The attribute may come + // from this element, or another one. + void AddAttribute(const Attribute& attr); + + // Unconditionally add attribute, copying value. + // For binary attributes (those without values) use value=NULL. + // TODO(sligocki): StringPiece(NULL) seems fragile because what it is or + // how it's treated is not docutmented. + // + // Doesn't check for attribute duplication (which is illegal in html). + // + // The value, if non-null, is assumed to be unescaped. See also + // AddEscapedAttribute. + void AddAttribute(const HtmlName& name, + const StringPiece& decoded_value, + QuoteStyle quote_style); + // As AddAttribute, but assumes value has been escaped for html output. + void AddEscapedAttribute(const HtmlName& name, + const StringPiece& escaped_value, + QuoteStyle quote_style); + + // Remove the attribute with the given name. Return true if the attribute + // was deleted, false if it wasn't there to begin with. + bool DeleteAttribute(HtmlName::Keyword keyword); + + // Look up attribute by name. NULL if no attribute exists. + // Use this for attributes whose value you might want to change + // after lookup. + const Attribute* FindAttribute(HtmlName::Keyword keyword) const; + Attribute* FindAttribute(HtmlName::Keyword keyword) { + const HtmlElement* const_this = this; + const Attribute* result = const_this->FindAttribute(keyword); + return const_cast(result); + } + + // Look up decoded attribute value by name. + // Returns NULL if: + // 1. no attribute exists + // 2. the attribute has no value. + // 3. the attribute has a value, but it cannot currently be safely decoded. + // If you care about this distinction, call FindAttribute. + // Use this only if you don't intend to change the attribute value; + // if you might change the attribute value, use FindAttribute instead + // (this avoids a double lookup). + const char* AttributeValue(HtmlName::Keyword name) const { + const Attribute* attribute = FindAttribute(name); + if (attribute != NULL) { + return attribute->DecodedValueOrNull(); + } + return NULL; + } + + // Look up escaped attribute value by name. + // Returns NULL if: + // 1. no attribute exists + // 2. the attribute has no value. + // If you care about this distinction, call FindAttribute. + // Use this only if you don't intend to change the attribute value; + // if you might change the attribute value, use FindAttribute instead + // (this avoids a double lookup). + const char* EscapedAttributeValue(HtmlName::Keyword name) const { + const Attribute* attribute = FindAttribute(name); + if (attribute != NULL) { + return attribute->escaped_value(); + } + return NULL; + } + + // Returns the element tag name, which is not guaranteed to be + // case-folded. Compare keyword() to the Keyword constant found in + // html_name.h for fast tag name comparisons. + const char* name_str() const { return data_->name_.c_str(); } + + // Returns the HTML keyword enum. If this tag name is not + // recognized, returns HtmlName::kNotAKeyword, and you can + // examine name_str(). + HtmlName::Keyword keyword() const { return data_->name_.keyword(); } + + const HtmlName& name() const { return data_->name_; } + + // Changing that tag of an element should only occur if the caller knows + // that the old attributes make sense for the new tag. E.g. a div could + // be changed to a span. + void set_name(const HtmlName& new_tag) { data_->name_ = new_tag; } + + const AttributeList& attributes() const { return data_->attributes_; } + AttributeList* mutable_attributes() { return &data_->attributes_; } + + friend class HtmlParse; + friend class HtmlLexer; + + CloseStyle close_style() const { return data_->close_style_; } + void set_close_style(CloseStyle style) { data_->close_style_ = style; } + + // Render an element as a string for debugging. This is not + // intended as a fully legal serialization. + void ToString(GoogleString* buf) const; + void DebugPrint() const; + + int begin_line_number() const { return data_->begin_line_number_; } + int end_line_number() const { return data_->end_line_number_; } + + protected: + virtual void SynthesizeEvents(const HtmlEventListIterator& iter, + HtmlEventList* queue); + + virtual HtmlEventListIterator begin() const { return data_->begin_; } + virtual HtmlEventListIterator end() const { return data_->end_; } + + private: + // All of the data associated with an HtmlElement is indirected through this + // class, so we can delete it on Flush after a CloseElement event. + struct Data { + Data(const HtmlName& name, + const HtmlEventListIterator& begin, + const HtmlEventListIterator& end); + ~Data(); + + // Pack four fields into 64 bits using bitfields. Warning: this + // stuff is quite sensitive to details, so make sure to look at + // object sizes before changing! Interleaving the 24-bit and + // 8-bit member variables gives a total size of 8 bytes for these + // 4 variables on a gcc 64-bit compile. But putting the two + // 24-bit integers together gives a total size of 16 bytes, so + // we interleave. + // + // HtmlParse::DeleteElement will set live_ to false without + // deleting element->data_. Flushing an ElementClose deletes + // data_ but HtmlElement knows that null data_ implies !live(). + unsigned begin_line_number_ : 24; + unsigned live_ : 8; + unsigned end_line_number_ : 24; + CloseStyle close_style_ : 8; + + HtmlName name_; + AttributeList attributes_; + HtmlEventListIterator begin_; + HtmlEventListIterator end_; + }; + + // Begin/end event iterators are used by HtmlParse to keep track + // of the span of events underneath an element. This is primarily to + // help delete the element. Events are not public. + void set_begin(const HtmlEventListIterator& begin) { data_->begin_ = begin; } + void set_end(const HtmlEventListIterator& end) { data_->end_ = end; } + + void set_begin_line_number(int line) { data_->begin_line_number_ = line; } + void set_end_line_number(int line) { data_->end_line_number_ = line; } + + // construct via HtmlParse::NewElement + HtmlElement(HtmlElement* parent, const HtmlName& name, + const HtmlEventListIterator& begin, + const HtmlEventListIterator& end); + + // HtmlElement data is held in HtmlElement::Data*, which is freed + // when a CloseElement is Flushed. The pointers themselves are + // retained and can correctly answer element->IsRewritable() and + // element->is_live(), but the rest of the data (attributes etc) + // is deleted. + void FreeData() { data_.reset(NULL); } + + scoped_ptr data_; + + DISALLOW_COPY_AND_ASSIGN(HtmlElement); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_ELEMENT_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/html_filter.h b/psol/include/net/instaweb/htmlparse/public/html_filter.h new file mode 100644 index 000000000..bb1b120ac --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/html_filter.h @@ -0,0 +1,100 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_FILTER_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_FILTER_H_ + +namespace net_instaweb { + +class HtmlCdataNode; +class HtmlCharactersNode; +class HtmlCommentNode; +class HtmlDirectiveNode; +class HtmlElement; +class HtmlIEDirectiveNode; + +class HtmlFilter { + public: + HtmlFilter(); + virtual ~HtmlFilter(); + + // Starts a new document. Filters should clear their state in this function, + // as the same Filter instance may be used for multiple HTML documents. + virtual void StartDocument() = 0; + // Note: EndDocument will be called imediately before the last Flush call. + virtual void EndDocument() = 0; + + // When an HTML element is encountered during parsing, each filter's + // StartElement method is called. The HtmlElement lives for the entire + // duration of the document. + // + // TODO(jmarantz): consider passing handles rather than pointers and + // reference-counting them instead to save memory on long documents. + virtual void StartElement(HtmlElement* element) = 0; + virtual void EndElement(HtmlElement* element) = 0; + + // Called for CDATA blocks (e.g. ) + virtual void Cdata(HtmlCdataNode* cdata) = 0; + + // Called for HTML comments that aren't IE directives (e.g. ). + virtual void Comment(HtmlCommentNode* comment) = 0; + + // Called for an IE directive; typically used for CSS styling. + // See http://msdn.microsoft.com/en-us/library/ms537512(VS.85).aspx + // + // TODO(mdsteele): Should we try to maintain the nested structure of + // the conditionals, in the same way that we maintain nesting of elements? + virtual void IEDirective(HtmlIEDirectiveNode* directive) = 0; + + // Called for raw characters between tags. + virtual void Characters(HtmlCharactersNode* characters) = 0; + + // Called for HTML directives (e.g. ). + virtual void Directive(HtmlDirectiveNode* directive) = 0; + + // Notifies the Filter that a flush is occurring. A filter that's + // generating streamed output should flush at this time. A filter + // that's mutating elements can mutate any element seen since the + // most recent flush; once an element is flushed it is already on + // the wire to its destination and it's too late to mutate. Flush + // is initiated by an application calling HttpParse::Flush(). + // + // Flush() is called after all other handlers during a HttpParse::Flush(). + virtual void Flush() = 0; + + // Invoked by rewrite driver where all filters should determine whether + // they are enabled for this request. + virtual void DetermineEnabled() = 0; + + // Intended to be called from DetermineEnabled implementations in filters. + // Returns whether a filter is enabled. + bool is_enabled() const { return is_enabled_; } + + // The name of this filter -- used for logging and debugging. + virtual const char* Name() const = 0; + + protected: + void set_is_enabled(bool is_enabled) { is_enabled_ = is_enabled; } + + private: + bool is_enabled_; +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_FILTER_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/html_keywords.h b/psol/include/net/instaweb/htmlparse/public/html_keywords.h new file mode 100644 index 000000000..d184a5254 --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/html_keywords.h @@ -0,0 +1,191 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_KEYWORDS_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_KEYWORDS_H_ + +#include +#include +#include +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/htmlparse/public/html_name.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HtmlKeywords { + public: + // Initialize a singleton instance of this class. This call is + // inherently thread unsafe, but only the first time it is called. + // If multi-threaded programs call this function before spawning + // threads then there will be no races. + static void Init(); + + // Tear down the singleton instance of this class, freeing any + // allocated memory. This call is inherently thread unsafe. + static void ShutDown(); + + // Returns an HTML keyword as a string, or NULL if not a keyword. + static const char* KeywordToString(HtmlName::Keyword keyword) { + return singleton_->keyword_vector_[keyword]; + } + + // Take a raw text and escape it so it's safe for an HTML attribute, + // e.g. a&b --> a&b + static StringPiece Escape(const StringPiece& unescaped, GoogleString* buf) { + return singleton_->EscapeHelper(unescaped, buf); + } + + // Take escaped text and unescape it so its value can be interpreted, + // e.g. "http://myhost.com/p?v&w" --> "http://myhost.com/p?v&w" + // + // *decoding_error is set to true if the escaped string could not be + // safely transformed into a simple stream of bytes. + // + // TODO(jmarantz): Support a variant where we unescape to UTF-8. + static StringPiece Unescape(const StringPiece& escaped, GoogleString* buf, + bool* decoding_error) { + return singleton_->UnescapeHelper(escaped, buf, decoding_error); + } + + // Note that Escape and Unescape are not guaranteed to be inverses of + // one another. For example, Unescape("")=="&", but Escape("&")="&". + // However, note that Unescape(Escape(s)) == s. + // + // Another case to be wary of is when the argument to Unescape is not + // properly escaped. The result will be that the string is returned + // unmodified. For example, Unescape("a&b")=="a&b", butthen re-escaping + // that will give "a&b". Hence, the careful maintainer of an HTML + // parsing and rewriting system will need to maintain the original escaped + // text parsed from HTML files, and pass that to browsers. + + // Determines whether an open tag of type k1 should be automatically closed + // if a StartElement for tag k2 is encountered. E.g. should + // be transformed to . + static bool IsAutoClose(HtmlName::Keyword k1, HtmlName::Keyword k2) { + return std::binary_search(singleton_->auto_close_.begin(), + singleton_->auto_close_.end(), + MakeKeywordPair(k1, k2)); + } + + // Determines whether an open tag of type k1 should be automatically closed + // if an EndElement for tag k2 is encountered. E.g. should + // be transformed into . + static bool IsContained(HtmlName::Keyword k1, HtmlName::Keyword k2) { + return std::binary_search(singleton_->contained_.begin(), + singleton_->contained_.end(), + MakeKeywordPair(k1, k2)); + } + + // Determines whether the specified HTML keyword is closed automatically + // by the parser if the close-tag is omitted. E.g. must be closed, + // but formatting elements such as

do not need to be closed. Also note + // the distinction with tags which are *implicitly* closed in HTML such as + // and
. + static bool IsOptionallyClosedTag(HtmlName::Keyword keyword) { + return std::binary_search(singleton_->optionally_closed_.begin(), + singleton_->optionally_closed_.end(), + keyword); + } + + private: + typedef int32 KeywordPair; // Encoded via shift & OR. + typedef std::vector KeywordPairVec; + typedef std::vector KeywordVec; + + HtmlKeywords(); + const char* UnescapeAttributeValue(); + void InitEscapeSequences(); + void InitAutoClose(); + void InitContains(); + void InitOptionallyClosedKeywords(); + + // Translate the escape sequence and append the corresponding character + // into *buf. + // + // accumulate_numeric_code==true means that the sequence has been accumulated + // into numeric_value and that will be used to form a character for appending + // to *buf. + // + // accumulate_numeric_code==false means that the sequence is in 'escape' and + // that will be looked up in the keyword tables to get the character to append + // to *buf. + // + // was_terminated indicates that the escape-sequence was properly terminated + // by a semicolon. This affects handling of unknown escape sequences, where + // we will need to retain the ";". + // + // Returns false iff the escape-sequence is a valid multi-byte sequence, + // which we can't currently represent in our 8-bit format. + bool TryUnescape(bool accumulate_numeric_code, + uint32 numeric_value, + const GoogleString& escape, + bool was_terminated, + GoogleString* buf) const; + + // Encodes two keyword enums as a KeywordPair, represented as an int32. + static KeywordPair MakeKeywordPair(HtmlName::Keyword k1, + HtmlName::Keyword k2) { + return (static_cast(k1) << 16) | static_cast(k2); + } + + // Adds all combinations of the members of k1_list and k2_list to + // kmap. The lists are represented as space-delimited keywords. + // E.g. if k1_list="a b" and k2_list="c d", then this adds (a,c), + // (b,c), (a,d), (b,d) to kmap. + void AddCrossProduct(const StringPiece& k1_list, const StringPiece& k2_list, + KeywordPairVec* kmap); + void AddAutoClose(const StringPiece& k1_list, const StringPiece& k2_list) { + AddCrossProduct(k1_list, k2_list, &auto_close_); + } + void AddContained(const StringPiece& k1_list, const StringPiece& k2_list) { + AddCrossProduct(k1_list, k2_list, &contained_); + } + + // Adds every space-delimited token in klist to kset. + void AddToSet(const StringPiece& klist, KeywordVec* kset); + + static HtmlKeywords* singleton_; + + StringPiece EscapeHelper(const StringPiece& unescaped, + GoogleString* buf) const; + StringPiece UnescapeHelper(const StringPiece& escaped, + GoogleString* buf, + bool* decoding_error) const; + + typedef std::map StringStringMapInsensitive; + typedef std::map StringStringMapSensitive; + StringStringMapInsensitive unescape_insensitive_map_; + StringStringMapSensitive unescape_sensitive_map_; + StringStringMapSensitive escape_map_; + CharStarVector keyword_vector_; + + // These vectors of KeywordPair and Keyword are sorted numerically during + // construction to enable binary-search during parsing. + KeywordPairVec auto_close_; + KeywordPairVec contained_; + KeywordVec optionally_closed_; + + DISALLOW_COPY_AND_ASSIGN(HtmlKeywords); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_KEYWORDS_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/html_name.h b/psol/include/net/instaweb/htmlparse/public/html_name.h new file mode 100644 index 000000000..58961d846 --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/html_name.h @@ -0,0 +1,274 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_NAME_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_NAME_H_ + +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +// HTML names are case insensitive. However, in the parser, we keep +// the original parsed case of the name, in addition to the html +// keyword enumeration, if any. Thus for both tags and attribute +// names, we have an enum representation which is used in filters +// for scanning, plus we have the original string representation. +class HtmlName { + public: + // We keep both attribute names and tag names in the same space + // for convenience. This list must be kept in alpha-order and + // in sync with the static array in html_name.cc. + // + // Note that this list does not need to cover all HTML keywords -- + // only the ones that we are interested in for rewriting. + enum Keyword { + kXml, // ?Xml + kA, + kAbbr, + kAction, + kAddress, + kAlt, + kArea, + kArticle, + kAside, + kAsync, + kAudio, + kAutocomplete, + kAutofocus, + kAutoplay, + kB, + kBackground, + kBase, + kBdi, + kBdo, + kBlockquote, + kBody, + kBr, + kButton, + kCharset, + kChecked, + kCite, + kClass, + kCode, + kCol, + kColgroup, + kColspan, + kCommand, + kContent, + kControls, + kData, + kDataSrc, + kDd, + kDeclare, + kDefaultchecked, + kDefaultselected, + kDefer, + kDel, + kDetails, + kDfn, + kDir, + kDisabled, + kDisplay, + kDiv, + kDl, + kDt, + kEm, + kEmbed, + kEnctype, + kEvent, + kFieldset, + kFont, + kFooter, + kFor, + kForm, + kFormaction, + kFormnovalidate, + kFrame, + kFrameborder, + kH1, + kH2, + kH3, + kH4, + kH5, + kH6, + kHead, + kHeader, + kHeight, + kHgroup, + kHr, + kHref, + kHtml, + kHttpEquiv, + kI, + kIcon, + kId, + kIframe, + kImg, + kIndeterminate, + kIns, + kInput, + kIsmap, + kKbd, + kKeygen, + kKeytype, + kLang, + kLanguage, + kLi, + kLink, + kLoop, + kManifest, + kMark, + kMarquee, + kMedia, + kMenu, + kMeta, + kMethod, + kMultiple, + kMuted, + kName, + kNav, + kNoembed, + kNohref, + kNoresize, + kNoscript, + kNovalidate, + kObject, + kOl, + kOnclick, + kOnload, + kOpen, + kOptgroup, + kOption, + kOther, + kP, + kPagespeedBlankSrc, + kPagespeedHighResSrc, + kPagespeedIframe, + kPagespeedLazySrc, + kPagespeedLowResSrc, + kPagespeedLscExpiry, + kPagespeedLscHash, + kPagespeedLscUrl, + kPagespeedNoDefer, + kPagespeedNoTransform, + kPagespeedOrigSrc, + kPagespeedOrigType, + kPagespeedSize, + kParam, + kPre, + kProfile, + kQ, + kReadonly, + kRel, + kRequired, + kReversed, + kRowspan, + kRp, + kRt, + kRuby, + kS, + kSamp, + kScoped, + kScript, + kScrolling, + kSeamless, + kSection, + kSelect, + kSelected, + kShape, + kSmall, + kSource, + kSpan, + kSrc, + kStrong, + kStyle, + kSub, + kTable, + kTag, + kTbody, + kTd, + kTest, + kTextarea, + kTfoot, + kTh, + kThead, + kTime, + kTr, + kTrack, + kType, + kU, + kUl, + kValuetype, + kVar, + kVideo, + kWbr, + kWidth, + kWrap, + kXmp, + kNotAKeyword + }; + + // Constructs an HTML name given a keyword, which can be + // HtmlName::kNotAKeyword, and 'const char* str'. 'str' + // is used to retain the case-sensitive spelling of the + // keyword. The storage for 'str' must be managed, and + // must be guaranteed valid throughout the life of the HtmlName. + HtmlName(Keyword keyword, const char* str) + : keyword_(keyword), c_str_(str) { + } + + // Returns the keyword enumeration for this HTML Name. Note that + // keyword lookup is case-insensitive. + Keyword keyword() const { return keyword_; } + + // Return the atom string, which may not be case folded. + const char* c_str() const { return c_str_; } + + // Limited iterator (not an STL iterator). Example usage: + // for (HtmlName::Iterator iter; !iter.AtEnd(); iter.Next()) { + // use(iter.keyword(), iter.name()); + // } + class Iterator { + public: + Iterator() : index_(-1) { Next(); } + bool AtEnd() const; + void Next(); + Keyword keyword() const; + const char* name() const; + + private: + int index_; + + // Implicit copy and assign ok. The members can be safely copied by bits. + }; + + static int num_keywords(); + static Keyword Lookup(const StringPiece& name); + + private: + friend class HtmlNameTest; + + Keyword keyword_; + const char* c_str_; + + // Implicit copy and assign ok. The members can be safely copied by bits. +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_NAME_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/html_node.h b/psol/include/net/instaweb/htmlparse/public/html_node.h new file mode 100644 index 000000000..fe1d89e04 --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/html_node.h @@ -0,0 +1,250 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mdsteele@google.com (Matthew D. Steele) + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_NODE_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_NODE_H_ + +#include +#include + +#include "base/logging.h" +#include "net/instaweb/util/public/arena.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HtmlElement; +class HtmlEvent; + +typedef std::list HtmlEventList; +typedef HtmlEventList::iterator HtmlEventListIterator; + +// Base class for HtmlElement and HtmlLeafNode +class HtmlNode { + public: + virtual ~HtmlNode(); + friend class HtmlParse; + + HtmlElement* parent() const { return parent_; } + virtual bool live() const = 0; + + // Marks a node as dead. The queue's end iterator should be passed in, + // to remove references to stale iterators, and to force IsRewritable to + // return false. + virtual void MarkAsDead(const HtmlEventListIterator& end) = 0; + + void* operator new(size_t size, Arena* arena) { + return arena->Allocate(size); + } + + void operator delete(void* ptr, Arena* arena) { + LOG(FATAL) << "HtmlNode must not be deleted directly."; + } + + protected: + // TODO(jmarantz): jmaessen suggests instantiating the html nodes + // without parents and computing them from context at the time they + // are instantiated from the lexer. This is a little more difficult + // when synthesizing new nodes, however. We assert sanity, however, + // when calling HtmlParse::ApplyFilter. + explicit HtmlNode(HtmlElement* parent) : parent_(parent) {} + + // Create new event object(s) representing this node, and insert them into + // the queue just before the given iterator; also, update this node object as + // necessary so that begin() and end() will return iterators pointing to + // the new event(s). The line number for each event should probably be -1. + virtual void SynthesizeEvents(const HtmlEventListIterator& iter, + HtmlEventList* queue) = 0; + + // Return an iterator pointing to the first event associated with this node. + virtual HtmlEventListIterator begin() const = 0; + // Return an iterator pointing to the last event associated with this node. + virtual HtmlEventListIterator end() const = 0; + + // Version that affects visibility of the destructor. + void operator delete(void* ptr) { + LOG(FATAL) << "HtmlNode must not be deleted directly."; + } + + private: + friend class HtmlLexer; + friend class HtmlTestingPeer; + + // Note: setting the parent doesn't change the DOM -- it just updates + // the pointer. This is intended to be called only from the DOM manipulation + // methods in HtmlParse. + void set_parent(HtmlElement* parent) { parent_ = parent; } + + HtmlElement* parent_; + DISALLOW_COPY_AND_ASSIGN(HtmlNode); +}; + +class HtmlLeafNode : public HtmlNode { + public: + virtual ~HtmlLeafNode(); + virtual bool live() const { return (data_.get() != NULL) && data_->is_live_; } + virtual void MarkAsDead(const HtmlEventListIterator& end); + + const GoogleString& contents() const { return data_->contents_; } + virtual HtmlEventListIterator begin() const { + return data_->iter_; + } + virtual HtmlEventListIterator end() const { + return data_->iter_; + } + void set_iter(const HtmlEventListIterator& iter) { + data_->iter_ = iter; + } + + void FreeData() { data_.reset(NULL); } + + protected: + HtmlLeafNode(HtmlElement* parent, const HtmlEventListIterator& iter, + const StringPiece& contents); + + // Write-access to the contents is protected by default, and made + // accessible by subclasses that need to expose this method. + GoogleString* mutable_contents() { return &data_->contents_; } + + private: + struct Data { + Data(const HtmlEventListIterator& iter, const StringPiece& contents) + : contents_(contents.data(), contents.size()), + is_live_(true), + iter_(iter) { + } + GoogleString contents_; + bool is_live_; + HtmlEventListIterator iter_; + }; + + scoped_ptr data_; +}; + +// Leaf node representing a CDATA section +class HtmlCdataNode : public HtmlLeafNode { + public: + virtual ~HtmlCdataNode(); + friend class HtmlParse; + + protected: + virtual void SynthesizeEvents(const HtmlEventListIterator& iter, + HtmlEventList* queue); + + private: + HtmlCdataNode(HtmlElement* parent, + const StringPiece& contents, + const HtmlEventListIterator& iter) + : HtmlLeafNode(parent, iter, contents) { + } + + DISALLOW_COPY_AND_ASSIGN(HtmlCdataNode); +}; + +// Leaf node representing raw characters in HTML +class HtmlCharactersNode : public HtmlLeafNode { + public: + virtual ~HtmlCharactersNode(); + void Append(const StringPiece& str) { + mutable_contents()->append(str.data(), str.size()); + } + friend class HtmlParse; + + // Expose writable contents for Characters nodes. + using HtmlLeafNode::mutable_contents; + + protected: + virtual void SynthesizeEvents(const HtmlEventListIterator& iter, + HtmlEventList* queue); + + private: + HtmlCharactersNode(HtmlElement* parent, + const StringPiece& contents, + const HtmlEventListIterator& iter) + : HtmlLeafNode(parent, iter, contents) { + } + + DISALLOW_COPY_AND_ASSIGN(HtmlCharactersNode); +}; + +// Leaf node representing an HTML comment +class HtmlCommentNode : public HtmlLeafNode { + public: + virtual ~HtmlCommentNode(); + friend class HtmlParse; + + protected: + virtual void SynthesizeEvents(const HtmlEventListIterator& iter, + HtmlEventList* queue); + + private: + HtmlCommentNode(HtmlElement* parent, + const StringPiece& contents, + const HtmlEventListIterator& iter) + : HtmlLeafNode(parent, iter, contents) { + } + + DISALLOW_COPY_AND_ASSIGN(HtmlCommentNode); +}; + +// Leaf node representing an HTML IE directive +class HtmlIEDirectiveNode : public HtmlLeafNode { + public: + virtual ~HtmlIEDirectiveNode(); + friend class HtmlParse; + + protected: + virtual void SynthesizeEvents(const HtmlEventListIterator& iter, + HtmlEventList* queue); + + private: + HtmlIEDirectiveNode(HtmlElement* parent, + const StringPiece& contents, + const HtmlEventListIterator& iter) + : HtmlLeafNode(parent, iter, contents) { + } + + DISALLOW_COPY_AND_ASSIGN(HtmlIEDirectiveNode); +}; + +// Leaf node representing an HTML directive +class HtmlDirectiveNode : public HtmlLeafNode { + public: + virtual ~HtmlDirectiveNode(); + friend class HtmlParse; + + protected: + virtual void SynthesizeEvents(const HtmlEventListIterator& iter, + HtmlEventList* queue); + + private: + HtmlDirectiveNode(HtmlElement* parent, + const StringPiece& contents, + const HtmlEventListIterator& iter) + : HtmlLeafNode(parent, iter, contents) { + } + + DISALLOW_COPY_AND_ASSIGN(HtmlDirectiveNode); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_NODE_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/html_parse.h b/psol/include/net/instaweb/htmlparse/public/html_parse.h new file mode 100644 index 000000000..a786df9d3 --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/html_parse.h @@ -0,0 +1,453 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ + +#include +#include +#include +#include +#include + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/htmlparse/public/html_element.h" +#include "net/instaweb/htmlparse/public/html_name.h" +#include "net/instaweb/htmlparse/public/html_node.h" +#include "net/instaweb/http/public/content_type.h" +#include "net/instaweb/util/public/arena.h" +#include "net/instaweb/util/public/google_url.h" +#include "net/instaweb/util/public/printf_format.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/symbol_table.h" + +namespace net_instaweb { + +class DocType; +class HtmlEvent; +class HtmlFilter; +class HtmlLexer; +class MessageHandler; +class Timer; + +typedef std::set ConstHtmlEventSet; + +// TODO(jmarantz): rename HtmlParse to HtmlContext. The actual +// parsing occurs in HtmlLexer, and this class is dominated by methods +// to manipulate DOM as it streams through. +class HtmlParse { + public: + explicit HtmlParse(MessageHandler* message_handler); + virtual ~HtmlParse(); + + // Application methods for parsing functions and adding filters + + // Add a new html filter to the filter-chain, without taking ownership + // of it. + void AddFilter(HtmlFilter* filter); + + // Initiate a chunked parsing session. Finish with FinishParse. The + // url is only used to resolve relative URLs; the contents are not + // directly fetched. The caller must supply the text and call ParseText. + // + // Returns whether the URL is valid. + bool StartParse(const StringPiece& url) { + return StartParseWithType(url, kContentTypeHtml); + } + bool StartParseWithType(const StringPiece& url, + const ContentType& content_type) { + return StartParseId(url, url, content_type); + } + + // Returns whether the google_url() URL is valid. + bool is_url_valid() const { return url_valid_; } + + // Mostly useful for file-based rewriters so that messages can reference + // the HTML file and produce navigable errors. + // + // Returns whether the URL is valid. + virtual bool StartParseId(const StringPiece& url, const StringPiece& id, + const ContentType& content_type); + + // Parses an arbitrary block of an html file, queuing up the events. Call + // Flush to send the events through the Filter. + // + // To parse an entire file, first call StartParse(), then call + // ParseText on the file contents (in whatever size chunks are convenient), + // then call FinishParse(). + // + // It is invalid to call ParseText when the StartParse* routines returned + // false. + void ParseText(const char* content, int size) { + ParseTextInternal(content, size); + } + void ParseText(const StringPiece& sp) { + ParseTextInternal(sp.data(), sp.size()); + } + + // Flush the currently queued events through the filters. It is desirable + // for large web pages, particularly dynamically generated ones, to start + // getting delivered to the browser as soon as they are ready. On the + // other hand, rewriting is more powerful when more of the content can + // be considered for image/css/js spriting. This method should be called + // when the controlling network process wants to induce a new chunk of + // output. The less you call this function the better the rewriting will + // be. + // + // It is invalid to call Flush when the StartParse* routines returned + // false. + // + // If this is called from a Filter, the request will be deferred until after + // currently active filters are completed. + virtual void Flush(); + + // Finish a chunked parsing session. This also induces a Flush. + // + // It is invalid to call FinishParse when the StartParse* routines returned + // false. + virtual void FinishParse(); + + + // Utility methods for implementing filters + + HtmlCdataNode* NewCdataNode(HtmlElement* parent, + const StringPiece& contents); + HtmlCharactersNode* NewCharactersNode(HtmlElement* parent, + const StringPiece& literal); + HtmlCommentNode* NewCommentNode(HtmlElement* parent, + const StringPiece& contents); + HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent, + const StringPiece& contents); + HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent, + const StringPiece& contents); + + // DOM-manipulation methods. + // TODO(sligocki): Find Javascript equivalents and list them or even change + // our names to be consistent. + + // TODO(mdsteele): Rename these methods to e.g. InsertNodeBeforeNode. + // This and downstream filters will then see inserted elements but upstream + // filters will not. + // Note: In Javascript the first is called insertBefore and takes the arg + // in the opposite order. + // Note: new_node must not already be in the DOM. + void InsertElementBeforeElement(const HtmlNode* existing_node, + HtmlNode* new_node); + void InsertElementAfterElement(const HtmlNode* existing_node, + HtmlNode* new_node); + + // Add a new child element at the beginning or end of existing_parent's + // children. Named after Javascript's appendChild method. + // Note: new_child must not already be in the DOM. + void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child); + void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child); + + // Insert a new element before the current one. current_ remains unchanged. + // Note: new_node must not already be in the DOM. + void InsertElementBeforeCurrent(HtmlNode* new_node); + + // Insert a new element after the current one, moving current_ to the new + // element. In a Filter, the flush-loop will advance past this on + // the next iteration. + // Note: new_node must not already be in the DOM. + void InsertElementAfterCurrent(HtmlNode* new_node); + + // Enclose element around two elements in a sequence. The first + // element must be the same as, or precede the last element in the + // event-stream, and this is not checked, but the two elements do + // not need to be adjacent. They must have the same parent to start + // with. + bool AddParentToSequence(HtmlNode* first, HtmlNode* last, + HtmlElement* new_parent); + + // Moves current node (and all children) to an already-existing parent, + // where they will be placed as the last elements in that parent. + // Returns false if the operation could not be performed because either + // the node or its parent was partially or wholly flushed. + // Note: Will not work if called from StartElement() event. + // + // This differs from AppendChild() because it moves the current node, + // which is already in the DOM, rather than adding a new node. + bool MoveCurrentInto(HtmlElement* new_parent); + + // Moves current node (and all children) directly before existing_node. + // Note: Will not work if called from StartElement() event. + // + // This differs from InsertElementBeforeElement() because it moves the + // current node, which is already in the DOM, rather than adding a new node. + bool MoveCurrentBefore(HtmlNode* existing_node); + + // If the given node is rewritable, delete it and all of its children (if + // any) and return true; otherwise, do nothing and return false. + // Note: Javascript appears to use removeChild for this. + bool DeleteElement(HtmlNode* node); + + // Delete a parent element, retaining any children and moving them to + // reside under the parent's parent. + bool DeleteSavingChildren(HtmlElement* element); + + // Determines whether the element, in the context of its flush + // window, has children. If the element is not rewritable, or + // has not been closed yet, or inserted into the DOM event stream, + // then 'false' is returned. + // + // Note that the concept of the Flush Window is important because the + // knowledge of an element's children is not limited to the current + // event being presented to a Filter. A Filter can call this method + // in the StartElement of an event to see if any children are going + // to be coming. Of course, if the StartElement is at the end of a + // Flush window, then we won't know about the children, but IsRewritable + // will also be false. + bool HasChildrenInFlushWindow(HtmlElement* element); + + // If possible, replace the existing node with the new node and return true; + // otherwise, do nothing and return false. + bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node); + + // Creates an another element with the same name and attributes as in_element. + // Does not duplicate the children or insert it anywhere. + HtmlElement* CloneElement(HtmlElement* in_element); + + HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) { + return NewElement(parent, MakeName(str)); + } + HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) { + return NewElement(parent, MakeName(keyword)); + } + HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name); + + void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, + const StringPiece& value) { + return element->AddAttribute(MakeName(keyword), value, + HtmlElement::DOUBLE_QUOTE); + } + void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword, + const StringPiece& escaped_value) { + return element->AddEscapedAttribute(MakeName(keyword), escaped_value, + HtmlElement::DOUBLE_QUOTE); + } + void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, + int value) { + return AddAttribute(element, keyword, IntegerToString(value)); + } + void SetAttributeName(HtmlElement::Attribute* attribute, + HtmlName::Keyword keyword) { + attribute->set_name(MakeName(keyword)); + } + + HtmlName MakeName(const StringPiece& str); + HtmlName MakeName(HtmlName::Keyword keyword); + + bool IsRewritable(const HtmlNode* node) const; + + void ClearElements(); + + // Log the HtmlEvent queue_ to the message_handler_ for debugging. + void DebugLogQueue(); + + // Print the HtmlEvent queue_ to stdout for debugging. + void DebugPrintQueue(); + + // Implementation helper with detailed knowledge of html parsing libraries + friend class HtmlLexer; + + // Determines whether a tag should be terminated in HTML, e.g. . + // We do not expect to see a close-tag for meta and should never insert one. + bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; + + // An optionally closed tag ranges from

, which is typically not closed, + // but we infer the closing from context. Also consider , which usually + // is closed but not always. E.g. www.google.com does not close its html tag. + bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; + + // Determines whether a tag allows brief termination in HTML, e.g. + bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; + + MessageHandler* message_handler() const { return message_handler_; } + // Gets the current location information; typically to help with error + // messages. + const char* url() const { return url_.c_str(); } + // Gets a parsed GoogleUrl& corresponding to url(). + const GoogleUrl& google_url() const { return google_url_; } + const char* id() const { return id_.c_str(); } + int line_number() const { return line_number_; } + // Returns URL (or id) and line number as a string, to be used in messages. + GoogleString UrlLine() const { + return StringPrintf("%s:%d", id(), line_number()); + } + + // Return the current assumed doctype of the document (based on the content + // type and any HTML directives encountered so far). + const DocType& doctype() const; + + // Interface for any caller to report an error message via the message handler + void Info(const char* filename, int line, const char* msg, ...) + INSTAWEB_PRINTF_FORMAT(4, 5); + void Warning(const char* filename, int line, const char* msg, ...) + INSTAWEB_PRINTF_FORMAT(4, 5); + void Error(const char* filename, int line, const char* msg, ...) + INSTAWEB_PRINTF_FORMAT(4, 5); + void FatalError(const char* filename, int line, const char* msg, ...) + INSTAWEB_PRINTF_FORMAT(4, 5); + + void InfoV(const char* file, int line, const char *msg, va_list args); + void WarningV(const char* file, int line, const char *msg, va_list args); + void ErrorV(const char* file, int line, const char *msg, va_list args); + void FatalErrorV(const char* file, int line, const char* msg, va_list args); + + // Report error message with current parsing filename and linenumber. + void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); + void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); + void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); + void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); + + // If set_log_rewrite_timing(true) has been called, logs the given message + // at info level with a timeset offset from the parsing start time, + void ShowProgress(const char* message); + + void InfoHereV(const char *msg, va_list args) { + InfoV(id_.c_str(), line_number_, msg, args); + } + void WarningHereV(const char *msg, va_list args) { + WarningV(id_.c_str(), line_number_, msg, args); + } + void ErrorHereV(const char *msg, va_list args) { + ErrorV(id_.c_str(), line_number_, msg, args); + } + void FatalErrorHereV(const char* msg, va_list args) { + FatalErrorV(id_.c_str(), line_number_, msg, args); + } + + void AddElement(HtmlElement* element, int line_number); + void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style, + int line_number); + + // Run a filter on the current queue of parse nodes. + void ApplyFilter(HtmlFilter* filter); + + // Provide timer to helping to report timing of each filter. You must also + // set_log_rewrite_timing(true) to turn on this reporting. + void set_timer(Timer* timer) { timer_ = timer; } + void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; } + + // Adds a filter to be called during parsing as new events are added. + // Takes ownership of the HtmlFilter passed in. + void add_event_listener(HtmlFilter* listener); + + // Inserts a comment before or after the current node. The function tries to + // pick an intelligent place depending on the document structure and + // whether the current node is a start-element, end-element, or a leaf. + void InsertComment(const StringPiece& sp); + + // Sets the limit on the maximum number of bytes that should be parsed. + void set_size_limit(int64 x); + // Returns whether we have exceeded the size limit. + bool size_limit_exceeded() const; + + protected: + typedef std::vector FilterVector; + typedef std::list FilterList; + + // HtmlParse::FinishParse() is equivalent to the sequence of + // BeginFinishParse(); Flush(); EndFinishParse(). + // Split up to permit asynchronous versions. + void BeginFinishParse(); + void EndFinishParse(); + + // Returns the number of events on the event queue. + size_t GetEventQueueSize(); + + // Move the entire contents of extra_events onto the end of the event queue. + void AppendEventsToQueue(HtmlEventList* extra_events); + + // Move the entire event queue after the first event in event_set to the end + // of tail. Return that event, or NULL if there was none. + HtmlEvent* SplitQueueOnFirstEventInSet(const ConstHtmlEventSet& event_set, + HtmlEventList* tail); + + // Return the EndElementEvent for this element, or NULL if it doesn't exist + // yet. + HtmlEvent* GetEndElementEvent(const HtmlElement* element); + + virtual void ParseTextInternal(const char* content, int size); + + // Allow filters to determine whether they are enabled for this request. + void DetermineEnabledFilters(FilterVector* filters) const; + + private: + void ApplyFilterHelper(HtmlFilter* filter); + HtmlEventListIterator Last(); // Last element in queue + bool IsInEventWindow(const HtmlEventListIterator& iter) const; + void InsertElementBeforeEvent(const HtmlEventListIterator& event, + HtmlNode* new_node); + void InsertElementAfterEvent(const HtmlEventListIterator& event, + HtmlNode* new_node); + bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to); + bool IsDescendantOf(const HtmlNode* possible_child, + const HtmlNode* possible_parent); + void SanityCheck(); + void CheckEventParent(HtmlEvent* event, HtmlElement* expect, + HtmlElement* actual); + void CheckParentFromAddEvent(HtmlEvent* event); + void FixParents(const HtmlEventListIterator& begin, + const HtmlEventListIterator& end_inclusive, + HtmlElement* new_parent); + void CoalesceAdjacentCharactersNodes(); + void ClearEvents(); + void EmitQueue(MessageHandler* handler); + + // Visible for testing only, via HtmlTestingPeer + friend class HtmlTestingPeer; + void AddEvent(HtmlEvent* event); + void SetCurrent(HtmlNode* node); + void set_coalesce_characters(bool x) { coalesce_characters_ = x; } + size_t symbol_table_size() const { + return string_table_.string_bytes_allocated(); + } + + FilterVector event_listeners_; + SymbolTableSensitive string_table_; + FilterVector filters_; + HtmlLexer* lexer_; + Arena nodes_; + HtmlEventList queue_; + HtmlEventListIterator current_; + // Have we deleted current? Then we shouldn't do certain manipulations to it. + MessageHandler* message_handler_; + GoogleString url_; + GoogleUrl google_url_; + GoogleString id_; // Per-request identifier string used in error messages. + int line_number_; + bool deleted_current_; + bool need_sanity_check_; + bool coalesce_characters_; + bool need_coalesce_characters_; + bool url_valid_; + bool log_rewrite_timing_; // Should we time the speed of parsing? + bool running_filters_; + int64 parse_start_time_us_; + Timer* timer_; + int first_filter_; + + DISALLOW_COPY_AND_ASSIGN(HtmlParse); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/html_parse_test_base.h b/psol/include/net/instaweb/htmlparse/public/html_parse_test_base.h new file mode 100644 index 000000000..ab9563a5b --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/html_parse_test_base.h @@ -0,0 +1,168 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +// Infrastructure for testing html parsing and rewriting. + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_TEST_BASE_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_TEST_BASE_H_ + +#include "net/instaweb/htmlparse/public/html_parse.h" +#include "net/instaweb/htmlparse/public/html_writer_filter.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/gtest.h" +#include "net/instaweb/util/public/mock_message_handler.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/string_writer.h" + +namespace net_instaweb { + +class HtmlParseTestBaseNoAlloc : public testing::Test { + protected: + static const char kTestDomain[]; + static const char kXhtmlDtd[]; // DOCTYPE string for claiming XHTML + + HtmlParseTestBaseNoAlloc() + : write_to_string_(&output_buffer_), + added_filter_(false) { + } + virtual ~HtmlParseTestBaseNoAlloc(); + + // To make the tests more concise, we generally omit the ... + // tags bracketing the input. The libxml parser will add those in + // if we don't have them. To avoid having that make the test data more + // verbose, we automatically add them in the test infrastructure, both + // for stimulus and expected response. + // + // This flag controls whether we also add ... tags. In + // the case html_parse_test, we go ahead and add them in. In the + // case of the rewriter tests, we want to explicitly control/observe + // the head and the body so we don't add the body tags in + // automatically. So classes that derive from HtmlParseTestBase must + // override this variable to indicate which they prefer. + virtual bool AddBody() const = 0; + + // If true, prepends "\n" and appends "\n" to input text + // prior to parsing it. This was originally done for consistency with + // libxml2 but that's long since been made irrelevant and we should probably + // just stop doing it. Adding the virtual function here should help us + // incrementally update tests & their gold results. + virtual bool AddHtmlTags() const { return true; } + + // Set a doctype string (e.g. "") to be inserted before the + // rest of the document (for the current test only). If none is set, it + // defaults to the empty string. + void SetDoctype(const StringPiece& directive) { + directive.CopyToString(&doctype_string_); + } + + virtual GoogleString AddHtmlBody(const StringPiece& html) { + GoogleString ret; + if (AddHtmlTags()) { + ret = AddBody() ? "\n" : "\n"; + StrAppend(&ret, html, (AddBody() ? "\n\n" : "\n")); + } else { + html.CopyToString(&ret); + } + return ret; + } + + // Check that the output HTML is serialized to string-compare + // precisely with the input. + void ValidateNoChanges(const StringPiece& case_id, + const GoogleString& html_input) { + ValidateExpected(case_id, html_input, html_input); + } + + // Fail to ValidateNoChanges. + void ValidateNoChangesFail(const StringPiece& case_id, + const GoogleString& html_input) { + ValidateExpectedFail(case_id, html_input, html_input); + } + + void SetupWriter() { + SetupWriter(&html_writer_filter_); + } + + void SetupWriter(scoped_ptr* html_writer_filter) { + output_buffer_.clear(); + if (html_writer_filter->get() == NULL) { + html_writer_filter->reset(new HtmlWriterFilter(html_parse())); + (*html_writer_filter)->set_writer(&write_to_string_); + html_parse()->AddFilter(html_writer_filter->get()); + } + } + + // Parse html_input, the result is stored in output_buffer_. + void Parse(const StringPiece& case_id, const GoogleString& html_input) { + // HtmlParser needs a valid HTTP URL to evaluate relative paths, + // so we create a dummy URL. + GoogleString dummy_url = StrCat(kTestDomain, case_id, ".html"); + ParseUrl(dummy_url, html_input); + } + + // Parse given an explicit URL rather than an id to build URL around. + virtual void ParseUrl(const StringPiece& url, const StringPiece& html_input); + + // Validate that the output HTML serializes as specified in + // 'expected', which might not be identical to the input. + // Also, returns true if result came out as expected. + bool ValidateExpected(const StringPiece& case_id, + const GoogleString& html_input, + const GoogleString& expected); + + // Same as ValidateExpected, but with an explicit URL rather than an id. + bool ValidateExpectedUrl(const StringPiece& url, + const GoogleString& html_input, + const GoogleString& expected); + + // Fail to ValidateExpected. + void ValidateExpectedFail(const StringPiece& case_id, + const GoogleString& html_input, + const GoogleString& expected); + + virtual HtmlParse* html_parse() = 0; + + MockMessageHandler message_handler_; + StringWriter write_to_string_; + GoogleString output_buffer_; + bool added_filter_; + scoped_ptr html_writer_filter_; + GoogleString doctype_string_; + + private: + DISALLOW_COPY_AND_ASSIGN(HtmlParseTestBaseNoAlloc); +}; + +class HtmlParseTestBase : public HtmlParseTestBaseNoAlloc { + public: + HtmlParseTestBase() : html_parse_(&message_handler_) { + }; + protected: + virtual HtmlParse* html_parse() { return &html_parse_; } + + HtmlParse html_parse_; + + private: + DISALLOW_COPY_AND_ASSIGN(HtmlParseTestBase); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_TEST_BASE_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/html_writer_filter.h b/psol/include/net/instaweb/htmlparse/public/html_writer_filter.h new file mode 100644 index 000000000..456015798 --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/html_writer_filter.h @@ -0,0 +1,106 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_WRITER_FILTER_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_WRITER_FILTER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/htmlparse/public/html_element.h" +#include "net/instaweb/htmlparse/public/html_filter.h" +#include "net/instaweb/htmlparse/public/html_name.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HtmlCdataNode; +class HtmlCharactersNode; +class HtmlCommentNode; +class HtmlDirectiveNode; +class HtmlIEDirectiveNode; +class HtmlParse; +class Writer; + +class HtmlWriterFilter : public HtmlFilter { + public: + explicit HtmlWriterFilter(HtmlParse* html_parse); + + void set_writer(Writer* writer) { writer_ = writer; } + virtual ~HtmlWriterFilter(); + + virtual void StartDocument(); + virtual void EndDocument(); + virtual void StartElement(HtmlElement* element); + virtual void EndElement(HtmlElement* element); + virtual void Cdata(HtmlCdataNode* cdata); + virtual void Comment(HtmlCommentNode* comment); + virtual void IEDirective(HtmlIEDirectiveNode* directive); + virtual void Characters(HtmlCharactersNode* characters); + virtual void Directive(HtmlDirectiveNode* directive); + virtual void Flush(); + virtual void DetermineEnabled(); + + void set_max_column(int max_column) { max_column_ = max_column; } + void set_case_fold(bool case_fold) { case_fold_ = case_fold; } + + virtual const char* Name() const { return "HtmlWriter"; } + + protected: + // Clear various variables for rewriting a new html file. + virtual void Clear(); + + Writer* writer() { return writer_; } + + // Terminates the current lazy close element if it is not already terminated. + void TerminateLazyCloseElement(); + + private: + void EmitBytes(const StringPiece& str); + + // Emits an HTML name, possibly case-folded depending on the + // caller-specified option. + void EmitName(const HtmlName& name); + + HtmlElement::CloseStyle GetCloseStyle(HtmlElement* element); + + // Escapes arbitrary text as HTML, e.g. turning & into &. If quoteChar + // is non-zero, e.g. '"', then it would escape " as well. + void EncodeBytes(const GoogleString& val, int quoteChar); + + HtmlParse* html_parse_; + Writer* writer_; + + // Helps writer exploit shortcuts like rather than writing + // . At the end of StartElement, we defer writing the ">" + // until we see what's coming next. If it's the matching end_tag, then + // we can emit />. If something else comes first, then we have to + // first emit the delayed ">" before continuing. + HtmlElement* lazy_close_element_; + + int column_; + int max_column_; + int write_errors_; + bool case_fold_; + GoogleString case_fold_buffer_; + + DISALLOW_COPY_AND_ASSIGN(HtmlWriterFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_WRITER_FILTER_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/logging_html_filter.h b/psol/include/net/instaweb/htmlparse/public/logging_html_filter.h new file mode 100644 index 000000000..a6e74b056 --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/logging_html_filter.h @@ -0,0 +1,110 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmaessen@google.com (Jan Maessen) + +// html_filter that passes data through unmodified, but +// logs statistics about the data as it goes by. +// It should be possible to create many instances of this +// class and insert them at different points in the rewriting flow +// Goal is to log: +// NUM_EXPLICIT_CLOSED - pairs +// NUM_IMPLICIT_CLOSED - for implicitly-closed tag +// NUM_BRIEF_CLOSED - +// NUM_CLOSED - Sum of above three +// NUM_UNCLOSED - without matching +// NUM_SPURIOUS_CLOSED - without preceding ; UNCOUNTED RIGHT NOW! +// NUM_TAGS - Total number of opening tags +// NUM_CDATA - cdata sections +// NUM_COMMENTS - comments +// NUM_DIRECTIVES - directives +// NUM_DOCUMENTS - started documents +// NUM_IE_DIRECTIVES - ie directives +// Reporting: +// We report this information via a StatisticsLog: filter.ToString(log) +// Two sets of statistics (eg before and after processing) can be +// compared using before.Equals(after), + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_LOGGING_HTML_FILTER_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_LOGGING_HTML_FILTER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/htmlparse/public/empty_html_filter.h" + +namespace net_instaweb { + +class HtmlCdataNode; +class HtmlCommentNode; +class HtmlDirectiveNode; +class HtmlElement; +class HtmlIEDirectiveNode; +class StatisticsLog; + +class LoggingFilter : public EmptyHtmlFilter { + public: + // internal names of statistics. + // NOTE: must match string names in kStatisticNames at top of + // logging_html_filter.c + enum Statistic { + MIN_STAT = 0, + NUM_EXPLICIT_CLOSED = 0, + NUM_IMPLICIT_CLOSED, + NUM_BRIEF_CLOSED, + NUM_CLOSED, + NUM_UNCLOSED, + NUM_SPURIOUS_CLOSED, + NUM_TAGS, + NUM_CDATA, + NUM_COMMENTS, + NUM_DIRECTIVES, + NUM_DOCUMENTS, + NUM_IE_DIRECTIVES, + MAX_STAT + }; + + LoggingFilter(); + + // HtmlFilter methods. + virtual void StartDocument(); + virtual void StartElement(HtmlElement* element); + virtual void EndElement(HtmlElement* element); + virtual void Cdata(HtmlCdataNode* cdata); + virtual void Comment(HtmlCommentNode* comment); + virtual void IEDirective(HtmlIEDirectiveNode* directive); + virtual void Directive(HtmlDirectiveNode* directive); + virtual const char* Name() const { return "Logging"; } + + // Getter for individual statistics; NO BOUNDS CHECKS. + inline int get(const Statistic statistic) const { + return stats_[statistic]; + } + + // Logging, diffing, and aggregation + + // Report all statistics + void LogStatistics(StatisticsLog *statistics_log) const; + + void Reset(); + + private: + int stats_[MAX_STAT]; + + DISALLOW_COPY_AND_ASSIGN(LoggingFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_LOGGING_HTML_FILTER_H_ diff --git a/psol/include/net/instaweb/htmlparse/public/statistics_log.h b/psol/include/net/instaweb/htmlparse/public/statistics_log.h new file mode 100644 index 000000000..6b6f2085b --- /dev/null +++ b/psol/include/net/instaweb/htmlparse/public/statistics_log.h @@ -0,0 +1,38 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmaessen@google.com (Jan Maessen) + +#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_STATISTICS_LOG_H_ +#define NET_INSTAWEB_HTMLPARSE_PUBLIC_STATISTICS_LOG_H_ + +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class StatisticsLog { + public: + StatisticsLog() { } + virtual ~StatisticsLog(); + virtual void LogStat(const char *statName, int value) = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(StatisticsLog); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_STATISTICS_LOG_H_ diff --git a/psol/include/net/instaweb/http/public/async_fetch.h b/psol/include/net/instaweb/http/public/async_fetch.h new file mode 100644 index 000000000..4a4f64c05 --- /dev/null +++ b/psol/include/net/instaweb/http/public/async_fetch.h @@ -0,0 +1,384 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) +// sligocki@google.com (Shawn Ligocki) +// +// AsyncFetch represents the context of a single fetch. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_ASYNC_FETCH_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_ASYNC_FETCH_H_ + +#include "net/instaweb/http/public/http_value.h" +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/http/public/request_headers.h" +#include "net/instaweb/http/public/response_headers.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/writer.h" + + +namespace net_instaweb { + +class LogRecord; +class MessageHandler; +class Variable; + +// Abstract base class for encapsulating streaming, asynchronous HTTP fetches. +// +// If you want to fetch a resources, implement this interface, create an +// instance and call UrlAsyncFetcher::Fetch() with it. +// +// It combines the 3 callbacks we expect to get from fetchers +// (Write, Flush and Done) and adds a HeadersComplete indicator that is +// useful in any place where we want to deal with and send headers before +// Write or Done are called. +// +// Note that it automatically invokes HeadersComplete before the first call to +// Write, Flush or Done. +class AsyncFetch : public Writer { + public: + AsyncFetch(); + explicit AsyncFetch(const RequestContextPtr& request_ctx); + + virtual ~AsyncFetch(); + + // Called when ResponseHeaders have been set, but before writing contents. + // Contract: Must be called (at most once) before Write, Flush or Done. + // Automatically invoked (if neccessary) before the first call to Write, + // Flush, or Done. This interface is intended for callers (e.g. Fetchers). + // Implementors of the AsyncFetch interface must override + // HandleHeadersComplete. + void HeadersComplete(); + + // Fetch complete. This interface is intended for callers + // (e.g. Fetchers). Implementors must override HandleDone. + void Done(bool success); + + // Data available. This interface is intended for callers. Implementors + // must override HandlerWrite and HandleFlush. + virtual bool Write(const StringPiece& content, MessageHandler* handler); + virtual bool Flush(MessageHandler* handler); + + // Is the cache entry corresponding to headers valid? Default is that it is + // valid. Sub-classes can provide specific implementations, e.g., based on + // cache invalidation timestamp in domain specific options. + // Used by CacheUrlAsyncFetcher. + // TODO(nikhilmadan): Consider making this virtual so that subclass authors + // are forced to look at this function. + virtual bool IsCachedResultValid(const ResponseHeaders& headers) { + return true; + } + + // Returns a pointer to the request-headers, lazily constructing + // them if needed. If they are constructed here (as opposed to + // being set with set_request_headers) then they will be owned by + // the class instance. + RequestHeaders* request_headers(); + + // Sets the request-headers to the specifid pointer. The caller must + // guarantee that the pointed-to headers remain valid as long as the + // AsyncFetch is running. + // + // Does not take ownership of headers. + void set_request_headers(RequestHeaders* headers); + + // Same as above, but takes ownership. + void SetRequestHeadersTakingOwnership(RequestHeaders* headers); + + // Returns the request_headers as a const pointer: it is required + // that the RequestHeaders be pre-initialized via non-const + // request_headers() or via set_request_headers before calling this. + const RequestHeaders* request_headers() const; + + // See doc for request_headers and set_request_headers. + ResponseHeaders* response_headers(); + void set_response_headers(ResponseHeaders* headers); + + // Returns extra response headers which may be modified between + // calls to HeadersComplete() and Done(). This is used to allow + // a fetch to provide additional headers which cannot be determined + // when HeadersComplete() has been invoked, e.g., X-Original-Content-Length. + // This is needed because it is not safe for the producer to modify + // response_headers() once HeadersComplete() has been called. + ResponseHeaders* extra_response_headers(); + void set_extra_response_headers(ResponseHeaders* headers); + + virtual bool EnableThreaded() const { return false; } + + // Indicates whether the request is a background fetch. These can be scheduled + // differently by the fetcher. + virtual bool IsBackgroundFetch() const { return false; } + + // Resets the 'headers_complete_' flag. + // TODO(jmarantz): should this also clear the response headers? + virtual void Reset() { headers_complete_ = false; } + + bool headers_complete() const { return headers_complete_; } + + // Returns logging information in a string eg. c1:0;c2:2;hf:45;. + // c1 is cache 1, c2 is cache 2, hf is headers fetch. + GoogleString LoggingString(); + + // Returns the request context associated with this fetch, if any, or + // NULL if no request context exists. + virtual const RequestContextPtr& request_context() { return request_ctx_; } + + // Returns a pointer to a log record that wraps this fetch's logging + // info. + virtual LogRecord* log_record(); + + protected: + virtual bool HandleWrite(const StringPiece& sp, MessageHandler* handler) = 0; + virtual bool HandleFlush(MessageHandler* handler) = 0; + virtual void HandleDone(bool success) = 0; + virtual void HandleHeadersComplete() = 0; + + private: + RequestHeaders* request_headers_; + ResponseHeaders* response_headers_; + ResponseHeaders* extra_response_headers_; + RequestContextPtr request_ctx_; + bool owns_request_headers_; + bool owns_response_headers_; + bool owns_extra_response_headers_; + bool headers_complete_; + + DISALLOW_COPY_AND_ASSIGN(AsyncFetch); +}; + +// Class to represent an Async fetch that collects the response-data into +// a string, which can be accessed via buffer() and cleared via Reset(). +// +// TODO(jmarantz): move StringAsyncFetch into its own file. +class StringAsyncFetch : public AsyncFetch { + public: + // TODO(marq): Remove constructors lacking a request context. + StringAsyncFetch() : buffer_pointer_(&buffer_) { Init(); } + + explicit StringAsyncFetch(const RequestContextPtr& request_ctx) + : AsyncFetch(request_ctx), buffer_pointer_(&buffer_) { + Init(); + } + + explicit StringAsyncFetch(GoogleString* buffer) : buffer_pointer_(buffer) { + Init(); + } + + StringAsyncFetch(const RequestContextPtr& request_ctx, GoogleString* buffer) + : AsyncFetch(request_ctx), buffer_pointer_(buffer) { + Init(); + } + + virtual ~StringAsyncFetch(); + + virtual bool HandleWrite(const StringPiece& content, + MessageHandler* handler) { + content.AppendToString(buffer_pointer_); + return true; + } + virtual bool HandleFlush(MessageHandler* handler) { return true; } + virtual void HandleHeadersComplete() {} + virtual void HandleDone(bool success) { + success_ = success; + done_ = true; + } + + bool success() const { return success_; } + bool done() const { return done_; } + const GoogleString& buffer() const { return *buffer_pointer_; } + + virtual void Reset() { + done_ = false; + success_ = false; + buffer_pointer_->clear(); + response_headers()->Clear(); + extra_response_headers()->Clear(); + request_headers()->Clear(); + AsyncFetch::Reset(); + } + + protected: + // For subclasses that need to use complex logic to set success_ and done_. + // Most subclasses should not need these. + void set_success(bool success) { success_ = success; } + void set_done(bool done) { done_ = done; } + + private: + void Init() { + success_ = false; + done_ = false; + } + + GoogleString buffer_; + GoogleString* buffer_pointer_; + bool success_; + bool done_; + + DISALLOW_COPY_AND_ASSIGN(StringAsyncFetch); +}; + +// Creates an AsyncFetch object using an existing Writer* object, +// which is used to delegate Write and Flush operations. This +// class is still abstract, and requires inheritors to implement Done(). +class AsyncFetchUsingWriter : public AsyncFetch { + public: + AsyncFetchUsingWriter(const RequestContextPtr& request_context, + Writer* writer) + : AsyncFetch(request_context), + writer_(writer) {} + virtual ~AsyncFetchUsingWriter(); + + protected: + virtual bool HandleWrite(const StringPiece& sp, MessageHandler* handler); + virtual bool HandleFlush(MessageHandler* handler); + + private: + Writer* writer_; + DISALLOW_COPY_AND_ASSIGN(AsyncFetchUsingWriter); +}; + +// Creates an AsyncFetch object using an existing AsyncFetcher*, +// sharing the response & request headers, and by default delegating +// all 4 Handle methods to the base fetcher. Any one of them can +// be overridden by inheritors of this class. +class SharedAsyncFetch : public AsyncFetch { + public: + explicit SharedAsyncFetch(AsyncFetch* base_fetch); + virtual ~SharedAsyncFetch(); + + AsyncFetch* base_fetch() { return base_fetch_; } + const AsyncFetch* base_fetch() const { return base_fetch_; } + + virtual const RequestContextPtr& request_context() { + return base_fetch_->request_context(); + } + + protected: + virtual void HandleDone(bool success) { + base_fetch_->Done(success); + } + + virtual bool HandleWrite(const StringPiece& content, + MessageHandler* handler) { + return base_fetch_->Write(content, handler); + } + + virtual bool HandleFlush(MessageHandler* handler) { + return base_fetch_->Flush(handler); + } + + virtual void HandleHeadersComplete() { + base_fetch_->HeadersComplete(); + } + + virtual bool EnableThreaded() const { + return base_fetch_->EnableThreaded(); + } + + virtual bool IsCachedResultValid(const ResponseHeaders& headers) { + return base_fetch_->IsCachedResultValid(headers); + } + + virtual bool IsBackgroundFetch() const { + return base_fetch_->IsBackgroundFetch(); + } + + private: + AsyncFetch* base_fetch_; + DISALLOW_COPY_AND_ASSIGN(SharedAsyncFetch); +}; + +// Creates a SharedAsyncFetch object using an existing AsyncFetch and a fallback +// value that is used in case the fetched response is an error. Note that in +// case the fetched response is an error and we have a non-empty fallback value, +// we completely ignore the fetched response. +// Also, note that this class gets deleted when HandleDone is called. +class FallbackSharedAsyncFetch : public SharedAsyncFetch { + public: + // Warning header to be added if a stale response is served. + static const char kStaleWarningHeaderValue[]; + + FallbackSharedAsyncFetch(AsyncFetch* base_fetch, HTTPValue* fallback, + MessageHandler* handler); + virtual ~FallbackSharedAsyncFetch(); + + void set_fallback_responses_served(Variable* x) { + fallback_responses_served_ = x; + } + + bool serving_fallback() const { return serving_fallback_; } + + protected: + virtual void HandleDone(bool success); + virtual bool HandleWrite(const StringPiece& content, MessageHandler* handler); + virtual bool HandleFlush(MessageHandler* handler); + virtual void HandleHeadersComplete(); + + private: + // Note that this is only used while serving the fallback response. + MessageHandler* handler_; + HTTPValue fallback_; + bool serving_fallback_; + Variable* fallback_responses_served_; // may be NULL. + + DISALLOW_COPY_AND_ASSIGN(FallbackSharedAsyncFetch); +}; + +// Creates a SharedAsyncFetch object using an existing AsyncFetch and a cached +// value (that may be stale) that is used to conditionally check if the resource +// at the origin has changed. If the resource hasn't changed and we get a 304, +// we serve the cached response, thus avoiding the download of the entire +// content. +// Note that we if you want the conditionally validated resource to be treated +// as a newly fetched with the original ttl, you should use this fetch such that +// the fixing of date headers happens in the base fetch. +// Also, note that this class gets deleted when HandleDone is called. +class ConditionalSharedAsyncFetch : public SharedAsyncFetch { + public: + ConditionalSharedAsyncFetch(AsyncFetch* base_fetch, HTTPValue* cached_value, + MessageHandler* handler); + virtual ~ConditionalSharedAsyncFetch(); + + void set_num_conditional_refreshes(Variable* x) { + num_conditional_refreshes_ = x; + } + + protected: + virtual void HandleDone(bool success); + virtual bool HandleWrite(const StringPiece& content, MessageHandler* handler); + virtual bool HandleFlush(MessageHandler* handler); + virtual void HandleHeadersComplete(); + + private: + // Note that this is only used while serving the cached response. + MessageHandler* handler_; + HTTPValue cached_value_; + // Indicates that we received a 304 from the origin and are serving out the + // cached value. + bool serving_cached_value_; + // Indicates that we added conditional headers to the request. + bool added_conditional_headers_to_request_; + + Variable* num_conditional_refreshes_; // may be NULL. + + DISALLOW_COPY_AND_ASSIGN(ConditionalSharedAsyncFetch); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_ASYNC_FETCH_H_ diff --git a/psol/include/net/instaweb/http/public/bot_checker.h b/psol/include/net/instaweb/http/public/bot_checker.h new file mode 100755 index 000000000..96d62f6d6 --- /dev/null +++ b/psol/include/net/instaweb/http/public/bot_checker.h @@ -0,0 +1,34 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: fangfei@google.com (Fangfei Zhou) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_BOT_CHECKER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_BOT_CHECKER_H_ + +#include "net/instaweb/util/public/string_util.h" +namespace net_instaweb { + +// BOTs names are case sensitive. +// We keep the exact BOT names in .gperf table. +class BotChecker { + public: + static bool Lookup(const StringPiece& user_agent); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_BOT_CHECKER_H_ diff --git a/psol/include/net/instaweb/http/public/cache_url_async_fetcher.h b/psol/include/net/instaweb/http/public/cache_url_async_fetcher.h new file mode 100644 index 000000000..22c4c1b1c --- /dev/null +++ b/psol/include/net/instaweb/http/public/cache_url_async_fetcher.h @@ -0,0 +1,137 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_CACHE_URL_ASYNC_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_CACHE_URL_ASYNC_FETCHER_H_ + +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class AsyncFetch; +class Histogram; +class HTTPCache; +class MessageHandler; +class Variable; + +// Composes an asynchronous URL fetcher with an http cache, to +// generate an asynchronous caching URL fetcher. +// +// This fetcher will asynchronously check the cache. If the url +// is found in cache and is still valid, the fetch's callback will be +// called right away. Otherwise (if fetcher != NULL) an async fetch +// will be performed in the fetcher, the result of which will be written +// into the cache. In case the fetch fails and there is a stale response +// in the cache, we serve the stale response. +// +// If fetcher == NULL, this will only perform a cache lookup and then call +// the callback immediately. +// +// TODO(sligocki): In order to use this for fetching resources for rewriting +// we'd need to integrate resource locking in this class. Do we want that? +class CacheUrlAsyncFetcher : public UrlAsyncFetcher { + public: + CacheUrlAsyncFetcher(HTTPCache* cache, UrlAsyncFetcher* fetcher) + : http_cache_(cache), + fetcher_(fetcher), + backend_first_byte_latency_(NULL), + fallback_responses_served_(NULL), + num_conditional_refreshes_(NULL), + respect_vary_(false), + ignore_recent_fetch_failed_(false), + serve_stale_if_fetch_error_(false), + default_cache_html_(false) { + } + virtual ~CacheUrlAsyncFetcher(); + + virtual bool SupportsHttps() const { return fetcher_->SupportsHttps(); } + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* base_fetch); + + HTTPCache* http_cache() const { return http_cache_; } + UrlAsyncFetcher* fetcher() const { return fetcher_; } + + void set_backend_first_byte_latency_histogram(Histogram* x) { + backend_first_byte_latency_ = x; + } + + Histogram* backend_first_byte_latency_histogram() const { + return backend_first_byte_latency_; + } + + void set_fallback_responses_served(Variable* x) { + fallback_responses_served_ = x; + } + + Variable* fallback_responses_served() const { + return fallback_responses_served_; + } + + void set_num_conditional_refreshes(Variable* x) { + num_conditional_refreshes_ = x; + } + + Variable* num_conditional_refreshes() const { + return num_conditional_refreshes_; + } + + void set_respect_vary(bool x) { respect_vary_ = x; } + bool respect_vary() const { return respect_vary_; } + + void set_ignore_recent_fetch_failed(bool x) { + ignore_recent_fetch_failed_ = x; + } + bool ignore_recent_fetch_failed() const { + return ignore_recent_fetch_failed_; + } + + void set_serve_stale_if_fetch_error(bool x) { + serve_stale_if_fetch_error_ = x; + } + + bool serve_stale_if_fetch_error() const { + return serve_stale_if_fetch_error_; + } + + void set_default_cache_html(bool x) { default_cache_html_ = x; } + bool default_cache_html() const { return default_cache_html_; } + + private: + // Not owned by CacheUrlAsyncFetcher. + HTTPCache* http_cache_; + UrlAsyncFetcher* fetcher_; + + Histogram* backend_first_byte_latency_; // may be NULL. + Variable* fallback_responses_served_; // may be NULL. + Variable* num_conditional_refreshes_; // may be NULL. + + bool respect_vary_; + bool ignore_recent_fetch_failed_; + bool serve_stale_if_fetch_error_; + bool default_cache_html_; + + DISALLOW_COPY_AND_ASSIGN(CacheUrlAsyncFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_CACHE_URL_ASYNC_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/content_type.h b/psol/include/net/instaweb/http/public/content_type.h new file mode 100644 index 000000000..27ecb3b0a --- /dev/null +++ b/psol/include/net/instaweb/http/public/content_type.h @@ -0,0 +1,132 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) +// +// A collection of content-types and their attributes. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_CONTENT_TYPE_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_CONTENT_TYPE_H_ + +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +struct ContentType { + public: + // The MIME types we process. + enum Type { + kHtml, + kXhtml, + kCeHtml, // See http://en.wikipedia.org/wiki/CE-HTML + kJavascript, + kCss, + kText, + kXml, + kPng, + kGif, + kJpeg, + kSwf, + kWebp, + kJson, + kPdf, + kVideo, + kOctetStream, // Binary resources. + kOther, // Used to specify a new local ContentType in one test file. + }; + + // Returns the maximum extension length of any resource types our filters + // can create. Does not count the ".". + // See RewriteDriver::CreateOutputResourceWithPath() + static int MaxProducedExtensionLength(); + + const char* mime_type() const { return mime_type_; } + // TODO(sligocki): Stop returning '.' in file_extension(). + const char* file_extension() const { return file_extension_; } + Type type() const { return type_; } + + // Return true iff this content type is HTML, or XHTML, or some other such + // thing (e.g. CE-HTML) that we can rewrite. + bool IsHtmlLike() const; + + // Return true iff this content type is XML of some kind (either XHTML or + // some other XML). + bool IsXmlLike() const; + + // Return true iff this content type is Flash. + bool IsFlash() const; + + // Return true iff this content type is Image. + bool IsImage() const; + + // Return true iff this content type is Video. + bool IsVideo() const; + + // These fields should be private; we leave them public only so we can use + // struct literals in content_type.cc. Other code should use the above + // accessor methods instead of accessing these fields directly. + const char* mime_type_; + const char* file_extension_; // includes ".", e.g. ".ext" + Type type_; +}; + +// HTML-like (i.e. rewritable) text: +extern const ContentType& kContentTypeHtml; +extern const ContentType& kContentTypeXhtml; +extern const ContentType& kContentTypeCeHtml; +// Other text: +extern const ContentType& kContentTypeJavascript; +extern const ContentType& kContentTypeCss; +extern const ContentType& kContentTypeText; +extern const ContentType& kContentTypeXml; +extern const ContentType& kContentTypeJson; +// Images: +extern const ContentType& kContentTypePng; +extern const ContentType& kContentTypeGif; +extern const ContentType& kContentTypeJpeg; +extern const ContentType& kContentTypeSwf; +extern const ContentType& kContentTypeWebp; +// Pdf: +extern const ContentType& kContentTypePdf; + +// Binary/octet-stream. +extern const ContentType& kBinaryOctetStream; + +// Given a name (file or url), see if it has the canonical extension +// corresponding to a particular content type. +const ContentType* NameExtensionToContentType(const StringPiece& name); +const ContentType* MimeTypeToContentType(const StringPiece& mime_type); + +// Extracts mime_type and charset from a string of the form +// "; charset=". +// If mime_type or charset is not specified, they will be populated +// with the empty string. +// Returns true if either a mime_type or a charset was extracted. +bool ParseContentType(const StringPiece& content_type_str, + GoogleString* mime_type, + GoogleString* charset); + +// Splits comma-separated string to elements and tries to match each one with +// a recognized content type. The out set will be cleared first and must be +// present. +void MimeTypeListToContentTypeSet( + const GoogleString& in, + std::set* out); + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_CONTENT_TYPE_H_ diff --git a/psol/include/net/instaweb/http/public/counting_url_async_fetcher.h b/psol/include/net/instaweb/http/public/counting_url_async_fetcher.h new file mode 100644 index 000000000..592c4fbda --- /dev/null +++ b/psol/include/net/instaweb/http/public/counting_url_async_fetcher.h @@ -0,0 +1,69 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) +// +// Wraps an asynchronous fetcher, but keeps track of success/failure count. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_COUNTING_URL_ASYNC_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_COUNTING_URL_ASYNC_FETCHER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class AsyncFetch; +class MessageHandler; + +class CountingUrlAsyncFetcher : public UrlAsyncFetcher { + public: + explicit CountingUrlAsyncFetcher(UrlAsyncFetcher* fetcher) + : fetcher_(fetcher) { + Clear(); + } + virtual ~CountingUrlAsyncFetcher(); + + void set_fetcher(UrlAsyncFetcher* fetcher) { fetcher_ = fetcher; } + + virtual bool SupportsHttps() const { return fetcher_->SupportsHttps(); } + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch); + + int fetch_count() const { return fetch_count_; } + int byte_count() const { return byte_count_; } + int failure_count() const { return failure_count_; } + + void Clear(); + + class CountingFetch; + friend class CountingFetch; + + private: + UrlAsyncFetcher* fetcher_; + int fetch_count_; + int byte_count_; + int failure_count_; + + DISALLOW_COPY_AND_ASSIGN(CountingUrlAsyncFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_COUNTING_URL_ASYNC_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/external_url_fetcher.h b/psol/include/net/instaweb/http/public/external_url_fetcher.h new file mode 100644 index 000000000..f05b68165 --- /dev/null +++ b/psol/include/net/instaweb/http/public/external_url_fetcher.h @@ -0,0 +1,90 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Authors: jmarantz@google.com (Joshua Marantz) +// vchudnov@google.com (Victor Chudnovsky) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_EXTERNAL_URL_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_EXTERNAL_URL_FETCHER_H_ + +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +#include "net/instaweb/http/public/url_fetcher.h" + +namespace net_instaweb { + +class MessageHandler; +class RequestHeaders; +class ResponseHeaders; +class Writer; + +// Runs an external command ('wget' by default, or 'curl') via popen +// for blocking URL fetches. + +// TODO(vchudnov): Incorporate NetcatUrlFetcher functionality into +// this class. +class ExternalUrlFetcher : public UrlFetcher { + public: + ExternalUrlFetcher() {} + virtual ~ExternalUrlFetcher() {} + + // TODO(sligocki): Allow protocol version number (e.g. HTTP/1.1) + // and request type (e.g. GET, POST, etc.) to be specified. + virtual bool StreamingFetchUrl(const GoogleString& url, + const RequestHeaders& request_headers, + ResponseHeaders* response_headers, + Writer* writer, + MessageHandler* message_handler, + const RequestContextPtr& request_context); + + + // Default user agent to use. + static const char kDefaultUserAgent[]; + + // Sets the path to "binary" when fetching using "how". + void set_binary(const GoogleString& binary); + + + protected: + // Appends to escaped_headers one header line for each Name, Value + // pair in request_headers. + virtual void AppendHeaders(const RequestHeaders& request_headers, + StringVector* escaped_headers); + + GoogleString binary_; + + private: + virtual const char* GetFetchLabel() = 0; + + // Returns the external command to run in order to fetch a URL. The + // URL and the vector of header lines must be already escaped in + // escaped_url and escaped_headers, respectively. In addition to the + // specified headers, the User-Agent is also explicitly set to the + // value of user_agent, unless the latter is NULL. + virtual GoogleString ConstructFetchCommand( + const GoogleString& escaped_url, + const char* user_agent, + const StringVector& escaped_headers) = 0; + + DISALLOW_COPY_AND_ASSIGN(ExternalUrlFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_EXTERNAL_URL_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/fake_url_async_fetcher.h b/psol/include/net/instaweb/http/public/fake_url_async_fetcher.h new file mode 100644 index 000000000..8a35748d3 --- /dev/null +++ b/psol/include/net/instaweb/http/public/fake_url_async_fetcher.h @@ -0,0 +1,70 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) +// +// UrlFetcher is an interface for asynchronously fetching urls. The +// caller must supply a callback to be called when the fetch is complete. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_FAKE_URL_ASYNC_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_FAKE_URL_ASYNC_FETCHER_H_ + +#include "net/instaweb/http/public/url_pollable_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class AsyncFetch; +class MessageHandler; +class UrlFetcher; + +// Constructs an async fetcher using a synchronous fetcher, blocking +// on a fetch and then the 'done' callback directly. It's also +// possible to construct a real async interface using a synchronous +// fetcher in a thread, but this does not do that: it blocks. +// +// This is intended for functional regression tests only. +class FakeUrlAsyncFetcher : public UrlPollableAsyncFetcher { + public: + explicit FakeUrlAsyncFetcher(UrlFetcher* url_fetcher) + : url_fetcher_(url_fetcher), + fetcher_supports_https_(true) { + } + virtual ~FakeUrlAsyncFetcher(); + + virtual bool SupportsHttps() const { return fetcher_supports_https_; } + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch); + + // Since the underlying fetcher is blocking, there can never be + // any outstanding fetches. + virtual int Poll(int64 max_wait_ms) { return 0; } + + void set_fetcher_supports_https(bool val) { fetcher_supports_https_ = val; } + + private: + UrlFetcher* url_fetcher_; + bool fetcher_supports_https_; + + DISALLOW_COPY_AND_ASSIGN(FakeUrlAsyncFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_FAKE_URL_ASYNC_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/fetcher_test.h b/psol/include/net/instaweb/http/public/fetcher_test.h new file mode 100644 index 000000000..604659dac --- /dev/null +++ b/psol/include/net/instaweb/http/public/fetcher_test.h @@ -0,0 +1,190 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +// Unit-test framework for wget fetcher + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_FETCHER_TEST_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_FETCHER_TEST_H_ + +#include // for pair +#include + +#include "base/logging.h" +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/http/public/url_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/google_message_handler.h" +#include "net/instaweb/util/public/gtest.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/thread_system.h" + +namespace net_instaweb { + +class MessageHandler; +class RequestHeaders; +class ResponseHeaders; +class SimpleStats; +class Writer; + +class FetcherTest : public testing::Test { + protected: + static const char kStartDate[]; + static const char kHtmlContent[]; + static const char kGoodUrl[]; + static const char kNotCachedUrl[]; + static const char kBadUrl[]; + static const char kHeaderName[]; + static const char kHeaderValue[]; + static const char kErrorMessage[]; + + FetcherTest(); + + static void SetUpTestCase(); + static void TearDownTestCase(); + + // Helpful classes for testing. + + // This mock fetcher will only fetch kGoodUrl, returning kHtmlContent. + // If you ask for any other URL it will fail. + class MockFetcher : public UrlFetcher { + public: + MockFetcher() : num_fetches_(0) {} + + virtual bool StreamingFetchUrl(const GoogleString& url, + const RequestHeaders& request_headers, + ResponseHeaders* response_headers, + Writer* response_writer, + MessageHandler* message_handler, + const RequestContextPtr& request_context); + + int num_fetches() const { return num_fetches_; } + + private: + bool Populate(const char* cache_control, ResponseHeaders* response_headers, + Writer* writer, MessageHandler* message_handler); + + int num_fetches_; + + DISALLOW_COPY_AND_ASSIGN(MockFetcher); + }; + + // This is a pseudo-asynchronous interface to MockFetcher. It performs + // fetches instantly, but defers calling the callback until the user + // calls CallCallbacks(). Then it will execute the deferred callbacks. + class MockAsyncFetcher : public UrlAsyncFetcher { + public: + explicit MockAsyncFetcher(UrlFetcher* url_fetcher) + : url_fetcher_(url_fetcher) {} + + virtual void Fetch(const GoogleString& url, + MessageHandler* handler, + AsyncFetch* fetch); + + void CallCallbacks(); + + private: + UrlFetcher* url_fetcher_; + std::vector > deferred_callbacks_; + + DISALLOW_COPY_AND_ASSIGN(MockAsyncFetcher); + }; + + // Callback that just checks correct Done status and keeps track of whether + // it has been called yet or not. + class CheckCallback : public StringAsyncFetch { + public: + CheckCallback(const RequestContextPtr& ctx, bool expect_success, + bool* callback_called) + : StringAsyncFetch(ctx), + expect_success_(expect_success), + callback_called_(callback_called) { + } + + virtual void HandleDone(bool success) { + *callback_called_ = true; + CHECK_EQ(expect_success_, success); + ValidateMockFetcherResponse(success, true, buffer(), *response_headers()); + delete this; + } + + bool expect_success_; + bool* callback_called_; + + private: + DISALLOW_COPY_AND_ASSIGN(CheckCallback); + }; + + static void ValidateMockFetcherResponse( + bool success, bool check_error_message, const GoogleString& content, + const ResponseHeaders& response_headers); + + // Do a URL fetch, and return the number of times the mock fetcher + // had to be run to perform the fetch. + // Note: You must override sync_fetcher() to return the correct fetcher. + int CountFetchesSync(const StringPiece& url, bool expect_success, + bool check_error_message); + // Use an explicit fetcher (you don't need to override sync_fetcher()). + int CountFetchesSync(const StringPiece& url, UrlFetcher* fetcher, + bool expect_success, bool check_error_message); + + // Initiate an async URL fetch, and return the number of times the mock + // fetcher had to be run to perform the fetch. + // Note: You must override async_fetcher() to return the correct fetcher. + int CountFetchesAsync(const StringPiece& url, bool expect_success, + bool* callback_called); + + // Override these to allow CountFetchesSync or Async respectively. + // These are not abstract (= 0) because they only need to be overridden by + // classes which want to use CountFetchersSync/Async without specifying the + // fetcher in each call. + virtual UrlFetcher* sync_fetcher() { + LOG(FATAL) << "sync_fetcher() must be overridden before use."; + return NULL; + }; + virtual UrlAsyncFetcher* async_fetcher() { + LOG(FATAL) << "async_fetcher() must be overridden before use."; + return NULL; + }; + + GoogleString TestFilename() { + return (GTestSrcDir() + + "/net/instaweb/http/testdata/google.http"); + } + + // This validation code is hard-coded to the http request capture in + // testdata/google.http. + void ValidateOutput(const GoogleString& content, + const ResponseHeaders& response_headers); + + GoogleMessageHandler message_handler_; + MockFetcher mock_fetcher_; + MockAsyncFetcher mock_async_fetcher_; + scoped_ptr thread_system_; + static SimpleStats* statistics_; + + private: + DISALLOW_COPY_AND_ASSIGN(FetcherTest); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_FETCHER_TEST_H_ diff --git a/psol/include/net/instaweb/http/public/headers.h b/psol/include/net/instaweb/http/public/headers.h new file mode 100644 index 000000000..cc891e5da --- /dev/null +++ b/psol/include/net/instaweb/http/public/headers.h @@ -0,0 +1,163 @@ +// Copyright 2011 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_HEADERS_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_HEADERS_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/proto_util.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class MessageHandler; +class NameValue; +class StringMultiMapInsensitive; +class Writer; + +// Read/write API for HTTP headers (shared base class) +template class Headers { + public: + Headers(); + virtual ~Headers(); + + virtual void Clear(); + + int major_version() const; + bool has_major_version() const; + int minor_version() const; + void set_major_version(int major_version); + void set_minor_version(int major_version); + + // Raw access for random access to attribute name/value pairs. + int NumAttributes() const; + const GoogleString& Name(int i) const; + const GoogleString& Value(int i) const; + + // Lookup attributes with provided name. Attribute values are stored in + // values. Returns true iff there were any attributes with provided name. + // + // Note that Lookup, though declared const, is NOT thread-safe. This + // is because it lazily generates a map. + // TODO(jmarantz): this is a problem waiting to happen, but I believe it + // will not be a problem in the immediate future. We can refactor our way + // around this problem by moving the Map to an explicit separate class that + // can be instantiated to assist with Lookups and Remove. But that should + // be done in a separate CL from the one I'm typing into now. + bool Lookup(const StringPiece& name, ConstStringStarVector* values) const; + + // Looks up a single attribute value. Returns NULL if the attribute is + // not found, or if more than one attribute is found. + const char* Lookup1(const StringPiece& name) const; + + // Does there exist a header with given name. + bool Has(const StringPiece& name) const; + + // Is value one of the values in Lookup(name)? + bool HasValue(const StringPiece& name, const StringPiece& value) const; + + // NumAttributeNames is also const but not thread-safe. + int NumAttributeNames() const; + + // Remove all instances of cookie_name in all the cookie headers. + // Empty cookie headers will be removed. + // It might be better for performance if this function is called after + // checking that the cookie is present. + // CAVEAT: Double quoted values are not necessarily treated as one token. + // Please refer to the test cases in headers_cookie_util_test.cc for more + // details. + void RemoveCookie(const StringPiece& cookie_name); + + // Adds a new header, even if a header with the 'name' exists already. + virtual void Add(const StringPiece& name, const StringPiece& value); + + // Remove headers by name and value. Return true if anything was removed. + // Note: If the original headers were: + // attr: val1 + // attr: val2 + // attr: val3 + // and you Remove(attr, val2), your new headers will be: + // attr: val1, val3 (if attr is a comma-separated field) + // and - + // attr: val1 + // attr: val3 (otherwise). + virtual bool Remove(const StringPiece& name, const StringPiece& value); + + // Removes all headers by name. Return true if anything was removed. + virtual bool RemoveAll(const StringPiece& name); + + // Removes all headers whose name is in |names|. + // Return true if anything was removed. + virtual bool RemoveAllFromSet(const StringSetInsensitive& names); + + // Removes all headers whose name is in |names|. + static void RemoveFromHeaders(const StringSetInsensitive& names, + protobuf::RepeatedPtrField* headers); + + // Removes all headers whose name starts with prefix. + virtual void RemoveAllWithPrefix(const StringPiece& prefix); + + // Similar to RemoveAll followed by Add. Note that the attribute + // order may be changed as a side effect of this operation. + virtual void Replace(const StringPiece& name, const StringPiece& value); + + // Merge headers. Replaces all headers specified both here and in + // other with the version in other. Useful for updating headers + // when recieving 304 Not Modified responses. + // Note: This is order-scrambling. + virtual void UpdateFrom(const Headers& other); + + // Serialize HTTP header to a binary stream. + virtual bool WriteAsBinary(Writer* writer, MessageHandler* message_handler); + + // Read HTTP header from a binary string. + virtual bool ReadFromBinary(const StringPiece& buf, MessageHandler* handler); + + // Serialize HTTP headers in HTTP format so it can be re-parsed + virtual bool WriteAsHttp(Writer* writer, MessageHandler* handler) const; + + protected: + void PopulateMap() const; // const is a lie, mutates map_. + + // We have two represenations for the name/value pairs. The + // HttpResponseHeader protobuf contains a simple string-pair vector, but + // lacks a fast associative lookup. So we will build structures for + // associative lookup lazily, and keep them up-to-date if they are + // present. + mutable scoped_ptr map_; + scoped_ptr proto_; + + private: + bool IsCommaSeparatedField(const StringPiece& name) const; + + // If name is a comma-separated field (above), then split value at commas, + // and add name, val for each of the comma-separated values + // (removing whitespace and commas). + // Otherwise, add the name, value pair to the map_. + // const is a lie + // NOTE: the map will contain the comma-split values, but the protobuf + // will contain the original pairs including comma-separated values. + void AddToMap(const StringPiece& name, const StringPiece& value) const; + + + DISALLOW_COPY_AND_ASSIGN(Headers); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_HEADERS_H_ diff --git a/psol/include/net/instaweb/http/public/http_cache.h b/psol/include/net/instaweb/http/public/http_cache.h new file mode 100644 index 000000000..b4d66e246 --- /dev/null +++ b/psol/include/net/instaweb/http/public/http_cache.h @@ -0,0 +1,339 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_HTTP_CACHE_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_HTTP_CACHE_H_ + +#include "base/logging.h" +#include "net/instaweb/http/public/http_value.h" +#include "net/instaweb/http/public/meta_data.h" +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/http/public/response_headers.h" +#include "net/instaweb/util/public/atomic_bool.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class CacheInterface; +class Hasher; +class LogRecord; +class MessageHandler; +class RequestHeaders; +class Statistics; +class Timer; +class Variable; + +// Implements HTTP caching semantics, including cache expiration and +// retention of the originally served cache headers. +class HTTPCache { + public: + // Names of statistics variables: exported for tests. + static const char kCacheTimeUs[]; + static const char kCacheHits[]; + static const char kCacheMisses[]; + static const char kCacheFallbacks[]; + static const char kCacheExpirations[]; + static const char kCacheInserts[]; + static const char kCacheDeletes[]; + + // The prefix used for Etags. + static const char kEtagPrefix[]; + + // Format that is used while generating Etags. + static const char kEtagFormat[]; + + // Does not take ownership of any inputs. + HTTPCache(CacheInterface* cache, Timer* timer, Hasher* hasher, + Statistics* stats); + virtual ~HTTPCache(); + + // When a lookup is done in the HTTP Cache, it returns one of these values. + enum FindResult { + kFound, + kNotFound, + // Helps avoid frequent refetching of resources which have error status + // codes or are not cacheable. + kRecentFetchFailed, + kRecentFetchNotCacheable, + }; + + virtual void set_hasher(Hasher* hasher) { hasher_ = hasher; } + + // Class to handle an asynchronous cache lookup response. + // + // TODO(jmarantz): consider inheriting from AsyncFetch with an implementation + // of Write/Flush/HeadersComplete -- we'd have to make Done take true/false so + // this would impact callers. + class Callback { + public: + explicit Callback(const RequestContextPtr& request_ctx) + : response_headers_(NULL), + owns_response_headers_(false), + request_ctx_(request_ctx) { + } + virtual ~Callback(); + virtual void Done(FindResult find_result) = 0; + // A method that allows client Callbacks to apply invalidation checks. We + // first (in http_cache.cc) check whether the entry is expired using normal + // http semantics, and if it is not expired, then this check is called -- + // thus callbacks can apply any further invalidation semantics it wants on + // otherwise valid entries. But there's no way for a callback to override + // when the HTTP semantics say the entry is expired. + // + // See also OptionsAwareHTTPCacheCallback in rewrite_driver.h for an + // implementation you probably want to use. + virtual bool IsCacheValid(const GoogleString& key, + const ResponseHeaders& headers) = 0; + + // A method that allows client Callbacks to check if the response in cache + // is fresh enough, in addition to it being valid. This is used while + // freshening resources to check that the response in cache is not only + // valid, but is also not going to expire anytime soon. + // Note that if the response in cache is valid but not fresh, the HTTPCache + // calls Callback::Done with find_result = kNotFound and fills in + // fallback_http_value() with the cached response. + virtual bool IsFresh(const ResponseHeaders& headers) { return true; } + + // Overrides the cache ttl of the cached response with the given value. Note + // that this has no effect if the returned value is negative or less than + // the cache ttl of the stored value. + virtual int64 OverrideCacheTtlMs(const GoogleString& key) { return -1; } + + // TODO(jmarantz): specify the dataflow between http_value and + // response_headers. + HTTPValue* http_value() { return &http_value_; } + ResponseHeaders* response_headers() { + if (response_headers_ == NULL) { + response_headers_ = new ResponseHeaders; + owns_response_headers_ = true; + } + return response_headers_; + } + const ResponseHeaders* response_headers() const { + return const_cast(this)->response_headers(); + } + void set_response_headers(ResponseHeaders* headers) { + DCHECK(!owns_response_headers_); + if (owns_response_headers_) { + delete response_headers_; + } + response_headers_ = headers; + owns_response_headers_ = false; + } + HTTPValue* fallback_http_value() { return &fallback_http_value_; } + + LogRecord* log_record(); + const RequestContextPtr& request_context() { return request_ctx_; } + + virtual void SetTimingMs(int64 timing_value_ms); + + private: + HTTPValue http_value_; + // Stale value that can be used in case a fetch fails. Note that Find() + // may fill in a stale value here but it will still return kNotFound. + HTTPValue fallback_http_value_; + ResponseHeaders* response_headers_; + bool owns_response_headers_; + RequestContextPtr request_ctx_; + + DISALLOW_COPY_AND_ASSIGN(Callback); + }; + + // Makes the cache ignore put requests that do not record successes. + virtual void SetIgnoreFailurePuts(); + + // Non-blocking Find. Calls callback when done. 'handler' must all + // stay valid until callback->Done() is called. + virtual void Find(const GoogleString& key, MessageHandler* handler, + Callback* callback); + + // Note that Put takes a non-const pointer for HTTPValue so it can + // bump the reference count. + virtual void Put(const GoogleString& key, HTTPValue* value, + MessageHandler* handler); + + // Note that Put takes a non-const pointer for ResponseHeaders* so it + // can update the caching fields prior to storing. + // If you call this method, you must be certain that the outgoing + // request was not sent with Authorization:. + virtual void Put(const GoogleString& key, ResponseHeaders* headers, + const StringPiece& content, MessageHandler* handler); + + // Deletes an element in the cache. + virtual void Delete(const GoogleString& key); + + virtual void set_force_caching(bool force) { force_caching_ = force; } + bool force_caching() const { return force_caching_; } + virtual void set_disable_html_caching_on_https(bool x) { + disable_html_caching_on_https_ = x; + } + Timer* timer() const { return timer_; } + + // Tell the HTTP Cache to remember that a particular key is not cacheable + // because the URL was marked with Cache-Control 'nocache' or Cache-Control + // 'private'. We would like to avoid DOSing the origin server or spinning our + // own wheels trying to re-fetch this resource. + // The not-cacheable setting will be 'remembered' for + // remember_not_cacheable_ttl_seconds_. + // Note that we remember whether the response was originally a "200 OK" so + // that we can check if the cache TTL can be overridden. + virtual void RememberNotCacheable(const GoogleString& key, + bool is_200_status_code, + MessageHandler* handler); + + // Tell the HTTP Cache to remember that a particular key is not cacheable + // because the associated URL failing Fetch. + // + // The not-cacheable setting will be 'remembered' for + // remember_fetch_failed_ttl_seconds_. + virtual void RememberFetchFailed(const GoogleString& key, + MessageHandler* handler); + + // Tell the HTTP Cache to remember that we had to give up on doing a + // background fetch due to load. This will remember it for + // remember_fetch_load_shed_ttl_seconds_. + virtual void RememberFetchDropped(const GoogleString& key, + MessageHandler* handler); + + // Indicates if the response is within the cacheable size limit. Clients of + // HTTPCache must check if they will be eventually able to cache their entries + // before buffering them in memory. If the content length header is not found + // then consider it as cacheable. This could be a chunked response. + bool IsCacheableContentLength(ResponseHeaders* headers) const; + // Indicates if the response body is within the cacheable size limit. If the + // response headers do not have content length header, then the clients of + // HTTPCache must check if the received response body is of cacheable size + // before buffering them in memory. + bool IsCacheableBodySize(int64 body_size) const; + + // Initialize statistics variables for the cache + static void InitStats(Statistics* statistics); + + // Returns true if the resource is already at the point of expiration + // (or not cacheable by us), and would never be used if inserted into the + // cache. Otherwise, returns false. If the entry was rejected because of + // expiration but would otherwise have been cacheable, this also increments + // the cache expirations statistic. + // + // request_headers is used to check for resources requested with + // authorization. It is OK to pass NULL if you're certain that the fetch + // was done without authorization headers. + bool IsAlreadyExpired(const RequestHeaders* request_headers, + const ResponseHeaders& headers); + + Variable* cache_time_us() { return cache_time_us_; } + Variable* cache_hits() { return cache_hits_; } + Variable* cache_misses() { return cache_misses_; } + Variable* cache_fallbacks() { return cache_fallbacks_; } + Variable* cache_expirations() { return cache_expirations_; } + Variable* cache_inserts() { return cache_inserts_; } + Variable* cache_deletes() { return cache_deletes_; } + + int64 remember_not_cacheable_ttl_seconds() { + return remember_not_cacheable_ttl_seconds_; + } + + virtual void set_remember_not_cacheable_ttl_seconds(int64 value) { + DCHECK_LE(0, value); + if (value >= 0) { + remember_not_cacheable_ttl_seconds_ = value; + } + } + + int64 remember_fetch_failed_ttl_seconds() { + return remember_fetch_failed_ttl_seconds_; + } + + virtual void set_remember_fetch_failed_ttl_seconds(int64 value) { + DCHECK_LE(0, value); + if (value >= 0) { + remember_fetch_failed_ttl_seconds_ = value; + } + } + + int64 remember_fetch_dropped_ttl_seconds() { + return remember_fetch_dropped_ttl_seconds_; + } + + virtual void set_remember_fetch_dropped_ttl_seconds(int64 value) { + DCHECK_LE(0, value); + if (value >= 0) { + remember_fetch_dropped_ttl_seconds_ = value; + } + } + + int max_cacheable_response_content_length() { + return max_cacheable_response_content_length_; + } + + virtual void set_max_cacheable_response_content_length(int64 value); + + virtual const char* Name() const { return name_.c_str(); } + + protected: + virtual void PutInternal(const GoogleString& key, int64 start_us, + HTTPValue* value); + + private: + friend class HTTPCacheCallback; + friend class WriteThroughHTTPCache; + + bool IsCurrentlyValid(const RequestHeaders* request_headers, + const ResponseHeaders& headers, int64 now_ms); + + bool MayCacheUrl(const GoogleString& url, const ResponseHeaders& headers); + // Requires either content or value to be non-NULL. + // Applies changes to headers. If the headers are actually changed or if value + // is NULL then it builds and returns a new HTTPValue. If content is NULL + // then content is extracted from value. + HTTPValue* ApplyHeaderChangesForPut( + const GoogleString& key, int64 start_us, const StringPiece* content, + ResponseHeaders* headers, HTTPValue* value, MessageHandler* handler); + void UpdateStats(FindResult result, bool has_fallback, int64 delta_us); + void RememberFetchFailedorNotCacheableHelper( + const GoogleString& key, MessageHandler* handler, HttpStatus::Code code, + int64 ttl_sec); + + CacheInterface* cache_; // Owned by the caller. + Timer* timer_; + Hasher* hasher_; + bool force_caching_; + // Whether to disable caching of HTML content fetched via https. + bool disable_html_caching_on_https_; + Variable* cache_time_us_; + Variable* cache_hits_; + Variable* cache_misses_; + Variable* cache_fallbacks_; + Variable* cache_expirations_; + Variable* cache_inserts_; + Variable* cache_deletes_; + GoogleString name_; + int64 remember_not_cacheable_ttl_seconds_; + int64 remember_fetch_failed_ttl_seconds_; + int64 remember_fetch_dropped_ttl_seconds_; + int64 max_cacheable_response_content_length_; + AtomicBool ignore_failure_puts_; + + DISALLOW_COPY_AND_ASSIGN(HTTPCache); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_HTTP_CACHE_H_ diff --git a/psol/include/net/instaweb/http/public/http_dump_url_async_writer.h b/psol/include/net/instaweb/http/public/http_dump_url_async_writer.h new file mode 100644 index 000000000..7cdd504b0 --- /dev/null +++ b/psol/include/net/instaweb/http/public/http_dump_url_async_writer.h @@ -0,0 +1,81 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_ASYNC_WRITER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_ASYNC_WRITER_H_ + +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/http/public/http_dump_url_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class AsyncFetch; +class FileSystem; +class MessageHandler; +class Timer; + +// HttpDumpWriter checks to see whether the HTTP dump is available on the +// filesystem. If not, it fetches it from another fetcher (e.g. one that +// uses the network) and writes it to the filesystem so that HttpDumpFetcher +// can find it. +class HttpDumpUrlAsyncWriter : public UrlAsyncFetcher { + public: + HttpDumpUrlAsyncWriter(const StringPiece& root_dir, + UrlAsyncFetcher* base_fetcher, + FileSystem* file_system, + Timer* timer) + : dump_fetcher_(root_dir, file_system, timer), + base_fetcher_(base_fetcher), + file_system_(file_system), + accept_gzip_(true) { + root_dir.CopyToString(&root_dir_); + } + virtual ~HttpDumpUrlAsyncWriter(); + + virtual bool SupportsHttps() const { return base_fetcher_->SupportsHttps(); } + + // This is a synchronous/blocking implementation. + virtual void Fetch(const GoogleString& url, + MessageHandler* handler, + AsyncFetch* base_fetch); + + // Controls whether we will request and save gzipped content to the + // file system. Note that http_dump_url_fetcher will inflate on + // read if its caller does not want gzipped output. + void set_accept_gzip(bool x) { accept_gzip_ = x; } + + private: + // Helper class to manage individual fetchs. + class DumpFetch; + + HttpDumpUrlFetcher dump_fetcher_; + // Used to fetch urls that aren't in the dump yet. + UrlAsyncFetcher* base_fetcher_; + GoogleString root_dir_; // Root directory of the HTTP dumps. + FileSystem* file_system_; + bool accept_gzip_; + + DISALLOW_COPY_AND_ASSIGN(HttpDumpUrlAsyncWriter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_ASYNC_WRITER_H_ diff --git a/psol/include/net/instaweb/http/public/http_dump_url_fetcher.h b/psol/include/net/instaweb/http/public/http_dump_url_fetcher.h new file mode 100644 index 000000000..83e673f8a --- /dev/null +++ b/psol/include/net/instaweb/http/public/http_dump_url_fetcher.h @@ -0,0 +1,110 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_FETCHER_H_ + +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/http/public/url_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/file_system.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class GoogleUrl; +class MessageHandler; +class RequestHeaders; +class ResponseHeaders; +class Timer; +class Writer; + +// TODO(sligocki): Can we forward declare these somehow? +// class FileSystem; +// class FileSystem::InputFile; + +// HttpDumpFetcher fetches raw HTTP dumps from the filesystem. +// These dumps could be compressed or chunked, the fetcher does not +// decompress or de-chunk them. +class HttpDumpUrlFetcher : public UrlFetcher { + public: + // When the slurped data is gzipped, but request headers are made + // that don't include 'gzip' in an Accept-Encodings header, then + // this fetcher inflates the gzipped output as it streams. It + // also captures the original gzipped size in this attribute in + // the response headers. + static const char kGzipContentLengthAttribute[]; + + HttpDumpUrlFetcher(const StringPiece& root_dir, FileSystem* file_system, + Timer* timer); + virtual ~HttpDumpUrlFetcher(); + + // Converts URL into filename the way that Latency Lab does. + // Note: root_dir_ must be standardized to have a / at end already. + static bool GetFilenameFromUrl(const StringPiece& root_dir, + const GoogleUrl& url, + GoogleString* filename, + MessageHandler* message_handler); + + // Non-static version that uses the fetcher's root dir. + bool GetFilename(const GoogleUrl& url, + GoogleString* filename, + MessageHandler* message_handler) { + return GetFilenameFromUrl(root_dir_, url, filename, message_handler); + } + + // This is a synchronous/blocking implementation. + virtual bool StreamingFetchUrl(const GoogleString& url, + const RequestHeaders& request_headers, + ResponseHeaders* response_headers, + Writer* fetched_content_writer, + MessageHandler* message_handler, + const RequestContextPtr& request_context); + + // Parse file into response_headers and response_writer as if it were bytes + // off the wire. + bool ParseFile(FileSystem::InputFile* file, + ResponseHeaders* response_headers, + Writer* response_writer, + MessageHandler* handler); + + // Helper function to return a generic error response. + void RespondError(ResponseHeaders* response_headers, Writer* response_writer, + MessageHandler* handler); + + // Print URLs each time they are fetched. + void set_print_urls(bool on); + + private: + GoogleString root_dir_; // Root directory of the HTTP dumps. + FileSystem* file_system_; + Timer* timer_; + + // Response to use if something goes wrong. + GoogleString error_body_; + + scoped_ptr urls_; + + DISALLOW_COPY_AND_ASSIGN(HttpDumpUrlFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/http_dump_url_writer.h b/psol/include/net/instaweb/http/public/http_dump_url_writer.h new file mode 100644 index 000000000..b4e0b5913 --- /dev/null +++ b/psol/include/net/instaweb/http/public/http_dump_url_writer.h @@ -0,0 +1,82 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_WRITER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_WRITER_H_ + +#include "net/instaweb/http/public/http_dump_url_fetcher.h" +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/http/public/url_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class FileSystem; +class MessageHandler; +class RequestHeaders; +class ResponseHeaders; +class Timer; +class Writer; + +// HttpDumpWriter checks to see whether the HTTP dump is available on the +// filesystem. If not, it fetches it from another fetcher (e.g. one that +// uses the network) and writes it to the filesystem so that HttpDumpFetcher +// can find it. +class HttpDumpUrlWriter : public UrlFetcher { + public: + HttpDumpUrlWriter(const StringPiece& root_dir, UrlFetcher* base_fetcher, + FileSystem* file_system, Timer* timer) + : dump_fetcher_(root_dir, file_system, timer), + base_fetcher_(base_fetcher), + file_system_(file_system), + accept_gzip_(true) { + root_dir.CopyToString(&root_dir_); + } + virtual ~HttpDumpUrlWriter(); + + // This is a synchronous/blocking implementation. + virtual bool StreamingFetchUrl(const GoogleString& url, + const RequestHeaders& request_headers, + ResponseHeaders* response_headers, + Writer* response_writer, + MessageHandler* message_handler, + const RequestContextPtr& request_context); + + // Controls whether we will request and save gzipped content to the + // file system. Note that http_dump_url_fetcher will inflate on + // read if its caller does not want gzipped output. + void set_accept_gzip(bool x) { accept_gzip_ = x; } + + // Print URLs each time they are fetched. + void set_print_urls(bool on) { dump_fetcher_.set_print_urls(on); } + + private: + HttpDumpUrlFetcher dump_fetcher_; + UrlFetcher* base_fetcher_; // Used to fetch urls that aren't in the dump yet. + GoogleString root_dir_; // Root directory of the HTTP dumps. + FileSystem* file_system_; + bool accept_gzip_; + + DISALLOW_COPY_AND_ASSIGN(HttpDumpUrlWriter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_HTTP_DUMP_URL_WRITER_H_ diff --git a/psol/include/net/instaweb/http/public/http_response_parser.h b/psol/include/net/instaweb/http/public/http_response_parser.h new file mode 100644 index 000000000..17768760b --- /dev/null +++ b/psol/include/net/instaweb/http/public/http_response_parser.h @@ -0,0 +1,76 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_HTTP_RESPONSE_PARSER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_HTTP_RESPONSE_PARSER_H_ + +#include // for FILE +#include "net/instaweb/util/public/basictypes.h" +// TODO(sligocki): Find a way to forward declare FileSystem::InputFile. +#include "net/instaweb/http/public/response_headers_parser.h" +#include "net/instaweb/util/public/file_system.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class MessageHandler; +class ResponseHeaders; +class Writer; + +// Helper class to facilitate parsing a raw streaming HTTP response including +// headers and body. +class HttpResponseParser { + public: + HttpResponseParser(ResponseHeaders* response_headers, Writer* writer, + MessageHandler* handler) + : reading_headers_(true), + ok_(true), + response_headers_(response_headers), + writer_(writer), + handler_(handler), + parser_(response_headers) { + } + + // Parse complete HTTP response from a file. + bool ParseFile(FileSystem::InputFile* file); + + // Parse complete HTTP response from a FILE stream. + // TODO(sligocki): We need a Readable abstraction (like Writer) + bool Parse(FILE* stream); + + // Read a chunk of HTTP response, populating response_headers and call + // writer on output body, returning true if the status is ok. + bool ParseChunk(const StringPiece& data); + + bool ok() const { return ok_; } + bool headers_complete() const { return parser_.headers_complete(); } + + private: + bool reading_headers_; + bool ok_; + ResponseHeaders* response_headers_; + Writer* writer_; + MessageHandler* handler_; + ResponseHeadersParser parser_; + + DISALLOW_COPY_AND_ASSIGN(HttpResponseParser); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_HTTP_RESPONSE_PARSER_H_ diff --git a/psol/include/net/instaweb/http/public/http_value.h b/psol/include/net/instaweb/http/public/http_value.h new file mode 100644 index 000000000..e15c9237e --- /dev/null +++ b/psol/include/net/instaweb/http/public/http_value.h @@ -0,0 +1,120 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_HTTP_VALUE_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_HTTP_VALUE_H_ + +#include // for size_t +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/shared_string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/writer.h" + +namespace net_instaweb { + +class ResponseHeaders; +class MessageHandler; + +// Provides shared, ref-counted, copy-on-write storage for HTTP +// contents, to aid sharing between active fetches and filters, and +// the cache, which from which data may be evicted at any time. +class HTTPValue : public Writer { + public: + HTTPValue() : contents_size_(0) {} + + // Clears the value (both headers and content) + void Clear(); + + // Is this HTTPValue empty + bool Empty() const { return storage_.empty(); } + + // Sets the HTTP headers for this value. This method may only + // be called once and must be called before or after all of the + // contents are set (using the streaming interface Write). + // + // If Clear() is called, then SetHeaders() can be called once again. + // + // A non-const pointer is required for the response headers so that + // the cache fields can be updated if necessary. + void SetHeaders(ResponseHeaders* headers); + + // Writes contents into the HTTPValue object. Write can be called + // multiple times to append more data, and can be called before + // or after SetHeaders. However, SetHeaders cannot be interleaved + // in between calls to Write. + virtual bool Write(const StringPiece& str, MessageHandler* handler); + virtual bool Flush(MessageHandler* handler); + + // Retrieves the headers, returning false if empty. + bool ExtractHeaders(ResponseHeaders* headers, MessageHandler* handler) const; + + // Retrieves the contents, returning false if empty. Note that the + // contents are only guaranteed valid as long as the HTTPValue + // object is in scope. + bool ExtractContents(StringPiece* str) const; + + // Tests whether this reference is the only active one to the string object. + bool unique() const { return storage_.unique(); } + + // Assigns the storage of an HTTPValue based on the provided storage. This + // can be used for a cache Get. Returns false if the string is not + // well-formed. + // + // Extracts the headers into the provided ResponseHeaders buffer. + bool Link(SharedString* src, ResponseHeaders* headers, + MessageHandler* handler); + + // Links two HTTPValues together, using the contents of 'src' and discarding + // the contents of this. + void Link(HTTPValue* src) { + if (src != this) { + storage_ = src->storage_; // SharedString links via assignment. + contents_size_ = src->contents_size(); + } + } + + // Access the shared string, for insertion into a cache via Put. + SharedString* share() { return &storage_; } + + size_t size() const { return storage_.size(); } + int64 contents_size() { return contents_size_; } + + private: + friend class HTTPValueTest; + + // Must be called with storage_ non-empty. + char type_identifier() const { return *storage_.data(); } + + unsigned int SizeOfFirstChunk() const; + void SetSizeOfFirstChunk(unsigned int size); + int64 ComputeContentsSize() const; + + // Disconnects this HTTPValue from other HTTPValues that may share the + // underlying storage, allowing a new buffer. + void CopyOnWrite(); + + SharedString storage_; + // Member variable to keep the size of body in storage. + int64 contents_size_; + + DISALLOW_COPY_AND_ASSIGN(HTTPValue); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_HTTP_VALUE_H_ diff --git a/psol/include/net/instaweb/http/public/http_value_writer.h b/psol/include/net/instaweb/http/public/http_value_writer.h new file mode 100644 index 000000000..5f949e807 --- /dev/null +++ b/psol/include/net/instaweb/http/public/http_value_writer.h @@ -0,0 +1,64 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Author: mmohabey@google.com (Megha Mohabey) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_HTTP_VALUE_WRITER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_HTTP_VALUE_WRITER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HTTPCache; +class HTTPValue; +class MessageHandler; +class ResponseHeaders; + +// Wrappper for buffering an HTTPValue. HTTPValueWriter ensures that an +// HTTPValue which cannot be eventually cached is not buffered. +class HTTPValueWriter { + public: + HTTPValueWriter(HTTPValue* value, HTTPCache* cache) + : value_(value), + cache_(cache), + has_buffered_(true) {} + + void SetHeaders(ResponseHeaders* headers); + + bool Write(const StringPiece& str, MessageHandler* handler); + + bool has_buffered() const { return has_buffered_; } + + // Checks if the http_value should be buffered or not depending on whether we + // can eventually cache it. It also clears the http_value if it can not be + // buffered. Note that this only checks the size constraints, not cache + // headers. + bool CheckCanCacheElseClear(ResponseHeaders* headers); + + // Checks if we can write the string to the HttpValue without going over + // limits. + bool CanCacheContent(const StringPiece& str) const; + + private: + HTTPValue* value_; + HTTPCache* cache_; + bool has_buffered_; + DISALLOW_COPY_AND_ASSIGN(HTTPValueWriter); +}; + +} // namespace net_instaweb +#endif // NET_INSTAWEB_HTTP_PUBLIC_HTTP_VALUE_WRITER_H_ diff --git a/psol/include/net/instaweb/http/public/inflating_fetch.h b/psol/include/net/instaweb/http/public/inflating_fetch.h new file mode 100644 index 000000000..829331741 --- /dev/null +++ b/psol/include/net/instaweb/http/public/inflating_fetch.h @@ -0,0 +1,97 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_INFLATING_FETCH_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_INFLATING_FETCH_H_ + +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/gzip_inflater.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class MessageHandler; + +// This Fetch layer helps work with origin servers that serve gzipped +// content even when request-headers do not include +// accept-encoding:gzip. In that scenario, this class inflates the +// content and strips the content-encoding:gzip response header. +// +// Some servers will serve gzipped content even to clients that didn't +// ask for it. Depending on the serving environment, we may also want +// to ask backend servers for gzipped content even if we want cleartext +// to be sent to the Write methods. Users of this class can force this +// by calling EnableGzipFromBackend. +class InflatingFetch : public SharedAsyncFetch { + public: + explicit InflatingFetch(AsyncFetch* fetch); + virtual ~InflatingFetch(); + + // Use this one cautiously, since it may cause resources to be corrupted + // if you use it with anything other than the IPRO path. + void set_inflation_content_type_blacklist( + const std::set& bypass_set) { + inflation_content_type_blacklist_ = bypass_set; + } + + // Adds accept-encoding:gzip to the request headers sent to the + // origin. The data is inflated as we Write it. If deflate + // or gzip was already in the request then this has no effect. + void EnableGzipFromBackend(); + + protected: + // If inflation is required, inflates and passes bytes to the linked fetch, + // otherwise just passes bytes. + virtual bool HandleWrite(const StringPiece& sp, MessageHandler* handler); + + // Analyzes headers and depending on the request settings and flags will + // either setup inflater or not. + virtual void HandleHeadersComplete(); + virtual void HandleDone(bool success); + virtual void Reset(); + + private: + void InitInflater(GzipInflater::InflateType, const StringPiece& value); + + // If this returns true, it means that we should not inflate incoming data and + // pass it to the caller as is, since that is what caller requested. + bool IsCompressionAllowedInRequest(); + + scoped_ptr inflater_; + + // Caching gate inside IsCompressionAllowedInRequest(). + bool request_checked_for_accept_encoding_; + + // Will be set to true if accepted encoding included gzip and/or deflate. + bool compression_desired_; + + // Whether any kind of error happened to the inflater. Once set to true, never + // gets reset. + bool inflate_failure_; + + // Set of content types that will not be inflated. + std::set inflation_content_type_blacklist_; + + DISALLOW_COPY_AND_ASSIGN(InflatingFetch); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_INFLATING_FETCH_H_ diff --git a/psol/include/net/instaweb/http/public/log_record.h b/psol/include/net/instaweb/http/public/log_record.h new file mode 100644 index 000000000..55dc77a6e --- /dev/null +++ b/psol/include/net/instaweb/http/public/log_record.h @@ -0,0 +1,148 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: marq@google.com (Mark Cogan) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_LOG_RECORD_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_LOG_RECORD_H_ + +#include "net/instaweb/http/public/logging_proto.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/gtest_prod.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +// If your .cc file needs to use the types declared in logging_proto.h, +// you must also include net/instaweb/http/public/logging_proto_impl.h +// See that header file for an explanation of why this is necessary. + + +namespace net_instaweb { + +class AbstractMutex; + +// This class is a wrapper around a protobuf used to collect logging +// information. It also provides a simple aggregation mechanism for +// collecting the ids of applied rewriters. +// +// Care and feeding of log records: +// (1) All logging must be done through log records. No class should +// have static members of any logging proto class. Log records +// can either create the logging protos, or will take ownership of them. +// (2) All access and manipulation of log data must be guarded by the log +// record's mutex. Commonly repeated logging operations should be factored +// into functions in this class (and be so guarded therein). +// (3) In most cases, log records should be created and owned by request +// contexts. + +// Subclasses may wrap some other type of protobuf; they must still provide +// access to a LogRecord instance, however. +class LogRecord { + public: + // Construct a LogRecord with a new LoggingInfo proto and caller- + // supplied mutex. This class takes ownership of the mutex. + explicit LogRecord(AbstractMutex* mutex); + virtual ~LogRecord(); + + // Log a rewriter (identified by an id string) as having been applied to + // the request being logged. These ids will be aggregated and written to the + // protobuf when Finalize() is called. + void LogAppliedRewriter(const char* rewriter_id); + + // This should be called when all logging activity on the log record is + // complete. If a subclass of this class uses other aggregate data structures + // or other intermediates before writing to the wrapped data structure, + // it should do those writes in FinalizeImpl. mutex_ guards this. + void Finalize(); + + // Return the LoggingInfo proto wrapped by this class. Calling code must + // guard any reads and writes to this using mutex(). + virtual LoggingInfo* logging_info(); + + // Mutex-guarded log mutation convenience methods. The rule of thumb is that + // if a single-field update to a logging proto occurs multiple times, it + // should be factored out into a method on this class. + void SetBlinkRequestFlow(int flow); + void SetIsOriginalResourceCacheable(bool cacheable); + void SetTimingRequestStartMs(int64 ms); + void SetTimingFetchMs(int64 ms); + + // Mutex-guarded log-writing operations. Derived classes should override + // *Impl methods. Returns false if the log write attempt failed. + bool WriteLog(); + // Update the log record with Blink-specific information, then write the + // log as if WriteLog() was called. + bool WriteLogForBlink(const GoogleString& user_agent); + + // If log-writing needs to occur in the context of an existing lock, + // these methods may be used. Returns false if write attempt failed. + bool WriteLogWhileLocked(); + bool WriteLogForBlinkWhileLocked(const GoogleString& user_agent); + + // Return the mutex associated with this instance. Calling code should + // guard reads and writes of LogRecords + AbstractMutex* mutex() { return mutex_.get(); } + + protected: + // Non-initializing default constructor for subclasses. Subclasses that invoke + // this constructor should implement and call their own initializer that + // instantiates the wrapped logging proto and calls set_mutex with a valid + // Mutex object. + LogRecord(); + + void set_mutex(AbstractMutex* m); + + // Returns a comma-joined string concatenating the contents of + // applied_rewriters_ + GoogleString ConcatenatedRewriterString(); + + // Implementation methods for subclasses to override. + // Implements logging an applied rewriter. + virtual void LogAppliedRewriterImpl(const char* rewriter_id); + // Implements finalization. + virtual void FinalizeImpl(); + // Implements writing a log, base implementation is a no-op. Returns false if + // writing failed. + virtual bool WriteLogImpl() { return true; } + // Implements writing the Blink log, base implementation is a no-op. Returns + // false if writing failed. + virtual bool WriteLogForBlinkImpl(const GoogleString& user_agent) { + return true; + } + + // True if Finalize() has been called. mutex_ guards this. + bool finalized() { return finalized_; } + FRIEND_TEST(LogRecordTest, NoAppliedRewriters); + + private: + // Called on construction. + void InitLogging(); + + StringSet applied_rewriters_; + + scoped_ptr logging_info_; + bool finalized_; + // Thus must be set. Implementation constructors must minimally default this + // to a NullMutex. + scoped_ptr mutex_; + + DISALLOW_COPY_AND_ASSIGN(LogRecord); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_LOG_RECORD_H_ diff --git a/psol/include/net/instaweb/http/public/logging_proto.h b/psol/include/net/instaweb/http/public/logging_proto.h new file mode 100644 index 000000000..b8b096b22 --- /dev/null +++ b/psol/include/net/instaweb/http/public/logging_proto.h @@ -0,0 +1,31 @@ +// Copyright 2012 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Mark Cogan (marq@google.com) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_LOGGING_PROTO_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_LOGGING_PROTO_H_ + + +namespace net_instaweb { + +class BlinkInfo; +class LoggingInfo; +class MetadataCacheInfo; +class TimingInfo; + +} + + +#endif // NET_INSTAWEB_HTTP_PUBLIC_LOGGING_PROTO_H_ diff --git a/psol/include/net/instaweb/http/public/logging_proto_impl.h b/psol/include/net/instaweb/http/public/logging_proto_impl.h new file mode 100644 index 000000000..dbff004c0 --- /dev/null +++ b/psol/include/net/instaweb/http/public/logging_proto_impl.h @@ -0,0 +1,32 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: Mark Cogan (marq@google.com) + +// This header should be included (in addition to log_record.h) whenever +// code needs access to the implementations of the logging_info protobuf +// This file needs to be separate from log_record.h because both apache's +// httpd.h and any pb.h #define incompatible |OK| macros. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_LOGGING_PROTO_IMPL_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_LOGGING_PROTO_IMPL_H_ + + +#include "net/instaweb/http/logging.pb.h" + + + +#endif // NET_INSTAWEB_HTTP_PUBLIC_LOGGING_PROTO_IMPL_H_ diff --git a/psol/include/net/instaweb/http/public/meta_data.h b/psol/include/net/instaweb/http/public/meta_data.h new file mode 100644 index 000000000..bd0bcfe92 --- /dev/null +++ b/psol/include/net/instaweb/http/public/meta_data.h @@ -0,0 +1,168 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) +// +// Meta-data associated with a rewriting resource. This is +// primarily a key-value store, but additionally we want to +// get easy access to the cache expiration time. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_META_DATA_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_META_DATA_H_ + +namespace net_instaweb { + +// Global constants for common HTML attributes names and values. +// +// TODO(jmarantz): Proactively change all the occurrences of the static strings +// to use these shared constants. +struct HttpAttributes { + static const char kAcceptEncoding[]; + static const char kAuthorization[]; + static const char kCacheControl[]; + static const char kConnection[]; + static const char kContentEncoding[]; + static const char kContentLanguage[]; + static const char kContentLength[]; + static const char kContentType[]; + static const char kCookie[]; + static const char kCookie2[]; + static const char kDate[]; + static const char kDeflate[]; + static const char kDnt[]; + static const char kEtag[]; + static const char kExpires[]; + static const char kGzip[]; + static const char kHost[]; + static const char kIfModifiedSince[]; + static const char kIfNoneMatch[]; + static const char kLastModified[]; + static const char kLocation[]; + static const char kNoCache[]; + static const char kPragma[]; + static const char kProxyAuthorization[]; + static const char kReferer[]; // sic + static const char kServer[]; + static const char kSetCookie[]; + static const char kSetCookie2[]; + static const char kTransferEncoding[]; + static const char kUserAgent[]; + static const char kVary[]; + static const char kWarning[]; + static const char kXmlHttpRequest[]; + static const char kXAssociatedContent[]; + static const char kXForwardedFor[]; + static const char kXForwardedProto[]; + static const char kXGooglePagespeedClientId[]; + static const char kXGoogleRequestEventId[]; + // If this header's value matches the configured blocking rewrite key, then + // all rewrites are completed before the response is sent to the client. + static const char kXPsaBlockingRewrite[]; + + // If this header is present on an incoming request it will be treated as if + // it came over a SPDY connection for purposes of applying special + // configuration or optimizations. + static const char kXPsaOptimizeForSpdy[]; + + // This header is set on optional fetches that got dropped due to load. + static const char kXPsaLoadShed[]; + static const char kXRequestedWith[]; + + // This header is set on optimized responses to indicate the original + // content length. + static const char kXOriginalContentLength[]; + static const char kXUACompatible[]; +}; + +namespace HttpStatus { +// Http status codes. +// Grokked from http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html +enum Code { + kContinue = 100, + kSwitchingProtocols = 101, + + kOK = 200, + kCreated = 201, + kAccepted = 202, + kNonAuthoritative = 203, + kNoContent = 204, + kResetContent = 205, + kPartialContent = 206, + + kMultipleChoices = 300, + kMovedPermanently = 301, + kFound = 302, + kSeeOther = 303, + kNotModified = 304, + kUseProxy = 305, + kSwitchProxy = 306, // In old spec; no longer used. + kTemporaryRedirect = 307, + + kBadRequest = 400, + kUnauthorized = 401, + kPaymentRequired = 402, + kForbidden = 403, + kNotFound = 404, + kMethodNotAllowed = 405, + kNotAcceptable = 406, + kProxyAuthRequired = 407, + kRequestTimeout = 408, + kConflict = 409, + kGone = 410, + kLengthRequired = 411, + kPreconditionFailed = 412, + kEntityTooLarge = 413, + kUriTooLong = 414, + kUnsupportedMediaType = 415, + kRangeNotSatisfiable = 416, + kExpectationFailed = 417, + kImATeapot = 418, + + kInternalServerError = 500, + kNotImplemented = 501, + kBadGateway = 502, + kUnavailable = 503, + kGatewayTimeout = 504, + kHttpVersionNotSupported = 505, + + // Instaweb-specific proxy failure constants. + kProxyPublisherFailure = 520, + kProxyFailure = 521, + kProxyConfigurationFailure = 522, + kProxyDeclinedRequest = 523, + + // Instaweb-specific response codes: these are intentionally chosen to be + // outside the normal HTTP range, but we consider these response codes + // to be 'cacheable' in our own cache. + kRememberFetchFailedStatusCode = 10001, + // Note that this includes all non-200 status code responses that are not + // cacheable. + kRememberNotCacheableStatusCode = 10002, + // This includes all 200 status code responses that are not cacheable. + kRememberNotCacheableAnd200StatusCode = 10003, + // Status code used when the actual status code of the response is unknown at + // the time of ProxyFetchPropertyCallbackCollector::Detach(). + kUnknownStatusCode = 10004, +}; + +// Transform a status code into the equivalent reason phrase. +const char* GetReasonPhrase(Code rc); + +} // namespace HttpStatus + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_META_DATA_H_ diff --git a/psol/include/net/instaweb/http/public/mock_callback.h b/psol/include/net/instaweb/http/public/mock_callback.h new file mode 100644 index 000000000..00bb631a2 --- /dev/null +++ b/psol/include/net/instaweb/http/public/mock_callback.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +// Callbacks used for testing. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_MOCK_CALLBACK_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_MOCK_CALLBACK_H_ + +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/gtest.h" + +namespace net_instaweb { + +// Callback that can be used for testing resource fetches which makes sure +// that Done() is called exactly once and with the expected success value. +// Can be used multiple times by calling Reset in between. +class ExpectStringAsyncFetch : public StringAsyncFetch { + public: + explicit ExpectStringAsyncFetch(bool expect_success) + : expect_success_(expect_success) {} + ExpectStringAsyncFetch(bool expect_success, + const RequestContextPtr& request_context) + : StringAsyncFetch(request_context), expect_success_(expect_success) {} + virtual ~ExpectStringAsyncFetch() { + EXPECT_TRUE(done()); + } + + virtual void HandleDone(bool success) { + EXPECT_FALSE(done()) << "Already Done; perhaps you reused without Reset()"; + StringAsyncFetch::HandleDone(success); + EXPECT_EQ(expect_success_, success); + } + + void set_expect_success(bool x) { expect_success_ = x; } + + private: + bool expect_success_; + + DISALLOW_COPY_AND_ASSIGN(ExpectStringAsyncFetch); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_MOCK_CALLBACK_H_ diff --git a/psol/include/net/instaweb/http/public/mock_url_fetcher.h b/psol/include/net/instaweb/http/public/mock_url_fetcher.h new file mode 100644 index 000000000..776f96f9f --- /dev/null +++ b/psol/include/net/instaweb/http/public/mock_url_fetcher.h @@ -0,0 +1,168 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_MOCK_URL_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_MOCK_URL_FETCHER_H_ + +#include +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/http/public/response_headers.h" +#include "net/instaweb/http/public/url_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class MessageHandler; +class RequestHeaders; +class Timer; +class Writer; + +// Simple UrlFetcher meant for tests, you can set responses for individual URLs. +// Meant only for testing. +class MockUrlFetcher : public UrlFetcher { + public: + MockUrlFetcher() : enabled_(true), fail_on_unexpected_(true), + update_date_headers_(false), omit_empty_writes_(false), + fail_after_headers_(false), verify_host_header_(false), + split_writes_(false), timer_(NULL) {} + virtual ~MockUrlFetcher(); + + void SetResponse(const StringPiece& url, + const ResponseHeaders& response_header, + const StringPiece& response_body); + + // Adds a new response-header attribute name/value pair to an existing + // response. If the response does not already exist, the method check-fails. + void AddToResponse(const StringPiece& url, + const StringPiece& name, + const StringPiece& value); + + // Set a conditional response which will either respond with the supplied + // response_headers and response_body or a simple 304 Not Modified depending + // upon last_modified_time and conditional GET "If-Modified-Since" headers. + void SetConditionalResponse(const StringPiece& url, + int64 last_modified_date, + const GoogleString& etag, + const ResponseHeaders& response_header, + const StringPiece& response_body); + + // Fetching unset URLs will cause EXPECT failures as well as return false. + virtual bool StreamingFetchUrl(const GoogleString& url, + const RequestHeaders& request_headers, + ResponseHeaders* response_headers, + Writer* response_writer, + MessageHandler* message_handler, + const RequestContextPtr& request_context); + + // Indicates that the specified URL should respond with headers and data, + // but still return a 'false' status. This is similar to a live fetcher + // that times out or disconnects while streaming data. + // + // This differs from set_fail_after_headers in that it's specific to a + // URL, and writes the body first before returning failure. + void SetResponseFailure(const StringPiece& url); + + // Clear all set responses. + void Clear(); + + // Remove a single response. Will be a no-op if no response was set for url. + void RemoveResponse(const StringPiece& url); + + // When disabled, fetcher will fail (but not crash) for all requests. + // Use to simulate temporarily not having access to resources, for example. + void Disable() { enabled_ = false; } + void Enable() { enabled_ = true; } + + // Set to false if you don't want the fetcher to EXPECT fail on unfound URL. + // Useful in MockUrlFetcher unittest :) + void set_fail_on_unexpected(bool x) { fail_on_unexpected_ = x; } + + // Update response header's Date using supplied timer. + // Note: Must set_timer(). + void set_update_date_headers(bool x) { update_date_headers_ = x; } + + // If set to true (defaults to false) the fetcher will not emit writes of + // length 0. + void set_omit_empty_writes(bool x) { omit_empty_writes_ = x; } + + // If set to true (defaults to false) the fetcher will fail after outputting + // the headers. See also SetResponseFailure which fails after writing + // the body. + void set_fail_after_headers(bool x) { fail_after_headers_ = x; } + + // If set to true (defaults to false) the fetcher will verify that the Host: + // header is present, and matches the host/port of the requested URL. + void set_verify_host_header(bool x) { verify_host_header_ = x; } + + void set_timer(Timer* timer) { timer_ = timer; } + + // If true then each time the fetcher writes it will split the write in half + // and write each half separately. This is needed to test that Ajax's + // RecordingFetch caches writes properly and recovers from failure. + void set_split_writes(bool val) { split_writes_ = val; } + + private: + class HttpResponse { + public: + HttpResponse(int64 last_modified_time, const GoogleString& etag, + const ResponseHeaders& in_header, const StringPiece& in_body) + : last_modified_time_(last_modified_time), + etag_(etag), + body_(in_body.data(), in_body.size()), + success_(true) { + header_.CopyFrom(in_header); + } + + const int64 last_modified_time() const { return last_modified_time_; } + const GoogleString& etag() const { return etag_; } + const ResponseHeaders& header() const { return header_; } + ResponseHeaders* mutable_header() { return &header_; } + const GoogleString& body() const { return body_; } + void set_success(bool success) { success_ = success; } + bool success() const { return success_; } + + private: + int64 last_modified_time_; + GoogleString etag_; + ResponseHeaders header_; + GoogleString body_; + bool success_; + + DISALLOW_COPY_AND_ASSIGN(HttpResponse); + }; + typedef std::map ResponseMap; + + ResponseMap response_map_; + + bool enabled_; + bool fail_on_unexpected_; // Should we EXPECT if unexpected url called? + bool update_date_headers_; // Should we update Date headers from timer? + bool omit_empty_writes_; // Should we call ->Write with length 0? + bool fail_after_headers_; // Should we call Done(false) after headers? + bool verify_host_header_; // Should we verify the Host: header? + bool split_writes_; // Should we turn one write into multiple? + Timer* timer_; // Timer to use for updating header dates. + + DISALLOW_COPY_AND_ASSIGN(MockUrlFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_MOCK_URL_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/rate_controller.h b/psol/include/net/instaweb/http/public/rate_controller.h new file mode 100644 index 000000000..a66a7b8d4 --- /dev/null +++ b/psol/include/net/instaweb/http/public/rate_controller.h @@ -0,0 +1,111 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: nikhilmadan@google.com (Nikhil Madan) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_RATE_CONTROLLER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_RATE_CONTROLLER_H_ + +#include + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/ref_counted_ptr.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class AbstractMutex; +class AsyncFetch; +class MessageHandler; +class Statistics; +class ThreadSystem; +class TimedVariable; +class UrlAsyncFetcher; +class Variable; + +// Controller which limits the number of outgoing fetches per domain. If the +// fetch is for a user-facing request, this sends the request out anyway and +// updates the count for number of outgoing fetches. +// For non-user facing requests, this checks that the number of outgoing fetches +// for this domain is less than the limit. If less than the limit, it sends +// the fetch out and updates the count. If greater than the per-domain limit, +// and if the global queue size is within the limit, it queues the request up. +// However, if the global queue size is above the limit, it drops the request. +// If a request is dropped, the response will have HttpAttributes::kXPsaLoadShed +// set on the response headers. +// +// Note: this requires working statistics to work. +class RateController { + public: + static const char kQueuedFetchCount[]; + static const char kDroppedFetchCount[]; + static const char kCurrentGlobalFetchQueueSize[]; + + RateController(int max_global_queue_size, + int per_host_outgoing_request_threshold, + int per_host_queued_request_threshold, + ThreadSystem* thread_system, + Statistics* statistics); + + virtual ~RateController(); + + // Applies our shaping policies, and either (eventually) asks fetcher to + // fetch the given URL or drops it. + void Fetch(UrlAsyncFetcher* fetcher, + const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch); + + // Initializes statistics variables associated with this class. + static void InitStats(Statistics* statistics); + + private: + class HostFetchInfo; + class CustomFetch; + friend class CustomFetch; + + typedef RefCountedPtr HostFetchInfoPtr; + + typedef std::map HostFetchInfoMap; + + // Delete the fetch info from fetch_info_map_ if possible. + void DeleteFetchInfoIfPossible(const HostFetchInfoPtr& fetch_info); + + // The maximum permissible size of the global queue. + const int max_global_queue_size_; + // The maximum number of outgoing requests allowed per host. + const int per_host_outgoing_request_threshold_; + // The maximum number of queued requests allowed per host. + const int per_host_queued_request_threshold_; + ThreadSystem* thread_system_; + + // Map containing per-host information tracking outgoing and queued fetches. + HostFetchInfoMap fetch_info_map_; + scoped_ptr mutex_; + + TimedVariable* queued_fetch_count_; + TimedVariable* dropped_fetch_count_; + // Using a variable here, since we want to be able to track this in the server + // statistics. + Variable* current_global_fetch_queue_size_; + + DISALLOW_COPY_AND_ASSIGN(RateController); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_RATE_CONTROLLER_H_ diff --git a/psol/include/net/instaweb/http/public/rate_controlling_url_async_fetcher.h b/psol/include/net/instaweb/http/public/rate_controlling_url_async_fetcher.h new file mode 100644 index 000000000..3c273cc70 --- /dev/null +++ b/psol/include/net/instaweb/http/public/rate_controlling_url_async_fetcher.h @@ -0,0 +1,71 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: nikhilmadan@google.com (Nikhil Madan) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_RATE_CONTROLLING_URL_ASYNC_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_RATE_CONTROLLING_URL_ASYNC_FETCHER_H_ + +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class AsyncFetch; +class MessageHandler; +class RateController; +class Statistics; +class ThreadSystem; + +// Fetcher that uses RateController to limit amount of background fetches +// we direct to a fetcher it wraps per domain. See RateController documentation +// for more details. +class RateControllingUrlAsyncFetcher : public UrlAsyncFetcher { + public: + // Does not take ownership of 'fetcher'. + // RateController::InitStats must have been called during stats initialization + // phase. + RateControllingUrlAsyncFetcher(UrlAsyncFetcher* fetcher, + int max_global_queue_size, + int per_host_outgoing_request_threshold, + int per_host_queued_request_threshold, + ThreadSystem* thread_system, + Statistics* statistics); + + virtual ~RateControllingUrlAsyncFetcher(); + + virtual bool SupportsHttps() const { + return base_fetcher_->SupportsHttps(); + } + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch); + + virtual void ShutDown(); + + private: + UrlAsyncFetcher* base_fetcher_; + scoped_ptr rate_controller_; + + DISALLOW_COPY_AND_ASSIGN(RateControllingUrlAsyncFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_RATE_CONTROLLING_URL_ASYNC_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/reflecting_test_fetcher.h b/psol/include/net/instaweb/http/public/reflecting_test_fetcher.h new file mode 100644 index 000000000..c9571f9dc --- /dev/null +++ b/psol/include/net/instaweb/http/public/reflecting_test_fetcher.h @@ -0,0 +1,64 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: morlovich@google.com (Maksim Orlovich) +// +// Contains ReflectingTestFetcher, which just echoes its input. Meant for use in +// unit tests. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_REFLECTING_TEST_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_REFLECTING_TEST_FETCHER_H_ + +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/meta_data.h" +#include "net/instaweb/http/public/request_headers.h" +#include "net/instaweb/http/public/response_headers.h" +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class MessageHandler; + +// A fetcher that reflects headers it gets back into response headers, +// and the URL inside body. We use it to test that we are setting proper +// headers when we are generating requests ourselves. +class ReflectingTestFetcher : public UrlAsyncFetcher { + public: + ReflectingTestFetcher() {} + virtual ~ReflectingTestFetcher() {} + + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch) { + RequestHeaders* in = fetch->request_headers(); + ResponseHeaders* out = fetch->response_headers(); + out->SetStatusAndReason(HttpStatus::kOK); + for (int i = 0; i < in->NumAttributes(); ++i) { + out->Add(in->Name(i), in->Value(i)); + } + fetch->Write(url, message_handler); + fetch->Done(true); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ReflectingTestFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_REFLECTING_TEST_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/request_context.h b/psol/include/net/instaweb/http/public/request_context.h new file mode 100644 index 000000000..436e0ecc2 --- /dev/null +++ b/psol/include/net/instaweb/http/public/request_context.h @@ -0,0 +1,86 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: piatek@google.com (Michael Piatek) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_REQUEST_CONTEXT_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_REQUEST_CONTEXT_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/ref_counted_ptr.h" +#include "net/instaweb/util/public/scoped_ptr.h" + +namespace net_instaweb { + +class AbstractMutex; +class LogRecord; +class RequestContext; +class RequestTrace; +class ThreadSystem; + +typedef RefCountedPtr RequestContextPtr; + +// A class which wraps state associated with a request. +// +// This object should be reference counted, wrapped in a RequestContextPtr. We +// use reference counting because, depending on the timing of asynchronous +// rewrites, RPC calls, and so on, a RequestContext may outlive the original +// HTTP request serving, or not. Reference counting avoids the complexity of +// explicit transfer of ownership in these cases. +class RequestContext : public RefCounted { + public: + // |logging_mutex| will be passed to the request context's LogRecord, which + // will take ownership of it. If you will be doing logging in a real + // (threaded) environment, pass in a real mutex. If not, a NullMutex is fine. + explicit RequestContext(AbstractMutex* logging_mutex); + + // TODO(marq): Move this test context factory to a test-specific file. + // Makes a request context for running tests. + static RequestContextPtr NewTestRequestContext(ThreadSystem* thread_system); + + RequestTrace* trace_context() { return trace_context_.get(); } + // Takes ownership of the given context. + void set_trace_context(RequestTrace* x); + + // The log record for the this request, created when the request context is. + LogRecord* log_record(); + + protected: + // The default constructor will not create a LogRecord. Subclass constructors + // must do this explicitly. + RequestContext(); + + // The log record can only be set once. This should only be used by a subclass + // during initialization. + void set_log_record(LogRecord* l); + + // Destructors in refcounted classes should be protected. + virtual ~RequestContext(); + REFCOUNT_FRIEND_DECLARATION(RequestContext); + + private: + // Always non-NULL. + scoped_ptr log_record_; + + // Logs tracing events. + scoped_ptr trace_context_; + + DISALLOW_COPY_AND_ASSIGN(RequestContext); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_REQUEST_CONTEXT_H_ diff --git a/psol/include/net/instaweb/http/public/request_headers.h b/psol/include/net/instaweb/http/public/request_headers.h new file mode 100644 index 000000000..ff7004df5 --- /dev/null +++ b/psol/include/net/instaweb/http/public/request_headers.h @@ -0,0 +1,71 @@ +// Copyright 2011 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_REQUEST_HEADERS_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_REQUEST_HEADERS_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/http/public/headers.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HttpRequestHeaders; +class MessageHandler; +class Writer; + +// Read/write API for HTTP request (RequestHeaders is a misnomer). +class RequestHeaders : public Headers { + public: + enum Method { kOptions, kGet, kHead, kPost, kPut, kDelete, kTrace, kConnect, + kPatch, kError }; + + RequestHeaders(); + + void Clear(); + void CopyFrom(const RequestHeaders& other); + + GoogleString ToString() const; + Method method() const; + const char* method_string() const; + void set_method(Method method); + + // This is encoded message body, a rewriter or fetcher + // may opt to translate to entity-body only after removing + // header which has encoding information. + const GoogleString& message_body() const; + void set_message_body(const GoogleString& data); + + using Headers::WriteAsHttp; + bool WriteAsHttp(const StringPiece& url, Writer* writer, + MessageHandler* handler) const; + + // Determines whether a request header accepts gzipped content. + bool AcceptsGzip() const; + + // Returns true if these request headers are for an XmlHttp request (i.e. ajax + // request). This mechanism is not reliable because sometimes this header is + // not set even for XmlHttp requests. + bool IsXmlHttpRequest() const; + + private: + DISALLOW_COPY_AND_ASSIGN(RequestHeaders); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_REQUEST_HEADERS_H_ diff --git a/psol/include/net/instaweb/http/public/response_headers.h b/psol/include/net/instaweb/http/public/response_headers.h new file mode 100644 index 000000000..028b2dc15 --- /dev/null +++ b/psol/include/net/instaweb/http/public/response_headers.h @@ -0,0 +1,320 @@ +// Copyright 2011 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_RESPONSE_HEADERS_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_RESPONSE_HEADERS_H_ + +#include "net/instaweb/http/public/content_type.h" +#include "net/instaweb/http/public/headers.h" +#include "net/instaweb/http/public/meta_data.h" // HttpAttributes, HttpStatus +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/timer.h" + +namespace net_instaweb { + +class HttpResponseHeaders; +class RequestHeaders; +class MessageHandler; +class Writer; + +// Read/write API for HTTP response headers. +class ResponseHeaders : public Headers { + public: + // The number of milliseconds of cache TTL we assign to resources that + // are "likely cacheable" (e.g. images, js, css, not html) and have no + // explicit cache ttl or expiration date. + static const int64 kImplicitCacheTtlMs = 5 * Timer::kMinuteMs; + + ResponseHeaders(); + virtual ~ResponseHeaders(); + + // This will set Date and (if supplied in the first place, Expires) + // header to now if the delta of date header wrt now_ms is more than + // a tolerance. Leaves the ComputeCaching state dirty if it came in + // dirty, or clean if it came in clean. + void FixDateHeaders(int64 now_ms); + + virtual void Clear(); + + void CopyFrom(const ResponseHeaders& other); + + // Add a new header. + virtual void Add(const StringPiece& name, const StringPiece& value); + + // Merge the new content_type with what is already in the headers. + // Returns true if the existing content-type header was changed. + bool MergeContentType(const StringPiece& content_type); + + // Remove headers by name and value. + virtual bool Remove(const StringPiece& name, const StringPiece& value); + + // Remove all headers by name. + virtual bool RemoveAll(const StringPiece& name); + + // Remove all headers whose name is in |names|. + virtual bool RemoveAllFromSet(const StringSetInsensitive& names); + + // Similar to RemoveAll followed by Add. Note that the attribute + // order may be changed as a side effect of this operation. + virtual void Replace(const StringPiece& name, const StringPiece& value); + + // Merge headers. Replaces all headers specified both here and in + // other with the version in other. Useful for updating headers + // when recieving 304 Not Modified responses. + // Note: We must use Headers instead of ResponseHeaders + // so that we don't expose the base UpdateFrom (and to avoid "hiding" errors). + virtual void UpdateFrom(const Headers& other); + + // Initializes the response headers with the one in proto, clearing the + // existing fields. + void UpdateFromProto(const HttpResponseHeaders& proto); + + // Serialize HTTP response header to a binary stream. + virtual bool WriteAsBinary(Writer* writer, MessageHandler* message_handler); + + // Read HTTP response header from a binary string. Note that this + // is distinct from HTTP response-header parsing, which is in + // ResponseHeadersParser. + virtual bool ReadFromBinary(const StringPiece& buf, MessageHandler* handler); + + // Serialize HTTP response header in HTTP format so it can be re-parsed. + virtual bool WriteAsHttp(Writer* writer, MessageHandler* handler) const; + + // Compute caching information. The current time is used to compute + // the absolute time when a cache resource will expire. The timestamp + // is in milliseconds since 1970. It is an error to call any of the + // accessors before ComputeCaching is called. + void ComputeCaching(); + bool IsCacheable() const; + + // Returns true if these response headers indicate the response is cacheable + // if it was fetched w/o special authorization headers. + // + // Generally you want to use IsProxyCacheableGivenRequest() instead which will + // also take the request headers into account, unless you know the request + // was synthesized with known headers which do not include authorization. + bool IsProxyCacheable() const; + + // Returns true if these response header indicate the response is cacheable + // if it was fetched with given 'request_headers'. + bool IsProxyCacheableGivenRequest(const RequestHeaders& req_headers) const; + + // Note(sligocki): I think CacheExpirationTimeMs will return 0 if !IsCacheable + // TODO(sligocki): Look through callsites and make sure this is being + // interpretted correctly. + int64 CacheExpirationTimeMs() const; + + // Set Date, Cache-Control and Expires headers appropriately. + // If cache_control_suffix is provided it is appended onto the + // Cache-Control: "max-age=%d" string. + // For example, cache_control_suffix = ", private" or ", no-cache, no-store". + void SetDateAndCaching(int64 date_ms, int64 ttl_ms, + const StringPiece& cache_control_suffix); + void SetDateAndCaching(int64 date_ms, int64 ttl_ms) { + SetDateAndCaching(date_ms, ttl_ms, ""); + } + // Returns Cache-Control header values that we might need to preserve. This + // function is meant to be used with SetDateAndCaching. It currently looks for + // and returns no-transform and no-store if found. + GoogleString CacheControlValuesToPreserve(); + + // Set a time-based header, converting ms since epoch to a string. + void SetTimeHeader(const StringPiece& header, int64 time_ms); + void SetDate(int64 date_ms) { SetTimeHeader(HttpAttributes::kDate, date_ms); } + void SetLastModified(int64 last_modified_ms) { + SetTimeHeader(HttpAttributes::kLastModified, last_modified_ms); + } + + // Sets the cache-control max-age to the specified value leaving the remaining + // Cache-Control attributes the same. This also updates the Expires header + // appropriately. Note that all existing max-age values are removed. + void SetCacheControlMaxAge(int64 ttl_ms); + + // Sets the original content length header, used to relay information on + // the original size of optimized resources. + void SetOriginalContentLength(int64 content_length); + + // Removes cookie headers, and returns true if any changes were made. + bool Sanitize(); + + // Copies the HttpResponseHeaders proto from the response headers to the given + // input after removing the Set-Cookie fields. + void GetSanitizedProto(HttpResponseHeaders* proto) const; + + // TODO(jmarantz): consider an alternative representation + bool headers_complete() const { return has_status_code(); } + + int status_code() const; + bool has_status_code() const; + void set_status_code(const int code); + const char* reason_phrase() const; + void set_reason_phrase(const StringPiece& reason_phrase); + int64 implicit_cache_ttl_ms() const { return implicit_cache_ttl_ms_; } + void set_implicit_cache_ttl_ms(const int64 ttl) { + implicit_cache_ttl_ms_ = ttl; + } + + int64 last_modified_time_ms() const; + int64 date_ms() const; // Timestamp from Date header. + bool has_date_ms() const; + int64 cache_ttl_ms() const; + bool is_implicitly_cacheable() const; + + GoogleString ToString() const; + + // Sets the status code and reason_phrase based on an internal table. + void SetStatusAndReason(HttpStatus::Code code); + + void DebugPrint() const; + + // Parses an arbitrary string into milliseconds since 1970 + static bool ParseTime(const char* time_str, int64* time_ms); + + // Returns true if our status denotes the request failing. + inline bool IsErrorStatus() { + int status = status_code(); + return status >= 400 && status <= 599; + } + + // Returns true if our status denotes a server side error. + inline bool IsServerErrorStatus() { + int status = status_code(); + return status >= 500 && status <= 599; + } + + // Determines whether a response header is marked as gzipped. + bool IsGzipped() const; + bool WasGzippedLast() const; + + // Get ContentType. NULL if none set or it isn't in our predefined set of + // known content types. + const ContentType* DetermineContentType() const; + + // Does this header have an HTML-like Content-Type (HTML, XHTML, ...). + bool IsHtmlLike() const { + const ContentType* type = DetermineContentType(); + return (type != NULL && type->IsHtmlLike()); + } + + // Get the charset. Empty string if none set in a Content-Type header. + GoogleString DetermineCharset() const; + + // Determine both the charset and content-type as above. See + // DetermineContentType() and DetermineCharset() for details. + // You may also pass in NULL for those of _out parameters you do not + // need (but in that case the individual functions would be more convenient) + void DetermineContentTypeAndCharset(const ContentType** content_type_out, + GoogleString* charset_out) const; + + + // Parses a date header such as HttpAttributes::kDate or + // HttpAttributes::kExpires, returning the timestamp as + // number of milliseconds since 1970. + bool ParseDateHeader(const StringPiece& attr, int64* date_ms) const; + + // Returns true if the date header is later than time_ms. Used in invalidation + // of http cache. + bool IsDateLaterThan(int64 time_ms) const { + return date_ms() > time_ms; + } + + // Parses the first line of an HTTP response, including the "HTTP/". + void ParseFirstLine(const StringPiece& first_line); + + // Parses the first line of an HTTP response, skipping the "HTTP/". + void ParseFirstLineHelper(const StringPiece& first_line); + + // Set whole first line. + void set_first_line(int major_version, int minor_version, int status_code, + const StringPiece& reason_phrase) { + set_major_version(major_version); + set_minor_version(minor_version); + set_status_code(status_code); + set_reason_phrase(reason_phrase); + } + + // Returns whether or not we can cache these headers if we take into + // account the Vary: headers. Note that we consider Vary: Cookie as cacheable + // if request_has_cookie is false. + bool VaryCacheable(bool request_has_cookie) const; + + // Finds Content-Length in the response headers, returning true and putting + // it in *content_length if successful. + bool FindContentLength(int64* content_length) const; + + // Force cache the response with the given TTL even if it is private. Note + // that this does not change any of the headers. The values of cache_ttl_ms, + // IsCacheable and IsProxyCacheable are updated once ComputeCaching() is + // called. + // Note that for responses which were originally cacheable, the effective + // cache TTL is the maximum of the original TTL and ttl_ms. + // For responses which were originally uncacheable, the new cache TTL is + // ttl_ms. + void ForceCaching(int64 ttl_ms); + + // Update the caching headers if the response has force cached. + bool UpdateCacheHeadersIfForceCached(); + + // Returns estimated size in bytes of these headers (if transferred over + // HTTP, not SPDY or other protocols). This is an estimate because it may not + // properly account for things like spacing around : or whether multiple + // headers were on a single or multiple lines. + int64 SizeEstimate() const; + + // Returns true if the response headers have cookies and false otherwise. + // If cookies are found then it sets them in cookie_str in javascript array + // format. + bool GetCookieString(GoogleString* cookie_str) const; + + // Returns true in the response headers have a cookie attribute with the given + // name. values gives the associated values. + // name=value results in "value" in values. + // name= results in "" in values. + // name results in nothing being added to values. + // The return value is true in all the above cases. + // It is a limitation of this API that a cookie value of "name=value;name" is + // indistinguishable from a cookie value of "name=value". + bool HasCookie(StringPiece name, StringPieceVector* values) const; + + private: + // Parse the original and fresh content types, and add a new header based + // on the two of them, giving preference to the original. + // e.g. if the original specified charset=UTF-8 and the new one specified + // charset=UTF-16, the resulting header would have charset=UTF-8. + // Returns true if the headers were changed. + bool CombineContentTypes(const StringPiece& orig, const StringPiece& fresh); + + friend class ResponseHeadersTest; + bool cache_fields_dirty_; + + // The number of milliseconds of cache TTL we assign to resources that are + // likely cacheable and have no explicit cache ttl or expiration date. + int64 implicit_cache_ttl_ms_; + + // The number of milliseconds of cache TTL for which we should cache the + // response even if it was originally uncacheable. + int64 force_cache_ttl_ms_; + // Indicates if the response was force cached. + bool force_cached_; + + DISALLOW_COPY_AND_ASSIGN(ResponseHeaders); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_RESPONSE_HEADERS_H_ diff --git a/psol/include/net/instaweb/http/public/response_headers_parser.h b/psol/include/net/instaweb/http/public/response_headers_parser.h new file mode 100644 index 000000000..6c72e4c2b --- /dev/null +++ b/psol/include/net/instaweb/http/public/response_headers_parser.h @@ -0,0 +1,58 @@ +// Copyright 2011 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_RESPONSE_HEADERS_PARSER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_RESPONSE_HEADERS_PARSER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class MessageHandler; +class ResponseHeaders; + +// Parses a stream of HTTP header text into a ResponseHeaders instance. +class ResponseHeadersParser { + public: + explicit ResponseHeadersParser(ResponseHeaders* rh) : response_headers_(rh) { + Clear(); + } + + void Clear(); + + // Parse a chunk of HTTP response header. Returns number of bytes consumed. + int ParseChunk(const StringPiece& text, MessageHandler* handler); + + bool headers_complete() const { return headers_complete_; } + void set_headers_complete(bool x) { headers_complete_ = x; } + + private: + ResponseHeaders* response_headers_; + + bool parsing_http_; + bool parsing_value_; + bool headers_complete_; + GoogleString parse_name_; + GoogleString parse_value_; + + DISALLOW_COPY_AND_ASSIGN(ResponseHeadersParser); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_RESPONSE_HEADERS_PARSER_H_ diff --git a/psol/include/net/instaweb/http/public/semantic_type.h b/psol/include/net/instaweb/http/public/semantic_type.h new file mode 100644 index 000000000..5aee2d5a8 --- /dev/null +++ b/psol/include/net/instaweb/http/public/semantic_type.h @@ -0,0 +1,70 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jefftk@google.com (Jeff Kaufman) +// +// A collection of content-types and their attributes. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_SEMANTIC_TYPE_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_SEMANTIC_TYPE_H_ + +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { +namespace semantic_type { + +// When we see a url, we are pretty sure what kind of resource it points to from +// context. See resource_tag_scanner.h for the definitions of the categories. +// They are broader categories than ContentType::Type because contextual +// information is limited. may be kPng, kGif, kJpeg, kWebp, or +// another image type. Another difference is that content type represents the +// actual type we found when we fetched the resource, while the semantic type is +// just what we expect to find when we do. If the webmaster writes something +// like and song.mp3 is a css file served with content type +// text/javascript, the semantic type will be kImage, the content type will by +// kJavascript, and we'll ignore the extension (mp3) and actual contents of the +// file (which will look like css). +enum Category { + kScript, + kImage, + kStylesheet, + kOtherResource, + kHyperlink, + kPrefetch, + kUndefined +}; + +// Determine the value of the category enum corresponding to the given string. +// Case insensitive. Valid categories are: +// Script +// Image +// Stylesheet +// OtherResource +// - This is any other url that will be automatically loaded by the browser +// along with the main page. For example, the 'manifest' attribute of the +// 'html' element or the 'src' attribute of an 'iframe' element. +// Prefetch +// - This is to prefetch the given url or dns-prefetch for the given domain. +// Hyperlink +// - A link to another page or other resource that a browser wouldn't +// normally load in connection to this page. For example the 'href' +// attribute of an 'a' element. +bool ParseCategory(const StringPiece& category_str, Category* category); + +} // namespace semantic_type +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_SEMANTIC_TYPE_H_ diff --git a/psol/include/net/instaweb/http/public/sync_fetcher_adapter.h b/psol/include/net/instaweb/http/public/sync_fetcher_adapter.h new file mode 100644 index 000000000..773b752ac --- /dev/null +++ b/psol/include/net/instaweb/http/public/sync_fetcher_adapter.h @@ -0,0 +1,62 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This permits the use of any UrlPollableAsyncFetcher as a synchronous fetcher. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_SYNC_FETCHER_ADAPTER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_SYNC_FETCHER_ADAPTER_H_ + +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/http/public/url_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class MessageHandler; +class RequestHeaders; +class ResponseHeaders; +class ThreadSystem; +class Timer; +class UrlPollableAsyncFetcher; +class Writer; + +class SyncFetcherAdapter : public UrlFetcher { + public: + // Note: the passed in async fetcher should use a timeout similar to + // fetcher_timeout_ms (or none at all). + SyncFetcherAdapter(Timer* timer, + int64 fetcher_timeout_ms, + UrlPollableAsyncFetcher* async_fetcher, + ThreadSystem* thread_system); + virtual ~SyncFetcherAdapter(); + virtual bool StreamingFetchUrl(const GoogleString& url, + const RequestHeaders& request_headers, + ResponseHeaders* response_headers, + Writer* fetched_content_writer, + MessageHandler* message_handler, + const RequestContextPtr& request_context); + + private: + Timer* timer_; + int64 fetcher_timeout_ms_; + UrlPollableAsyncFetcher* async_fetcher_; + ThreadSystem* thread_system_; + + DISALLOW_COPY_AND_ASSIGN(SyncFetcherAdapter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_SYNC_FETCHER_ADAPTER_H_ diff --git a/psol/include/net/instaweb/http/public/sync_fetcher_adapter_callback.h b/psol/include/net/instaweb/http/public/sync_fetcher_adapter_callback.h new file mode 100644 index 000000000..cd8656102 --- /dev/null +++ b/psol/include/net/instaweb/http/public/sync_fetcher_adapter_callback.h @@ -0,0 +1,100 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: jmarantz@google.com (Joshua Marantz) +// lsong@google.com (Libo Song) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_SYNC_FETCHER_ADAPTER_CALLBACK_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_SYNC_FETCHER_ADAPTER_CALLBACK_H_ + +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/writer.h" + +namespace net_instaweb { + +class AbstractMutex; +class MessageHandler; +class ThreadSystem; + +// Class to help run an asynchronous fetch synchronously with a timeout. +class SyncFetcherAdapterCallback : public AsyncFetch { + public: + SyncFetcherAdapterCallback(ThreadSystem* thread_system, Writer* writer, + const RequestContextPtr& request_context); + virtual ~SyncFetcherAdapterCallback(); + + // When implementing a synchronous fetch with a timeout based on an + // underlying asynchronous mechanism, we need to ensure that we don't + // write to freed memory if the Done callback fires after the timeout. + // + // So we need to make sure the Writer and Response Buffers are owned + // by this Callback class, which will forward the output and headers + // to the caller *if* it has not been released by the time the callback + // is called. + // + // If this object may be accessed from multiple threads (e.g. due to + // async rewrites), you should use LockIfNotReleased() and Unlock() + // to guard access to these. + + // When the 'owner' of this callback -- the code that calls 'new' -- + // is done with it, it can call release. This will only delete the + // callback if Done() has been called. Otherwise it will stay around + // waiting for Done() to be called, and only then will it be deleted. + // + // When Release is called prior to Done(), the writer and response_headers + // will be NULLed out in this structure so they will not be updated when + // Done() is finally called. + void Release(); + + bool done() const; + bool success() const; + bool released() const; + + // If this fetcher hasn't yet been released(), returns true with mutex_ held. + // Otherwise, returns false with the mutex_ released. These methods + // should be used to guard accesses to writer() and response_headers(). + bool LockIfNotReleased(); + + // Releases mutex acquired by a successful LockIfNotReleased() call. + void Unlock(); + + protected: + virtual void HandleDone(bool success); + virtual bool HandleWrite(const StringPiece& content, + MessageHandler* handler) { + return writer_->Write(content, handler); + } + virtual bool HandleFlush(MessageHandler* handler) { + return writer_->Flush(handler); + } + virtual void HandleHeadersComplete() { + } + + private: + scoped_ptr mutex_; + bool done_; + bool success_; + bool released_; + scoped_ptr writer_; + + DISALLOW_COPY_AND_ASSIGN(SyncFetcherAdapterCallback); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_SYNC_FETCHER_ADAPTER_CALLBACK_H_ diff --git a/psol/include/net/instaweb/http/public/url_async_fetcher.h b/psol/include/net/instaweb/http/public/url_async_fetcher.h new file mode 100644 index 000000000..4e4a247d2 --- /dev/null +++ b/psol/include/net/instaweb/http/public/url_async_fetcher.h @@ -0,0 +1,99 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_URL_ASYNC_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_URL_ASYNC_FETCHER_H_ + +#include + +#include "net/instaweb/http/public/content_type.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class AsyncFetch; +class MessageHandler; + +// UrlAsyncFetcher is an interface for asynchronously fetching URLs. +// The results of a fetch are asynchronously passed back to the callbacks +// in the supplied AsyncFetch object. +class UrlAsyncFetcher { + public: + static const int64 kUnspecifiedTimeout; + + // Default statistics group name. + static const char kStatisticsGroup[]; + + virtual ~UrlAsyncFetcher(); + + // Asynchronously fetch a URL, set the response headers and stream the + // contents to fetch and call fetch->Done() when the fetch finishes. + // + // There is an unchecked contract that response_headers are set before the + // response_writer or callback are used. + // Caution, several implementations do not satisfy this contract (but should). + // + // TODO(sligocki): GoogleString -> GoogleUrl or at least StringPiece. + // TODO(sligocki): Include the URL in the fetch, like the request headers. + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch) = 0; + + // Determine if the fetcher supports fetching using HTTPS. By default we + // assume a fetcher can. + virtual bool SupportsHttps() const { return true; } + + // Returns a maximum time that we will allow fetches to take, or + // kUnspecifiedTimeout (the default) if we don't promise to timeout fetches. + virtual int64 timeout_ms() { return kUnspecifiedTimeout; } + + // Stops all active fetches and prevents further fetches from starting, + // calling back to ->Done(false). + // + // Base-class implementation is empty for forward compatibility. + virtual void ShutDown(); + + // Always requests content from servers using gzip. If the request headers + // do not accept that encoding, then it will be decompressed while streaming. + void set_fetch_with_gzip(bool x) { fetch_with_gzip_ = x; } + bool fetch_with_gzip() const { return fetch_with_gzip_; } + + // Returns a new InflatingFetch to handle auto-inflating the + // response if needed. + // Use inflation_content_type_blacklist cautiously, it may cause resources to + // be corrupted if you use it with anything other than the IPRO path. + AsyncFetch* EnableInflation( + AsyncFetch* fetch, + const std::set* inflation_content_type_blacklist) + const; + + protected: + // Put this in protected to make sure nobody constructs this class except + // for subclasses. + UrlAsyncFetcher() : fetch_with_gzip_(false) {} + + private: + bool fetch_with_gzip_; + + DISALLOW_COPY_AND_ASSIGN(UrlAsyncFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_URL_ASYNC_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/url_async_fetcher_stats.h b/psol/include/net/instaweb/http/public/url_async_fetcher_stats.h new file mode 100644 index 000000000..6d0fb8742 --- /dev/null +++ b/psol/include/net/instaweb/http/public/url_async_fetcher_stats.h @@ -0,0 +1,84 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: morlovichz@google.com (Maksim Orlovich) +// Wrapper around a UrlAsyncFetcher that adds statistics and histograms. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_URL_ASYNC_FETCHER_STATS_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_URL_ASYNC_FETCHER_STATS_H_ + +#include "net/instaweb/http/public/url_async_fetcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class AsyncFetch; +class Histogram; +class MessageHandler; +class Statistics; +class Timer; +class Variable; + +class UrlAsyncFetcherStats : public UrlAsyncFetcher { + public: + // Creates a fetcher that delegates to base_fetcher, while collecting + // statistics. The variables will be prefixed with 'prefix'; which must + // have been passed to ::InitStats during statistics initialization process. + // + // Note that base_fetcher should not have fetch_with_gzip() as it would break + // usage metering; if you want that functionality you should turn it off on + // base_fetcher and turn it on UrlAsyncFetcherStats. + // + // Does not own base_fetcher (so you can have multiple UrlAsyncFetcherStats + // objects around a single UrlAsyncFetcher object). + UrlAsyncFetcherStats(StringPiece prefix, + UrlAsyncFetcher* base_fetcher, + Timer* timer, + Statistics* statistics); + virtual ~UrlAsyncFetcherStats(); + + // This must be called once for every unique prefix used with + // UrlAsyncFetcherStats. + static void InitStats(StringPiece prefix, Statistics* statistics); + + // Reimplementation of UrlAsyncFetcher methods. See base class + // for API specifications. + virtual bool SupportsHttps() const; + virtual void Fetch(const GoogleString& url, + MessageHandler* message_handler, + AsyncFetch* fetch); + virtual int64 timeout_ms(); + virtual void ShutDown(); + + private: + class StatsAsyncFetch; + + UrlAsyncFetcher* base_fetcher_; + Timer* timer_; + + Histogram* fetch_latency_us_histogram_; + Variable* fetches_; + Variable* bytes_fetched_; + Variable* approx_header_bytes_fetched_; + + DISALLOW_COPY_AND_ASSIGN(UrlAsyncFetcherStats); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_URL_ASYNC_FETCHER_STATS_H_ diff --git a/psol/include/net/instaweb/http/public/url_fetcher.h b/psol/include/net/instaweb/http/public/url_fetcher.h new file mode 100644 index 000000000..e0b2098b1 --- /dev/null +++ b/psol/include/net/instaweb/http/public/url_fetcher.h @@ -0,0 +1,56 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) +// +// UrlFetcher is an interface for fetching urls. +// +// TODO(jmarantz): Consider asynchronous fetches. This may not require +// a change in interface; we would simply always return 'false' if the +// url contents is not already cached. We may want to consider a richer +// return-value enum to distinguish illegal ULRs from invalid ones, from +// ones where the fetch is in-progress. Or maybe the caller doesn't care. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_URL_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_URL_FETCHER_H_ + +#include "net/instaweb/http/public/request_context.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class MessageHandler; +class RequestHeaders; +class ResponseHeaders; +class Writer; + +class UrlFetcher { + public: + virtual ~UrlFetcher(); + + // Fetch a URL, streaming the output to fetched_content_writer, and + // returning the headers. Returns true if the fetch was successful. + virtual bool StreamingFetchUrl(const GoogleString& url, + const RequestHeaders& request_headers, + ResponseHeaders* response_headers, + Writer* response_writer, + MessageHandler* message_handler, + const RequestContextPtr& request_context) = 0; +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_URL_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/url_pollable_async_fetcher.h b/psol/include/net/instaweb/http/public/url_pollable_async_fetcher.h new file mode 100644 index 000000000..a5ae2f852 --- /dev/null +++ b/psol/include/net/instaweb/http/public/url_pollable_async_fetcher.h @@ -0,0 +1,39 @@ +// Copyright 2010 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: morlovich@google.com (Maksim Orlovich) +// +// UrlPollableAsyncFetchers allow a client to block on asynchronous resource +// fetches. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_URL_POLLABLE_ASYNC_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_URL_POLLABLE_ASYNC_FETCHER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/http/public/url_async_fetcher.h" + +namespace net_instaweb { + +class UrlPollableAsyncFetcher : public UrlAsyncFetcher { + public: + virtual ~UrlPollableAsyncFetcher(); + + // Poll the active fetches, returning the number of fetches + // still outstanding. + virtual int Poll(int64 max_wait_ms) = 0; +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_URL_POLLABLE_ASYNC_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/user_agent_matcher.h b/psol/include/net/instaweb/http/public/user_agent_matcher.h new file mode 100644 index 000000000..176d56611 --- /dev/null +++ b/psol/include/net/instaweb/http/public/user_agent_matcher.h @@ -0,0 +1,122 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_USER_AGENT_MATCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_USER_AGENT_MATCHER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/re2.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/fast_wildcard_group.h" + +namespace net_instaweb { + +class RequestHeaders; + +// This class contains various user agent based checks. Currently all of these +// are based on simple wildcard based white- and black-lists. +// +// TODO(sriharis): Split the functionality here into two: a matcher that +// pulls out all relevent information from UA strings (browser-family, version, +// mobile/tablet/desktop, etc.), and a query interface that can be used by +// clients. +class UserAgentMatcher { + public: + enum BlinkRequestType { + kBlinkWhiteListForDesktop, + kBlinkBlackListForDesktop, + kBlinkWhiteListForMobile, + kDoesNotSupportBlinkForMobile, + kNullOrEmpty, + kDoesNotSupportBlink, + }; + + enum PrefetchMechanism { + kPrefetchNotSupported, + kPrefetchLinkRelSubresource, + kPrefetchImageTag, + kPrefetchObjectTag, + kPrefetchLinkScriptTag, + }; + + UserAgentMatcher(); + virtual ~UserAgentMatcher(); + + bool IsIe(const StringPiece& user_agent) const; + bool IsIe6(const StringPiece& user_agent) const; + bool IsIe7(const StringPiece& user_agent) const; + bool IsIe6or7(const StringPiece& user_agent) const { + return IsIe6(user_agent) || IsIe7(user_agent); + }; + bool IsIe9(const StringPiece& user_agent) const; + + virtual bool SupportsImageInlining(const StringPiece& user_agent) const; + + // Returns the request type for the given request. The return type currently + // supports desktop, mobile and not supported. + virtual BlinkRequestType GetBlinkRequestType( + const char* user_agent, const RequestHeaders* request_headers) const; + + // Returns the supported prefetch mechanism depending upon the user agent. + PrefetchMechanism GetPrefetchMechanism( + const StringPiece& user_agent, + const RequestHeaders* request_headers) const; + + bool SupportsJsDefer(const StringPiece& user_agent, bool allow_mobile) const; + bool SupportsWebp(const StringPiece& user_agent) const; + bool SupportsWebpLosslessAlpha(const StringPiece& user_agent) const; + + // IE9 does not implement . Instead it does DNS + // preresolution when it sees . This method returns + // true if the browser support DNS prefetch using rel=prefetch. + // Refer: http://blogs.msdn.com/b/ie/archive/2011/03/17/internet-explorer-9-network-performance-improvements.aspx NOLINT + bool SupportsDnsPrefetchUsingRelPrefetch(const StringPiece& user_agent) const; + bool SupportsDnsPrefetch(const StringPiece& user_agent) const; + + virtual bool IsMobileUserAgent(const StringPiece& user_agent) const; + virtual bool IsMobileRequest( + const StringPiece& user_agent, + const RequestHeaders* request_headers) const; + + virtual bool IsAndroidUserAgent(const StringPiece& user_agent) const; + + // Returns false if this is not a Chrome user agent, or parsing the + // string build number fails. + virtual bool GetChromeBuildNumber(const StringPiece& user_agent, int* major, + int* minor, int* build, int* patch) const; + + virtual bool SupportsSplitHtml(const StringPiece& user_agent, + bool allow_mobile) const; + + private: + FastWildcardGroup supports_image_inlining_; + FastWildcardGroup blink_desktop_whitelist_; + FastWildcardGroup blink_desktop_blacklist_; + FastWildcardGroup blink_mobile_whitelist_; + FastWildcardGroup supports_webp_; + FastWildcardGroup supports_webp_lossless_alpha_; + FastWildcardGroup mobile_user_agents_; + FastWildcardGroup supports_prefetch_link_rel_subresource_; + FastWildcardGroup supports_prefetch_image_tag_; + FastWildcardGroup supports_prefetch_link_script_tag_; + FastWildcardGroup supports_dns_prefetch_; + + const RE2 chrome_version_pattern_; + + DISALLOW_COPY_AND_ASSIGN(UserAgentMatcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_USER_AGENT_MATCHER_H_ diff --git a/psol/include/net/instaweb/http/public/user_agent_matcher_test.h b/psol/include/net/instaweb/http/public/user_agent_matcher_test.h new file mode 100644 index 000000000..5aedb80e1 --- /dev/null +++ b/psol/include/net/instaweb/http/public/user_agent_matcher_test.h @@ -0,0 +1,124 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_USER_AGENT_MATCHER_TEST_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_USER_AGENT_MATCHER_TEST_H_ + +namespace net_instaweb { +namespace UserAgentStrings { + +// User Agent strings are from http://www.useragentstring.com/. +// IE: http://www.useragentstring.com/pages/Internet Explorer/ +// FireFox: http://www.useragentstring.com/pages/Firefox/ +// Chrome: http://www.useragentstring.com/pages/Chrome/ +// And there are many more. + +const char kAndroidHCUserAgent[] = + "Mozilla/5.0 (Linux; U; Android 3.2; en-us; Sony Tablet S Build/THMAS11000)" + " AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13"; +const char kAndroidICSUserAgent[] = + "Mozilla/5.0 (Linux; U; Android 4.0.1; en-us; Galaxy Nexus Build/ICL27) " + "AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"; +const char kAndroidNexusSUserAgent[] = + "Mozilla/5.0 (Linux; U; Android 2.3.3; en-gb; Nexus S Build/GRI20)" + "AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"; +const char kChromeUserAgent[] = + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) " + "AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.A.B.C Safari/525.13"; +const char kChrome9UserAgent[] = // Not webp capable + "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) " + "AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.19 Safari/534.13"; +const char kChrome15UserAgent[] = // Not webp capable + "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) " + "AppleWebKit/534.13 (KHTML, like Gecko) Chrome/15.0.597.19 Safari/534.13"; +const char kChrome18UserAgent[] = // webp capable + "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) " + "AppleWebKit/534.13 (KHTML, like Gecko) Chrome/18.0.597.19 Safari/534.13"; +const char kChrome12UserAgent[] = // webp capable + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_4) " + "AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30"; +const char kAndroidChrome18UserAgent[] = // webp broken + "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) " + "AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile " + "Safari/535.19"; +const char kAndroidChrome21UserAgent[] = // webp fixed (string is a hack) + "Mozilla/5.0 (Linux; Android 4.1.4; Galaxy Nexus Build/IMM76B) " + "AppleWebKit/535.19 (KHTML, like Gecko) Chrome/21.0.1025.133 Mobile " + "Safari/535.19"; +const char kIPhoneChrome21UserAgent[] = // no webp on iOS + "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0_1 like Mac OS X; en-us) " + "AppleWebKit/534.46.0 (KHTML, like Gecko) CriOS/21.0.1180.82 " + "Mobile/10A523 Safari/7534.48.3"; +const char kOpera1101UserAgent[] = // Not webp capable + "Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01"; +const char kOpera1110UserAgent[] = // webp capable + "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10"; +const char kFirefoxUserAgent[] = + "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) " + "Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"; +const char kFirefox1UserAgent[] = + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.7) " + "Gecko/20060909 Firefox/1.5.0.7 MG (Novarra-Vision/6.1)"; +const char kFirefox5UserAgent[] = + "Mozilla/5.0 (X11; U; Linux i586; de; rv:5.0) Gecko/20100101 Firefox/5.0"; +const char kIe6UserAgent[] = + "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1;" + " .NET CLR 2.0.50727)"; +const char kIe7UserAgent[] = + "Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)"; +const char kIe8UserAgent[] = + "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64;" + " Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729;" + " .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2;" + " .NET4.0C; .NET4.0E; FDM)"; +const char kIe9UserAgent[] = + "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))"; +const char kIPhoneUserAgent[] = + "Apple iPhone OS v2.1.1 CoreMedia v1.0.0.5F138"; +const char kIPhone4Safari[] = + "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46" + " (KHTML, like Gecko) Version/5.1 Mobile/9A405 Safari/7534.48.3"; +const char kNokiaUserAgent[] = + "Nokia2355/1.0 (JN100V0200.nep) UP.Browser/6.2.2.1.c.1.108 (GUI) MMP/2.0"; +const char kOpera5UserAgent[] = + "Opera/5.0 (SunOS 5.8 sun4u; U) [en]"; +const char kOpera8UserAgent[] = + "Opera/8.01 (J2ME/MIDP; Opera Mini/1.1.2666/1724; en; U; ssr)"; +const char kPSPUserAgent[] = + "Mozilla/4.0 (PSP (PlayStation Portable); 2.00)"; +const char kSafariUserAgent[] = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.51.22 " + "(KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"; +const char kOperaMobi9[] = + "Opera/9.51 Beta (Microsoft Windows; PPC; Opera Mobi/1718; U; en)"; +const char kFirefoxNokiaN800[] = /* This is a tablet */ + "Mozilla/5.0 (X11; U; Linux armv6l; en-US; rv:1.9a6pre) Gecko/20070810 " + "Firefox/3.0a1 Tablet browser 0.1.16 RX-34_2007SE_4.2007.38-2"; +const char kIPadUserAgent[] = + "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) " + "AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 " + "Mobile/7B334b Safari/531.21.10"; +const char kNexus7ChromeUserAgent[] = + "Mozilla/5.0 (Linux; Android 4.2; Nexus 7 Build/JOP32C) AppleWebKit/535.19" + "(KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19"; +const char kAcceptHeaderValueNonMobile[] = "text/html"; +const char kAcceptHeaderValueMobile[] = + "text/html,application/vnd.wap.xhtml+xml"; +const char kXWapProfile[] = "x-wap-profile"; +const char kXWapProfileHeaderValue[] = "http://foo.bar.xml"; + +} // namespace UserAgentStrings +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_USER_AGENT_MATCHER_TEST_H_ diff --git a/psol/include/net/instaweb/http/public/wait_url_async_fetcher.h b/psol/include/net/instaweb/http/public/wait_url_async_fetcher.h new file mode 100644 index 000000000..c1cb0928d --- /dev/null +++ b/psol/include/net/instaweb/http/public/wait_url_async_fetcher.h @@ -0,0 +1,75 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_WAIT_URL_ASYNC_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_WAIT_URL_ASYNC_FETCHER_H_ + +#include + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/http/public/url_async_fetcher.h" + +namespace net_instaweb { + +class AsyncFetch; +class AbstractMutex; +class MessageHandler; +class UrlFetcher; + +// Fake UrlAsyncFetcher which waits to call underlying blocking fetcher until +// you explicitly call CallCallbacks(). +class WaitUrlAsyncFetcher : public UrlAsyncFetcher { + public: + WaitUrlAsyncFetcher(UrlFetcher* url_fetcher, AbstractMutex* mutex) + : url_fetcher_(url_fetcher), + pass_through_mode_(false), + mutex_(mutex) { + } + virtual ~WaitUrlAsyncFetcher(); + + // Initiate fetches that will finish when CallCallbacks is called. + virtual void Fetch(const GoogleString& url, + MessageHandler* handler, + AsyncFetch* fetch); + + // Call all callbacks from previously initiated fetches. + void CallCallbacks(); + + // Sets a mode where no waiting occurs -- fetches propagate immediately. + // The previous mode is returned. When turning pass-through mode on, + // any pending callbacks are called. + bool SetPassThroughMode(bool pass_through_mode); + + private: + class DelayedFetch; + + bool CallCallbacksAndSwitchModesHelper(bool new_mode); + + UrlFetcher* url_fetcher_; + std::vector delayed_fetches_; + bool pass_through_mode_; + scoped_ptr mutex_; + + DISALLOW_COPY_AND_ASSIGN(WaitUrlAsyncFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_WAIT_URL_ASYNC_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/wget_url_fetcher.h b/psol/include/net/instaweb/http/public/wget_url_fetcher.h new file mode 100644 index 000000000..23bade10c --- /dev/null +++ b/psol/include/net/instaweb/http/public/wget_url_fetcher.h @@ -0,0 +1,47 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Authors: jmarantz@google.com (Joshua Marantz) +// vchudnov@google.com (Victor Chudnovsky) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_WGET_URL_FETCHER_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_WGET_URL_FETCHER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/http/public/external_url_fetcher.h" + +namespace net_instaweb { + +class WgetUrlFetcher : public ExternalUrlFetcher { + public: + WgetUrlFetcher(); + virtual ~WgetUrlFetcher() {} + + private: + virtual GoogleString ConstructFetchCommand( + const GoogleString& escaped_url, + const char* user_agent, + const StringVector& escaped_headers); + virtual const char* GetFetchLabel(); + + DISALLOW_COPY_AND_ASSIGN(WgetUrlFetcher); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_WGET_URL_FETCHER_H_ diff --git a/psol/include/net/instaweb/http/public/write_through_http_cache.h b/psol/include/net/instaweb/http/public/write_through_http_cache.h new file mode 100644 index 000000000..f141448d0 --- /dev/null +++ b/psol/include/net/instaweb/http/public/write_through_http_cache.h @@ -0,0 +1,122 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: nikhilmadan@google.com (Nikhil Madan) + +#ifndef NET_INSTAWEB_HTTP_PUBLIC_WRITE_THROUGH_HTTP_CACHE_H_ +#define NET_INSTAWEB_HTTP_PUBLIC_WRITE_THROUGH_HTTP_CACHE_H_ + +#include + +#include "net/instaweb/http/public/http_cache.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class CacheInterface; +class Hasher; +class HTTPValue; +class MessageHandler; +class Statistics; +class Timer; + +// Composes two cache interfaces to form a two level http cache. +class WriteThroughHTTPCache : public HTTPCache { + public: + static const size_t kUnlimited; + + // Takes ownership of both caches passed in. + WriteThroughHTTPCache(CacheInterface* cache1, CacheInterface* cache2, + Timer* timer, Hasher* hasher, Statistics* statistics); + + virtual ~WriteThroughHTTPCache(); + + // Implements HTTPCache::SetIgnoreFailurePuts(). + virtual void SetIgnoreFailurePuts(); + + // Implements HTTPCache::Find(). + virtual void Find(const GoogleString& key, MessageHandler* handler, + Callback* callback); + + // Implements HTTPCache::Delete(). + virtual void Delete(const GoogleString& key); + + // Implements HTTPCache::set_force_caching(). + virtual void set_force_caching(bool force); + + // Implements HttpCache::set_hasher(). + virtual void set_hasher(Hasher* hasher) { + cache1_->set_hasher(hasher); + cache2_->set_hasher(hasher); + } + + // Implements HTTPCache::set_disable_html_caching_on_https(). + virtual void set_disable_html_caching_on_https(bool x); + + // Implements HTTPCache::set_remember_not_cacheable_ttl_seconds(). + virtual void set_remember_not_cacheable_ttl_seconds(int64 value); + + // Implements HTTPCache::set_remember_fetch_failed_ttl_seconds(). + virtual void set_remember_fetch_failed_ttl_seconds(int64 value); + + // Implements HTTPCache::set_remember_fetch_dropped_ttl_seconds(); + virtual void set_remember_fetch_dropped_ttl_seconds(int64 value); + + // Implements HTTPCache::set_max_cacheable_response_content_length(). + virtual void set_max_cacheable_response_content_length(int64 value); + + // Implements HTTPCache::RememberNotCacheable(). + virtual void RememberNotCacheable(const GoogleString& key, + bool is_200_status_code, + MessageHandler * handler); + + // Implements HTTPCache::RememberFetchFailed(). + virtual void RememberFetchFailed(const GoogleString& key, + MessageHandler * handler); + + // Implements HTTPCache::RememberFetchDropped(). + virtual void RememberFetchDropped(const GoogleString& key, + MessageHandler * handler); + + // By default, all data goes into both cache1 and cache2. But + // if you only want to put small items in cache1, you can set the + // size limit. Note that both the key and value will count + // torward the size. + void set_cache1_limit(size_t limit) { cache1_size_limit_ = limit; } + + virtual const char* Name() const { return name_.c_str(); } + + protected: + // Implements HTTPCache::PutInternal(). + virtual void PutInternal(const GoogleString& key, int64 start_us, + HTTPValue* value); + + private: + void PutInCache1(const GoogleString& key, HTTPValue* value); + + scoped_ptr cache1_; + scoped_ptr cache2_; + size_t cache1_size_limit_; + GoogleString name_; + + DISALLOW_COPY_AND_ASSIGN(WriteThroughHTTPCache); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_HTTP_PUBLIC_WRITE_THROUGH_HTTP_CACHE_H_ diff --git a/psol/include/net/instaweb/js/public/js_keywords.h b/psol/include/net/instaweb/js/public/js_keywords.h new file mode 100644 index 000000000..f2635e3aa --- /dev/null +++ b/psol/include/net/instaweb/js/public/js_keywords.h @@ -0,0 +1,141 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com +// +// This is based on third_party/libpagespeed/src/pagespeed/js/js_minify.cc by +// mdsteele@google.com + +#ifndef NET_INSTAWEB_JS_PUBLIC_JS_KEYWORDS_H_ +#define NET_INSTAWEB_JS_PUBLIC_JS_KEYWORDS_H_ + +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class JsKeywords { + public: + enum Type { + // literals + kNull, + kTrue, + kFalse, + + // keywords + kBreak, + kCase, + kCatch, + kConst, + kDefault, + kFinally, + kFor, + kInstanceof, + kNew, + kVar, + kContinue, + kFunction, + kReturn, + kVoid, + kDelete, + kIf, + kThis, + kDo, + kWhile, + kElse, + kIn, + kSwitch, + kThrow, + kTry, + kTypeof, + kWith, + kDebugger, + + // reserved for future use + kClass, + kEnum, + kExport, + kExtends, + kImport, + kSuper, + + // reserved for future use in strict code + kImplements, + kInterface, + kLet, + kPackage, + kPrivate, + kProtected, + kPublic, + kStatic, + kYield, + + // Sentinel value for gperf. + kNotAKeyword, + + // Other types of lexical tokens; returned by lexer, but not gperf. + kComment, + kWhitespace, + kLineSeparator, + kRegex, + kStringLiteral, + kNumber, + kOperator, + kIdentifier, + kEndOfInput + }; + + static bool IsAKeyword(Type type) { return type < kNotAKeyword; } + + enum Flag { + kNone, + kIsValue, + kIsReservedNonStrict, + kIsReservedStrict + }; + + // Finds a Keyword based on a keyword string. If not found, returns + // kNotAKeyword. Otherwise, this always returns a Type for which + // IsAKeyword is true. + static Type Lookup(const StringPiece& name, Flag* flag); + + private: + friend class JsLexer; + + // Limited iterator (not an STL iterator). Example usage: + // for (JsKeywords::Iterator iter; !iter.AtEnd(); iter.Next()) { + // use(iter.keyword(), iter.name()); + // } + class Iterator { + public: + Iterator() : index_(-1) { Next(); } + bool AtEnd() const; + void Next(); + Type keyword() const; + const char* name() const; + + private: + int index_; + + // Implicit copy and assign ok. The members can be safely copied by bits. + }; + + // Returns the number of keywords recognized by the Lookup function. This is + // used by the Lexer to size the keyword-sring array prior to iterating over + // the keywords to populate it. + static int num_keywords(); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_JS_PUBLIC_JS_KEYWORDS_H_ diff --git a/psol/include/net/instaweb/js/public/js_lexer.h b/psol/include/net/instaweb/js/public/js_lexer.h new file mode 100644 index 000000000..a94bdd848 --- /dev/null +++ b/psol/include/net/instaweb/js/public/js_lexer.h @@ -0,0 +1,106 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jmarantz@google.com +// +// This is based on third_party/libpagespeed/src/pagespeed/js/js_minify.cc by +// mdsteele@google.com + +#ifndef NET_INSTAWEB_JS_PUBLIC_JS_LEXER_H_ +#define NET_INSTAWEB_JS_PUBLIC_JS_LEXER_H_ + +#include "net/instaweb/js/public/js_keywords.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +// Lexical analysis class for Javascript. +class JsLexer { + public: + JsLexer(); + void Lex(const StringPiece& contents); + const char* keyword_string(JsKeywords::Type keyword) { + return keyword_vector_[static_cast(keyword)]; + } + + // Grabs the next token from the stream. + JsKeywords::Type NextToken(StringPiece* token); + + // Was there an error in the stream? + bool error() const { return error_; } + + private: + // Method used to determine whether we are still in a particular + // Lexer state. + typedef bool (JsLexer::*LexicalPredicate)(uint8 ch, int index); + + JsKeywords::Type IdentifierOrKeyword(const StringPiece& name); + JsKeywords::Type NumberOrDot(const StringPiece& number_or_dot); + + // Walks through input text looking for the end of the current token. + // When predicate(char, index) returns false, the token is over, and + // the callback 'fn' is called with a StringPiece of the character + // bounds of the token. + // + // If 'include_last_char' is specified, then the terminating + // character is included in the StringPiece passed to 'fn'. If + // ok_to_terminate_with_eof is false and the input text ends before + // predicate() returns false, then an error is signaled, resulting + // in Lex() returning false. However, the in-progress token is + // passed to 'fn'. + void Consume(LexicalPredicate predicate, + bool include_last_char, + bool ok_to_terminate_with_eof, + StringPiece* token); + + bool IsSpace(uint8 ch, int index); + bool IsLineSeparator(uint8 ch, int index); + bool IsNumber(uint8 ch, int index); + bool InBlockComment(uint8 ch, int index); + bool InSingleLineComment(uint8 ch, int index); + bool InIdentifier(uint8 ch, int index); + bool InOperator(uint8 ch, int index); + bool InString(uint8 ch, int index); + bool InRegex(uint8 ch, int index); + + // Returns 'true' if this is the start of an identifier. + bool IdentifierStart(uint8 ch); + + // If the character is a backslash, updates backslash_mode_ and returns + // true, so the caller can skip over the next character, as indicated by + // lexical context. + bool ProcessBackslash(uint8 ch); + + JsKeywords::Type ConsumeSlash(StringPiece* token); + + StringPiece input_; + int index_; + int prev_char_; + int token_start_; + int token_start_index_; + int dot_count_; + bool error_; + bool backslash_mode_; + bool last_token_may_end_value_; + bool within_brackets_; + bool seen_a_dot_; + + CharStarVector keyword_vector_; + + DISALLOW_COPY_AND_ASSIGN(JsLexer); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_JS_PUBLIC_JS_LEXER_H_ diff --git a/psol/include/net/instaweb/js/public/js_minify.h b/psol/include/net/instaweb/js/public/js_minify.h new file mode 100644 index 000000000..baa9cde4d --- /dev/null +++ b/psol/include/net/instaweb/js/public/js_minify.h @@ -0,0 +1,24 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mmohabey@google.com (Megha Mohabey) + +#ifndef NET_INSTAWEB_JS_PUBLIC_JS_MINIFY_H_ +#define NET_INSTAWEB_JS_PUBLIC_JS_MINIFY_H_ + +#include "pagespeed/js/js_minify.h" + +#endif // NET_INSTAWEB_JS_PUBLIC_JS_MINIFY_H_ diff --git a/psol/include/net/instaweb/public/global_constants.h b/psol/include/net/instaweb/public/global_constants.h new file mode 100644 index 000000000..719e831ee --- /dev/null +++ b/psol/include/net/instaweb/public/global_constants.h @@ -0,0 +1,63 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: pradnya@google.com (Pradnya Karbhari) + +// Header file that includes global constants to be used in net/instaweb. + +#ifndef NET_INSTAWEB_PUBLIC_GLOBAL_CONSTANTS_H_ +#define NET_INSTAWEB_PUBLIC_GLOBAL_CONSTANTS_H_ + +namespace { + +// Time of day used in Chromium when running javascript in deterministic mode +// (with flag --no-js-randomness). We use the same time of day for slurping, +// validation and measurement in order to maintain consistency. +static const double kChromiumTimeOfDay = 1204251968254LL; + +const char kModPagespeedHeader[] = "X-Mod-Pagespeed"; +const char kPageSpeedHeader[] = "X-Page-Speed"; + +// String added to all subrequests (with version string). +const char kModPagespeedSubrequestUserAgent[] = "mod_pagespeed"; + +const char kWPTUserAgentIdentifier[] = "PTST"; + +// The name of the header used to specify the rewriters that were +// applied to the current request. +const char kPsaRewriterHeader[] = "X-PSA-Rewriter"; + +// The name of the header that pubilshers can use to send the time when the +// cacheable content on the page was last modified. This is used by +// prioritize_visible_content filter to invalidate its cache. +const char kPsaLastModified[] = "X-PSA-Last-Modified"; + +// Noscript element that redirects to ModPagespeed=noscript. This is applied +// when a filter that inserts custom javascript is enabled. +const char kNoScriptRedirectFormatter[] = + "

Please click here " + "if you are not redirected within a few seconds.
"; + +// Link tag to be inserted on noscript redirect so that original URL is +// considered canonical. +const char kLinkRelCanonicalFormatter[] = + ""; + +} // namespace + +#endif // NET_INSTAWEB_PUBLIC_GLOBAL_CONSTANTS_H_ diff --git a/psol/include/net/instaweb/rewriter/google_analytics_snippet.h b/psol/include/net/instaweb/rewriter/google_analytics_snippet.h new file mode 100644 index 000000000..6f86e6ab7 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/google_analytics_snippet.h @@ -0,0 +1,79 @@ +/** + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// DO NOT EDIT. Generated by ./google_analytics_snippet_gen.py + +namespace net_instaweb { + +const char kGaSnippetGetTracker[] = "_modpagespeed_getRewriteTracker("; + +const char kGaSnippetPrefix[] = + "var _gaq = _gaq || [];\n" + "(function(){\n" + " function functionName(fn) {\n" + " var name = /\\W*function\\s+([\\w\\$]+)\\(/.exec(fn);\n" + " if (!name)\n" + " return 'No name';\n" + " return name[1];\n" + " }\n" + " var nameSpace = '_gat';\n" + " var existingGat = window[nameSpace];\n" + " if (existingGat && typeof existingGat['_getTracker'] == 'function') {\n" + " return;\n" + " }\n" + "\n" + " var gaqAccounts = [];\n" + " function setAccount(acct, prefix) {\n" + " if (gaqAccounts[prefix] != acct) {\n" + " gaqAccounts[prefix] = acct;\n" + " _gaq.push([prefix + '_setAccount', acct]);\n" + " }\n" + " }\n" + "\n" + " window['_modpagespeed_getRewriteTracker'] = function(tracker_acct,\n" + " tracker_name) {\n" + " var prefix = tracker_name ? tracker_name + '.' : '';\n" + "\n" + " function deferTrackerFunc(fn) {\n" + " return function() {\n" + " setAccount(tracker_acct, prefix);\n" + " var pushArgs = [fn];\n" + " [].push.apply(pushArgs, arguments);\n" + " _gaq.push(pushArgs);\n" + " };\n" + " }\n" + " var pageTrackerMethodNames = [\n"; + +const char kGaSnippetSuffix[] = + " ];\n" + " var pageTracker = {\n" + " initData: function() {},\n" + " };\n" + " for (var i=pageTrackerMethodNames.length; i--;) {\n" + " var n = pageTrackerMethodNames[i];\n" + " pageTracker[n] = deferTrackerFunc(prefix + n);\n" + " }\n" + " return pageTracker;\n" + " };\n" + "\n" + " var ga = document.createElement('script');\n" + " ga.type = 'text/javascript'; ga.async = true;\n" + " ga.src = ('https:' == document.location.protocol ? 'https://ssl' :\n" + " 'http://www') + '.google-analytics.com/ga.js';\n" + " var s = document.getElementsByTagName('script')[0];\n" + " s.parentNode.insertBefore(ga, s);\n" + "})();\n"; +} // namespace net_instaweb diff --git a/psol/include/net/instaweb/rewriter/image_testing_peer.h b/psol/include/net/instaweb/rewriter/image_testing_peer.h new file mode 100644 index 000000000..fffd39361 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/image_testing_peer.h @@ -0,0 +1,47 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_IMAGE_TESTING_PEER_H_ +#define NET_INSTAWEB_REWRITER_IMAGE_TESTING_PEER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/rewriter/public/image.h" + +namespace net_instaweb { + +class ImageDim; + +class ImageTestingPeer { + public: + ImageTestingPeer() { } + + static void SetResizedDimensions(const ImageDim& dim, Image* image) { + image->SetResizedDimensions(dim); + } + + static bool ShouldConvertToProgressive(int64 quality, Image* image) { + return image->ShouldConvertToProgressive(quality); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ImageTestingPeer); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_IMAGE_TESTING_PEER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/add_head_filter.h b/psol/include/net/instaweb/rewriter/public/add_head_filter.h new file mode 100644 index 000000000..7e5116ed2 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/add_head_filter.h @@ -0,0 +1,55 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_ADD_HEAD_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_ADD_HEAD_FILTER_H_ + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { +class HtmlElement; +class HtmlParse; + +// Adds a 'head' element before the 'body', if none was found +// during parsing. This enables downstream filters to assume +// that there will be a head. +class AddHeadFilter : public EmptyHtmlFilter { + public: + explicit AddHeadFilter(HtmlParse* parser, bool combine_multiple_heads); + virtual ~AddHeadFilter(); + + virtual void StartDocument(); + virtual void StartElement(HtmlElement* element); + virtual void EndDocument(); + virtual void EndElement(HtmlElement* element); + virtual void Flush(); + virtual const char* Name() const { return "AddHead"; } + + private: + HtmlParse* html_parse_; + bool combine_multiple_heads_; + bool found_head_; + HtmlElement* head_element_; + + DISALLOW_COPY_AND_ASSIGN(AddHeadFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_ADD_HEAD_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/add_instrumentation_filter.h b/psol/include/net/instaweb/rewriter/public/add_instrumentation_filter.h new file mode 100644 index 000000000..f06e7ac0c --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/add_instrumentation_filter.h @@ -0,0 +1,73 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: abliss@google.com (Adam Bliss) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_ADD_INSTRUMENTATION_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_ADD_INSTRUMENTATION_FILTER_H_ + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; +class Statistics; +class Variable; + +// Injects javascript instrumentation for monitoring page-rendering time. +class AddInstrumentationFilter : public EmptyHtmlFilter { + public: + static const char kLoadTag[]; + static const char kUnloadTag[]; + static GoogleString* kUnloadScriptFormatXhtml; + static GoogleString* kTailScriptFormatXhtml; + + // Counters. + static const char kInstrumentationScriptAddedCount[]; + + explicit AddInstrumentationFilter(RewriteDriver* driver); + virtual ~AddInstrumentationFilter(); + + static void InitStats(Statistics* statistics); + + virtual void StartDocument(); + virtual void StartElement(HtmlElement* element); + virtual void EndElement(HtmlElement* element); + virtual const char* Name() const { return "AddInstrumentation"; } + + protected: + // The total number of times instrumentation script is added. + Variable* instrumentation_script_added_count_; + + private: + // Adds a script node to given element using the specified format and + // tag name. + void AddScriptNode(HtmlElement* element, const GoogleString& tag_name); + + RewriteDriver* driver_; + bool found_head_; + bool added_tail_script_; + bool added_unload_script_; + + DISALLOW_COPY_AND_ASSIGN(AddInstrumentationFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_ADD_INSTRUMENTATION_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/ajax_rewrite_context.h b/psol/include/net/instaweb/rewriter/public/ajax_rewrite_context.h new file mode 100755 index 000000000..6adf44ec7 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/ajax_rewrite_context.h @@ -0,0 +1,183 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: nikhilmadan@google.com (Nikhil Madan) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_AJAX_REWRITE_CONTEXT_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_AJAX_REWRITE_CONTEXT_H_ + +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/content_type.h" +#include "net/instaweb/http/public/http_value.h" +#include "net/instaweb/http/public/http_value_writer.h" +#include "net/instaweb/http/public/response_headers.h" +#include "net/instaweb/rewriter/public/output_resource_kind.h" +#include "net/instaweb/rewriter/public/resource.h" +#include "net/instaweb/rewriter/public/server_context.h" +#include "net/instaweb/rewriter/public/resource_slot.h" +#include "net/instaweb/rewriter/public/rewrite_context.h" +#include "net/instaweb/rewriter/public/rewrite_options.h" +#include "net/instaweb/rewriter/public/single_rewrite_context.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/proto_util.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class InputInfo; +class MessageHandler; +class RewriteDriver; +class RewriteFilter; +class Statistics; +class UrlAsyncFetcher; +class Variable; + +// A resource-slot created for an ajax rewrite. This has an empty render method. +// Note that this class is usually used as a RefCountedPtr and gets deleted when +// there are no references remaining. +class AjaxRewriteResourceSlot : public ResourceSlot { + public: + explicit AjaxRewriteResourceSlot(const ResourcePtr& resource); + + // Implements ResourceSlot::Render(). + virtual void Render(); + + // Implements ResourceSlot::LocationString(). + virtual GoogleString LocationString() { return "ajax"; } + + protected: + virtual ~AjaxRewriteResourceSlot(); + + private: + DISALLOW_COPY_AND_ASSIGN(AjaxRewriteResourceSlot); +}; + +// Context that is used for an ajax rewrite. +class AjaxRewriteContext : public SingleRewriteContext { + public: + // Stats variable name to keep track of how often in-place falls back to + // stream (due to a large resource) when Options->in_place_wait_for_optimized + // is true. + static const char kInPlaceOversizedOptStream[]; + + AjaxRewriteContext(RewriteDriver* driver, const StringPiece& url); + virtual ~AjaxRewriteContext(); + + // Implements SingleRewriteContext::RewriteSingle(). + virtual void RewriteSingle(const ResourcePtr& input, + const OutputResourcePtr& output); + // Implements RewriteContext::id(). + virtual const char* id() const { return RewriteOptions::kAjaxRewriteId; } + // Implements RewriteContext::kind(). + virtual OutputResourceKind kind() const { return kRewrittenResource; } + // Implements RewriteContext::DecodeFetchUrls(). + virtual bool DecodeFetchUrls(const OutputResourcePtr& output_resource, + MessageHandler* message_handler, + GoogleUrlStarVector* url_vector); + // Implements RewriteContext::StartFetchReconstruction(). + virtual void StartFetchReconstruction(); + + static void InitStats(Statistics* statistics); + + bool perform_http_fetch() const { return perform_http_fetch_; } + void set_perform_http_fetch(bool x) { perform_http_fetch_ = x; } + + private: + friend class RecordingFetch; + // Implements RewriteContext::Harvest(). + virtual void Harvest(); + void StartFetchReconstructionParent(); + // Implements RewriteContext::FixFetchFallbackHeaders(). + virtual void FixFetchFallbackHeaders(ResponseHeaders* headers); + // Implements RewriteContext::FetchTryFallback(). + virtual void FetchTryFallback(const GoogleString& url, + const StringPiece& hash); + // Implements RewriteContext::FetchCallbackDone(). + virtual void FetchCallbackDone(bool success); + + RewriteFilter* GetRewriteFilter(const ContentType& type); + + // Update the date and expiry time based on the InputInfo's. + void UpdateDateAndExpiry(const protobuf::RepeatedPtrField& inputs, + int64* date_ms, int64* expiry_ms); + + RewriteDriver* driver_; + GoogleString url_; + // Boolean indicating whether or not the resource was rewritten successfully. + bool is_rewritten_; + // The hash of the rewritten resource. Note that this should only be used if + // is_rewritten_ is true. This may be empty. + GoogleString rewritten_hash_; + + // Information needed for nested rewrites. + ResourcePtr input_resource_; + OutputResourcePtr output_resource_; + + scoped_ptr cache_fetcher_; + + // Should we fetch the contents if cache lookup fails? + bool perform_http_fetch_; + + DISALLOW_COPY_AND_ASSIGN(AjaxRewriteContext); +}; + +// Records the fetch into the provided resource and passes through events to the +// underlying writer, response headers and callback. +class RecordingFetch : public SharedAsyncFetch { + public: + RecordingFetch(AsyncFetch* async_fetch, + const ResourcePtr& resource, + AjaxRewriteContext* context, + MessageHandler* handler); + + virtual ~RecordingFetch(); + + // Implements SharedAsyncFetch::HandleHeadersComplete(). + virtual void HandleHeadersComplete(); + // Implements SharedAsyncFetch::HandleWrite(). + virtual bool HandleWrite(const StringPiece& content, MessageHandler* handler); + // Implements SharedAsyncFetch::HandleFlush(). + virtual bool HandleFlush(MessageHandler* handler); + // Implements SharedAsyncFetch::HandleDone(). + virtual void HandleDone(bool success); + + private: + void FreeDriver(); + + bool CanAjaxRewrite(); + + // By default RecordingFetch streams back the original content to the browser. + // If this returns false then the RecordingFetch should cache the original + // content but not stream it. + bool ShouldStream(); + + MessageHandler* handler_; + ResourcePtr resource_; + AjaxRewriteContext* context_; + bool can_ajax_rewrite_; + bool streaming_; + HTTPValue cache_value_; + HTTPValueWriter cache_value_writer_; + ResponseHeaders saved_headers_; + Variable* in_place_oversized_opt_stream_; + DISALLOW_COPY_AND_ASSIGN(RecordingFetch); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_AJAX_REWRITE_CONTEXT_H_ diff --git a/psol/include/net/instaweb/rewriter/public/association_transformer.h b/psol/include/net/instaweb/rewriter/public/association_transformer.h new file mode 100644 index 000000000..26a04db92 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/association_transformer.h @@ -0,0 +1,129 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_ASSOCIATION_TRANSFORMER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_ASSOCIATION_TRANSFORMER_H_ + +#include + +#include "net/instaweb/rewriter/public/css_tag_scanner.h" +#include "net/instaweb/rewriter/public/resource.h" +#include "net/instaweb/rewriter/public/resource_slot.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/gtest_prod.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class GoogleUrl; +class MessageHandler; + +// Transformer that uses a std::map to specify which URLs to rewrite to +// which other URLs. +// Used by CssFilter to rewrite subresources in CSS even when it cannot +// be parsed, by using AssociationSlots to update the map before transforming. +class AssociationTransformer : public CssTagScanner::Transformer { + public: + // base_url is the URL all CSS url()s should be absolutified against, + // this is generally the URL for the CSS file or HTML file for inline CSS. + // backup_transformer is another transformer to be applied if no + // association has been set in AssociationTransformer's map_. It may be + // set to NULL if no backup is needed. + // + // base_url, backup_transformer and handler must live longer than + // AssociationTransformer. + AssociationTransformer(const GoogleUrl* base_url, + CssTagScanner::Transformer* backup_transformer, + MessageHandler* handler) + : base_url_(base_url), backup_transformer_(backup_transformer), + handler_(handler) {} + virtual ~AssociationTransformer(); + + // Map is exposed so that you can set associations. + // Each key -> value specifies that every instance of the absolute URL + // key should be transformed to the absolute URL value. + StringStringMap* map() { return &map_; } + + // To do the actual transformation. Call CssTagScanner::TransformUrls() + // with this AssociationTransformer which will call Transform() on all URLs. + // Transform will lookup all (absolutified) URLs in map_ and rewrite them + // if present (otherwise it will pass them to the backup_transformer_). + virtual TransformStatus Transform(const StringPiece& in, GoogleString* out); + + private: + // Mapping of input URLs to output URLs. + StringStringMap map_; + + // Base URL for CSS file, needed to absolutify URLs in Transform. + const GoogleUrl* base_url_; + + // Transformer to be applied to URLs we don't rewrite. For example, we might + // want to make sure we absolutify all URLs, even if we don't rewrite them. + CssTagScanner::Transformer* backup_transformer_; + + MessageHandler* handler_; + + FRIEND_TEST(AssociationTransformerTest, TransformsCorrectly); + + DISALLOW_COPY_AND_ASSIGN(AssociationTransformer); +}; + +// Extremely simple slot which just sets an association in a std::map when +// it is Render()ed. It associates the key (input URL) with this slot's +// resource URL (the output URL). +// Can be used to set AssociationTransformer::map() so that +// AssocitationTransformer::Transform() will rewrite the rendered URLs. +class AssociationSlot : public ResourceSlot { + public: + // Note: map must outlive AssociationSlot. + AssociationSlot(ResourcePtr resource, + StringStringMap* map, const StringPiece& key) + : ResourceSlot(resource), map_(map) { + key.CopyToString(&key_); + } + virtual ~AssociationSlot(); + + // All Render() calls are from the same thread, so this doesn't need to be + // thread-safe. + virtual void Render() { + if (!disable_rendering()) { + (*map_)[key_] = resource()->url(); + } + } + + virtual void DirectSetUrl(const StringPiece& url) { + url.CopyToString(&((*map_)[key_])); + } + + virtual GoogleString LocationString() { + // TODO(sligocki): Improve quality of this diagnostic. + // Also improve CssResourceSlot::LocationString() which is identical. + return "Inside CSS"; + } + + private: + StringStringMap* map_; + GoogleString key_; + + DISALLOW_COPY_AND_ASSIGN(AssociationSlot); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_ASSOCIATION_TRANSFORMER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/base_tag_filter.h b/psol/include/net/instaweb/rewriter/public/base_tag_filter.h new file mode 100644 index 000000000..acfc02913 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/base_tag_filter.h @@ -0,0 +1,55 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BASE_TAG_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BASE_TAG_FILTER_H_ + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; + +// Add this filter into the HtmlParse chain to add a base +// tag into the head section of an HTML document. +class BaseTagFilter : public EmptyHtmlFilter { + public: + explicit BaseTagFilter(RewriteDriver* driver) + : added_base_tag_(false), + driver_(driver) {} + + virtual ~BaseTagFilter(); + + virtual void StartDocument() { + added_base_tag_ = false; + } + virtual void StartElement(HtmlElement* element); + virtual const char* Name() const { return "BaseTag"; } + + private: + bool added_base_tag_; + RewriteDriver* driver_; + + DISALLOW_COPY_AND_ASSIGN(BaseTagFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BASE_TAG_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/beacon_critical_images_finder.h b/psol/include/net/instaweb/rewriter/public/beacon_critical_images_finder.h new file mode 100644 index 000000000..9b35abf57 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/beacon_critical_images_finder.h @@ -0,0 +1,55 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Author: jud@google.com (Jud Porter) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BEACON_CRITICAL_IMAGES_FINDER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BEACON_CRITICAL_IMAGES_FINDER_H_ + +#include "net/instaweb/rewriter/public/critical_images_finder.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class RewriteDriver; +class Statistics; + +// Support critical (above the fold) image detection through a javascript beacon +// on the client. +// TODO(jud): This class is not yet implemented. +class BeaconCriticalImagesFinder : public CriticalImagesFinder { + public: + static const char kBeaconCohort[]; + + explicit BeaconCriticalImagesFinder(Statistics* stats); + virtual ~BeaconCriticalImagesFinder(); + + virtual bool IsMeaningful() const { + // TODO(jud): This class is not currently implemented yet, change this when + // it is functional. + return false; + } + + virtual void ComputeCriticalImages(StringPiece url, + RewriteDriver* driver); + + virtual const char* GetCriticalImagesCohort() const { + return kBeaconCohort; + } +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BEACON_CRITICAL_IMAGES_FINDER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/blink_background_filter.h b/psol/include/net/instaweb/rewriter/public/blink_background_filter.h new file mode 100644 index 000000000..566cf69fc --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/blink_background_filter.h @@ -0,0 +1,51 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: rahulbansal@google.com (Rahul Bansal) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BLINK_BACKGROUND_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BLINK_BACKGROUND_FILTER_H_ + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/rewriter/public/script_tag_scanner.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; +class RewriteOptions; + +// This class does the preprocessing required to apply blink. +class BlinkBackgroundFilter : public EmptyHtmlFilter { + public: + explicit BlinkBackgroundFilter(RewriteDriver* rewrite_driver); + virtual ~BlinkBackgroundFilter(); + + virtual void StartElement(HtmlElement* element); + virtual const char* Name() const { return "ProcessBlinkInBackgroundFilter"; } + + private: + RewriteDriver* rewrite_driver_; + const RewriteOptions* rewrite_options_; + ScriptTagScanner script_tag_scanner_; + + DISALLOW_COPY_AND_ASSIGN(BlinkBackgroundFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BLINK_BACKGROUND_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/blink_critical_line_data_finder.h b/psol/include/net/instaweb/rewriter/public/blink_critical_line_data_finder.h new file mode 100644 index 000000000..73704ce48 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/blink_critical_line_data_finder.h @@ -0,0 +1,63 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: pulkitg@google.com (Pulkit Goyal) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BLINK_CRITICAL_LINE_DATA_FINDER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BLINK_CRITICAL_LINE_DATA_FINDER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class BlinkCriticalLineData; +class PropertyPage; +class ResponseHeaders; +class RewriteDriver; + +// Finds BlinkCriticalLineData from the given html content. This information +// will be used by BlinkFlowCriticalLine. +// TODO(pulkitg): Rethink about the naming and structure of this class. +class BlinkCriticalLineDataFinder { + public: + static const char kBlinkCohort[]; + BlinkCriticalLineDataFinder(); + virtual ~BlinkCriticalLineDataFinder(); + + // Gets BlinkCriticalLineData from the given PropertyPage. + virtual BlinkCriticalLineData* ExtractBlinkCriticalLineData( + int64 cache_time_ms, PropertyPage* page, int64 now_ms, bool diff_enabled, + bool propagate_cache_deletes); + + // Computes BlinkCriticalLineData for the given html content. + virtual void ComputeBlinkCriticalLineData( + const GoogleString& computed_hash, + const GoogleString& computed_hash_smart_diff, + const StringPiece html_content, + const ResponseHeaders* response_headers, + RewriteDriver* driver); + + virtual void PropagateCacheDeletes(const GoogleString& key); + + private: + DISALLOW_COPY_AND_ASSIGN(BlinkCriticalLineDataFinder); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BLINK_CRITICAL_LINE_DATA_FINDER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/blink_filter.h b/psol/include/net/instaweb/rewriter/public/blink_filter.h new file mode 100644 index 000000000..d383b4d4a --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/blink_filter.h @@ -0,0 +1,90 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: rahulbansal@google.com (Rahul Bansal) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BLINK_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BLINK_FILTER_H_ + +#include + +#include "net/instaweb/htmlparse/public/html_writer_filter.h" +#include "net/instaweb/rewriter/blink_critical_line_data.pb.h" +#include "net/instaweb/rewriter/public/blink_util.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/property_cache.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/string_writer.h" +#include "net/instaweb/util/public/json.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; +class RewriteOptions; + +// This class extracts the non cacheable panels, looks up the non critical +// content in property cache and sends it to the client, +class BlinkFilter : public HtmlWriterFilter { + public: + // TODO(rahulbansal): Move these consts to appropriate file. + static const char kBlinkCriticalLineDataPropertyName[]; + static const char kBlinkCohort[]; + static const char kRefreshPageJs[]; + + explicit BlinkFilter(RewriteDriver* rewrite_driver); + virtual ~BlinkFilter(); + + void StartDocument(); + void StartElement(HtmlElement* element); + void EndElement(HtmlElement* element); + void EndDocument(); + void WriteString(StringPiece str); + void Flush(); + virtual const char* Name() const { return "BlinkFilter"; } + + private: + void SendCookies(); + void SendNonCriticalJson(GoogleString* str); + void ServeNonCriticalPanelContents(); + void SendNonCacheableObject(const Json::Value& json); + void ObtainBlinkCriticalLineData(); + void HandleLastModifiedChange(); + // Produces a custom xpath relative to the body or relative to the nearest + // ancestor with an id (if there is one). Xpath comprises of the tag name + // and the id (if it exists) or the position of the elements. + GoogleString GetXpathOfCurrentElement(HtmlElement* element); + + RewriteDriver* rewrite_driver_; // We do not own this. + const RewriteOptions* rewrite_options_; // We do not own this. + AttributesToNonCacheableValuesMap attribute_non_cacheable_values_map_; + std::vector panel_number_num_instances_; + GoogleString buffer_; + StringWriter string_writer_; + const HtmlElement* current_non_cacheable_element_; // We do not own this. + GoogleString current_panel_id_; + const PropertyCache::Cohort* cohort_; // We do not own this. + BlinkCriticalLineData blink_critical_line_data_; + bool abort_filter_; + std::vector num_children_stack_; + + DISALLOW_COPY_AND_ASSIGN(BlinkFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BLINK_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/blink_util.h b/psol/include/net/instaweb/rewriter/public/blink_util.h new file mode 100644 index 000000000..c6bb28fc7 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/blink_util.h @@ -0,0 +1,119 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: gagansingh@google.com (Gagan Singh) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BLINK_UTIL_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BLINK_UTIL_H_ + +#include +#include +#include + +#include "net/instaweb/util/public/json.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class AsyncFetch; +class GoogleUrl; +class HtmlElement; +class Panel; +class PanelSet; +class ServerContext; +class RewriteOptions; +class UserAgentMatcher; + +typedef std::map PanelIdToSpecMap; +typedef std::multimap, + StringCompareInsensitive> AttributesToNonCacheableValuesMap; + +namespace BlinkUtil { + +const char kContiguous[] = "contiguous"; +const char kCritical[] = "critical"; +const char kPanelId[] = "panel-id"; +const char kImages[] = "images"; +const char kInstanceHtml[] = "instance_html"; +const char kStartBodyMarker[] = ""; +const char kEndBodyTag[] = ""; +const char kLayoutMarker[] = ""; +const char kJsonCachePrefix[] = "json:"; +// TODO(mmohabey): Use RewriteDriver::kStatusCodePropertyName here. +const char kBlinkResponseCodePropertyName[] = "blink_last_response_code"; +const char kXpath[] = "xpath"; +// TODO(rahulbansal): Use these constants everywhere in the code from here. +const char kBlinkCohort[] = "blink"; +const char kBlinkCriticalLineDataPropertyName[] = "blink_critical_line_data"; +const char kComputeVisibleTextFilterOutputEndMarker[] = + ""; + +// Checks whether the user agent is allowed to go into the blink flow. +bool IsUserAgentAllowedForBlink(AsyncFetch* async_fetch, + const RewriteOptions* options, + const char* user_agent, + const UserAgentMatcher& user_agent_matcher); + +// Checks whether the request for 'url' is a valid blink request. +bool IsBlinkRequest(const GoogleUrl& url, + AsyncFetch* async_fetch, + const RewriteOptions* options, + const char* user_agent, + const UserAgentMatcher& user_agent_matcher_); + +// Checks if blink critical line flow can be applied. +bool ShouldApplyBlinkFlowCriticalLine( + const ServerContext* manager, + const RewriteOptions* options); + +// Returns true if json has only miscellaneous(like 'contiguous') +// atributes. +bool IsJsonEmpty(const Json::Value& json); + +// Clears the json array if all objects are empty. +void ClearArrayIfAllEmpty(Json::Value* json); + +// Computes panel id to specification map and returns if any non cacheable +// panels are present. +bool ComputePanels(const PanelSet* panel_set_, + PanelIdToSpecMap* panel_id_to_spec); + +// Escapes < and > with __psa_lt; and __psa_gt; respectively. +void EscapeString(GoogleString* str); + +// TODO(rahulbansal): Move this function to net/instaweb/util/string_util +bool StripTrailingNewline(GoogleString* s); + +// Populates the attributes to non cacheable values map. +void PopulateAttributeToNonCacheableValuesMap( + const RewriteOptions* rewrite_options, + const GoogleUrl& url, + AttributesToNonCacheableValuesMap* attribute_non_cacheable_values_map, + std::vector* panel_number_num_instances); + +// Returns panel number for non cacheable element. If cacheable returns -1. +int GetPanelNumberForNonCacheableElement( + const AttributesToNonCacheableValuesMap& attribute_non_cacheable_values_map, + const HtmlElement* element); + +// Gets panel id for the given panel instance. +GoogleString GetPanelId(int panel_number, int instance_number); +} // namespace BlinkUtil + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BLINK_UTIL_H_ diff --git a/psol/include/net/instaweb/rewriter/public/cache_extender.h b/psol/include/net/instaweb/rewriter/public/cache_extender.h new file mode 100644 index 000000000..05ffc4787 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/cache_extender.h @@ -0,0 +1,93 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_CACHE_EXTENDER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_CACHE_EXTENDER_H_ + +#include "net/instaweb/rewriter/public/resource.h" // for ResourcePtr +#include "net/instaweb/rewriter/public/server_context.h" +#include "net/instaweb/rewriter/public/resource_slot.h" +#include "net/instaweb/rewriter/public/rewrite_filter.h" +#include "net/instaweb/rewriter/public/rewrite_options.h" +#include "net/instaweb/rewriter/public/rewrite_result.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HtmlElement; +class ResponseHeaders; +class RewriteContext; +class RewriteDriver; +class Statistics; +class Variable; + +// Rewrites resources without changing their content -- just their +// URLs and headers. The original intent of this filter was limited +// to cache extension. However, its scope has been expanded to include +// domain sharding and moving static resources to cookieless domains or +// CDNs. +// +// TODO(jmarantz): rename this class to something more generic, like +// RenameUrlFilter or ProxyUrlFilter. +class CacheExtender : public RewriteFilter { + public: + static const char kCacheExtensions[]; + static const char kNotCacheable[]; + + explicit CacheExtender(RewriteDriver* driver); + virtual ~CacheExtender(); + + static void InitStats(Statistics* statistics); + + virtual void StartDocumentImpl() {} + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element) {} + + virtual const char* Name() const { return "CacheExtender"; } + virtual const char* id() const { return RewriteOptions::kCacheExtenderId; } + + // Creates a nested rewrite for given parent and slot, and returns it. + // The result is not registered with the parent. + RewriteContext* MakeNestedContext(RewriteContext* parent, + const ResourceSlotPtr& slot); + + protected: + virtual bool ComputeOnTheFly() const; + virtual RewriteContext* MakeRewriteContext(); + + private: + class Context; + friend class Context; + + RewriteResult RewriteLoadedResource(const ResourcePtr& input_resource, + const OutputResourcePtr& output_resource); + + bool ShouldRewriteResource( + const ResponseHeaders* headers, int64 now_ms, + const ResourcePtr& input_resource, const StringPiece& url) const; + + Variable* extension_count_; + Variable* not_cacheable_count_; + + DISALLOW_COPY_AND_ASSIGN(CacheExtender); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_CACHE_EXTENDER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/collapse_whitespace_filter.h b/psol/include/net/instaweb/rewriter/public/collapse_whitespace_filter.h new file mode 100644 index 000000000..e1e0be76b --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/collapse_whitespace_filter.h @@ -0,0 +1,62 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mdsteele@google.com (Matthew D. Steele) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_COLLAPSE_WHITESPACE_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_COLLAPSE_WHITESPACE_FILTER_H_ + +#include + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/htmlparse/public/html_name.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { +class HtmlParse; +class HtmlElement; +class HtmlCharactersNode; + +// Reduce the size of the HTML by collapsing whitespace (except within certain +// tags, e.g.
 and 
+//   
+//    
+//   
+//  
+// 
+//
+// Above script which converts pagespeed_iframe to iframe will be deferred
+// by JsDeferDisabledJavascriptFilter, hence loading of iframe is also deferred.
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DEFER_IFRAME_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DEFER_IFRAME_FILTER_H_
+
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+#include "net/instaweb/util/public/basictypes.h"
+
+namespace net_instaweb {
+
+class HtmlElement;
+class RewriteDriver;
+class StaticJavascriptManager;
+
+class DeferIframeFilter : public EmptyHtmlFilter {
+ public:
+  static const char kDeferIframeInit[];
+  static const char kDeferIframeIframeJs[];
+  explicit DeferIframeFilter(RewriteDriver* driver);
+  ~DeferIframeFilter();
+
+  virtual void StartDocument();
+  virtual void StartElement(HtmlElement* element);
+  virtual void EndElement(HtmlElement* element);
+  virtual void DetermineEnabled();
+
+  virtual const char* Name() const { return "DeferIframe"; }
+
+ private:
+  RewriteDriver* driver_;
+  StaticJavascriptManager* static_js_manager_;
+  bool script_inserted_;
+
+  DISALLOW_COPY_AND_ASSIGN(DeferIframeFilter);
+};
+
+}  // namespace net_instaweb
+
+#endif  // NET_INSTAWEB_REWRITER_PUBLIC_DEFER_IFRAME_FILTER_H_
diff --git a/psol/include/net/instaweb/rewriter/public/delay_images_filter.h b/psol/include/net/instaweb/rewriter/public/delay_images_filter.h
new file mode 100644
index 000000000..ab5bcaf12
--- /dev/null
+++ b/psol/include/net/instaweb/rewriter/public/delay_images_filter.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: pulkitg@google.com (Pulkit Goyal)
+//
+// Contains implementation of DelayImagesFilter, which delays all the high
+// quality images whose low quality inlined data url are available within their
+// respective image tag like
+// .
+//
+// This filter extracts such low res data urls and generates a map from them.
+// This map will be embedded inside HTML at the end of body tag with a script
+// whose function is to put low res src into respective image tag. Another
+// script which replaces low quality images with high quality images is also
+// embedded.
+//
+// This filter will work in conjunction with image_rewrite_filter which
+// generates data url for low quality images and embeds them with their
+// respective img tags.
+//
+// To avoid drastic reflows, we also need to switch on insert_image_dimensions.
+//
+// Html input to this filter looks like:
+// 
+//  
+//  
+//  
+//   
+//  
+// 
+//
+// Above input html input looks like this because the image_rewrite_filter has
+// already replaced  with
+// .
+//
+// Output for the above html will be:
+// 
+//  
+//   
+//  
+//  
+//   
+//   
+//  
+// 
+//
+// Bottom-of-page script actually includes the image data for the low-resolution
+// images, and those are put in place as soon as control reaches there. High
+// quality images are downloaded after all the low quality images are placed
+// by delay script.
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DELAY_IMAGES_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DELAY_IMAGES_FILTER_H_
+
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+#include "net/instaweb/util/public/basictypes.h"
+#include "net/instaweb/util/public/string_util.h"
+
+namespace net_instaweb {
+
+class HtmlElement;
+class RewriteDriver;
+class StaticJavascriptManager;
+class Statistics;
+
+class DelayImagesFilter : public EmptyHtmlFilter {
+ public:
+  static const char kDelayImagesSuffix[];
+  static const char kDelayImagesInlineSuffix[];
+  static const char kOnloadFunction[];
+
+  explicit DelayImagesFilter(RewriteDriver* driver);
+  virtual ~DelayImagesFilter();
+
+  virtual void StartDocument();
+  virtual void EndDocument();
+  virtual void EndElement(HtmlElement* element);
+
+  virtual const char* Name() const { return "DelayImages"; }
+
+  static void InitStats(Statistics* statistics);
+  static void Terminate();
+
+ private:
+  // Creates a script node containing kDelayImagesSuffix js and append this node
+  // just after element.
+  void InsertDelayImagesJS(HtmlElement* element);
+
+  // Creates a script node containing kDelayImagesInlineSuffix js and append
+  // this node just after element.
+  void InsertDelayImagesInlineJS(HtmlElement* element);
+
+  RewriteDriver* driver_;
+  StaticJavascriptManager* static_js_manager_;
+
+  // pagespeed_low_res_src will be added to the low_res_data_map_ until
+  // low_res_inserted is false. As soon as low_res_map_inserted_ is true, there
+  // is no further addition to low_res_data_map_.
+  bool low_res_map_inserted_;
+  int num_low_res_inlined_images_;
+  StringStringMap low_res_data_map_;
+
+  // Replace the image url with low res base64 encoded url inplace if it is
+  // true, else low_res_data_map_ containing low res images is inserted at the
+  // end of body tag.
+  bool insert_low_res_images_inplace_;
+
+  // is_experimental_enabled_ is set to true if
+  // enable_inline_preview_images_experimental is true.
+  bool is_experimental_enabled_;
+  DISALLOW_COPY_AND_ASSIGN(DelayImagesFilter);
+};
+
+}  // namespace net_instaweb
+
+#endif  // NET_INSTAWEB_REWRITER_PUBLIC_DELAY_IMAGES_FILTER_H_
diff --git a/psol/include/net/instaweb/rewriter/public/detect_reflow_js_defer_filter.h b/psol/include/net/instaweb/rewriter/public/detect_reflow_js_defer_filter.h
new file mode 100644
index 000000000..5b211834e
--- /dev/null
+++ b/psol/include/net/instaweb/rewriter/public/detect_reflow_js_defer_filter.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: atulvasu@google.com (Atul Vasu)
+//         sriharis@google.com (Srihari Sukumaran)
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DETECT_REFLOW_JS_DEFER_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DETECT_REFLOW_JS_DEFER_FILTER_H_
+
+#include "net/instaweb/util/public/basictypes.h"
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+
+namespace net_instaweb {
+
+class RewriteDriver;
+class HtmlElement;
+class Statistics;
+
+// Similar to JsDeferDisabledFilter, but adds some extra js to figure out
+// potential page rendering reflows due to deferred script execution.
+class DetectReflowJsDeferFilter : public EmptyHtmlFilter {
+ public:
+  explicit DetectReflowJsDeferFilter(RewriteDriver* driver);
+  virtual ~DetectReflowJsDeferFilter();
+
+  virtual void StartDocument();
+  virtual void StartElement(HtmlElement* element);
+  virtual void EndElement(HtmlElement* element);
+  virtual void EndDocument();
+  virtual const char* Name() const { return "DetectReflowJsDeferFilter"; }
+
+  void InsertDetectReflowCode(HtmlElement* element);
+
+  static void InitStats(Statistics* statistics);
+  static void Terminate();
+
+ private:
+  RewriteDriver* rewrite_driver_;
+
+  // The script that will be inlined at the end of BODY.
+  bool script_written_;
+  bool defer_js_enabled_;
+  bool debug_;
+
+  DISALLOW_COPY_AND_ASSIGN(DetectReflowJsDeferFilter);
+};
+
+}  // namespace net_instaweb
+
+#endif  // NET_INSTAWEB_REWRITER_PUBLIC_DETECT_REFLOW_JS_DEFER_FILTER_H_
diff --git a/psol/include/net/instaweb/rewriter/public/deterministic_js_filter.h b/psol/include/net/instaweb/rewriter/public/deterministic_js_filter.h
new file mode 100644
index 000000000..d78aa9bf5
--- /dev/null
+++ b/psol/include/net/instaweb/rewriter/public/deterministic_js_filter.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: mmohabey@google.com (Megha Mohabey)
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DETERMINISTIC_JS_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DETERMINISTIC_JS_FILTER_H_
+
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+#include "net/instaweb/util/public/basictypes.h"
+
+namespace net_instaweb {
+
+class HtmlElement;
+class RewriteDriver;
+
+// Injects javascript at the beginning of the head tag to make it deterministic.
+// The JS redefines functions like Math.random and Date. This filter is useful
+// for testing and measurement but does not provide any latency gains. A head
+// element is added if it is not already present in the html.
+class DeterministicJsFilter : public EmptyHtmlFilter {
+ public:
+  explicit DeterministicJsFilter(RewriteDriver* driver);
+  virtual ~DeterministicJsFilter();
+
+  virtual void StartDocument();
+  virtual void StartElement(HtmlElement* element);
+  virtual const char* Name() const { return "DeterministicJs"; }
+
+ private:
+  RewriteDriver* driver_;
+  bool found_head_;
+
+  DISALLOW_COPY_AND_ASSIGN(DeterministicJsFilter);
+};
+
+}  // namespace net_instaweb
+
+
+#endif  // NET_INSTAWEB_REWRITER_PUBLIC_DETERMINISTIC_JS_FILTER_H_
diff --git a/psol/include/net/instaweb/rewriter/public/div_structure_filter.h b/psol/include/net/instaweb/rewriter/public/div_structure_filter.h
new file mode 100644
index 000000000..43f08b5eb
--- /dev/null
+++ b/psol/include/net/instaweb/rewriter/public/div_structure_filter.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: jhoch@google.com (Jason R. Hoch)
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DIV_STRUCTURE_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DIV_STRUCTURE_FILTER_H_
+
+#include 
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+#include "net/instaweb/util/public/string.h"
+
+namespace net_instaweb {
+
+class HtmlElement;
+
+// This filter as it stands adds to all anchor href's a special query parameter,
+// unique for each link, representing vaguely the link's location on a page,
+// based on the div structure of the page.
+//
+// In its current simple/functional form, the query parameters are of the form
+// "0.1.0.3", a sort of series of DOM-coordinates of a DOM restricted to 
+// and elements. This example could be the 4th link in the first div +// of the second div of the first main div, or the 2nd link (following 2 divs) +// in the first div in the first div (following one link) in the first top-level +// div. +// +// TODO(jhoch): Next step is to encode/condense these parameter values (at the +// very least use a base higher than 10). +class DivStructureFilter : public EmptyHtmlFilter { + public: + static const char kParamName[]; + + explicit DivStructureFilter(); + virtual ~DivStructureFilter(); + + virtual void StartDocument(); + virtual void StartElement(HtmlElement* element); + virtual void EndElement(HtmlElement* element); + virtual const char* Name() const { return "DivStructureFilter"; } + + static GoogleString GetDivCountStackEncoding( + const std::vector& div_count_stack); + + private: + std::vector div_count_stack_; +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_DIV_STRUCTURE_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/domain_lawyer.h b/psol/include/net/instaweb/rewriter/public/domain_lawyer.h new file mode 100644 index 000000000..fc639ff64 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/domain_lawyer.h @@ -0,0 +1,320 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) +// +// This class manages the relationships between domains and resources. +// +// The Lawyer keeps track of which domains we are allowed to rewrite, including +// whether multiple resources can be bundled together. +// +// The Lawyer keeps track of domain mappings to move resources onto a CDN or +// onto a cookieless domain. +// +// The Lawyer keeps track of domain sharding, for distributing resources across +// equivalent domains to improve browser download parallelism. +// +// The class here holds state based on the configuration files +// (e.g. Apache .conf). + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_LAWYER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_LAWYER_H_ + +#include +#include + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { +class GoogleUrl; +class MessageHandler; + +class DomainLawyer { + public: + DomainLawyer() : can_rewrite_domains_(false) {} + ~DomainLawyer(); + + // Determines whether a resource can be rewritten, and returns the domain + // that it should be written to. The domain and the path of the resolved + // request are considered - first just the domain, then the domain plus the + // root of the path, and so on down the path until a match is found or the + // path is exhausted; this is done because we can map to a domain plus a + // path and we want to retain the previous behavior of 'working' when a + // mapped-to domain was provided. If the resource_url is relative (has no + // domain) then the resource can always be written, and will share the domain + // of the original request. + // + // The resource_url is considered relative to original_request. Generally + // it is always accessible to rewrite resources in the same domain as the + // original. + // + // Note: The mapped domain name will not incorporate any sharding. + // This is handled by ShardDomain(). + // + // The returned mapped_domain_name will always end with a slash on success. + // The returned resolved_request incorporates rewrite-domain mapping and + // the original URL. + // + // Returns false on failure. + // + // This is used both for domain authorization and domain rewriting, + // but not domain sharding. + // + // See also IsDomainAuthorized, which can be used to determine + // domain authorization without performing a mapping. + bool MapRequestToDomain(const GoogleUrl& original_request, + const StringPiece& resource_url, + GoogleString* mapped_domain_name, + GoogleUrl* resolved_request, + MessageHandler* handler) const; + + // Given the context of an HTTP request to 'original_request', + // checks whether 'domain_to_check' is authorized for rewriting. + // + // For example, if we are rewriting http://www.myhost.com/index.html, + // then all resources from www.myhost.com are implicitly authorized + // for rewriting. Additionally, any domains specified via + // AddDomain() are also authorized. + bool IsDomainAuthorized(const GoogleUrl& original_request, + const GoogleUrl& domain_to_check) const; + + + // Returns true if the given origin (domain:port) is one that we were + // explicitly told about in any form --- e.g. as a rewrite domain, origin + // domain, simple domain, or a shard. + // + // Note that this method returning true does not mean that resources from the + // given domain should be rewritten. + bool IsOriginKnown(const GoogleUrl& domain_to_check) const; + + // Maps an origin resource; just prior to fetching it. This fails + // if the input URL is not valid. It succeeds even if there is no + // mapping done. You must compare 'in' to 'out' to determine if + // mapping was done. + // + // *is_proxy is set to true if the origin-domain was established via + // AddProxyDomainMapping. + bool MapOrigin(const StringPiece& in, GoogleString* out, + bool* is_proxy) const; + bool MapOriginUrl(const GoogleUrl& gurl, GoogleString* out, + bool* is_proxy) const; + + // The methods below this comment are intended only to be run only + // at configuration time. + + // Adds a simple domain to the set that can be rewritten. No + // mapping or sharding will be performed. Returns false if the + // domain syntax was not acceptable. Wildcards (*, ?) may be used in + // the domain_name. Careless use of wildcards can expose the user to + // XSS attacks. + bool AddDomain(const StringPiece& domain_name, MessageHandler* handler); + + // Adds a domain mapping, to assist with serving resources from + // cookieless domains or CDNs. This implicitly calls AddDomain(to_domain) + // and AddDomain(from_domain) if necessary. If either 'to' or 'from' has + // invalid syntax then this function returns false and has no effect. + // + // Wildcards may not be used in the to_domain, but they can be used + // in the from_domains. + // + // This routine can be called multiple times for the same to_domain. If + // the 'from' domains overlap due to wildcards, this will not be detected. + bool AddRewriteDomainMapping(const StringPiece& to_domain, + const StringPiece& comma_separated_from_domains, + MessageHandler* handler); + + // Adds domain mappings that handle both http and https urls for the given + // from_domain_name. No wildcards may be used in either domain, and both + // must be protocol-free and should not have port numbers. + // + // This routine can be called multiple times for the same to_domain. + bool AddTwoProtocolRewriteDomainMapping(const StringPiece& to_domain_name, + const StringPiece& from_domain_name, + MessageHandler* handler); + + // Adds a domain mapping, to assist with fetching resources from locally + // signficant names/ip-addresses. + // + // Wildcards may not be used in the to_domain, but they can be used + // in the from_domains. + // + // This routine can be called multiple times for the same to_domain. If + // the 'from' domains overlap due to wildcards, this will not be detected. + // + // It is invalid to use the same origin_domain in AddProxyDomainMapping + // and as the to_domain of AddOriginDomainMapping. The latter requires + // a Host: request-header on fetches, whereas the former will not get one. + bool AddOriginDomainMapping(const StringPiece& to_domain, + const StringPiece& comma_separated_from_domains, + MessageHandler* handler); + + // Adds a mapping to enable proxying & optimizing resources hosted + // on a domain we do not control, going back to the origin to + // fetch them. + // + // Wildcards may not be used in the proxy_domain or origin_domain. + // + // Subdirectories should normally be used in both the proxy_domain and + // origin_domain. This is a not a strict requirement. If you fully + // control the entire origin domain and are dedicating a proxy domain + // for the sole use of that origin domain then subdirectories are not needed. + // + // The proxy_domain must be running mod_pagespeed and configured + // consistently. The resources will be referenced from this domain + // in CSS and HTML files. + // + // The origin_domain does not need to run mod_pagespeed; it is used + // to fetch the resources. + // + // It is invalid to use the same origin_domain in AddProxyDomainMapping + // and to_domain of AddOriginDomainMapping. The latter requires + // a overriding the Host: request-header on fetches. + bool AddProxyDomainMapping(const StringPiece& proxy_domain, + const StringPiece& origin_domain, + MessageHandler* handler); + + // Adds domain mappings that handle fetches on both http and https for the + // given from_domain. No wildcards may be used in either domain, and both + // must be protocol-free and should not have port numbers. + // + // This routine can be called multiple times for the same to_domain. + bool AddTwoProtocolOriginDomainMapping(const StringPiece& to_domain_name, + const StringPiece& from_domain_name, + MessageHandler* handler); + + // Specifies domain-sharding. This implicitly calls AddDomain(to_domain). + // + // Wildcards may not be used in the to_domain or the from_domain. + bool AddShard(const StringPiece& to_domain, + const StringPiece& comma_separated_shards, + MessageHandler* handler); + + // Computes a domain shard based on a passed-in hash, returning true + // if the domain was sharded. Output argument 'sharded_domain' is + // only updated if when the return value is true. + // + // The hash is an explicit uint32 so that we get the same shard for a + // resource, whether the server is 32-bit or 64-bit. If we have + // 5 shards and used size_t for hashes, then we'd wind up with different + // shards on 32-bit and 64-bit machines and that would reduce cacheability + // of the sharded resources. + bool ShardDomain(const StringPiece& domain_name, uint32 hash, + GoogleString* sharded_domain) const; + + // Merge the domains declared in src into this. There are no exclusions, so + // this is really just aggregating the mappings and authorizations declared in + // both domains. When the same domain is mapped in 'this' and 'src', 'src' + // wins. + void Merge(const DomainLawyer& src); + + // Determines whether a resource of the given domain name is going + // to change due to RewriteDomain mapping or domain sharding. Note + // that this does not account for the actual domain shard selected. + bool WillDomainChange(const StringPiece& domain_name) const; + + // Determines whether any resources might be domain-mapped, either + // via sharding or rewriting. + bool can_rewrite_domains() const { return can_rewrite_domains_; } + + // Visible for testing. + int num_wildcarded_domains() const { return wildcarded_domains_.size(); } + + // Determines whether two domains have been declared as serving the same + // content by the user, via Rewrite or Shard mapping. + bool DoDomainsServeSameContent(const StringPiece& domain1, + const StringPiece& domain2) const; + + // Finds domains rewritten to this domain. Includes only non-wildcarded + // domains. comma_separated_from_domains is empty if no mapping found. + void FindDomainsRewrittenTo( + const GoogleUrl& domain_name, + ConstStringStarVector* from_domains) const; + + // Computes a signature for the DomainLawyer object including containing + // classes (Domain). + GoogleString Signature() const; + + // Computes a string representation meant for debugging purposes only. + // (The format might change in unpredictable ways and is not meant for + // machine consumption). + // Each domain will appear on a separate line, and each line will be prefixed + // with 'line_prefix'. + GoogleString ToString(StringPiece line_prefix) const; + + // Version that's easier to call from debugger. + GoogleString ToString() const { return ToString(StringPiece()); } + + private: + class Domain; + + typedef bool (Domain::*SetDomainFn)(Domain* domain, MessageHandler* handler); + + static GoogleString NormalizeDomainName(const StringPiece& domain_name); + + static bool IsSchemeSafeToMapTo(const StringPiece& domain_name, + bool allow_https_scheme); + + bool MapDomainHelper( + const StringPiece& to_domain_name, + const StringPiece& comma_separated_from_domains, + SetDomainFn set_domain_fn, + bool allow_wildcards, + bool allow_map_to_https, + bool authorize, + MessageHandler* handler); + + bool MapUrlHelper(const Domain& from_domain, + const Domain& to_domain, + const GoogleUrl& gurl, + GoogleUrl* mapped_gurl) const; + + bool DomainNameToTwoProtocols(const StringPiece& domain_name, + GoogleString* http_url, + GoogleString* https_url); + + bool TwoProtocolDomainHelper( + const StringPiece& to_domain_name, + const StringPiece& from_domain_name, + SetDomainFn set_domain_fn, + bool authorize, + MessageHandler* handler); + + Domain* AddDomainHelper(const StringPiece& domain_name, + bool warn_on_duplicate, + bool authorize, + bool is_proxy, + MessageHandler* handler); + Domain* CloneAndAdd(const Domain* src); + + Domain* FindDomain(const GoogleUrl& gurl) const; + + // Map-order is important as ordering is taken into consideration while + // constructing the signature of the domain lawyer. + typedef std::map DomainMap; // see AddDomainHelper + DomainMap domain_map_; + typedef std::vector DomainVector; // see AddDomainHelper + DomainVector wildcarded_domains_; + bool can_rewrite_domains_; + // If you add more fields here, please be sure to update Merge(). + + DISALLOW_COPY_AND_ASSIGN(DomainLawyer); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_LAWYER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/domain_rewrite_filter.h b/psol/include/net/instaweb/rewriter/public/domain_rewrite_filter.h new file mode 100644 index 000000000..d0c251a1b --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/domain_rewrite_filter.h @@ -0,0 +1,76 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_REWRITE_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_REWRITE_FILTER_H_ + +#include "net/instaweb/rewriter/public/common_filter.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HtmlElement; +class GoogleUrl; +class RewriteDriver; +class Statistics; +class Variable; + +// Filter that rewrites URL domains for resources that are not +// otherwise rewritten. For example, the user may want to +// domain-shard adding a hash to their URL leaves, or domain shard +// resources that are not cacheable. +class DomainRewriteFilter : public CommonFilter { + public: + DomainRewriteFilter(RewriteDriver* rewrite_driver, Statistics* stats); + ~DomainRewriteFilter(); + static void InitStats(Statistics* statistics); + virtual void StartDocumentImpl(); + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element); + + virtual const char* Name() const { return "DomainRewrite"; } + + enum RewriteResult { + kRewroteDomain, + kDomainUnchanged, + kFail, + }; + + // Rewrites the specified URL (which might be relative to the base tag) + // into an absolute sharded url. + // + // Absolute URL output_url will be set if kRewroteDomain or + // kDomainUnchanged returned. + RewriteResult Rewrite(const StringPiece& input_url, + const GoogleUrl& base_url, + bool apply_sharding, + GoogleString* output_url); + + private: + // Stats on how much domain-rewriting we've done. + Variable* rewrite_count_; + bool client_domain_rewriter_script_written_; + + DISALLOW_COPY_AND_ASSIGN(DomainRewriteFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_REWRITE_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/elide_attributes_filter.h b/psol/include/net/instaweb/rewriter/public/elide_attributes_filter.h new file mode 100644 index 000000000..4c24cff9e --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/elide_attributes_filter.h @@ -0,0 +1,62 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mdsteele@google.com (Matthew D. Steele) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_ELIDE_ATTRIBUTES_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_ELIDE_ATTRIBUTES_FILTER_H_ + +#include +#include + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/htmlparse/public/html_name.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { +class HtmlElement; +class HtmlParse; + +// Remove attributes and attribute values that can be safely elided. +class ElideAttributesFilter : public EmptyHtmlFilter { + public: + explicit ElideAttributesFilter(HtmlParse* html_parse); + virtual ~ElideAttributesFilter(); + + virtual void StartElement(HtmlElement* element); + virtual const char* Name() const { return "ElideAttributes"; } + + private: + struct AttrValue { + const char* attr_value; + bool requires_version_5; // Default value only exists in (X)HTML 5. + }; + + typedef std::set KeywordSet; + typedef std::map KeywordSetMap; + typedef std::map ValueMap; + typedef std::map ValueMapMap; + + HtmlParse* html_parse_; + KeywordSetMap one_value_attrs_map_; // tag/attrs with only one possible value + ValueMapMap default_value_map_; // tag/attrs with default values + + DISALLOW_COPY_AND_ASSIGN(ElideAttributesFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_ELIDE_ATTRIBUTES_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/file_input_resource.h b/psol/include/net/instaweb/rewriter/public/file_input_resource.h new file mode 100644 index 000000000..643ca66a0 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/file_input_resource.h @@ -0,0 +1,86 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) +// +// Input resource created based on a local file. + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FILE_INPUT_RESOURCE_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FILE_INPUT_RESOURCE_H_ + +#include "net/instaweb/rewriter/public/resource.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +struct ContentType; +class InputInfo; +class MessageHandler; +class ResponseHeaders; +class RewriteOptions; +class ServerContext; + +class FileInputResource : public Resource { + public: + FileInputResource(ServerContext* server_context, + const RewriteOptions* options, + const ContentType* type, + const StringPiece& url, + const StringPiece& filename) + : Resource(server_context, type), + url_(url.data(), url.size()), + filename_(filename.data(), filename.size()), + rewrite_options_(options) { + } + + virtual ~FileInputResource(); + + // Uses default no-op Freshen implementation because file-based resources + // are fetched each time they are needed. + + virtual bool IsValidAndCacheable() const; + + // Set OutputPartition's input info used for expiration validation. + virtual void FillInPartitionInputInfo(HashHint include_content_hash, + InputInfo* input); + + virtual GoogleString url() const { return url_; } + virtual const RewriteOptions* rewrite_options() const { + return rewrite_options_; + } + + protected: + void SetDefaultHeaders(const ContentType* content_type, + ResponseHeaders* header, MessageHandler* handler); + + virtual bool Load(MessageHandler* message_handler); + // Uses default, blocking LoadAndCallback implementation. + + private: + GoogleString url_; + GoogleString filename_; + int64 last_modified_time_sec_; // Loaded from file mtime. + + const RewriteOptions* rewrite_options_; + + DISALLOW_COPY_AND_ASSIGN(FileInputResource); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FILE_INPUT_RESOURCE_H_ diff --git a/psol/include/net/instaweb/rewriter/public/file_load_mapping.h b/psol/include/net/instaweb/rewriter/public/file_load_mapping.h new file mode 100644 index 000000000..b5eeae4d1 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/file_load_mapping.h @@ -0,0 +1,79 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jefftk@google.com (Jeff Kaufman) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_MAPPING_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_MAPPING_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/manually_ref_counted.h" +#include "net/instaweb/util/public/re2.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +// Class for storing a mapping from a URL to a filesystem path, for use by +// FileLoadPolicy. +class FileLoadMapping : public ManuallyRefCounted { + public: + virtual ~FileLoadMapping(); + + // If this mapping applies to this url, put the mapped path into filename and + // return true. Otherwise return false. + virtual bool Substitute(const StringPiece& url, + GoogleString* filename) const = 0; +}; + +class FileLoadMappingRegexp : public FileLoadMapping { + public: + FileLoadMappingRegexp(const GoogleString& url_regexp, + const GoogleString& filename_prefix) + : url_regexp_(url_regexp), + url_regexp_str_(url_regexp), + filename_prefix_(filename_prefix) {} + + virtual bool Substitute(const StringPiece& url, GoogleString* filename) const; + + private: + const RE2 url_regexp_; + // RE2s can't be copied, so we need to keep the string around. + const GoogleString url_regexp_str_; + const GoogleString filename_prefix_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadMappingRegexp); +}; + +class FileLoadMappingLiteral : public FileLoadMapping { + public: + FileLoadMappingLiteral(const GoogleString& url_prefix, + const GoogleString& filename_prefix) + : url_prefix_(url_prefix), + filename_prefix_(filename_prefix) {} + + virtual bool Substitute(const StringPiece& url, GoogleString* filename) const; + + private: + const GoogleString url_prefix_; + const GoogleString filename_prefix_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadMappingLiteral); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_MAPPING_H_ diff --git a/psol/include/net/instaweb/rewriter/public/file_load_policy.h b/psol/include/net/instaweb/rewriter/public/file_load_policy.h new file mode 100644 index 000000000..516cb8c58 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/file_load_policy.h @@ -0,0 +1,123 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_POLICY_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_POLICY_H_ + +#include +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/gtest_prod.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class GoogleUrl; +class FileLoadMapping; +class FileLoadRule; + +// Class for deciding which URLs get loaded from which files. +// +// Currently, you must explicitly set which directories to load directly +// from filesystem. +class FileLoadPolicy { + public: + FileLoadPolicy() {} + virtual ~FileLoadPolicy(); + + // Note: This is O(N+M) for N calls to Associate and M calls to AddRule. + // TODO(sligocki): Set up a more efficient mapper. + virtual bool ShouldLoadFromFile(const GoogleUrl& url, + GoogleString* filename) const; + + // Tells us to load all URLs with this prefix from filename_prefix directory. + // Both prefixes must specify directories, if they do not end in slashes, + // we add them. + // + // Tests against youngest association first in case of overlapping prefixes. + // Because we support regular expressions, checking for overlapping prefixes + // isn't practical. + virtual void Associate(const StringPiece& url_prefix, + const StringPiece& filename_prefix); + + // A version of Associate supporting RE2-format regular expressions. + // Backreferences are supported, as in: + // + // AssociateRegexp("^https?://example.com/~([^/]*)/static/", + // "/var/static/\\1", &error); + // + // Which will map urls as: + // + // http://example.com/~pat/static/cat.jpg -> /var/static/pat/cat.jpg + // http://example.com/~sam/static/dog.jpg -> /var/static/sam/dog.jpg + // https://example.com/~al/static/css/ie -> /var/static/al/css/ie + // + // If the regular expression and substitution validate, returns true. + // Otherwise it writes a message to error and returns false. + virtual bool AssociateRegexp(const StringPiece& url_regexp, + const StringPiece& filename_prefix, + GoogleString* error); + + // By default Associate permits directly loading anything under the specified + // filesystem path prefix. So if we were given: + // + // Associate("http://example.com/", "/var/www/") + // + // we would use load-from-file for everything on the site. If some of those + // files actually need to be loaded through HTTP, for example because they + // need to be interpreted, we might need: + // + // AddRule("/var/www/cgi-bin/", false, false); // literal blacklist. + // + // or: + // + // // blacklist regexp + // AddRule("\\.php$", true, false); // regexp blacklist. + // + // In cases where it's easier to list what's allowed than what's prohibited, + // you can whitelist: + // + // GoogleString e; // For regexp errors. + // Associate("http://example.com/", "/var/www/") + // AddRule(".*", true, false, &e) // regexp blacklist. + // AddRule("\\.html$", true, true, &e) // regexp whitelist. + // AddRule("/var/www/static/", false, true, &e) // literal whitelist. + // // regexp blacklist. + // AddRule("^/var/www/static/legacy/.*\\.php$", true, false, &e) + // + // AddRule will fail if RE2 can't compile the regular expression, and will + // write an error message to it's error string and return false if that + // happens. + virtual bool AddRule(const GoogleString& rule, bool is_regexp, bool allowed, + GoogleString* error); + + // Merge in other policies (needed for rewrite_options). + virtual void Merge(const FileLoadPolicy& other); + + private: + typedef std::list FileLoadMappings; + FileLoadMappings file_load_mappings_; + typedef std::list FileLoadRules; + FileLoadRules file_load_rules_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadPolicy); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_POLICY_H_ diff --git a/psol/include/net/instaweb/rewriter/public/file_load_rule.h b/psol/include/net/instaweb/rewriter/public/file_load_rule.h new file mode 100644 index 000000000..44a4ba645 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/file_load_rule.h @@ -0,0 +1,93 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jefftk@google.com (Jeff Kaufman) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_RULE_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_RULE_H_ + +#include "net/instaweb/util/public/manually_ref_counted.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/re2.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +// Class for storing information about what filesystem paths are appropriate for +// direct access and which need to be fetched through HTTP loopback. +class FileLoadRule : public ManuallyRefCounted { + public: + enum Classification { + kAllowed, + kDisallowed, + kUnmatched, + }; + + virtual ~FileLoadRule(); + explicit FileLoadRule(bool allowed) : allowed_(allowed) {} + + // What does this rule say about this filename? + Classification Classify(const GoogleString& filename) const; + + protected: + // Is does this rule apply to this filename? + virtual bool Match(const GoogleString& filename) const = 0; + const bool allowed_; +}; + +class FileLoadRuleRegexp : public FileLoadRule { + public: + virtual ~FileLoadRuleRegexp(); + + // If allowed is true, whitelist filenames matching filename_regexp. + // Otherwise blacklist them. + FileLoadRuleRegexp(const GoogleString& filename_regexp, bool allowed) + : FileLoadRule(allowed), + filename_regexp_(filename_regexp), + filename_regexp_str_(filename_regexp) + {} + + virtual bool Match(const GoogleString& filename) const; + + private: + const RE2 filename_regexp_; + // RE2s can't be copied, so we need to keep the string around. + const GoogleString filename_regexp_str_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadRuleRegexp); +}; + +class FileLoadRuleLiteral : public FileLoadRule { + public: + virtual ~FileLoadRuleLiteral(); + + // If allowed is true, whitelist filenames starting with filename_prefix. + // Otherwise blacklist them. + FileLoadRuleLiteral(const GoogleString& filename_prefix, bool allowed) + : FileLoadRule(allowed), filename_prefix_(filename_prefix) + {} + + virtual bool Match(const GoogleString& filename) const; + + private: + const GoogleString filename_prefix_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadRuleLiteral); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_RULE_H_ diff --git a/psol/include/net/instaweb/rewriter/public/flush_early_content_writer_filter.h b/psol/include/net/instaweb/rewriter/public/flush_early_content_writer_filter.h new file mode 100644 index 000000000..2c7797369 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/flush_early_content_writer_filter.h @@ -0,0 +1,103 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Author: nikhilmadan@google.com (Nikhil Madan) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_CONTENT_WRITER_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_CONTENT_WRITER_FILTER_H_ + +#include + +#include "net/instaweb/htmlparse/public/html_writer_filter.h" +#include "net/instaweb/http/public/semantic_type.h" +#include "net/instaweb/http/public/user_agent_matcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/null_writer.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class GoogleUrl; +class HtmlElement; +class RewriteDriver; +class TimedVariable; +class Writer; + +struct ResourceInfo; + +// FlushEarlyContentWriterFilter finds rewritten resources in the DOM and +// inserts HTML that makes the browser download them. Note that we set a +// NullWriter as the writer for this driver, and directly write whatever we +// need to the original writer. +class FlushEarlyContentWriterFilter : public HtmlWriterFilter { + public: + static const char kPrefetchLinkRelSubresourceHtml[]; + static const char kPrefetchImageTagHtml[]; + static const char kPrefetchStartTimeScript[]; + static const char kNumResourcesFlushedEarly[]; + static const char kPrefetchScriptTagHtml[]; + static const char kPrefetchLinkTagHtml[]; + + explicit FlushEarlyContentWriterFilter(RewriteDriver* driver); + + virtual void StartDocument(); + virtual void EndDocument(); + + virtual void StartElement(HtmlElement* element); + virtual void EndElement(HtmlElement* element); + + protected: + virtual void Clear(); + + private: + // Writes the string to original_writer_. + void WriteToOriginalWriter(const GoogleString& in); + + // Check whether resource can be flushed or not. + bool IsFlushable(const GoogleUrl& gurl, bool is_pagespeed_resource); + + // Flush the resource and update time_consumed_ms_ based on time_to_download. + void FlushResources( + StringPiece url, + int64 time_to_download, + bool is_pagespeed_resource, + semantic_type::Category category); + + RewriteDriver* driver_; + TimedVariable* num_resources_flushed_early_; + // Whether we need to insert a close script tag at EndDocument. + bool in_body_; + bool insert_close_script_; + int num_resources_flushed_; + NullWriter null_writer_; + Writer* original_writer_; + HtmlElement* current_element_; + UserAgentMatcher::PrefetchMechanism prefetch_mechanism_; + scoped_ptr private_cacheable_resources_; + int64 time_consumed_ms_; + int64 max_available_time_ms_; + typedef std::list ResourceInfoList; + ResourceInfoList js_resources_info_; + bool defer_javascript_enabled_; + GoogleString flush_early_content_; + + DISALLOW_COPY_AND_ASSIGN(FlushEarlyContentWriterFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_CONTENT_WRITER_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/flush_early_info_finder.h b/psol/include/net/instaweb/rewriter/public/flush_early_info_finder.h new file mode 100644 index 000000000..32e9bbff4 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/flush_early_info_finder.h @@ -0,0 +1,71 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mmohabey@google.com (Megha Mohabey) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_H_ + +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class FlushEarlyRenderInfo; +class RewriteDriver; + +// Finds a subset of flush early information which may be used by +// FlushEarlyFlow. This includes information like privately cacheable resources, +// charset. +class FlushEarlyInfoFinder { + public: + static const char kFlushEarlyRenderPropertyName[]; + + FlushEarlyInfoFinder() {} + virtual ~FlushEarlyInfoFinder(); + + // Checks whether GetCharset will return meaningful result. The default + // implementation does not, but classes inheriting likely do. Users of + // GetCharset should check this function and supply a default behavior if + // IsMeaningful returns false. + virtual bool IsMeaningful() const { + return false; + } + + // Gets the flush early info and update the RewriteDriver. + virtual void UpdateFlushEarlyInfoInDriver(RewriteDriver* driver); + + // Computes the flush early info. + virtual void ComputeFlushEarlyInfo(RewriteDriver* driver); + + // Gets the charset of the html document. Users of this function should also + // check IsMeaningful() to see if the implementation of this function returns + // meaningful results and provide a default behavior if it does not. + virtual const char* GetCharset(const RewriteDriver* driver); + + virtual const char* GetCohort() const = 0; + + protected: + void UpdateFlushEarlyInfoCacheEntry( + RewriteDriver* driver, + FlushEarlyRenderInfo* flush_early_render_info); + + private: + DISALLOW_COPY_AND_ASSIGN(FlushEarlyInfoFinder); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/flush_early_info_finder_test_base.h b/psol/include/net/instaweb/rewriter/public/flush_early_info_finder_test_base.h new file mode 100644 index 000000000..50217c0c0 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/flush_early_info_finder_test_base.h @@ -0,0 +1,61 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mmohabey@google.com (Megha Mohabey) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_TEST_BASE_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_TEST_BASE_H_ + +#include "net/instaweb/rewriter/public/flush_early_info_finder.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/timer.h" + +namespace net_instaweb { + +class RewriteDriver; + +// By default, FlushEarlyInfoFinder does not return meaningful results. This +// class can be used by tests which manually manage FlushEarlyRenderInfo. +class MeaningfulFlushEarlyInfoFinder : public FlushEarlyInfoFinder { + public: + MeaningfulFlushEarlyInfoFinder() : num_compute_calls_(0) {} + virtual ~MeaningfulFlushEarlyInfoFinder() {} + virtual bool IsMeaningful() const { + return true; + } + virtual const char* GetCohort() const { + return "NullCohort"; + } + virtual int64 cache_expiration_time_ms() const { + return Timer::kHourMs; + } + virtual void ComputeFlushEarlyInfo(RewriteDriver* driver) { + num_compute_calls_++; + } + int num_compute_calls() { + return num_compute_calls_; + } + void Clear() { + num_compute_calls_ = 0; + } + + private: + int num_compute_calls_; +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_TEST_BASE_H_ diff --git a/psol/include/net/instaweb/rewriter/public/flush_html_filter.h b/psol/include/net/instaweb/rewriter/public/flush_html_filter.h new file mode 100644 index 000000000..2874b4404 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/flush_html_filter.h @@ -0,0 +1,53 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_HTML_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_HTML_FILTER_H_ + +#include "net/instaweb/rewriter/public/common_filter.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; + +// This filter is run immediately after lexing when streaming HTML into +// the system. It is used to monitor the HTML and try to figure out good +// times to flush, based on document structure and timing. +class FlushHtmlFilter : public CommonFilter { + public: + explicit FlushHtmlFilter(RewriteDriver* driver); + virtual ~FlushHtmlFilter(); + + virtual void StartDocumentImpl(); + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element); + virtual void Flush(); + + virtual const char* Name() const { return "FlushHtmlFilter"; } + + private: + int score_; + + DISALLOW_COPY_AND_ASSIGN(FlushHtmlFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_HTML_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/furious_matcher.h b/psol/include/net/instaweb/rewriter/public/furious_matcher.h new file mode 100644 index 000000000..749befe7c --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/furious_matcher.h @@ -0,0 +1,62 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mukerjee@google.com (Matt Mukerjee) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_MATCHER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_MATCHER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class RequestHeaders; +class ResponseHeaders; +class RewriteOptions; + +// Provides a way to replace the mapping of clients/sessions to furious +// experiments. +// +// Furious is the A/B experiment framework that enables us to track +// page speed statistics and correlate them with different sets of +// rewriters. The default implementation uses cookies to send clients +// to the same experiment consistently. This implementation can be +// overridden to divide clients/sessions into experiments using a +// different mechanism. +class FuriousMatcher { + public: + FuriousMatcher() { } + virtual ~FuriousMatcher(); + + // Decides which experiment to place the current client/session into. + // Returns true if the mapping needs to be stored. + virtual bool ClassifyIntoExperiment(const RequestHeaders& headers, + RewriteOptions* options); + + // Stores the client/session -> experiment mapping for the domain indicated + // by url. The experiment id is indicated by state. The default + // implementation stores this in a cookie in the response headers, setting it + // to expire at expiration_time_ms (specified as ms since the epoch). + virtual void StoreExperimentData(int state, const StringPiece& url, + int64 expiration_time_ms, ResponseHeaders* headers); + + private: + DISALLOW_COPY_AND_ASSIGN(FuriousMatcher); +}; + +} // namespace net_instaweb +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_MATCHER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/furious_util.h b/psol/include/net/instaweb/rewriter/public/furious_util.h new file mode 100644 index 000000000..ce109e943 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/furious_util.h @@ -0,0 +1,82 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: nforman@google.com (Naomi Forman) +// +// Functionality and constants for handling Furious experiments and +// measurement. +// +// Furious is the A/B experiment framework that uses cookies +// and Google Analytics to track page speed statistics and correlate +// them with different sets of rewriters. + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_UTIL_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_UTIL_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class RequestHeaders; +class ResponseHeaders; +class RewriteOptions; + +namespace furious { + +// kFuriousNoExperiment indicates there is an actual cookie set, but the cookie +// says: don't run experiments on this user. E.g. if you're running an A/B +// experiment on 40% of the traffic, 20% is in A, 20% is in B, and +// 60% is in NoExperiment. +enum FuriousState { + kFuriousNotSet = -1, // Indicates no experiment cookie was set. + kFuriousNoExperiment = 0, +}; + +// Name of the Furious cookie we set when running experiments. +const char kFuriousCookie[] = "_GFURIOUS"; +const char kFuriousCookiePrefix[] = "_GFURIOUS="; + +// Populates value with the state indicated by the FuriousCookie, if found. +// Returns true if a cookie was found, false if it was not. +bool GetFuriousCookieState(const RequestHeaders& headers, int* value); + +// Removes the Furious cookie from the request headers so we don't +// send it to the origin. +void RemoveFuriousCookie(RequestHeaders *headers); + +// Add a Set-Cookie header for Furious on the domain of url, +// expiring at expiration_time_ms (specified as ms since the epoch), +// putting it on the side of the experiment indicated by state. +void SetFuriousCookie(ResponseHeaders* headers, int state, + const StringPiece& url, int64 expiration_time_ms); + +// Determines which side of the experiment this request should end up on. +int DetermineFuriousState(const RewriteOptions* options); + +// The string value of a Furious State. We don't want to use "ToString" +// in case we change how we want the cookies to look. +GoogleString FuriousStateToCookieString(int state); + +// Converts a Furious Cookie string, e.g. "2", into a FuriousState. +int CookieStringToState(const StringPiece& cookie_str); + +} // namespace furious + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_UTIL_H_ diff --git a/psol/include/net/instaweb/rewriter/public/google_analytics_filter.h b/psol/include/net/instaweb/rewriter/public/google_analytics_filter.h new file mode 100644 index 000000000..bddfe6ac0 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/google_analytics_filter.h @@ -0,0 +1,178 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: slamm@google.com (Stephen Lamm) + +// Search for synchronous loads of Google Analytics similar to the following: +// +// +// +// +// Replace the document.write with a new snippet that loads ga.js +// asynchronously. Also, insert a replacement for _getTracker that +// converts any calls to the synchronous API to the asynchronous API. +// The _getTracker replacement is a new function that returns a mock +// tracker object. Anytime a synchronous API method is called, the +// mock tracker fowards it to a _gaq.push(...) call. +// +// An alternative approach would been to find all the API calls and +// rewrite them to the asynchronous API. However, to be done properly, +// it would have had the added complication of using a JavaScript +// compiler. +// + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_GOOGLE_ANALYTICS_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_GOOGLE_ANALYTICS_FILTER_H_ + +#include + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { +class HtmlCdataNode; +class HtmlCharactersNode; +class HtmlCommentNode; +class HtmlElement; +class HtmlIEDirectiveNode; +class HtmlParse; +class Statistics; +class Variable; + + +// Edit a substring in a script element. +class ScriptEditor { + public: + enum Type { + kGaJsScriptSrcLoad = 0, + kGaJsDocWriteLoad, + kGaJsInit, + }; + ScriptEditor(HtmlElement* script_element_, + HtmlCharactersNode* characters_node, + GoogleString::size_type pos, + GoogleString::size_type len, + Type editor_type); + + HtmlElement* GetScriptElement() const { return script_element_; } + HtmlCharactersNode* GetScriptCharactersNode() const { + return script_characters_node_; + } + Type GetType() const { return editor_type_; } + + void NewContents(const StringPiece &replacement, + GoogleString* contents) const; + + private: + HtmlElement* script_element_; + HtmlCharactersNode* script_characters_node_; + + GoogleString::size_type pos_; + GoogleString::size_type len_; + + Type editor_type_; + DISALLOW_COPY_AND_ASSIGN(ScriptEditor); +}; + + +// Filter +// +// +// +// where $hash stands for using the active Hasher and tweaking the result to +// be a valid identifier continuation. Further, the combined source file +// has the code: +// var mod_pagespeed_${hash("a.js")} = "code of a.js as a string literal"; +// var mod_pagespeed_${hash("b.js")} = "code of b.js as a string literal"; +class JsCombineFilter : public RewriteFilter { + public: + static const char kJsFileCountReduction[]; // statistics variable name + + // rewrite_driver is the context owning us, and filter_id is the ID we + // are registered under. + explicit JsCombineFilter(RewriteDriver* rewrite_driver); + virtual ~JsCombineFilter(); + + // Registers the provided statistics variable names with 'statistics'. + static void InitStats(Statistics* statistics); + virtual const char* id() const { + return RewriteOptions::kJavascriptCombinerId; + } + + protected: + // RewriteFilter overrides --- HTML parsing event handlers. + virtual void StartDocumentImpl(); + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element); + virtual void Characters(HtmlCharactersNode* characters); + virtual void Flush(); + virtual void IEDirective(HtmlIEDirectiveNode* directive); + virtual const char* Name() const { return "JsCombine"; } + virtual RewriteContext* MakeRewriteContext(); + virtual const UrlSegmentEncoder* encoder() const { + return &encoder_; + } + + private: + class JsCombiner; + class Context; + + friend class JsCombineFilterTest; + + void ConsiderJsForCombination(HtmlElement* element, + HtmlElement::Attribute* src); + + // Returns JS variable name where code for given URL should be stored. + static GoogleString VarName(const ServerContext* server_context, + const GoogleString& url); + + void NextCombination(); + + Context* MakeContext(); + + JsCombiner* combiner() const; + ServerContext* server_context() const { return server_context_; } + + ScriptTagScanner script_scanner_; + int script_depth_; // how many script elements we are inside + // current outermost +// to: +// +// +class JsDisableFilter : public EmptyHtmlFilter { + public: + explicit JsDisableFilter(RewriteDriver* driver); + ~JsDisableFilter(); + + static const char kEnableJsExperimental[]; + static const char kDisableJsExperimental[]; + + virtual void StartDocument(); + + virtual void StartElement(HtmlElement* element); + + virtual void EndElement(HtmlElement* element); + + virtual void EndDocument(); + + virtual void DetermineEnabled(); + + virtual const char* Name() const { + return "JsDisableFilter"; + } + + static GoogleString GetJsDisableScriptSnippet(const RewriteOptions* options); + + private: + // Inserts the experimental js enable/disable code. + void InsertJsDeferExperimentalScript(HtmlElement* element); + + // Insert meta tag with 'X-UA-Compatible'. This will avoid IE going to quirks + // mode. More information about this can be found in + // http://webdesign.about.com/od/metataglibraries/p/x-ua-compatible-meta-tag.htm + void InsertMetaTagForIE(HtmlElement* element); + + RewriteDriver* rewrite_driver_; + ScriptTagScanner script_tag_scanner_; + int index_; + bool defer_js_experimental_script_written_; + bool ie_meta_tag_written_; + + DISALLOW_COPY_AND_ASSIGN(JsDisableFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_JS_DISABLE_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/js_inline_filter.h b/psol/include/net/instaweb/rewriter/public/js_inline_filter.h new file mode 100644 index 000000000..c7ad40683 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/js_inline_filter.h @@ -0,0 +1,71 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mdsteele@google.com (Matthew D. Steele) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_JS_INLINE_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_JS_INLINE_FILTER_H_ + +#include + +#include "net/instaweb/rewriter/public/common_filter.h" +#include "net/instaweb/rewriter/public/resource.h" +#include "net/instaweb/rewriter/public/script_tag_scanner.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { +class HtmlElement; +class HtmlCharactersNode; +class RewriteDriver; + +// Inline small Javascript files. +class JsInlineFilter : public CommonFilter { + public: + explicit JsInlineFilter(RewriteDriver* driver); + virtual ~JsInlineFilter(); + + virtual void StartDocumentImpl(); + virtual void EndDocument(); + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element); + virtual void Characters(HtmlCharactersNode* characters); + virtual const char* Name() const { return "InlineJs"; } + + private: + class Context; + friend class Context; + + bool ShouldInline(const ResourcePtr& resource) const; + void RenderInline(const ResourcePtr& resource, const StringPiece& text, + HtmlElement* element); + + const size_t size_threshold_bytes_; + ScriptTagScanner script_tag_scanner_; + + // This is set to true during StartElement() for a . + bool should_inline_; + + DISALLOW_COPY_AND_ASSIGN(JsInlineFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_JS_INLINE_FILTER_H_ diff --git a/psol/include/net/instaweb/rewriter/public/js_outline_filter.h b/psol/include/net/instaweb/rewriter/public/js_outline_filter.h new file mode 100644 index 000000000..f0b80f779 --- /dev/null +++ b/psol/include/net/instaweb/rewriter/public/js_outline_filter.h @@ -0,0 +1,75 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_JS_OUTLINE_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_JS_OUTLINE_FILTER_H_ + +#include + +#include "net/instaweb/rewriter/public/common_filter.h" +#include "net/instaweb/rewriter/public/script_tag_scanner.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { +class HtmlCharactersNode; +class HtmlElement; +class MessageHandler; +class OutputResource; +class ServerContext; +class RewriteDriver; + +// Filter to take explicit " + "
Please click here " + "if you are not redirected within a few seconds.
"; +// Link tag to be inserted on noscript redirect so that original URL is +// considered canonical. +const char kLinkRelCanonicalFormatter[] = + ""; +} // namespace + +#endif // NET_INSTAWEB_PUBLIC_GLOBAL_CONSTANTS_H_ diff --git a/psol/include/src/net/instaweb/rewriter/google_analytics_snippet.h b/psol/include/src/net/instaweb/rewriter/google_analytics_snippet.h new file mode 100644 index 000000000..6f86e6ab7 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/google_analytics_snippet.h @@ -0,0 +1,79 @@ +/** + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// DO NOT EDIT. Generated by ./google_analytics_snippet_gen.py + +namespace net_instaweb { + +const char kGaSnippetGetTracker[] = "_modpagespeed_getRewriteTracker("; + +const char kGaSnippetPrefix[] = + "var _gaq = _gaq || [];\n" + "(function(){\n" + " function functionName(fn) {\n" + " var name = /\\W*function\\s+([\\w\\$]+)\\(/.exec(fn);\n" + " if (!name)\n" + " return 'No name';\n" + " return name[1];\n" + " }\n" + " var nameSpace = '_gat';\n" + " var existingGat = window[nameSpace];\n" + " if (existingGat && typeof existingGat['_getTracker'] == 'function') {\n" + " return;\n" + " }\n" + "\n" + " var gaqAccounts = [];\n" + " function setAccount(acct, prefix) {\n" + " if (gaqAccounts[prefix] != acct) {\n" + " gaqAccounts[prefix] = acct;\n" + " _gaq.push([prefix + '_setAccount', acct]);\n" + " }\n" + " }\n" + "\n" + " window['_modpagespeed_getRewriteTracker'] = function(tracker_acct,\n" + " tracker_name) {\n" + " var prefix = tracker_name ? tracker_name + '.' : '';\n" + "\n" + " function deferTrackerFunc(fn) {\n" + " return function() {\n" + " setAccount(tracker_acct, prefix);\n" + " var pushArgs = [fn];\n" + " [].push.apply(pushArgs, arguments);\n" + " _gaq.push(pushArgs);\n" + " };\n" + " }\n" + " var pageTrackerMethodNames = [\n"; + +const char kGaSnippetSuffix[] = + " ];\n" + " var pageTracker = {\n" + " initData: function() {},\n" + " };\n" + " for (var i=pageTrackerMethodNames.length; i--;) {\n" + " var n = pageTrackerMethodNames[i];\n" + " pageTracker[n] = deferTrackerFunc(prefix + n);\n" + " }\n" + " return pageTracker;\n" + " };\n" + "\n" + " var ga = document.createElement('script');\n" + " ga.type = 'text/javascript'; ga.async = true;\n" + " ga.src = ('https:' == document.location.protocol ? 'https://ssl' :\n" + " 'http://www') + '.google-analytics.com/ga.js';\n" + " var s = document.getElementsByTagName('script')[0];\n" + " s.parentNode.insertBefore(ga, s);\n" + "})();\n"; +} // namespace net_instaweb diff --git a/psol/include/src/net/instaweb/rewriter/public/add_head_filter.h b/psol/include/src/net/instaweb/rewriter/public/add_head_filter.h new file mode 100644 index 000000000..7e5116ed2 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/add_head_filter.h @@ -0,0 +1,55 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_ADD_HEAD_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_ADD_HEAD_FILTER_H_ + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { +class HtmlElement; +class HtmlParse; + +// Adds a 'head' element before the 'body', if none was found +// during parsing. This enables downstream filters to assume +// that there will be a head. +class AddHeadFilter : public EmptyHtmlFilter { + public: + explicit AddHeadFilter(HtmlParse* parser, bool combine_multiple_heads); + virtual ~AddHeadFilter(); + + virtual void StartDocument(); + virtual void StartElement(HtmlElement* element); + virtual void EndDocument(); + virtual void EndElement(HtmlElement* element); + virtual void Flush(); + virtual const char* Name() const { return "AddHead"; } + + private: + HtmlParse* html_parse_; + bool combine_multiple_heads_; + bool found_head_; + HtmlElement* head_element_; + + DISALLOW_COPY_AND_ASSIGN(AddHeadFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_ADD_HEAD_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/add_instrumentation_filter.h b/psol/include/src/net/instaweb/rewriter/public/add_instrumentation_filter.h new file mode 100644 index 000000000..e4e073ea5 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/add_instrumentation_filter.h @@ -0,0 +1,74 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: abliss@google.com (Adam Bliss) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_ADD_INSTRUMENTATION_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_ADD_INSTRUMENTATION_FILTER_H_ + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; +class Statistics; +class Variable; + +// Injects javascript instrumentation for monitoring page-rendering time. +class AddInstrumentationFilter : public EmptyHtmlFilter { + public: + static const char kLoadTag[]; + static const char kUnloadTag[]; + static GoogleString* kUnloadScriptFormatXhtml; + static GoogleString* kTailScriptFormatXhtml; + + // Counters. + static const char kInstrumentationScriptAddedCount[]; + + explicit AddInstrumentationFilter(RewriteDriver* driver); + virtual ~AddInstrumentationFilter(); + + static void InitStats(Statistics* statistics); + + virtual void StartDocument(); + virtual void StartElement(HtmlElement* element); + virtual void EndElement(HtmlElement* element); + virtual const char* Name() const { return "AddInstrumentation"; } + + protected: + // The total number of times instrumentation script is added. + Variable* instrumentation_script_added_count_; + + private: + // Adds a script node to given element using the specified format and + // tag name. + void AddScriptNode(HtmlElement* element, const GoogleString& tag_name); + + RewriteDriver* driver_; + bool found_head_; + bool use_cdata_hack_; + bool added_tail_script_; + bool added_unload_script_; + + DISALLOW_COPY_AND_ASSIGN(AddInstrumentationFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_ADD_INSTRUMENTATION_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/ajax_rewrite_context.h b/psol/include/src/net/instaweb/rewriter/public/ajax_rewrite_context.h new file mode 100755 index 000000000..5086bd14a --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/ajax_rewrite_context.h @@ -0,0 +1,175 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: nikhilmadan@google.com (Nikhil Madan) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_AJAX_REWRITE_CONTEXT_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_AJAX_REWRITE_CONTEXT_H_ + +#include "net/instaweb/http/public/async_fetch.h" +#include "net/instaweb/http/public/content_type.h" +#include "net/instaweb/http/public/http_value.h" +#include "net/instaweb/http/public/http_value_writer.h" +#include "net/instaweb/http/public/response_headers.h" +#include "net/instaweb/rewriter/public/output_resource_kind.h" +#include "net/instaweb/rewriter/public/resource.h" +#include "net/instaweb/rewriter/public/server_context.h" +#include "net/instaweb/rewriter/public/resource_slot.h" +#include "net/instaweb/rewriter/public/rewrite_context.h" +#include "net/instaweb/rewriter/public/rewrite_options.h" +#include "net/instaweb/rewriter/public/single_rewrite_context.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/proto_util.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class InputInfo; +class MessageHandler; +class RewriteDriver; +class RewriteFilter; +class UrlAsyncFetcher; + +// A resource-slot created for an ajax rewrite. This has an empty render method. +// Note that this class is usually used as a RefCountedPtr and gets deleted when +// there are no references remaining. +class AjaxRewriteResourceSlot : public ResourceSlot { + public: + explicit AjaxRewriteResourceSlot(const ResourcePtr& resource); + + // Implements ResourceSlot::Render(). + virtual void Render(); + + // Implements ResourceSlot::LocationString(). + virtual GoogleString LocationString() { return "ajax"; } + + protected: + virtual ~AjaxRewriteResourceSlot(); + + private: + DISALLOW_COPY_AND_ASSIGN(AjaxRewriteResourceSlot); +}; + +// Context that is used for an ajax rewrite. +class AjaxRewriteContext : public SingleRewriteContext { + public: + // Stats variable name to keep track of how often in-place falls back to + // stream (due to a large resource) when Options->in_place_wait_for_optimized + // is true. + static const char kInPlaceOversizedOptStream[]; + + AjaxRewriteContext(RewriteDriver* driver, const StringPiece& url); + virtual ~AjaxRewriteContext(); + + // Implements SingleRewriteContext::RewriteSingle(). + virtual void RewriteSingle(const ResourcePtr& input, + const OutputResourcePtr& output); + // Implements RewriteContext::id(). + virtual const char* id() const { return RewriteOptions::kAjaxRewriteId; } + // Implements RewriteContext::kind(). + virtual OutputResourceKind kind() const { return kRewrittenResource; } + // Implements RewriteContext::DecodeFetchUrls(). + virtual bool DecodeFetchUrls(const OutputResourcePtr& output_resource, + MessageHandler* message_handler, + GoogleUrlStarVector* url_vector); + // Implements RewriteContext::StartFetchReconstruction(). + virtual void StartFetchReconstruction(); + + static void InitStats(Statistics* statistics); + + private: + friend class RecordingFetch; + // Implements RewriteContext::Harvest(). + virtual void Harvest(); + void StartFetchReconstructionParent(); + // Implements RewriteContext::FixFetchFallbackHeaders(). + virtual void FixFetchFallbackHeaders(ResponseHeaders* headers); + // Implements RewriteContext::FetchTryFallback(). + virtual void FetchTryFallback(const GoogleString& url, + const StringPiece& hash); + // Implements RewriteContext::FetchCallbackDone(). + virtual void FetchCallbackDone(bool success); + + RewriteFilter* GetRewriteFilter(const ContentType& type); + + // Update the date and expiry time based on the InputInfo's. + void UpdateDateAndExpiry(const protobuf::RepeatedPtrField& inputs, + int64* date_ms, int64* expiry_ms); + + RewriteDriver* driver_; + GoogleString url_; + // Boolean indicating whether or not the resource was rewritten successfully. + bool is_rewritten_; + // The hash of the rewritten resource. Note that this should only be used if + // is_rewritten_ is true. This may be empty. + GoogleString rewritten_hash_; + + // Information needed for nested rewrites. + ResourcePtr input_resource_; + OutputResourcePtr output_resource_; + + scoped_ptr cache_fetcher_; + + DISALLOW_COPY_AND_ASSIGN(AjaxRewriteContext); +}; + +// Records the fetch into the provided resource and passes through events to the +// underlying writer, response headers and callback. +class RecordingFetch : public SharedAsyncFetch { + public: + RecordingFetch(AsyncFetch* async_fetch, + const ResourcePtr& resource, + AjaxRewriteContext* context, + MessageHandler* handler); + + virtual ~RecordingFetch(); + + // Implements SharedAsyncFetch::HandleHeadersComplete(). + virtual void HandleHeadersComplete(); + // Implements SharedAsyncFetch::HandleWrite(). + virtual bool HandleWrite(const StringPiece& content, MessageHandler* handler); + // Implements SharedAsyncFetch::HandleFlush(). + virtual bool HandleFlush(MessageHandler* handler); + // Implements SharedAsyncFetch::HandleDone(). + virtual void HandleDone(bool success); + + private: + void FreeDriver(); + + bool CanAjaxRewrite(); + + // By default RecordingFetch streams back the original content to the browser. + // If this returns false then the RecordingFetch should cache the original + // content but not stream it. + bool ShouldStream(); + + MessageHandler* handler_; + ResourcePtr resource_; + AjaxRewriteContext* context_; + bool can_ajax_rewrite_; + bool streaming_; + HTTPValue cache_value_; + HTTPValueWriter cache_value_writer_; + ResponseHeaders saved_headers_; + Variable* in_place_oversized_opt_stream_; + DISALLOW_COPY_AND_ASSIGN(RecordingFetch); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_AJAX_REWRITE_CONTEXT_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/association_transformer.h b/psol/include/src/net/instaweb/rewriter/public/association_transformer.h new file mode 100644 index 000000000..26a04db92 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/association_transformer.h @@ -0,0 +1,129 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_ASSOCIATION_TRANSFORMER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_ASSOCIATION_TRANSFORMER_H_ + +#include + +#include "net/instaweb/rewriter/public/css_tag_scanner.h" +#include "net/instaweb/rewriter/public/resource.h" +#include "net/instaweb/rewriter/public/resource_slot.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/gtest_prod.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class GoogleUrl; +class MessageHandler; + +// Transformer that uses a std::map to specify which URLs to rewrite to +// which other URLs. +// Used by CssFilter to rewrite subresources in CSS even when it cannot +// be parsed, by using AssociationSlots to update the map before transforming. +class AssociationTransformer : public CssTagScanner::Transformer { + public: + // base_url is the URL all CSS url()s should be absolutified against, + // this is generally the URL for the CSS file or HTML file for inline CSS. + // backup_transformer is another transformer to be applied if no + // association has been set in AssociationTransformer's map_. It may be + // set to NULL if no backup is needed. + // + // base_url, backup_transformer and handler must live longer than + // AssociationTransformer. + AssociationTransformer(const GoogleUrl* base_url, + CssTagScanner::Transformer* backup_transformer, + MessageHandler* handler) + : base_url_(base_url), backup_transformer_(backup_transformer), + handler_(handler) {} + virtual ~AssociationTransformer(); + + // Map is exposed so that you can set associations. + // Each key -> value specifies that every instance of the absolute URL + // key should be transformed to the absolute URL value. + StringStringMap* map() { return &map_; } + + // To do the actual transformation. Call CssTagScanner::TransformUrls() + // with this AssociationTransformer which will call Transform() on all URLs. + // Transform will lookup all (absolutified) URLs in map_ and rewrite them + // if present (otherwise it will pass them to the backup_transformer_). + virtual TransformStatus Transform(const StringPiece& in, GoogleString* out); + + private: + // Mapping of input URLs to output URLs. + StringStringMap map_; + + // Base URL for CSS file, needed to absolutify URLs in Transform. + const GoogleUrl* base_url_; + + // Transformer to be applied to URLs we don't rewrite. For example, we might + // want to make sure we absolutify all URLs, even if we don't rewrite them. + CssTagScanner::Transformer* backup_transformer_; + + MessageHandler* handler_; + + FRIEND_TEST(AssociationTransformerTest, TransformsCorrectly); + + DISALLOW_COPY_AND_ASSIGN(AssociationTransformer); +}; + +// Extremely simple slot which just sets an association in a std::map when +// it is Render()ed. It associates the key (input URL) with this slot's +// resource URL (the output URL). +// Can be used to set AssociationTransformer::map() so that +// AssocitationTransformer::Transform() will rewrite the rendered URLs. +class AssociationSlot : public ResourceSlot { + public: + // Note: map must outlive AssociationSlot. + AssociationSlot(ResourcePtr resource, + StringStringMap* map, const StringPiece& key) + : ResourceSlot(resource), map_(map) { + key.CopyToString(&key_); + } + virtual ~AssociationSlot(); + + // All Render() calls are from the same thread, so this doesn't need to be + // thread-safe. + virtual void Render() { + if (!disable_rendering()) { + (*map_)[key_] = resource()->url(); + } + } + + virtual void DirectSetUrl(const StringPiece& url) { + url.CopyToString(&((*map_)[key_])); + } + + virtual GoogleString LocationString() { + // TODO(sligocki): Improve quality of this diagnostic. + // Also improve CssResourceSlot::LocationString() which is identical. + return "Inside CSS"; + } + + private: + StringStringMap* map_; + GoogleString key_; + + DISALLOW_COPY_AND_ASSIGN(AssociationSlot); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_ASSOCIATION_TRANSFORMER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/base_tag_filter.h b/psol/include/src/net/instaweb/rewriter/public/base_tag_filter.h new file mode 100644 index 000000000..acfc02913 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/base_tag_filter.h @@ -0,0 +1,55 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BASE_TAG_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BASE_TAG_FILTER_H_ + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; + +// Add this filter into the HtmlParse chain to add a base +// tag into the head section of an HTML document. +class BaseTagFilter : public EmptyHtmlFilter { + public: + explicit BaseTagFilter(RewriteDriver* driver) + : added_base_tag_(false), + driver_(driver) {} + + virtual ~BaseTagFilter(); + + virtual void StartDocument() { + added_base_tag_ = false; + } + virtual void StartElement(HtmlElement* element); + virtual const char* Name() const { return "BaseTag"; } + + private: + bool added_base_tag_; + RewriteDriver* driver_; + + DISALLOW_COPY_AND_ASSIGN(BaseTagFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BASE_TAG_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/beacon_critical_images_finder.h b/psol/include/src/net/instaweb/rewriter/public/beacon_critical_images_finder.h new file mode 100644 index 000000000..86994bd12 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/beacon_critical_images_finder.h @@ -0,0 +1,56 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Author: jud@google.com (Jud Porter) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BEACON_CRITICAL_IMAGES_FINDER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BEACON_CRITICAL_IMAGES_FINDER_H_ + +#include "net/instaweb/rewriter/public/critical_images_finder.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class RewriteDriver; +class Statistics; + +// Support critical (above the fold) image detection through a javascript beacon +// on the client. +// TODO(jud): This class is not yet implemented. +class BeaconCriticalImagesFinder : public CriticalImagesFinder { + public: + static const char kBeaconCohort[]; + + explicit BeaconCriticalImagesFinder(Statistics* stats); + virtual ~BeaconCriticalImagesFinder(); + + virtual bool IsMeaningful() const { + // TODO(jud): This class is not currently implemented yet, change this when + // it is functional. + return false; + } + + virtual void ComputeCriticalImages(StringPiece url, + RewriteDriver* driver, + bool must_compute); + + virtual const char* GetCriticalImagesCohort() const { + return kBeaconCohort; + } +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BEACON_CRITICAL_IMAGES_FINDER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/blink_background_filter.h b/psol/include/src/net/instaweb/rewriter/public/blink_background_filter.h new file mode 100644 index 000000000..e9642f7d7 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/blink_background_filter.h @@ -0,0 +1,50 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: rahulbansal@google.com (Rahul Bansal) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BLINK_BACKGROUND_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BLINK_BACKGROUND_FILTER_H_ + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/rewriter/public/script_tag_scanner.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class RewriteDriver; +class RewriteOptions; + +// This class does the preprocessing required to apply blink. +class BlinkBackgroundFilter : public EmptyHtmlFilter { + public: + explicit BlinkBackgroundFilter(RewriteDriver* rewrite_driver); + virtual ~BlinkBackgroundFilter(); + + virtual void StartElement(HtmlElement* element); + virtual const char* Name() const { return "ProcessBlinkInBackgroundFilter"; } + + private: + RewriteDriver* rewrite_driver_; + const RewriteOptions* rewrite_options_; + ScriptTagScanner script_tag_scanner_; + + DISALLOW_COPY_AND_ASSIGN(BlinkBackgroundFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BLINK_BACKGROUND_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/blink_critical_line_data_finder.h b/psol/include/src/net/instaweb/rewriter/public/blink_critical_line_data_finder.h new file mode 100644 index 000000000..73704ce48 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/blink_critical_line_data_finder.h @@ -0,0 +1,63 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: pulkitg@google.com (Pulkit Goyal) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BLINK_CRITICAL_LINE_DATA_FINDER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BLINK_CRITICAL_LINE_DATA_FINDER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class BlinkCriticalLineData; +class PropertyPage; +class ResponseHeaders; +class RewriteDriver; + +// Finds BlinkCriticalLineData from the given html content. This information +// will be used by BlinkFlowCriticalLine. +// TODO(pulkitg): Rethink about the naming and structure of this class. +class BlinkCriticalLineDataFinder { + public: + static const char kBlinkCohort[]; + BlinkCriticalLineDataFinder(); + virtual ~BlinkCriticalLineDataFinder(); + + // Gets BlinkCriticalLineData from the given PropertyPage. + virtual BlinkCriticalLineData* ExtractBlinkCriticalLineData( + int64 cache_time_ms, PropertyPage* page, int64 now_ms, bool diff_enabled, + bool propagate_cache_deletes); + + // Computes BlinkCriticalLineData for the given html content. + virtual void ComputeBlinkCriticalLineData( + const GoogleString& computed_hash, + const GoogleString& computed_hash_smart_diff, + const StringPiece html_content, + const ResponseHeaders* response_headers, + RewriteDriver* driver); + + virtual void PropagateCacheDeletes(const GoogleString& key); + + private: + DISALLOW_COPY_AND_ASSIGN(BlinkCriticalLineDataFinder); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BLINK_CRITICAL_LINE_DATA_FINDER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/blink_filter.h b/psol/include/src/net/instaweb/rewriter/public/blink_filter.h new file mode 100644 index 000000000..d383b4d4a --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/blink_filter.h @@ -0,0 +1,90 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: rahulbansal@google.com (Rahul Bansal) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BLINK_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BLINK_FILTER_H_ + +#include + +#include "net/instaweb/htmlparse/public/html_writer_filter.h" +#include "net/instaweb/rewriter/blink_critical_line_data.pb.h" +#include "net/instaweb/rewriter/public/blink_util.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/property_cache.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" +#include "net/instaweb/util/public/string_writer.h" +#include "net/instaweb/util/public/json.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; +class RewriteOptions; + +// This class extracts the non cacheable panels, looks up the non critical +// content in property cache and sends it to the client, +class BlinkFilter : public HtmlWriterFilter { + public: + // TODO(rahulbansal): Move these consts to appropriate file. + static const char kBlinkCriticalLineDataPropertyName[]; + static const char kBlinkCohort[]; + static const char kRefreshPageJs[]; + + explicit BlinkFilter(RewriteDriver* rewrite_driver); + virtual ~BlinkFilter(); + + void StartDocument(); + void StartElement(HtmlElement* element); + void EndElement(HtmlElement* element); + void EndDocument(); + void WriteString(StringPiece str); + void Flush(); + virtual const char* Name() const { return "BlinkFilter"; } + + private: + void SendCookies(); + void SendNonCriticalJson(GoogleString* str); + void ServeNonCriticalPanelContents(); + void SendNonCacheableObject(const Json::Value& json); + void ObtainBlinkCriticalLineData(); + void HandleLastModifiedChange(); + // Produces a custom xpath relative to the body or relative to the nearest + // ancestor with an id (if there is one). Xpath comprises of the tag name + // and the id (if it exists) or the position of the elements. + GoogleString GetXpathOfCurrentElement(HtmlElement* element); + + RewriteDriver* rewrite_driver_; // We do not own this. + const RewriteOptions* rewrite_options_; // We do not own this. + AttributesToNonCacheableValuesMap attribute_non_cacheable_values_map_; + std::vector panel_number_num_instances_; + GoogleString buffer_; + StringWriter string_writer_; + const HtmlElement* current_non_cacheable_element_; // We do not own this. + GoogleString current_panel_id_; + const PropertyCache::Cohort* cohort_; // We do not own this. + BlinkCriticalLineData blink_critical_line_data_; + bool abort_filter_; + std::vector num_children_stack_; + + DISALLOW_COPY_AND_ASSIGN(BlinkFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BLINK_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/blink_util.h b/psol/include/src/net/instaweb/rewriter/public/blink_util.h new file mode 100644 index 000000000..c6bb28fc7 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/blink_util.h @@ -0,0 +1,119 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: gagansingh@google.com (Gagan Singh) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_BLINK_UTIL_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_BLINK_UTIL_H_ + +#include +#include +#include + +#include "net/instaweb/util/public/json.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class AsyncFetch; +class GoogleUrl; +class HtmlElement; +class Panel; +class PanelSet; +class ServerContext; +class RewriteOptions; +class UserAgentMatcher; + +typedef std::map PanelIdToSpecMap; +typedef std::multimap, + StringCompareInsensitive> AttributesToNonCacheableValuesMap; + +namespace BlinkUtil { + +const char kContiguous[] = "contiguous"; +const char kCritical[] = "critical"; +const char kPanelId[] = "panel-id"; +const char kImages[] = "images"; +const char kInstanceHtml[] = "instance_html"; +const char kStartBodyMarker[] = ""; +const char kEndBodyTag[] = ""; +const char kLayoutMarker[] = ""; +const char kJsonCachePrefix[] = "json:"; +// TODO(mmohabey): Use RewriteDriver::kStatusCodePropertyName here. +const char kBlinkResponseCodePropertyName[] = "blink_last_response_code"; +const char kXpath[] = "xpath"; +// TODO(rahulbansal): Use these constants everywhere in the code from here. +const char kBlinkCohort[] = "blink"; +const char kBlinkCriticalLineDataPropertyName[] = "blink_critical_line_data"; +const char kComputeVisibleTextFilterOutputEndMarker[] = + ""; + +// Checks whether the user agent is allowed to go into the blink flow. +bool IsUserAgentAllowedForBlink(AsyncFetch* async_fetch, + const RewriteOptions* options, + const char* user_agent, + const UserAgentMatcher& user_agent_matcher); + +// Checks whether the request for 'url' is a valid blink request. +bool IsBlinkRequest(const GoogleUrl& url, + AsyncFetch* async_fetch, + const RewriteOptions* options, + const char* user_agent, + const UserAgentMatcher& user_agent_matcher_); + +// Checks if blink critical line flow can be applied. +bool ShouldApplyBlinkFlowCriticalLine( + const ServerContext* manager, + const RewriteOptions* options); + +// Returns true if json has only miscellaneous(like 'contiguous') +// atributes. +bool IsJsonEmpty(const Json::Value& json); + +// Clears the json array if all objects are empty. +void ClearArrayIfAllEmpty(Json::Value* json); + +// Computes panel id to specification map and returns if any non cacheable +// panels are present. +bool ComputePanels(const PanelSet* panel_set_, + PanelIdToSpecMap* panel_id_to_spec); + +// Escapes < and > with __psa_lt; and __psa_gt; respectively. +void EscapeString(GoogleString* str); + +// TODO(rahulbansal): Move this function to net/instaweb/util/string_util +bool StripTrailingNewline(GoogleString* s); + +// Populates the attributes to non cacheable values map. +void PopulateAttributeToNonCacheableValuesMap( + const RewriteOptions* rewrite_options, + const GoogleUrl& url, + AttributesToNonCacheableValuesMap* attribute_non_cacheable_values_map, + std::vector* panel_number_num_instances); + +// Returns panel number for non cacheable element. If cacheable returns -1. +int GetPanelNumberForNonCacheableElement( + const AttributesToNonCacheableValuesMap& attribute_non_cacheable_values_map, + const HtmlElement* element); + +// Gets panel id for the given panel instance. +GoogleString GetPanelId(int panel_number, int instance_number); +} // namespace BlinkUtil + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_BLINK_UTIL_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/cache_extender.h b/psol/include/src/net/instaweb/rewriter/public/cache_extender.h new file mode 100644 index 000000000..05ffc4787 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/cache_extender.h @@ -0,0 +1,93 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_CACHE_EXTENDER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_CACHE_EXTENDER_H_ + +#include "net/instaweb/rewriter/public/resource.h" // for ResourcePtr +#include "net/instaweb/rewriter/public/server_context.h" +#include "net/instaweb/rewriter/public/resource_slot.h" +#include "net/instaweb/rewriter/public/rewrite_filter.h" +#include "net/instaweb/rewriter/public/rewrite_options.h" +#include "net/instaweb/rewriter/public/rewrite_result.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HtmlElement; +class ResponseHeaders; +class RewriteContext; +class RewriteDriver; +class Statistics; +class Variable; + +// Rewrites resources without changing their content -- just their +// URLs and headers. The original intent of this filter was limited +// to cache extension. However, its scope has been expanded to include +// domain sharding and moving static resources to cookieless domains or +// CDNs. +// +// TODO(jmarantz): rename this class to something more generic, like +// RenameUrlFilter or ProxyUrlFilter. +class CacheExtender : public RewriteFilter { + public: + static const char kCacheExtensions[]; + static const char kNotCacheable[]; + + explicit CacheExtender(RewriteDriver* driver); + virtual ~CacheExtender(); + + static void InitStats(Statistics* statistics); + + virtual void StartDocumentImpl() {} + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element) {} + + virtual const char* Name() const { return "CacheExtender"; } + virtual const char* id() const { return RewriteOptions::kCacheExtenderId; } + + // Creates a nested rewrite for given parent and slot, and returns it. + // The result is not registered with the parent. + RewriteContext* MakeNestedContext(RewriteContext* parent, + const ResourceSlotPtr& slot); + + protected: + virtual bool ComputeOnTheFly() const; + virtual RewriteContext* MakeRewriteContext(); + + private: + class Context; + friend class Context; + + RewriteResult RewriteLoadedResource(const ResourcePtr& input_resource, + const OutputResourcePtr& output_resource); + + bool ShouldRewriteResource( + const ResponseHeaders* headers, int64 now_ms, + const ResourcePtr& input_resource, const StringPiece& url) const; + + Variable* extension_count_; + Variable* not_cacheable_count_; + + DISALLOW_COPY_AND_ASSIGN(CacheExtender); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_CACHE_EXTENDER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/collapse_whitespace_filter.h b/psol/include/src/net/instaweb/rewriter/public/collapse_whitespace_filter.h new file mode 100644 index 000000000..e1e0be76b --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/collapse_whitespace_filter.h @@ -0,0 +1,62 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mdsteele@google.com (Matthew D. Steele) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_COLLAPSE_WHITESPACE_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_COLLAPSE_WHITESPACE_FILTER_H_ + +#include + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/htmlparse/public/html_name.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { +class HtmlParse; +class HtmlElement; +class HtmlCharactersNode; + +// Reduce the size of the HTML by collapsing whitespace (except within certain +// tags, e.g.
 and 
+//   
+//    
+//   
+//  
+// 
+//
+// Above script which converts pagespeed_iframe to iframe will be deferred
+// by JsDeferDisabledJavascriptFilter, hence loading of iframe is also deferred.
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DEFER_IFRAME_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DEFER_IFRAME_FILTER_H_
+
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+#include "net/instaweb/util/public/basictypes.h"
+
+namespace net_instaweb {
+
+class HtmlElement;
+class RewriteDriver;
+class StaticJavascriptManager;
+
+class DeferIframeFilter : public EmptyHtmlFilter {
+ public:
+  static const char kDeferIframeInit[];
+  static const char kDeferIframeIframeJs[];
+  explicit DeferIframeFilter(RewriteDriver* driver);
+  ~DeferIframeFilter();
+
+  virtual void StartDocument();
+  virtual void StartElement(HtmlElement* element);
+  virtual void EndElement(HtmlElement* element);
+
+  virtual const char* Name() const { return "DeferIframe"; }
+
+ private:
+  RewriteDriver* driver_;
+  StaticJavascriptManager* static_js_manager_;
+  bool script_inserted_;
+  bool defer_js_enabled_;
+
+  DISALLOW_COPY_AND_ASSIGN(DeferIframeFilter);
+};
+
+}  // namespace net_instaweb
+
+#endif  // NET_INSTAWEB_REWRITER_PUBLIC_DEFER_IFRAME_FILTER_H_
diff --git a/psol/include/src/net/instaweb/rewriter/public/delay_images_filter.h b/psol/include/src/net/instaweb/rewriter/public/delay_images_filter.h
new file mode 100644
index 000000000..ab5bcaf12
--- /dev/null
+++ b/psol/include/src/net/instaweb/rewriter/public/delay_images_filter.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: pulkitg@google.com (Pulkit Goyal)
+//
+// Contains implementation of DelayImagesFilter, which delays all the high
+// quality images whose low quality inlined data url are available within their
+// respective image tag like
+// .
+//
+// This filter extracts such low res data urls and generates a map from them.
+// This map will be embedded inside HTML at the end of body tag with a script
+// whose function is to put low res src into respective image tag. Another
+// script which replaces low quality images with high quality images is also
+// embedded.
+//
+// This filter will work in conjunction with image_rewrite_filter which
+// generates data url for low quality images and embeds them with their
+// respective img tags.
+//
+// To avoid drastic reflows, we also need to switch on insert_image_dimensions.
+//
+// Html input to this filter looks like:
+// 
+//  
+//  
+//  
+//   
+//  
+// 
+//
+// Above input html input looks like this because the image_rewrite_filter has
+// already replaced  with
+// .
+//
+// Output for the above html will be:
+// 
+//  
+//   
+//  
+//  
+//   
+//   
+//  
+// 
+//
+// Bottom-of-page script actually includes the image data for the low-resolution
+// images, and those are put in place as soon as control reaches there. High
+// quality images are downloaded after all the low quality images are placed
+// by delay script.
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DELAY_IMAGES_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DELAY_IMAGES_FILTER_H_
+
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+#include "net/instaweb/util/public/basictypes.h"
+#include "net/instaweb/util/public/string_util.h"
+
+namespace net_instaweb {
+
+class HtmlElement;
+class RewriteDriver;
+class StaticJavascriptManager;
+class Statistics;
+
+class DelayImagesFilter : public EmptyHtmlFilter {
+ public:
+  static const char kDelayImagesSuffix[];
+  static const char kDelayImagesInlineSuffix[];
+  static const char kOnloadFunction[];
+
+  explicit DelayImagesFilter(RewriteDriver* driver);
+  virtual ~DelayImagesFilter();
+
+  virtual void StartDocument();
+  virtual void EndDocument();
+  virtual void EndElement(HtmlElement* element);
+
+  virtual const char* Name() const { return "DelayImages"; }
+
+  static void InitStats(Statistics* statistics);
+  static void Terminate();
+
+ private:
+  // Creates a script node containing kDelayImagesSuffix js and append this node
+  // just after element.
+  void InsertDelayImagesJS(HtmlElement* element);
+
+  // Creates a script node containing kDelayImagesInlineSuffix js and append
+  // this node just after element.
+  void InsertDelayImagesInlineJS(HtmlElement* element);
+
+  RewriteDriver* driver_;
+  StaticJavascriptManager* static_js_manager_;
+
+  // pagespeed_low_res_src will be added to the low_res_data_map_ until
+  // low_res_inserted is false. As soon as low_res_map_inserted_ is true, there
+  // is no further addition to low_res_data_map_.
+  bool low_res_map_inserted_;
+  int num_low_res_inlined_images_;
+  StringStringMap low_res_data_map_;
+
+  // Replace the image url with low res base64 encoded url inplace if it is
+  // true, else low_res_data_map_ containing low res images is inserted at the
+  // end of body tag.
+  bool insert_low_res_images_inplace_;
+
+  // is_experimental_enabled_ is set to true if
+  // enable_inline_preview_images_experimental is true.
+  bool is_experimental_enabled_;
+  DISALLOW_COPY_AND_ASSIGN(DelayImagesFilter);
+};
+
+}  // namespace net_instaweb
+
+#endif  // NET_INSTAWEB_REWRITER_PUBLIC_DELAY_IMAGES_FILTER_H_
diff --git a/psol/include/src/net/instaweb/rewriter/public/detect_reflow_js_defer_filter.h b/psol/include/src/net/instaweb/rewriter/public/detect_reflow_js_defer_filter.h
new file mode 100644
index 000000000..5b211834e
--- /dev/null
+++ b/psol/include/src/net/instaweb/rewriter/public/detect_reflow_js_defer_filter.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: atulvasu@google.com (Atul Vasu)
+//         sriharis@google.com (Srihari Sukumaran)
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DETECT_REFLOW_JS_DEFER_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DETECT_REFLOW_JS_DEFER_FILTER_H_
+
+#include "net/instaweb/util/public/basictypes.h"
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+
+namespace net_instaweb {
+
+class RewriteDriver;
+class HtmlElement;
+class Statistics;
+
+// Similar to JsDeferDisabledFilter, but adds some extra js to figure out
+// potential page rendering reflows due to deferred script execution.
+class DetectReflowJsDeferFilter : public EmptyHtmlFilter {
+ public:
+  explicit DetectReflowJsDeferFilter(RewriteDriver* driver);
+  virtual ~DetectReflowJsDeferFilter();
+
+  virtual void StartDocument();
+  virtual void StartElement(HtmlElement* element);
+  virtual void EndElement(HtmlElement* element);
+  virtual void EndDocument();
+  virtual const char* Name() const { return "DetectReflowJsDeferFilter"; }
+
+  void InsertDetectReflowCode(HtmlElement* element);
+
+  static void InitStats(Statistics* statistics);
+  static void Terminate();
+
+ private:
+  RewriteDriver* rewrite_driver_;
+
+  // The script that will be inlined at the end of BODY.
+  bool script_written_;
+  bool defer_js_enabled_;
+  bool debug_;
+
+  DISALLOW_COPY_AND_ASSIGN(DetectReflowJsDeferFilter);
+};
+
+}  // namespace net_instaweb
+
+#endif  // NET_INSTAWEB_REWRITER_PUBLIC_DETECT_REFLOW_JS_DEFER_FILTER_H_
diff --git a/psol/include/src/net/instaweb/rewriter/public/deterministic_js_filter.h b/psol/include/src/net/instaweb/rewriter/public/deterministic_js_filter.h
new file mode 100644
index 000000000..d78aa9bf5
--- /dev/null
+++ b/psol/include/src/net/instaweb/rewriter/public/deterministic_js_filter.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: mmohabey@google.com (Megha Mohabey)
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DETERMINISTIC_JS_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DETERMINISTIC_JS_FILTER_H_
+
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+#include "net/instaweb/util/public/basictypes.h"
+
+namespace net_instaweb {
+
+class HtmlElement;
+class RewriteDriver;
+
+// Injects javascript at the beginning of the head tag to make it deterministic.
+// The JS redefines functions like Math.random and Date. This filter is useful
+// for testing and measurement but does not provide any latency gains. A head
+// element is added if it is not already present in the html.
+class DeterministicJsFilter : public EmptyHtmlFilter {
+ public:
+  explicit DeterministicJsFilter(RewriteDriver* driver);
+  virtual ~DeterministicJsFilter();
+
+  virtual void StartDocument();
+  virtual void StartElement(HtmlElement* element);
+  virtual const char* Name() const { return "DeterministicJs"; }
+
+ private:
+  RewriteDriver* driver_;
+  bool found_head_;
+
+  DISALLOW_COPY_AND_ASSIGN(DeterministicJsFilter);
+};
+
+}  // namespace net_instaweb
+
+
+#endif  // NET_INSTAWEB_REWRITER_PUBLIC_DETERMINISTIC_JS_FILTER_H_
diff --git a/psol/include/src/net/instaweb/rewriter/public/div_structure_filter.h b/psol/include/src/net/instaweb/rewriter/public/div_structure_filter.h
new file mode 100644
index 000000000..43f08b5eb
--- /dev/null
+++ b/psol/include/src/net/instaweb/rewriter/public/div_structure_filter.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: jhoch@google.com (Jason R. Hoch)
+
+#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DIV_STRUCTURE_FILTER_H_
+#define NET_INSTAWEB_REWRITER_PUBLIC_DIV_STRUCTURE_FILTER_H_
+
+#include 
+#include "net/instaweb/htmlparse/public/empty_html_filter.h"
+#include "net/instaweb/util/public/string.h"
+
+namespace net_instaweb {
+
+class HtmlElement;
+
+// This filter as it stands adds to all anchor href's a special query parameter,
+// unique for each link, representing vaguely the link's location on a page,
+// based on the div structure of the page.
+//
+// In its current simple/functional form, the query parameters are of the form
+// "0.1.0.3", a sort of series of DOM-coordinates of a DOM restricted to 
+// and elements. This example could be the 4th link in the first div +// of the second div of the first main div, or the 2nd link (following 2 divs) +// in the first div in the first div (following one link) in the first top-level +// div. +// +// TODO(jhoch): Next step is to encode/condense these parameter values (at the +// very least use a base higher than 10). +class DivStructureFilter : public EmptyHtmlFilter { + public: + static const char kParamName[]; + + explicit DivStructureFilter(); + virtual ~DivStructureFilter(); + + virtual void StartDocument(); + virtual void StartElement(HtmlElement* element); + virtual void EndElement(HtmlElement* element); + virtual const char* Name() const { return "DivStructureFilter"; } + + static GoogleString GetDivCountStackEncoding( + const std::vector& div_count_stack); + + private: + std::vector div_count_stack_; +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_DIV_STRUCTURE_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/domain_lawyer.h b/psol/include/src/net/instaweb/rewriter/public/domain_lawyer.h new file mode 100644 index 000000000..fc639ff64 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/domain_lawyer.h @@ -0,0 +1,320 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) +// +// This class manages the relationships between domains and resources. +// +// The Lawyer keeps track of which domains we are allowed to rewrite, including +// whether multiple resources can be bundled together. +// +// The Lawyer keeps track of domain mappings to move resources onto a CDN or +// onto a cookieless domain. +// +// The Lawyer keeps track of domain sharding, for distributing resources across +// equivalent domains to improve browser download parallelism. +// +// The class here holds state based on the configuration files +// (e.g. Apache .conf). + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_LAWYER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_LAWYER_H_ + +#include +#include + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { +class GoogleUrl; +class MessageHandler; + +class DomainLawyer { + public: + DomainLawyer() : can_rewrite_domains_(false) {} + ~DomainLawyer(); + + // Determines whether a resource can be rewritten, and returns the domain + // that it should be written to. The domain and the path of the resolved + // request are considered - first just the domain, then the domain plus the + // root of the path, and so on down the path until a match is found or the + // path is exhausted; this is done because we can map to a domain plus a + // path and we want to retain the previous behavior of 'working' when a + // mapped-to domain was provided. If the resource_url is relative (has no + // domain) then the resource can always be written, and will share the domain + // of the original request. + // + // The resource_url is considered relative to original_request. Generally + // it is always accessible to rewrite resources in the same domain as the + // original. + // + // Note: The mapped domain name will not incorporate any sharding. + // This is handled by ShardDomain(). + // + // The returned mapped_domain_name will always end with a slash on success. + // The returned resolved_request incorporates rewrite-domain mapping and + // the original URL. + // + // Returns false on failure. + // + // This is used both for domain authorization and domain rewriting, + // but not domain sharding. + // + // See also IsDomainAuthorized, which can be used to determine + // domain authorization without performing a mapping. + bool MapRequestToDomain(const GoogleUrl& original_request, + const StringPiece& resource_url, + GoogleString* mapped_domain_name, + GoogleUrl* resolved_request, + MessageHandler* handler) const; + + // Given the context of an HTTP request to 'original_request', + // checks whether 'domain_to_check' is authorized for rewriting. + // + // For example, if we are rewriting http://www.myhost.com/index.html, + // then all resources from www.myhost.com are implicitly authorized + // for rewriting. Additionally, any domains specified via + // AddDomain() are also authorized. + bool IsDomainAuthorized(const GoogleUrl& original_request, + const GoogleUrl& domain_to_check) const; + + + // Returns true if the given origin (domain:port) is one that we were + // explicitly told about in any form --- e.g. as a rewrite domain, origin + // domain, simple domain, or a shard. + // + // Note that this method returning true does not mean that resources from the + // given domain should be rewritten. + bool IsOriginKnown(const GoogleUrl& domain_to_check) const; + + // Maps an origin resource; just prior to fetching it. This fails + // if the input URL is not valid. It succeeds even if there is no + // mapping done. You must compare 'in' to 'out' to determine if + // mapping was done. + // + // *is_proxy is set to true if the origin-domain was established via + // AddProxyDomainMapping. + bool MapOrigin(const StringPiece& in, GoogleString* out, + bool* is_proxy) const; + bool MapOriginUrl(const GoogleUrl& gurl, GoogleString* out, + bool* is_proxy) const; + + // The methods below this comment are intended only to be run only + // at configuration time. + + // Adds a simple domain to the set that can be rewritten. No + // mapping or sharding will be performed. Returns false if the + // domain syntax was not acceptable. Wildcards (*, ?) may be used in + // the domain_name. Careless use of wildcards can expose the user to + // XSS attacks. + bool AddDomain(const StringPiece& domain_name, MessageHandler* handler); + + // Adds a domain mapping, to assist with serving resources from + // cookieless domains or CDNs. This implicitly calls AddDomain(to_domain) + // and AddDomain(from_domain) if necessary. If either 'to' or 'from' has + // invalid syntax then this function returns false and has no effect. + // + // Wildcards may not be used in the to_domain, but they can be used + // in the from_domains. + // + // This routine can be called multiple times for the same to_domain. If + // the 'from' domains overlap due to wildcards, this will not be detected. + bool AddRewriteDomainMapping(const StringPiece& to_domain, + const StringPiece& comma_separated_from_domains, + MessageHandler* handler); + + // Adds domain mappings that handle both http and https urls for the given + // from_domain_name. No wildcards may be used in either domain, and both + // must be protocol-free and should not have port numbers. + // + // This routine can be called multiple times for the same to_domain. + bool AddTwoProtocolRewriteDomainMapping(const StringPiece& to_domain_name, + const StringPiece& from_domain_name, + MessageHandler* handler); + + // Adds a domain mapping, to assist with fetching resources from locally + // signficant names/ip-addresses. + // + // Wildcards may not be used in the to_domain, but they can be used + // in the from_domains. + // + // This routine can be called multiple times for the same to_domain. If + // the 'from' domains overlap due to wildcards, this will not be detected. + // + // It is invalid to use the same origin_domain in AddProxyDomainMapping + // and as the to_domain of AddOriginDomainMapping. The latter requires + // a Host: request-header on fetches, whereas the former will not get one. + bool AddOriginDomainMapping(const StringPiece& to_domain, + const StringPiece& comma_separated_from_domains, + MessageHandler* handler); + + // Adds a mapping to enable proxying & optimizing resources hosted + // on a domain we do not control, going back to the origin to + // fetch them. + // + // Wildcards may not be used in the proxy_domain or origin_domain. + // + // Subdirectories should normally be used in both the proxy_domain and + // origin_domain. This is a not a strict requirement. If you fully + // control the entire origin domain and are dedicating a proxy domain + // for the sole use of that origin domain then subdirectories are not needed. + // + // The proxy_domain must be running mod_pagespeed and configured + // consistently. The resources will be referenced from this domain + // in CSS and HTML files. + // + // The origin_domain does not need to run mod_pagespeed; it is used + // to fetch the resources. + // + // It is invalid to use the same origin_domain in AddProxyDomainMapping + // and to_domain of AddOriginDomainMapping. The latter requires + // a overriding the Host: request-header on fetches. + bool AddProxyDomainMapping(const StringPiece& proxy_domain, + const StringPiece& origin_domain, + MessageHandler* handler); + + // Adds domain mappings that handle fetches on both http and https for the + // given from_domain. No wildcards may be used in either domain, and both + // must be protocol-free and should not have port numbers. + // + // This routine can be called multiple times for the same to_domain. + bool AddTwoProtocolOriginDomainMapping(const StringPiece& to_domain_name, + const StringPiece& from_domain_name, + MessageHandler* handler); + + // Specifies domain-sharding. This implicitly calls AddDomain(to_domain). + // + // Wildcards may not be used in the to_domain or the from_domain. + bool AddShard(const StringPiece& to_domain, + const StringPiece& comma_separated_shards, + MessageHandler* handler); + + // Computes a domain shard based on a passed-in hash, returning true + // if the domain was sharded. Output argument 'sharded_domain' is + // only updated if when the return value is true. + // + // The hash is an explicit uint32 so that we get the same shard for a + // resource, whether the server is 32-bit or 64-bit. If we have + // 5 shards and used size_t for hashes, then we'd wind up with different + // shards on 32-bit and 64-bit machines and that would reduce cacheability + // of the sharded resources. + bool ShardDomain(const StringPiece& domain_name, uint32 hash, + GoogleString* sharded_domain) const; + + // Merge the domains declared in src into this. There are no exclusions, so + // this is really just aggregating the mappings and authorizations declared in + // both domains. When the same domain is mapped in 'this' and 'src', 'src' + // wins. + void Merge(const DomainLawyer& src); + + // Determines whether a resource of the given domain name is going + // to change due to RewriteDomain mapping or domain sharding. Note + // that this does not account for the actual domain shard selected. + bool WillDomainChange(const StringPiece& domain_name) const; + + // Determines whether any resources might be domain-mapped, either + // via sharding or rewriting. + bool can_rewrite_domains() const { return can_rewrite_domains_; } + + // Visible for testing. + int num_wildcarded_domains() const { return wildcarded_domains_.size(); } + + // Determines whether two domains have been declared as serving the same + // content by the user, via Rewrite or Shard mapping. + bool DoDomainsServeSameContent(const StringPiece& domain1, + const StringPiece& domain2) const; + + // Finds domains rewritten to this domain. Includes only non-wildcarded + // domains. comma_separated_from_domains is empty if no mapping found. + void FindDomainsRewrittenTo( + const GoogleUrl& domain_name, + ConstStringStarVector* from_domains) const; + + // Computes a signature for the DomainLawyer object including containing + // classes (Domain). + GoogleString Signature() const; + + // Computes a string representation meant for debugging purposes only. + // (The format might change in unpredictable ways and is not meant for + // machine consumption). + // Each domain will appear on a separate line, and each line will be prefixed + // with 'line_prefix'. + GoogleString ToString(StringPiece line_prefix) const; + + // Version that's easier to call from debugger. + GoogleString ToString() const { return ToString(StringPiece()); } + + private: + class Domain; + + typedef bool (Domain::*SetDomainFn)(Domain* domain, MessageHandler* handler); + + static GoogleString NormalizeDomainName(const StringPiece& domain_name); + + static bool IsSchemeSafeToMapTo(const StringPiece& domain_name, + bool allow_https_scheme); + + bool MapDomainHelper( + const StringPiece& to_domain_name, + const StringPiece& comma_separated_from_domains, + SetDomainFn set_domain_fn, + bool allow_wildcards, + bool allow_map_to_https, + bool authorize, + MessageHandler* handler); + + bool MapUrlHelper(const Domain& from_domain, + const Domain& to_domain, + const GoogleUrl& gurl, + GoogleUrl* mapped_gurl) const; + + bool DomainNameToTwoProtocols(const StringPiece& domain_name, + GoogleString* http_url, + GoogleString* https_url); + + bool TwoProtocolDomainHelper( + const StringPiece& to_domain_name, + const StringPiece& from_domain_name, + SetDomainFn set_domain_fn, + bool authorize, + MessageHandler* handler); + + Domain* AddDomainHelper(const StringPiece& domain_name, + bool warn_on_duplicate, + bool authorize, + bool is_proxy, + MessageHandler* handler); + Domain* CloneAndAdd(const Domain* src); + + Domain* FindDomain(const GoogleUrl& gurl) const; + + // Map-order is important as ordering is taken into consideration while + // constructing the signature of the domain lawyer. + typedef std::map DomainMap; // see AddDomainHelper + DomainMap domain_map_; + typedef std::vector DomainVector; // see AddDomainHelper + DomainVector wildcarded_domains_; + bool can_rewrite_domains_; + // If you add more fields here, please be sure to update Merge(). + + DISALLOW_COPY_AND_ASSIGN(DomainLawyer); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_LAWYER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/domain_rewrite_filter.h b/psol/include/src/net/instaweb/rewriter/public/domain_rewrite_filter.h new file mode 100644 index 000000000..d0c251a1b --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/domain_rewrite_filter.h @@ -0,0 +1,76 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_REWRITE_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_REWRITE_FILTER_H_ + +#include "net/instaweb/rewriter/public/common_filter.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class HtmlElement; +class GoogleUrl; +class RewriteDriver; +class Statistics; +class Variable; + +// Filter that rewrites URL domains for resources that are not +// otherwise rewritten. For example, the user may want to +// domain-shard adding a hash to their URL leaves, or domain shard +// resources that are not cacheable. +class DomainRewriteFilter : public CommonFilter { + public: + DomainRewriteFilter(RewriteDriver* rewrite_driver, Statistics* stats); + ~DomainRewriteFilter(); + static void InitStats(Statistics* statistics); + virtual void StartDocumentImpl(); + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element); + + virtual const char* Name() const { return "DomainRewrite"; } + + enum RewriteResult { + kRewroteDomain, + kDomainUnchanged, + kFail, + }; + + // Rewrites the specified URL (which might be relative to the base tag) + // into an absolute sharded url. + // + // Absolute URL output_url will be set if kRewroteDomain or + // kDomainUnchanged returned. + RewriteResult Rewrite(const StringPiece& input_url, + const GoogleUrl& base_url, + bool apply_sharding, + GoogleString* output_url); + + private: + // Stats on how much domain-rewriting we've done. + Variable* rewrite_count_; + bool client_domain_rewriter_script_written_; + + DISALLOW_COPY_AND_ASSIGN(DomainRewriteFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_DOMAIN_REWRITE_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/elide_attributes_filter.h b/psol/include/src/net/instaweb/rewriter/public/elide_attributes_filter.h new file mode 100644 index 000000000..4c24cff9e --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/elide_attributes_filter.h @@ -0,0 +1,62 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mdsteele@google.com (Matthew D. Steele) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_ELIDE_ATTRIBUTES_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_ELIDE_ATTRIBUTES_FILTER_H_ + +#include +#include + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/htmlparse/public/html_name.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { +class HtmlElement; +class HtmlParse; + +// Remove attributes and attribute values that can be safely elided. +class ElideAttributesFilter : public EmptyHtmlFilter { + public: + explicit ElideAttributesFilter(HtmlParse* html_parse); + virtual ~ElideAttributesFilter(); + + virtual void StartElement(HtmlElement* element); + virtual const char* Name() const { return "ElideAttributes"; } + + private: + struct AttrValue { + const char* attr_value; + bool requires_version_5; // Default value only exists in (X)HTML 5. + }; + + typedef std::set KeywordSet; + typedef std::map KeywordSetMap; + typedef std::map ValueMap; + typedef std::map ValueMapMap; + + HtmlParse* html_parse_; + KeywordSetMap one_value_attrs_map_; // tag/attrs with only one possible value + ValueMapMap default_value_map_; // tag/attrs with default values + + DISALLOW_COPY_AND_ASSIGN(ElideAttributesFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_ELIDE_ATTRIBUTES_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/file_input_resource.h b/psol/include/src/net/instaweb/rewriter/public/file_input_resource.h new file mode 100644 index 000000000..643ca66a0 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/file_input_resource.h @@ -0,0 +1,86 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) +// +// Input resource created based on a local file. + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FILE_INPUT_RESOURCE_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FILE_INPUT_RESOURCE_H_ + +#include "net/instaweb/rewriter/public/resource.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +struct ContentType; +class InputInfo; +class MessageHandler; +class ResponseHeaders; +class RewriteOptions; +class ServerContext; + +class FileInputResource : public Resource { + public: + FileInputResource(ServerContext* server_context, + const RewriteOptions* options, + const ContentType* type, + const StringPiece& url, + const StringPiece& filename) + : Resource(server_context, type), + url_(url.data(), url.size()), + filename_(filename.data(), filename.size()), + rewrite_options_(options) { + } + + virtual ~FileInputResource(); + + // Uses default no-op Freshen implementation because file-based resources + // are fetched each time they are needed. + + virtual bool IsValidAndCacheable() const; + + // Set OutputPartition's input info used for expiration validation. + virtual void FillInPartitionInputInfo(HashHint include_content_hash, + InputInfo* input); + + virtual GoogleString url() const { return url_; } + virtual const RewriteOptions* rewrite_options() const { + return rewrite_options_; + } + + protected: + void SetDefaultHeaders(const ContentType* content_type, + ResponseHeaders* header, MessageHandler* handler); + + virtual bool Load(MessageHandler* message_handler); + // Uses default, blocking LoadAndCallback implementation. + + private: + GoogleString url_; + GoogleString filename_; + int64 last_modified_time_sec_; // Loaded from file mtime. + + const RewriteOptions* rewrite_options_; + + DISALLOW_COPY_AND_ASSIGN(FileInputResource); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FILE_INPUT_RESOURCE_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/file_load_mapping.h b/psol/include/src/net/instaweb/rewriter/public/file_load_mapping.h new file mode 100644 index 000000000..b5eeae4d1 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/file_load_mapping.h @@ -0,0 +1,79 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jefftk@google.com (Jeff Kaufman) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_MAPPING_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_MAPPING_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/manually_ref_counted.h" +#include "net/instaweb/util/public/re2.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +// Class for storing a mapping from a URL to a filesystem path, for use by +// FileLoadPolicy. +class FileLoadMapping : public ManuallyRefCounted { + public: + virtual ~FileLoadMapping(); + + // If this mapping applies to this url, put the mapped path into filename and + // return true. Otherwise return false. + virtual bool Substitute(const StringPiece& url, + GoogleString* filename) const = 0; +}; + +class FileLoadMappingRegexp : public FileLoadMapping { + public: + FileLoadMappingRegexp(const GoogleString& url_regexp, + const GoogleString& filename_prefix) + : url_regexp_(url_regexp), + url_regexp_str_(url_regexp), + filename_prefix_(filename_prefix) {} + + virtual bool Substitute(const StringPiece& url, GoogleString* filename) const; + + private: + const RE2 url_regexp_; + // RE2s can't be copied, so we need to keep the string around. + const GoogleString url_regexp_str_; + const GoogleString filename_prefix_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadMappingRegexp); +}; + +class FileLoadMappingLiteral : public FileLoadMapping { + public: + FileLoadMappingLiteral(const GoogleString& url_prefix, + const GoogleString& filename_prefix) + : url_prefix_(url_prefix), + filename_prefix_(filename_prefix) {} + + virtual bool Substitute(const StringPiece& url, GoogleString* filename) const; + + private: + const GoogleString url_prefix_; + const GoogleString filename_prefix_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadMappingLiteral); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_MAPPING_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/file_load_policy.h b/psol/include/src/net/instaweb/rewriter/public/file_load_policy.h new file mode 100644 index 000000000..516cb8c58 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/file_load_policy.h @@ -0,0 +1,123 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_POLICY_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_POLICY_H_ + +#include +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/gtest_prod.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class GoogleUrl; +class FileLoadMapping; +class FileLoadRule; + +// Class for deciding which URLs get loaded from which files. +// +// Currently, you must explicitly set which directories to load directly +// from filesystem. +class FileLoadPolicy { + public: + FileLoadPolicy() {} + virtual ~FileLoadPolicy(); + + // Note: This is O(N+M) for N calls to Associate and M calls to AddRule. + // TODO(sligocki): Set up a more efficient mapper. + virtual bool ShouldLoadFromFile(const GoogleUrl& url, + GoogleString* filename) const; + + // Tells us to load all URLs with this prefix from filename_prefix directory. + // Both prefixes must specify directories, if they do not end in slashes, + // we add them. + // + // Tests against youngest association first in case of overlapping prefixes. + // Because we support regular expressions, checking for overlapping prefixes + // isn't practical. + virtual void Associate(const StringPiece& url_prefix, + const StringPiece& filename_prefix); + + // A version of Associate supporting RE2-format regular expressions. + // Backreferences are supported, as in: + // + // AssociateRegexp("^https?://example.com/~([^/]*)/static/", + // "/var/static/\\1", &error); + // + // Which will map urls as: + // + // http://example.com/~pat/static/cat.jpg -> /var/static/pat/cat.jpg + // http://example.com/~sam/static/dog.jpg -> /var/static/sam/dog.jpg + // https://example.com/~al/static/css/ie -> /var/static/al/css/ie + // + // If the regular expression and substitution validate, returns true. + // Otherwise it writes a message to error and returns false. + virtual bool AssociateRegexp(const StringPiece& url_regexp, + const StringPiece& filename_prefix, + GoogleString* error); + + // By default Associate permits directly loading anything under the specified + // filesystem path prefix. So if we were given: + // + // Associate("http://example.com/", "/var/www/") + // + // we would use load-from-file for everything on the site. If some of those + // files actually need to be loaded through HTTP, for example because they + // need to be interpreted, we might need: + // + // AddRule("/var/www/cgi-bin/", false, false); // literal blacklist. + // + // or: + // + // // blacklist regexp + // AddRule("\\.php$", true, false); // regexp blacklist. + // + // In cases where it's easier to list what's allowed than what's prohibited, + // you can whitelist: + // + // GoogleString e; // For regexp errors. + // Associate("http://example.com/", "/var/www/") + // AddRule(".*", true, false, &e) // regexp blacklist. + // AddRule("\\.html$", true, true, &e) // regexp whitelist. + // AddRule("/var/www/static/", false, true, &e) // literal whitelist. + // // regexp blacklist. + // AddRule("^/var/www/static/legacy/.*\\.php$", true, false, &e) + // + // AddRule will fail if RE2 can't compile the regular expression, and will + // write an error message to it's error string and return false if that + // happens. + virtual bool AddRule(const GoogleString& rule, bool is_regexp, bool allowed, + GoogleString* error); + + // Merge in other policies (needed for rewrite_options). + virtual void Merge(const FileLoadPolicy& other); + + private: + typedef std::list FileLoadMappings; + FileLoadMappings file_load_mappings_; + typedef std::list FileLoadRules; + FileLoadRules file_load_rules_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadPolicy); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_POLICY_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/file_load_rule.h b/psol/include/src/net/instaweb/rewriter/public/file_load_rule.h new file mode 100644 index 000000000..44a4ba645 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/file_load_rule.h @@ -0,0 +1,93 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jefftk@google.com (Jeff Kaufman) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_RULE_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_RULE_H_ + +#include "net/instaweb/util/public/manually_ref_counted.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/re2.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { + +// Class for storing information about what filesystem paths are appropriate for +// direct access and which need to be fetched through HTTP loopback. +class FileLoadRule : public ManuallyRefCounted { + public: + enum Classification { + kAllowed, + kDisallowed, + kUnmatched, + }; + + virtual ~FileLoadRule(); + explicit FileLoadRule(bool allowed) : allowed_(allowed) {} + + // What does this rule say about this filename? + Classification Classify(const GoogleString& filename) const; + + protected: + // Is does this rule apply to this filename? + virtual bool Match(const GoogleString& filename) const = 0; + const bool allowed_; +}; + +class FileLoadRuleRegexp : public FileLoadRule { + public: + virtual ~FileLoadRuleRegexp(); + + // If allowed is true, whitelist filenames matching filename_regexp. + // Otherwise blacklist them. + FileLoadRuleRegexp(const GoogleString& filename_regexp, bool allowed) + : FileLoadRule(allowed), + filename_regexp_(filename_regexp), + filename_regexp_str_(filename_regexp) + {} + + virtual bool Match(const GoogleString& filename) const; + + private: + const RE2 filename_regexp_; + // RE2s can't be copied, so we need to keep the string around. + const GoogleString filename_regexp_str_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadRuleRegexp); +}; + +class FileLoadRuleLiteral : public FileLoadRule { + public: + virtual ~FileLoadRuleLiteral(); + + // If allowed is true, whitelist filenames starting with filename_prefix. + // Otherwise blacklist them. + FileLoadRuleLiteral(const GoogleString& filename_prefix, bool allowed) + : FileLoadRule(allowed), filename_prefix_(filename_prefix) + {} + + virtual bool Match(const GoogleString& filename) const; + + private: + const GoogleString filename_prefix_; + + DISALLOW_COPY_AND_ASSIGN(FileLoadRuleLiteral); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FILE_LOAD_RULE_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/flush_early_content_writer_filter.h b/psol/include/src/net/instaweb/rewriter/public/flush_early_content_writer_filter.h new file mode 100644 index 000000000..2c7797369 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/flush_early_content_writer_filter.h @@ -0,0 +1,103 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Author: nikhilmadan@google.com (Nikhil Madan) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_CONTENT_WRITER_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_CONTENT_WRITER_FILTER_H_ + +#include + +#include "net/instaweb/htmlparse/public/html_writer_filter.h" +#include "net/instaweb/http/public/semantic_type.h" +#include "net/instaweb/http/public/user_agent_matcher.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/null_writer.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class GoogleUrl; +class HtmlElement; +class RewriteDriver; +class TimedVariable; +class Writer; + +struct ResourceInfo; + +// FlushEarlyContentWriterFilter finds rewritten resources in the DOM and +// inserts HTML that makes the browser download them. Note that we set a +// NullWriter as the writer for this driver, and directly write whatever we +// need to the original writer. +class FlushEarlyContentWriterFilter : public HtmlWriterFilter { + public: + static const char kPrefetchLinkRelSubresourceHtml[]; + static const char kPrefetchImageTagHtml[]; + static const char kPrefetchStartTimeScript[]; + static const char kNumResourcesFlushedEarly[]; + static const char kPrefetchScriptTagHtml[]; + static const char kPrefetchLinkTagHtml[]; + + explicit FlushEarlyContentWriterFilter(RewriteDriver* driver); + + virtual void StartDocument(); + virtual void EndDocument(); + + virtual void StartElement(HtmlElement* element); + virtual void EndElement(HtmlElement* element); + + protected: + virtual void Clear(); + + private: + // Writes the string to original_writer_. + void WriteToOriginalWriter(const GoogleString& in); + + // Check whether resource can be flushed or not. + bool IsFlushable(const GoogleUrl& gurl, bool is_pagespeed_resource); + + // Flush the resource and update time_consumed_ms_ based on time_to_download. + void FlushResources( + StringPiece url, + int64 time_to_download, + bool is_pagespeed_resource, + semantic_type::Category category); + + RewriteDriver* driver_; + TimedVariable* num_resources_flushed_early_; + // Whether we need to insert a close script tag at EndDocument. + bool in_body_; + bool insert_close_script_; + int num_resources_flushed_; + NullWriter null_writer_; + Writer* original_writer_; + HtmlElement* current_element_; + UserAgentMatcher::PrefetchMechanism prefetch_mechanism_; + scoped_ptr private_cacheable_resources_; + int64 time_consumed_ms_; + int64 max_available_time_ms_; + typedef std::list ResourceInfoList; + ResourceInfoList js_resources_info_; + bool defer_javascript_enabled_; + GoogleString flush_early_content_; + + DISALLOW_COPY_AND_ASSIGN(FlushEarlyContentWriterFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_CONTENT_WRITER_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/flush_early_info_finder.h b/psol/include/src/net/instaweb/rewriter/public/flush_early_info_finder.h new file mode 100644 index 000000000..844b091a9 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/flush_early_info_finder.h @@ -0,0 +1,73 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mmohabey@google.com (Megha Mohabey) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_H_ + +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class FlushEarlyRenderInfo; +class RewriteDriver; + +// Finds a subset of flush early information which may be used by +// FlushEarlyFlow. This includes information like privately cacheable resources, +// charset. +class FlushEarlyInfoFinder { + public: + static const char kFlushEarlyRenderPropertyName[]; + + FlushEarlyInfoFinder() {} + virtual ~FlushEarlyInfoFinder(); + + // Checks whether GetCharset will return meaningful result. The default + // implementation does not, but classes inheriting likely do. Users of + // GetCharset should check this function and supply a default behavior if + // IsMeaningful returns false. + virtual bool IsMeaningful() const { + return false; + } + + // Gets the flush early info and update the RewriteDriver. + virtual void UpdateFlushEarlyInfoInDriver(RewriteDriver* driver); + + // Computes the flush early info. + virtual void ComputeFlushEarlyInfo(RewriteDriver* driver); + + // Gets the charset of the html document. Users of this function should also + // check IsMeaningful() to see if the implementation of this function returns + // meaningful results and provide a default behavior if it does not. + virtual const char* GetCharset(const RewriteDriver* driver); + + virtual const char* GetCohort() const = 0; + + virtual int64 cache_expiration_time_ms() const = 0; + + protected: + void UpdateFlushEarlyInfoCacheEntry( + RewriteDriver* driver, + FlushEarlyRenderInfo* flush_early_render_info); + + private: + DISALLOW_COPY_AND_ASSIGN(FlushEarlyInfoFinder); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/flush_early_info_finder_test_base.h b/psol/include/src/net/instaweb/rewriter/public/flush_early_info_finder_test_base.h new file mode 100644 index 000000000..50217c0c0 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/flush_early_info_finder_test_base.h @@ -0,0 +1,61 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mmohabey@google.com (Megha Mohabey) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_TEST_BASE_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_TEST_BASE_H_ + +#include "net/instaweb/rewriter/public/flush_early_info_finder.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/timer.h" + +namespace net_instaweb { + +class RewriteDriver; + +// By default, FlushEarlyInfoFinder does not return meaningful results. This +// class can be used by tests which manually manage FlushEarlyRenderInfo. +class MeaningfulFlushEarlyInfoFinder : public FlushEarlyInfoFinder { + public: + MeaningfulFlushEarlyInfoFinder() : num_compute_calls_(0) {} + virtual ~MeaningfulFlushEarlyInfoFinder() {} + virtual bool IsMeaningful() const { + return true; + } + virtual const char* GetCohort() const { + return "NullCohort"; + } + virtual int64 cache_expiration_time_ms() const { + return Timer::kHourMs; + } + virtual void ComputeFlushEarlyInfo(RewriteDriver* driver) { + num_compute_calls_++; + } + int num_compute_calls() { + return num_compute_calls_; + } + void Clear() { + num_compute_calls_ = 0; + } + + private: + int num_compute_calls_; +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_EARLY_INFO_FINDER_TEST_BASE_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/flush_html_filter.h b/psol/include/src/net/instaweb/rewriter/public/flush_html_filter.h new file mode 100644 index 000000000..2874b4404 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/flush_html_filter.h @@ -0,0 +1,53 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_HTML_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_HTML_FILTER_H_ + +#include "net/instaweb/rewriter/public/common_filter.h" +#include "net/instaweb/util/public/basictypes.h" + +namespace net_instaweb { + +class HtmlElement; +class RewriteDriver; + +// This filter is run immediately after lexing when streaming HTML into +// the system. It is used to monitor the HTML and try to figure out good +// times to flush, based on document structure and timing. +class FlushHtmlFilter : public CommonFilter { + public: + explicit FlushHtmlFilter(RewriteDriver* driver); + virtual ~FlushHtmlFilter(); + + virtual void StartDocumentImpl(); + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element); + virtual void Flush(); + + virtual const char* Name() const { return "FlushHtmlFilter"; } + + private: + int score_; + + DISALLOW_COPY_AND_ASSIGN(FlushHtmlFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FLUSH_HTML_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/furious_matcher.h b/psol/include/src/net/instaweb/rewriter/public/furious_matcher.h new file mode 100644 index 000000000..aba019cf3 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/furious_matcher.h @@ -0,0 +1,62 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mukerjee@google.com (Matt Mukerjee) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_MATCHER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_MATCHER_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class RequestHeaders; +class ResponseHeaders; +class RewriteOptions; + +// Provides a way to replace the mapping of clients/sessions to furious +// experiments. +// +// Furious is the A/B experiment framework that enables us to track +// page speed statistics and correlate them with different sets of +// rewriters. The default implementation uses cookies to send clients +// to the same experiment consistently. This implementation can be +// overridden to divide clients/sessions into experiments using a +// different mechanism. +class FuriousMatcher { + public: + FuriousMatcher() { } + virtual ~FuriousMatcher(); + + // Decides which experiment to place the current client/session into. + // Returns true if the mapping needs to be stored. + virtual bool ClassifyIntoExperiment(const RequestHeaders& headers, + RewriteOptions* options); + + // Stores the client/session -> experiment mapping for the domain indicated + // by url. The experiment id is indicated by state. The default + // implementation stores this in a cookie in the response headers, setting it + // to expire one week from now_ms. + virtual void StoreExperimentData(int state, const StringPiece& url, + int64 now_ms, ResponseHeaders* headers); + + private: + DISALLOW_COPY_AND_ASSIGN(FuriousMatcher); +}; + +} // namespace net_instaweb +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_MATCHER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/furious_util.h b/psol/include/src/net/instaweb/rewriter/public/furious_util.h new file mode 100644 index 000000000..bba5981bc --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/furious_util.h @@ -0,0 +1,82 @@ +/* + * Copyright 2012 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: nforman@google.com (Naomi Forman) +// +// Functionality and constants for handling Furious experiments and +// measurement. +// +// Furious is the A/B experiment framework that uses cookies +// and Google Analytics to track page speed statistics and correlate +// them with different sets of rewriters. + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_UTIL_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_UTIL_H_ + +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { + +class RequestHeaders; +class ResponseHeaders; +class RewriteOptions; + +namespace furious { + +// kFuriousNoExperiment indicates there is an actual cookie set, but the cookie +// says: don't run experiments on this user. E.g. if you're running an A/B +// experiment on 40% of the traffic, 20% is in A, 20% is in B, and +// 60% is in NoExperiment. +enum FuriousState { + kFuriousNotSet = -1, // Indicates no experiment cookie was set. + kFuriousNoExperiment = 0, +}; + +// Name of the Furious cookie we set when running experiments. +const char kFuriousCookie[] = "_GFURIOUS"; +const char kFuriousCookiePrefix[] = "_GFURIOUS="; + +// Populates value with the state indicated by the FuriousCookie, if found. +// Returns true if a cookie was found, false if it was not. +bool GetFuriousCookieState(const RequestHeaders& headers, int* value); + +// Removes the Furious cookie from the request headers so we don't +// send it to the origin. +void RemoveFuriousCookie(RequestHeaders *headers); + +// Add a Set-Cookie header for Furious on the domain of url, +// one week from now_ms, putting it on the side of the experiment +// indicated by state. +void SetFuriousCookie(ResponseHeaders* headers, int state, + const StringPiece& url, int64 now_ms); + +// Determines which side of the experiment this request should end up on. +int DetermineFuriousState(const RewriteOptions* options); + +// The string value of a Furious State. We don't want to use "ToString" +// in case we change how we want the cookies to look. +GoogleString FuriousStateToCookieString(int state); + +// Converts a Furious Cookie string, e.g. "2", into a FuriousState. +int CookieStringToState(const StringPiece& cookie_str); + +} // namespace furious + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_FURIOUS_UTIL_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/google_analytics_filter.h b/psol/include/src/net/instaweb/rewriter/public/google_analytics_filter.h new file mode 100644 index 000000000..833143104 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/google_analytics_filter.h @@ -0,0 +1,178 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: slamm@google.com (Stephen Lamm) + +// Search for synchronous loads of Google Analytics similar to the following: +// +// +// +// +// Replace the document.write with a new snippet that loads ga.js +// asynchronously. Also, insert a replacement for _getTracker that +// converts any calls to the synchronous API to the asynchronous API. +// The _getTracker replacement is a new function that returns a mock +// tracker object. Anytime a synchronous API method is called, the +// mock tracker fowards it to a _gaq.push(...) call. +// +// An alternative approach would been to find all the API calls and +// rewrite them to the asynchronous API. However, to be done properly, +// it would have had the added complication of using a JavaScript +// compiler. +// + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_GOOGLE_ANALYTICS_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_GOOGLE_ANALYTICS_FILTER_H_ + +#include + +#include "net/instaweb/htmlparse/public/empty_html_filter.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/scoped_ptr.h" +#include "net/instaweb/util/public/string.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { +class HtmlCdataNode; +class HtmlCharactersNode; +class HtmlCommentNode; +class HtmlElement; +class HtmlIEDirectiveNode; +class HtmlParse; +class Statistics; +class Variable; + + +// Edit a substring in a script element. +class ScriptEditor { + public: + enum Type { + kGaJsScriptSrcLoad = 0, + kGaJsDocWriteLoad, + kGaJsInit, + }; + ScriptEditor(HtmlElement* script_element_, + HtmlCharactersNode* characters_node, + GoogleString::size_type pos, + GoogleString::size_type len, + Type editor_type); + + HtmlElement* GetScriptElement() const { return script_element_; } + HtmlCharactersNode* GetScriptCharactersNode() const { + return script_characters_node_; + } + Type GetType() const { return editor_type_; } + + void NewContents(const StringPiece &replacement, + GoogleString* contents) const; + + private: + HtmlElement* script_element_; + HtmlCharactersNode* script_characters_node_; + + GoogleString::size_type pos_; + GoogleString::size_type len_; + + Type editor_type_; + DISALLOW_COPY_AND_ASSIGN(ScriptEditor); +}; + + +// Filter +// +// +// +// where $hash stands for using the active Hasher and tweaking the result to +// be a valid identifier continuation. Further, the combined source file +// has the code: +// var mod_pagespeed_${hash("a.js")} = "code of a.js as a string literal"; +// var mod_pagespeed_${hash("b.js")} = "code of b.js as a string literal"; +class JsCombineFilter : public RewriteFilter { + public: + static const char kJsFileCountReduction[]; // statistics variable name + + // rewrite_driver is the context owning us, and filter_id is the ID we + // are registered under. + explicit JsCombineFilter(RewriteDriver* rewrite_driver); + virtual ~JsCombineFilter(); + + // Registers the provided statistics variable names with 'statistics'. + static void InitStats(Statistics* statistics); + virtual const char* id() const { + return RewriteOptions::kJavascriptCombinerId; + } + + protected: + // RewriteFilter overrides --- HTML parsing event handlers. + virtual void StartDocumentImpl(); + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element); + virtual void Characters(HtmlCharactersNode* characters); + virtual void Flush(); + virtual void IEDirective(HtmlIEDirectiveNode* directive); + virtual const char* Name() const { return "JsCombine"; } + virtual RewriteContext* MakeRewriteContext(); + virtual const UrlSegmentEncoder* encoder() const { + return &encoder_; + } + + private: + class JsCombiner; + class Context; + + friend class JsCombineFilterTest; + + void ConsiderJsForCombination(HtmlElement* element, + HtmlElement::Attribute* src); + + // Returns JS variable name where code for given URL should be stored. + static GoogleString VarName(const ServerContext* server_context, + const GoogleString& url); + + void NextCombination(); + + Context* MakeContext(); + + JsCombiner* combiner() const; + ServerContext* server_context() const { return server_context_; } + + ScriptTagScanner script_scanner_; + int script_depth_; // how many script elements we are inside + // current outermost +// to: +// +// +class JsDisableFilter : public EmptyHtmlFilter { + public: + explicit JsDisableFilter(RewriteDriver* driver); + ~JsDisableFilter(); + + static const char kEnableJsExperimental[]; + static const char kDisableJsExperimental[]; + + virtual void StartDocument(); + + virtual void StartElement(HtmlElement* element); + + virtual void EndElement(HtmlElement* element); + + virtual void EndDocument(); + + virtual void DetermineEnabled(); + + virtual const char* Name() const { + return "JsDisableFilter"; + } + + static GoogleString GetJsDisableScriptSnippet(const RewriteOptions* options); + + private: + // Inserts the experimental js enable/disable code. + void InsertJsDeferExperimentalScript(HtmlElement* element); + + // Insert meta tag with 'X-UA-Compatible'. This will avoid IE going to quirks + // mode. More information about this can be found in + // http://webdesign.about.com/od/metataglibraries/p/x-ua-compatible-meta-tag.htm + void InsertMetaTagForIE(HtmlElement* element); + + RewriteDriver* rewrite_driver_; + ScriptTagScanner script_tag_scanner_; + int index_; + bool defer_js_experimental_script_written_; + + DISALLOW_COPY_AND_ASSIGN(JsDisableFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_JS_DISABLE_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/js_inline_filter.h b/psol/include/src/net/instaweb/rewriter/public/js_inline_filter.h new file mode 100644 index 000000000..c7ad40683 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/js_inline_filter.h @@ -0,0 +1,71 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: mdsteele@google.com (Matthew D. Steele) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_JS_INLINE_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_JS_INLINE_FILTER_H_ + +#include + +#include "net/instaweb/rewriter/public/common_filter.h" +#include "net/instaweb/rewriter/public/resource.h" +#include "net/instaweb/rewriter/public/script_tag_scanner.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string_util.h" + +namespace net_instaweb { +class HtmlElement; +class HtmlCharactersNode; +class RewriteDriver; + +// Inline small Javascript files. +class JsInlineFilter : public CommonFilter { + public: + explicit JsInlineFilter(RewriteDriver* driver); + virtual ~JsInlineFilter(); + + virtual void StartDocumentImpl(); + virtual void EndDocument(); + virtual void StartElementImpl(HtmlElement* element); + virtual void EndElementImpl(HtmlElement* element); + virtual void Characters(HtmlCharactersNode* characters); + virtual const char* Name() const { return "InlineJs"; } + + private: + class Context; + friend class Context; + + bool ShouldInline(const ResourcePtr& resource) const; + void RenderInline(const ResourcePtr& resource, const StringPiece& text, + HtmlElement* element); + + const size_t size_threshold_bytes_; + ScriptTagScanner script_tag_scanner_; + + // This is set to true during StartElement() for a . + bool should_inline_; + + DISALLOW_COPY_AND_ASSIGN(JsInlineFilter); +}; + +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_JS_INLINE_FILTER_H_ diff --git a/psol/include/src/net/instaweb/rewriter/public/js_outline_filter.h b/psol/include/src/net/instaweb/rewriter/public/js_outline_filter.h new file mode 100644 index 000000000..f0b80f779 --- /dev/null +++ b/psol/include/src/net/instaweb/rewriter/public/js_outline_filter.h @@ -0,0 +1,75 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_JS_OUTLINE_FILTER_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_JS_OUTLINE_FILTER_H_ + +#include + +#include "net/instaweb/rewriter/public/common_filter.h" +#include "net/instaweb/rewriter/public/script_tag_scanner.h" +#include "net/instaweb/util/public/basictypes.h" +#include "net/instaweb/util/public/string.h" + +namespace net_instaweb { +class HtmlCharactersNode; +class HtmlElement; +class MessageHandler; +class OutputResource; +class ServerContext; +class RewriteDriver; + +// Filter to take explicit