0fb90651b2
by always registering cohort, and deciding whether we need it at read time. Also add the corresponding example page and integration test. Fix it on nginx by adding a separate hook for post-property-cache init, as its actually not ready in StartParse w/ProxyFetch (while it is with Apache) Also remove some needless quoting that was pointed out in review.
1811 lines
78 KiB
C++
1811 lines
78 KiB
C++
/*
|
|
* Copyright 2010 Google Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
// Author: jmarantz@google.com (Joshua Marantz)
|
|
|
|
#ifndef NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_
|
|
#define NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_
|
|
|
|
#include <map>
|
|
#include <set>
|
|
#include <vector>
|
|
|
|
#include "base/logging.h"
|
|
#include "net/instaweb/http/public/cache_url_async_fetcher.h"
|
|
#include "net/instaweb/http/public/http_cache.h"
|
|
#include "net/instaweb/http/public/request_context.h"
|
|
#include "net/instaweb/http/public/url_async_fetcher.h"
|
|
#include "net/instaweb/rewriter/cached_result.pb.h"
|
|
#include "net/instaweb/rewriter/public/critical_images_finder.h"
|
|
#include "net/instaweb/rewriter/public/critical_selector_finder.h"
|
|
#include "net/instaweb/rewriter/public/downstream_cache_purger.h"
|
|
#include "net/instaweb/rewriter/public/inline_attribute_slot.h"
|
|
#include "net/instaweb/rewriter/public/inline_resource_slot.h"
|
|
#include "net/instaweb/rewriter/public/output_resource.h"
|
|
#include "net/instaweb/rewriter/public/output_resource_kind.h"
|
|
#include "net/instaweb/rewriter/public/resource.h"
|
|
#include "net/instaweb/rewriter/public/resource_namer.h"
|
|
#include "net/instaweb/rewriter/public/resource_slot.h"
|
|
#include "net/instaweb/rewriter/public/rewrite_context.h"
|
|
#include "net/instaweb/rewriter/public/rewrite_options.h"
|
|
#include "net/instaweb/rewriter/public/scan_filter.h"
|
|
#include "net/instaweb/rewriter/public/server_context.h"
|
|
#include "net/instaweb/rewriter/public/srcset_slot.h"
|
|
#include "pagespeed/kernel/base/abstract_mutex.h"
|
|
#include "pagespeed/kernel/base/atomic_bool.h"
|
|
#include "pagespeed/kernel/base/basictypes.h"
|
|
#include "pagespeed/kernel/base/function.h"
|
|
#include "pagespeed/kernel/base/printf_format.h"
|
|
#include "pagespeed/kernel/base/proto_util.h"
|
|
#include "pagespeed/kernel/base/scoped_ptr.h"
|
|
#include "pagespeed/kernel/base/string.h"
|
|
#include "pagespeed/kernel/base/string_util.h"
|
|
#include "pagespeed/kernel/base/thread_annotations.h"
|
|
#include "pagespeed/kernel/base/thread_system.h"
|
|
#include "pagespeed/kernel/base/writer.h"
|
|
#include "pagespeed/kernel/html/html_element.h"
|
|
#include "pagespeed/kernel/html/html_filter.h"
|
|
#include "pagespeed/kernel/html/html_node.h"
|
|
#include "pagespeed/kernel/html/html_parse.h"
|
|
#include "pagespeed/kernel/http/content_type.h"
|
|
#include "pagespeed/kernel/http/google_url.h"
|
|
#include "pagespeed/kernel/http/request_headers.h"
|
|
#include "pagespeed/kernel/http/response_headers.h"
|
|
#include "pagespeed/kernel/http/user_agent_matcher.h"
|
|
#include "pagespeed/kernel/thread/queued_worker_pool.h"
|
|
#include "pagespeed/kernel/thread/scheduler.h"
|
|
#include "pagespeed/kernel/thread/sequence.h"
|
|
#include "pagespeed/kernel/util/categorized_refcount.h"
|
|
#include "pagespeed/kernel/util/url_segment_encoder.h"
|
|
#include "pagespeed/opt/http/property_cache.h"
|
|
|
|
namespace net_instaweb {
|
|
|
|
class AbstractLogRecord;
|
|
class AsyncFetch;
|
|
class CommonFilter;
|
|
class DebugFilter;
|
|
class DependencyTracker;
|
|
class DomStatsFilter;
|
|
class DomainRewriteFilter;
|
|
class FallbackPropertyPage;
|
|
class FileSystem;
|
|
class FlushEarlyInfo;
|
|
class HtmlWriterFilter;
|
|
class MessageHandler;
|
|
class RequestProperties;
|
|
class RequestTrace;
|
|
class RewriteDriverPool;
|
|
class RewriteFilter;
|
|
class Statistics;
|
|
class UrlLeftTrimFilter;
|
|
class UrlNamer;
|
|
|
|
// This extends class HtmlParse (which should renamed HtmlContext) by providing
|
|
// context for rewriting resources (css, js, images).
|
|
class RewriteDriver : public HtmlParse {
|
|
public:
|
|
// Status return-code for ResolveCssUrls.
|
|
enum CssResolutionStatus {
|
|
kWriteFailed,
|
|
kNoResolutionNeeded,
|
|
kSuccess
|
|
};
|
|
|
|
// Mode for BoundedWaitForCompletion
|
|
enum WaitMode {
|
|
kNoWait, // Used internally. Do not pass in.
|
|
kWaitForCompletion, // wait for everything to complete (up to deadline)
|
|
kWaitForCachedRender, // wait for at least cached rewrites to complete,
|
|
// and anything else that finishes within deadline.
|
|
kWaitForShutDown // Makes sure that all work, including any that's
|
|
// being done in background, finishes.
|
|
};
|
|
|
|
// Indicates document's mimetype as XHTML, HTML, or is not
|
|
// known/something else. Note that in Apache we might not know the
|
|
// correct mimetype because a downstream module might change it.
|
|
// It's not clear how likely this is, since mod_rewrite and mod_mime
|
|
// run upstream of mod_pagespeed. However if anyone sets mimetype
|
|
// via "Header Add", it would affect the Browser's view of the
|
|
// document's mimetype (which is what determines the parsing) but
|
|
// mod_pagespeed would not know.
|
|
//
|
|
// Note that we also have doctype().IsXhtml() but that indicates quirks-mode
|
|
// for CSS, and does not control how the parser parses the document.
|
|
enum XhtmlStatus {
|
|
kXhtmlUnknown,
|
|
kIsXhtml,
|
|
kIsNotXhtml
|
|
};
|
|
|
|
// See CreateInputResource.
|
|
enum InlineAuthorizationPolicy {
|
|
kInlineUnauthorizedResources,
|
|
kInlineOnlyAuthorizedResources
|
|
};
|
|
|
|
// See CreateInputResource.
|
|
enum IntendedFor {
|
|
kIntendedForInlining,
|
|
kIntendedForGeneral
|
|
};
|
|
|
|
// This string identifies, for the PropertyCache, a group of properties
|
|
// that are computed from the DOM, and thus can, if desired, be rewritten
|
|
// on every HTML request.
|
|
static const char kDomCohort[];
|
|
// The cohort for properties that are written by the beacon handler.
|
|
static const char kBeaconCohort[];
|
|
// Cohort for dependency information. This is written at different time than
|
|
// kDomCohort, and might not be in use for some requests, depending on
|
|
// settings.
|
|
static const char kDependenciesCohort[];
|
|
|
|
// Property Names in DomCohort.
|
|
// Tracks the timestamp when we last received a request for this url.
|
|
static const char kLastRequestTimestamp[];
|
|
// Tracks if we exceeded the maximum size limit of html which we should parse.
|
|
static const char kParseSizeLimitExceeded[];
|
|
// Flush Subresources Info associted with the HTML page.
|
|
static const char kSubresourcesPropertyName[];
|
|
// Status codes of previous responses.
|
|
static const char kStatusCodePropertyName[];
|
|
|
|
RewriteDriver(MessageHandler* message_handler,
|
|
FileSystem* file_system,
|
|
UrlAsyncFetcher* url_async_fetcher);
|
|
|
|
// Need explicit destructors to allow destruction of scoped_ptr-controlled
|
|
// instances without propagating the include files.
|
|
virtual ~RewriteDriver();
|
|
|
|
// Returns a fresh instance using the same options we do, using the same log
|
|
// record. Drivers should only be cloned within the same request.
|
|
//
|
|
// Clones share the same request_context, which contains bits derived from the
|
|
// request headers, so request_headers_ is also cloned (or shared if we make
|
|
// them shareable).
|
|
//
|
|
// You must call SetRequestHeaders before calling Clone.
|
|
RewriteDriver* Clone();
|
|
|
|
// Clears the current request cache of resources and base URL. The
|
|
// filter-chain is left intact so that a new request can be issued.
|
|
// Deletes all RewriteContexts.
|
|
//
|
|
// WaitForCompletion must be called prior to Clear().
|
|
void Clear();
|
|
|
|
// Initialize statistics for all filters that need it.
|
|
static void InitStats(Statistics* statistics);
|
|
|
|
// Initialize statics. Initialize/Terminate calls must be paired.
|
|
static void Initialize();
|
|
static void Terminate();
|
|
|
|
// Formats a "deadline exceeded" message for a given filter.
|
|
static GoogleString DeadlineExceededMessage(StringPiece filter_name);
|
|
|
|
// Sets a server context enabling the rewriting of
|
|
// resources. This will replace any previous server context.
|
|
void SetServerContext(ServerContext* server_context);
|
|
|
|
// Returns true if we may cache extend Css, Images, PDFs, or Scripts
|
|
// respectively.
|
|
bool MayCacheExtendCss() const;
|
|
bool MayCacheExtendImages() const;
|
|
bool MayCacheExtendPdfs() const;
|
|
bool MayCacheExtendScripts() const;
|
|
|
|
const GoogleString& user_agent() const { return user_agent_; }
|
|
|
|
const RequestProperties* request_properties() const {
|
|
return request_properties_.get();
|
|
}
|
|
|
|
// Reinitializes request_properties_, clearing any cached values.
|
|
void ClearRequestProperties();
|
|
|
|
bool write_property_cache_dom_cohort() const {
|
|
return write_property_cache_dom_cohort_;
|
|
}
|
|
void set_write_property_cache_dom_cohort(bool x) {
|
|
write_property_cache_dom_cohort_ = x;
|
|
}
|
|
|
|
// Returns the list of cohorts that should be read in based on
|
|
// our options.
|
|
static PropertyCache::CohortVector GetCohortList(
|
|
const PropertyCache* pcache, const RewriteOptions* options,
|
|
const ServerContext* server_context);
|
|
|
|
// Should be called once everything in the property cache has been read,
|
|
// and the pages set on the object.
|
|
void PropertyCacheSetupDone();
|
|
|
|
RequestContextPtr request_context() { return request_context_; }
|
|
void set_request_context(const RequestContextPtr& x);
|
|
|
|
// Convenience method to return the trace context from the request_context()
|
|
// if both are configured and NULL otherwise.
|
|
RequestTrace* trace_context();
|
|
|
|
// Convenience methods to issue a trace annotation if tracing is enabled.
|
|
// If tracing is disabled, these methods are no-ops.
|
|
void TracePrintf(const char* fmt, ...);
|
|
void TraceLiteral(const char* literal);
|
|
void TraceString(const GoogleString& s);
|
|
|
|
// Return a mutable pointer to the response headers that filters can update
|
|
// before the first flush. Returns NULL after Flush has occurred.
|
|
ResponseHeaders* mutable_response_headers() {
|
|
return flush_occurred_ ? NULL : response_headers_;
|
|
}
|
|
|
|
// Returns a const version of the ResponseHeaders*, indepdendent of whether
|
|
// Flush has occurred. Note that ResponseHeaders* may still be NULL if
|
|
// no one has called set_response_headers_ptr.
|
|
//
|
|
// TODO(jmarantz): Change API to require response_headers in StartParse so
|
|
// we can guarantee this is non-null.
|
|
const ResponseHeaders* response_headers() {
|
|
return response_headers_;
|
|
}
|
|
|
|
// Set the pointer to the response headers that filters can update
|
|
// before the first flush. RewriteDriver does NOT take ownership
|
|
// of this memory.
|
|
void set_response_headers_ptr(ResponseHeaders* headers) {
|
|
response_headers_ = headers;
|
|
}
|
|
|
|
// Reinitializes request_headers_ (a scoped ptr) with a copy of the original
|
|
// request headers. Note that the fetches associated with the driver could
|
|
// be using a modified version of the original request headers.
|
|
// There MUST be exactly 1 call to this method after a rewrite driver object
|
|
// has been constructed or recycled, before the RewriteDriver is used for
|
|
// request processing.
|
|
//
|
|
// This method also sets up the user-agent and device properties.
|
|
void SetRequestHeaders(const RequestHeaders& headers);
|
|
|
|
const RequestHeaders* request_headers() const {
|
|
return request_headers_.get();
|
|
}
|
|
|
|
UserAgentMatcher* user_agent_matcher() const {
|
|
DCHECK(server_context() != NULL);
|
|
return server_context()->user_agent_matcher();
|
|
}
|
|
|
|
// Adds the filters from the options, specified by name in enabled_filters.
|
|
// This must be called explicitly after object construction to provide an
|
|
// opportunity to programatically add custom filters beyond those defined
|
|
// in RewriteOptions, via AddFilter(HtmlFilter* filter) (below).
|
|
void AddFilters();
|
|
|
|
// Adds a filter to the very beginning of the pre-render chain, taking
|
|
// ownership. This should only be used for filters that must run before any
|
|
// filter added via PrependOwnedPreRenderFilter.
|
|
void AddOwnedEarlyPreRenderFilter(HtmlFilter* filter);
|
|
|
|
// Adds a filter to the beginning of the pre-render chain, taking ownership.
|
|
void PrependOwnedPreRenderFilter(HtmlFilter* filter);
|
|
// Adds a filter to the end of the pre-render chain, taking ownership.
|
|
void AppendOwnedPreRenderFilter(HtmlFilter* filter);
|
|
// Same, without taking ownership.
|
|
void AppendUnownedPreRenderFilter(HtmlFilter* filter);
|
|
|
|
// Adds a filter to the end of the post-render chain, taking ownership.
|
|
void AddOwnedPostRenderFilter(HtmlFilter* filter);
|
|
// Same, without taking ownership.
|
|
void AddUnownedPostRenderFilter(HtmlFilter* filter);
|
|
|
|
// Add a RewriteFilter to the end of the pre-render chain and take ownership
|
|
// of the filter. This differs from AppendOwnedPreRenderFilter in that
|
|
// it adds the filter's ID into a dispatch table for serving
|
|
// rewritten resources. E.g. if your filter->id == "xy" and
|
|
// FetchResource("NAME.pagespeed.xy.HASH.EXT"...) is called, then
|
|
// RewriteDriver will dispatch to filter->Fetch().
|
|
//
|
|
// This is used when the filter being added is not part of the
|
|
// core set built into RewriteDriver and RewriteOptions, such
|
|
// as platform-specific or server-specific filters, or filters
|
|
// invented for unit-testing the framework.
|
|
void AppendRewriteFilter(RewriteFilter* filter);
|
|
|
|
// Like AppendRewriteFilter, but adds the filter to the beginning of the
|
|
// pre-render chain.
|
|
void PrependRewriteFilter(RewriteFilter* filter);
|
|
|
|
// Tells RewriteDriver that a certain portion of URL namespace should not
|
|
// be handled via usual (HTTP proxy semantics) means. It's up to
|
|
// the filters to actually arrange for that to do something.
|
|
// Takes ownership of the claimant object. Note that it's important for the
|
|
// claims to be disjoint, since the RewriteContext framework needs to
|
|
// be able to assign compatible Resource objects for same URLs/slots among
|
|
// all filters that deal with them.
|
|
void AddResourceUrlClaimant(ResourceUrlClaimant* claimant);
|
|
|
|
// Controls how HTML output is written. Be sure to call this last, after
|
|
// all other filters have been established.
|
|
//
|
|
// TODO(jmarantz): fix this in the implementation so that the caller can
|
|
// install filters in any order and the writer will always be last.
|
|
void SetWriter(Writer* writer);
|
|
|
|
Writer* writer() const { return writer_; }
|
|
|
|
// Initiates an async fetch for a rewritten resource with the specified name.
|
|
// If url matches the pattern of what the driver is authorized to serve,
|
|
// then true is returned and the caller must listen on the callback for
|
|
// the completion of the request.
|
|
//
|
|
// If the driver is not authorized to serve the resource for any of the
|
|
// following reasons, false is returned and the callback will -not- be
|
|
// called - the request should be passed to another handler.
|
|
// * The URL is invalid or it does not match the general pagespeed pattern.
|
|
// * The filter id in the URL does not map to a known filter.
|
|
// * The filter for the id in the URL doesn't recognize the format of the URL.
|
|
// * The filter for the id in the URL is forbidden.
|
|
//
|
|
// In other words there are three outcomes for this routine:
|
|
// 1. the request was handled immediately and the callback called
|
|
// before the method returns. true is returned.
|
|
// 2. the request looks good but was queued because some other resource
|
|
// fetch is needed to satisfy it. true is returned.
|
|
// 3. the request does not look like it belongs to Instaweb. The callback
|
|
// will not be called, and false will be returned.
|
|
//
|
|
// In even other words, if this routine returns 'false' then the callback
|
|
// will not be called. If the callback -is- called, then this should be the
|
|
// 'final word' on this request, whether it was called with success=true or
|
|
// success=false.
|
|
//
|
|
// Note that if the request headers have not yet been set on the driver then
|
|
// they'll be taken from the fetch.
|
|
bool FetchResource(const StringPiece& url, AsyncFetch* fetch);
|
|
|
|
// Initiates an In-Place Resource Optimization (IPRO) fetch (A resource which
|
|
// is served under the original URL, but is still able to be rewritten).
|
|
//
|
|
// proxy_mode indicates whether we are running as a proxy where users
|
|
// depend on us to send contents. When set true, we will perform HTTP fetches
|
|
// to get contents if not in cache and will ignore kRecentFetchNotCacheable
|
|
// and kRecentFetchFailed since we'll have to fetch the resource for users
|
|
// anyway. Origin implementations (like mod_pagespeed) should set this to
|
|
// false and let the serve serve the resource if it's not in cache.
|
|
//
|
|
// If proxy_mode is false and the resource could not be found in HTTP cache,
|
|
// async_fetch->Done(false) will be called and async_fetch->status_code()
|
|
// will be CacheUrlAsyncFetcher::kNotInCacheStatus (to distinguish this
|
|
// from a different reason for failure, like kRecentFetchNotCacheable).
|
|
//
|
|
// Note that if the request headers have not yet been set on the driver then
|
|
// they'll be taken from the fetch.
|
|
void FetchInPlaceResource(const GoogleUrl& gurl, bool proxy_mode,
|
|
AsyncFetch* async_fetch);
|
|
|
|
// See FetchResource. There are two differences:
|
|
// 1. It takes an OutputResource instead of a URL.
|
|
// 2. It returns whether a fetch was queued or not. This is safe
|
|
// to ignore because in either case the callback will be called.
|
|
// 3. If 'filter' is NULL then the request only checks cache and
|
|
// (if enabled) the file system.
|
|
bool FetchOutputResource(const OutputResourcePtr& output_resource,
|
|
RewriteFilter* filter,
|
|
AsyncFetch* async_fetch);
|
|
|
|
// Attempts to decode an output resource based on the URL pattern
|
|
// without actually rewriting it. No permission checks are performed on the
|
|
// url, though it is parsed to see if it looks like the url of a generated
|
|
// resource (which should mean checking the hash to ensure we generated it
|
|
// ourselves).
|
|
// TODO(jmaessen): add url hash & check thereof.
|
|
OutputResourcePtr DecodeOutputResource(const GoogleUrl& url,
|
|
RewriteFilter** filter) const;
|
|
|
|
// As above, but does not actually create a resource object,
|
|
// and instead outputs the decoded information into the various out
|
|
// parameters. Returns whether decoding successful or not.
|
|
// Uses options_to_use rather than this->options() to determine which
|
|
// drivers are forbidden from applying, etc.
|
|
bool DecodeOutputResourceName(const GoogleUrl& url,
|
|
const RewriteOptions* options_to_use,
|
|
const UrlNamer* url_namer,
|
|
ResourceNamer* name_out,
|
|
OutputResourceKind* kind_out,
|
|
RewriteFilter** filter_out) const;
|
|
|
|
// Attempts to lookup the metadata cache info that would be used for the
|
|
// output resource at url with the RewriteOptions set on this driver.
|
|
//
|
|
// If there is a problem with the URL, returns false, and *error_out
|
|
// will contain an error message.
|
|
//
|
|
// If it can determine the metadata cache key successfully, returns true,
|
|
// and eventually callback will be invoked with the metadata cache key
|
|
// and the decoding results.
|
|
//
|
|
// After calling this method, the driver should not be used for anything else.
|
|
bool LookupMetadataForOutputResource(
|
|
StringPiece url,
|
|
GoogleString* error_out,
|
|
RewriteContext::CacheLookupResultCallback* callback);
|
|
|
|
// Decodes the incoming pagespeed url to original url(s).
|
|
bool DecodeUrl(const GoogleUrl& url,
|
|
StringVector* decoded_urls) const;
|
|
|
|
// As above, but lets one specify the options and URL namer to use.
|
|
// Meant for use with the decoding_driver.
|
|
bool DecodeUrlGivenOptions(const GoogleUrl& url,
|
|
const RewriteOptions* options,
|
|
const UrlNamer* url_namer,
|
|
StringVector* decoded_urls) const;
|
|
|
|
FileSystem* file_system() { return file_system_; }
|
|
UrlAsyncFetcher* async_fetcher() { return url_async_fetcher_; }
|
|
|
|
// Set a fetcher that will be used by RewriteDriver for current request
|
|
// only (that is, until Clear()). RewriteDriver will take ownership of this
|
|
// fetcher, and will keep it around until Clear(), even if further calls
|
|
// to this method are made.
|
|
void SetSessionFetcher(UrlAsyncFetcher* f);
|
|
|
|
UrlAsyncFetcher* distributed_fetcher() { return distributed_async_fetcher_; }
|
|
// Does not take ownership.
|
|
void set_distributed_fetcher(UrlAsyncFetcher* fetcher) {
|
|
distributed_async_fetcher_ = fetcher;
|
|
}
|
|
|
|
// Creates a cache fetcher that uses the driver's fetcher and its options.
|
|
// Note: this means the driver's fetcher must survive as long as this does.
|
|
CacheUrlAsyncFetcher* CreateCacheFetcher();
|
|
// Returns a cache fetcher that does not fall back to an actual fetcher.
|
|
CacheUrlAsyncFetcher* CreateCacheOnlyFetcher();
|
|
|
|
ServerContext* server_context() const { return server_context_; }
|
|
Statistics* statistics() const;
|
|
|
|
// Takes ownership of 'options'.
|
|
void set_custom_options(RewriteOptions* options) {
|
|
set_options_for_pool(NULL, options);
|
|
}
|
|
|
|
// Takes ownership of 'options'. pool denotes the pool of rewrite drivers that
|
|
// use these options. May be NULL if using custom options.
|
|
void set_options_for_pool(RewriteDriverPool* pool, RewriteOptions* options) {
|
|
controlling_pool_ = pool;
|
|
options_.reset(options);
|
|
}
|
|
|
|
// Pool in which this driver can be recycled. May be NULL.
|
|
RewriteDriverPool* controlling_pool() { return controlling_pool_; }
|
|
|
|
// Return the options used for this RewriteDriver.
|
|
const RewriteOptions* options() const { return options_.get(); }
|
|
|
|
// Override HtmlParse's StartParseId to propagate any required options.
|
|
// Note that if this (or other variants) returns true you should use
|
|
// FinishParse(), otherwise Cleanup().
|
|
virtual bool StartParseId(const StringPiece& url, const StringPiece& id,
|
|
const ContentType& content_type);
|
|
|
|
// Override HtmlParse's FinishParse to ensure that the
|
|
// request-scoped cache is cleared immediately.
|
|
//
|
|
// Note that the RewriteDriver can delete itself in this method, if
|
|
// it's not externally managed, and if all RewriteContexts have been
|
|
// completed.
|
|
virtual void FinishParse();
|
|
|
|
// As above, but asynchronous. Note that the RewriteDriver may already be
|
|
// deleted at the point the callback is invoked. The scheduler lock will
|
|
// not be held when the callback is run.
|
|
void FinishParseAsync(Function* callback);
|
|
|
|
// Report error message with description of context's location
|
|
// (such as filenames and line numbers). context may be NULL, in which case
|
|
// the current parse position will be used.
|
|
void InfoAt(const RewriteContext* context,
|
|
const char* msg, ...) INSTAWEB_PRINTF_FORMAT(3, 4);
|
|
|
|
// Constructs name and URL for the specified input resource and encoder.
|
|
bool GenerateOutputResourceNameAndUrl(
|
|
const UrlSegmentEncoder* encoder,
|
|
const ResourceContext* data,
|
|
const ResourcePtr& input_resource,
|
|
GoogleString* name,
|
|
GoogleUrl* mapped_gurl,
|
|
GoogleString* failure_reason);
|
|
|
|
// Creates a reference-counted pointer to a new OutputResource object.
|
|
//
|
|
// The content type is taken from the input_resource, but can be modified
|
|
// with SetType later if that is not correct (e.g. due to image transcoding).
|
|
|
|
// Constructs an output resource corresponding to the specified input resource
|
|
// and encoded using the provided encoder. Assumes permissions checking
|
|
// occurred when the input resource was constructed, and does not do it again.
|
|
// To avoid if-chains, tolerates a NULL input_resource (by returning NULL).
|
|
// TODO(jmaessen, jmarantz): Do we want to permit NULL input_resources here?
|
|
// jmarantz has evinced a distaste.
|
|
OutputResourcePtr CreateOutputResourceFromResource(
|
|
const char* filter_id,
|
|
const UrlSegmentEncoder* encoder,
|
|
const ResourceContext* data,
|
|
const ResourcePtr& input_resource,
|
|
OutputResourceKind kind,
|
|
GoogleString* failure_reason);
|
|
|
|
// Creates an output resource where the name is provided. The intent is to
|
|
// be able to derive the content from the name, for example, by encoding
|
|
// URLs and metadata.
|
|
//
|
|
// This method succeeds unless the filename is too long.
|
|
//
|
|
// This name is prepended with path for writing hrefs, and the resulting url
|
|
// is encoded and stored at file_prefix when working with the file system.
|
|
// So hrefs are:
|
|
// $(PATH)/$(NAME).pagespeed[.$EXPERIMENT].$(FILTER_PREFIX).
|
|
// $(HASH).$(CONTENT_TYPE_EXT)
|
|
//
|
|
// EXPERIMENT is set only when there is an active experiment_spec.
|
|
//
|
|
// Could be private since you should use one of the versions below but put
|
|
// here with the rest like it and for documentation clarity.
|
|
OutputResourcePtr CreateOutputResourceWithPath(
|
|
const StringPiece& mapped_path, const StringPiece& unmapped_path,
|
|
const StringPiece& base_url, const StringPiece& filter_id,
|
|
const StringPiece& name, OutputResourceKind kind,
|
|
GoogleString* failure_reason);
|
|
|
|
// Fills in the resource namer based on the give filter_id, name and options
|
|
// stored in the driver.
|
|
void PopulateResourceNamer(
|
|
const StringPiece& filter_id,
|
|
const StringPiece& name,
|
|
ResourceNamer* full_name);
|
|
|
|
// Version of CreateOutputResourceWithPath which first takes only the
|
|
// unmapped path and finds the mapped path using the DomainLawyer
|
|
// and the base_url is this driver's base_url.
|
|
OutputResourcePtr CreateOutputResourceWithUnmappedUrl(
|
|
const GoogleUrl& unmapped_gurl, const StringPiece& filter_id,
|
|
const StringPiece& name, OutputResourceKind kind,
|
|
GoogleString* failure_reason);
|
|
|
|
// Version of CreateOutputResourceWithPath where the unmapped and mapped
|
|
// paths are different and the base_url is this driver's base_url.
|
|
OutputResourcePtr CreateOutputResourceWithMappedPath(
|
|
const StringPiece& mapped_path, const StringPiece& unmapped_path,
|
|
const StringPiece& filter_id, const StringPiece& name,
|
|
OutputResourceKind kind, GoogleString* failure_reason) {
|
|
return CreateOutputResourceWithPath(mapped_path, unmapped_path,
|
|
decoded_base_url_.AllExceptLeaf(),
|
|
filter_id, name, kind, failure_reason);
|
|
}
|
|
|
|
// Version of CreateOutputResourceWithPath where the unmapped and mapped
|
|
// paths and the base url are all the same. FOR TESTS ONLY.
|
|
OutputResourcePtr CreateOutputResourceWithPath(
|
|
const StringPiece& path, const StringPiece& filter_id,
|
|
const StringPiece& name, OutputResourceKind kind,
|
|
GoogleString* failure_reason) {
|
|
return CreateOutputResourceWithPath(path, path, path, filter_id, name,
|
|
kind, failure_reason);
|
|
}
|
|
|
|
// Creates an input resource based on input_url. Returns NULL if the input
|
|
// resource url isn't valid or is a data url, or can't legally be rewritten
|
|
// in the context of this page, in which case *is_authorized will be false.
|
|
// Assumes that resources from unauthorized domains may not be rewritten and
|
|
// that the resource is not intended exclusively for inlining.
|
|
ResourcePtr CreateInputResource(const GoogleUrl& input_url,
|
|
bool* is_authorized);
|
|
|
|
// Creates an input resource. Returns NULL if the input resource url isn't
|
|
// valid or is a data url, or can't legally be rewritten in the context of
|
|
// this page (which could mean that it was a resource from an unauthorized
|
|
// domain being processed by a filter that does not allow unauthorized
|
|
// resources, in which case *is_authorized will be false).
|
|
//
|
|
// There are two "special" options, and if you don't care about them you
|
|
// should just call CreateInputResource(input_url, is_authorized) to use
|
|
// their defaults:
|
|
// * If resources from unauthorized domains may be inlined, set
|
|
// inline_authorization_policy to kInlineUnauthorizedResources, otherwise
|
|
// set it to kInlineOnlyAuthorizedResources.
|
|
// * If this resource will be inlined after fetching, then set intended_for to
|
|
// kIntendedForInlining, otherwise use kIntendedForGeneral. This is to
|
|
// support AllowWhenInlining.
|
|
ResourcePtr CreateInputResource(
|
|
const GoogleUrl& input_url,
|
|
InlineAuthorizationPolicy inline_authorization_policy,
|
|
IntendedFor intended_for,
|
|
bool* is_authorized);
|
|
|
|
// Creates an input resource from the given absolute url. Requires that the
|
|
// provided url has been checked, and can legally be rewritten in the current
|
|
// page context. Only for use by unit tests.
|
|
ResourcePtr CreateInputResourceAbsoluteUncheckedForTestsOnly(
|
|
const StringPiece& absolute_url);
|
|
|
|
// Returns true if some ResourceUrlClaimant has staked a claim on given URL.
|
|
// If this returns true, CreateInputResource will fail, but it's probably
|
|
// not worth logging any debug filter hints about that.
|
|
bool IsResourceUrlClaimed(const GoogleUrl& url) const;
|
|
|
|
// Checks to see if the input_url has the same origin as and the base url, to
|
|
// make sure we're not fetching from another server. Does not consult the
|
|
// domain lawyer, and is not affected by AddDomain().
|
|
// Precondition: input_url.IsWebValid()
|
|
bool MatchesBaseUrl(const GoogleUrl& input_url) const;
|
|
|
|
// Checks to see if we can write the input_url resource in the domain_url
|
|
// taking into account domain authorization, wildcard allow/disallow from
|
|
// RewriteOptions, and the intended use of the url's resource. After the
|
|
// function is executed, is_authorized_domain will indicate whether input_url
|
|
// was found to belong to an authorized domain or not.
|
|
bool MayRewriteUrl(const GoogleUrl& domain_url,
|
|
const GoogleUrl& input_url,
|
|
InlineAuthorizationPolicy inline_authorization_policy,
|
|
IntendedFor intended_for,
|
|
bool* is_authorized_domain) const;
|
|
|
|
// Returns the appropriate base gurl to be used for resolving hrefs
|
|
// in the document. Note that HtmlParse::google_url() is the URL
|
|
// for the HTML file and is used for printing html syntax errors.
|
|
const GoogleUrl& base_url() const { return base_url_; }
|
|
|
|
// The URL that was requested if FetchResource was called.
|
|
StringPiece fetch_url() const { return fetch_url_; }
|
|
|
|
// Returns the decoded version of base_gurl() in case it was encoded by a
|
|
// non-default UrlNamer (for the default UrlNamer this returns the same value
|
|
// as base_url()). Required when fetching a resource by its encoded name.
|
|
const GoogleUrl& decoded_base_url() const { return decoded_base_url_; }
|
|
StringPiece decoded_base() const { return decoded_base_url_.Spec(); }
|
|
|
|
// Quick way to tell if the document url is https (ie was fetched via https).
|
|
bool IsHttps() const { return google_url().SchemeIs("https"); }
|
|
|
|
const UrlSegmentEncoder* default_encoder() const { return &default_encoder_; }
|
|
|
|
// Finds a filter with the given ID, or returns NULL if none found.
|
|
RewriteFilter* FindFilter(const StringPiece& id) const;
|
|
|
|
// Returns refs_before_base.
|
|
bool refs_before_base() { return refs_before_base_; }
|
|
|
|
// Sets whether or not there were references to urls before the
|
|
// base tag (if there is a base tag). This variable has document-level
|
|
// scope. It is reset at the beginning of every document by
|
|
// ScanFilter.
|
|
void set_refs_before_base() { refs_before_base_ = true; }
|
|
|
|
// Get/set the charset of the containing HTML page. See scan_filter.cc for
|
|
// an explanation of how this is determined, but NOTE that the determined
|
|
// charset can change as more of the HTML is seen, in particular after a
|
|
// meta tag.
|
|
StringPiece containing_charset() { return containing_charset_; }
|
|
void set_containing_charset(const StringPiece charset) {
|
|
charset.CopyToString(&containing_charset_);
|
|
}
|
|
|
|
// Creates and registers a HtmlElement slot for rewriting.
|
|
// If this is the first time called for this position, a new slot will be
|
|
// returned. On subsequent calls, the original slot will be returned so
|
|
// that rewrites are propagated between filters.
|
|
HtmlResourceSlotPtr GetSlot(const ResourcePtr& resource,
|
|
HtmlElement* elt,
|
|
HtmlElement::Attribute* attr);
|
|
|
|
// Creates and registers an inline resource slot for rewriting.
|
|
// If this is the first time called for this position, a new slot will be
|
|
// returned. On subsequent calls, the original slot will be returned so
|
|
// that rewrites are propagated between filters.
|
|
InlineResourceSlotPtr GetInlineSlot(const ResourcePtr& resource,
|
|
HtmlCharactersNode* char_node);
|
|
|
|
// Creates and registers an inline attribute resource slot for rewriting.
|
|
// If this is the first time called for this position, a new slot will be
|
|
// returned. On subsequent calls, the original slot will be returned so
|
|
// that rewrites are propagated between filters.
|
|
InlineAttributeSlotPtr GetInlineAttributeSlot(
|
|
const ResourcePtr& resource, HtmlElement* element,
|
|
HtmlElement::Attribute* attribute);
|
|
|
|
// Create and and registers a source set slot collection for rewriting
|
|
// all the images in the srcset attribute of an <img>. Also creates the
|
|
// neccessary resources using the provided filter's policy.
|
|
//
|
|
// If this is the first time called for this element + attr, a new
|
|
// collection will be returned. On subsequent calls, the original collection
|
|
// will be returned so that rewrites are propagated between filters. All
|
|
// filters using this call are expected to have the same values for
|
|
// AllowUnauthorizedDomain() and IntendedForInlining().
|
|
SrcSetSlotCollectionPtr GetSrcSetSlotCollection(
|
|
CommonFilter* filter, HtmlElement* element, HtmlElement::Attribute* attr);
|
|
|
|
// Method to start a resource rewrite. This is called by a filter during
|
|
// parsing, although the Rewrite might continue after deadlines expire
|
|
// and the rewritten HTML must be flushed. Returns InitiateRewrite returns
|
|
// false if the system is not healthy enough to support resource rewrites.
|
|
bool InitiateRewrite(RewriteContext* rewrite_context)
|
|
LOCKS_EXCLUDED(rewrite_mutex());
|
|
void InitiateFetch(RewriteContext* rewrite_context);
|
|
|
|
// Provides a mechanism for a RewriteContext to notify a
|
|
// RewriteDriver that it is complete, to allow the RewriteDriver
|
|
// to delete itself or return it back to a free pool in the ServerContext.
|
|
//
|
|
// This will also call back into RewriteContext::Propagate, letting it
|
|
// know whether the context is still attached to the HTML DOM
|
|
// (and hence safe to render), and to do other bookkeeping.
|
|
//
|
|
// If 'permit_render' is false, no rendering will be asked for even if
|
|
// the context is still attached.
|
|
void RewriteComplete(RewriteContext* rewrite_context, bool permit_render);
|
|
|
|
// Provides a mechanism for a RewriteContext to notify a
|
|
// RewriteDriver that a certain number of rewrites have been discovered
|
|
// to need to take the slow path.
|
|
void ReportSlowRewrites(int num);
|
|
|
|
// If there are not outstanding references to this RewriteDriver,
|
|
// delete it or recycle it to a free pool in the ServerContext.
|
|
// If this is a fetch, calling this also signals to the system that you
|
|
// are no longer interested in its results.
|
|
void Cleanup();
|
|
|
|
// Adds an extra external reference to the object. You should not
|
|
// normally need to call it (NewRewriteDriver does it initially), unless for
|
|
// some reason you want to pin the object (e.g. in tests). Matches up with
|
|
// Cleanup.
|
|
void AddUserReference();
|
|
|
|
// Debugging routines to print out data about the driver.
|
|
GoogleString ToString(bool show_detached_contexts) const
|
|
LOCKS_EXCLUDED(rewrite_mutex());
|
|
GoogleString ToStringLockHeld(bool show_detached_contexts) const
|
|
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
|
|
void PrintState(bool show_detached_contexts); // For debugging.
|
|
void PrintStateToErrorLog(bool show_detached_contexts); // For logs.
|
|
|
|
// Wait for outstanding Rewrite to complete. Once the rewrites are
|
|
// complete they can be rendered.
|
|
void WaitForCompletion();
|
|
|
|
// Wait for outstanding rewrite to complete, including any background
|
|
// work that may be ongoing even after results were reported.
|
|
//
|
|
// Note: while this guarantees that the result of the computation is
|
|
// known, the thread that performed it may still be running for a
|
|
// little bit and accessing the driver.
|
|
void WaitForShutDown();
|
|
|
|
// As above, but with a time bound, and taking a mode parameter to decide
|
|
// between WaitForCompletion or WaitForShutDown behavior.
|
|
// If timeout_ms <= 0, no time bound will be used.
|
|
void BoundedWaitFor(WaitMode mode, int64 timeout_ms)
|
|
LOCKS_EXCLUDED(rewrite_mutex());
|
|
|
|
// If this is set to true, during a Flush of HTML the system will
|
|
// wait for results of all rewrites rather than just waiting for
|
|
// cache lookups and a small deadline. Note, however, that in very
|
|
// rare circumstances some rewrites may still be dropped due to
|
|
// excessive load.
|
|
//
|
|
// Note: reset every time the driver is recycled.
|
|
void set_fully_rewrite_on_flush(bool x) {
|
|
fully_rewrite_on_flush_ = x;
|
|
}
|
|
|
|
// Returns if this response has a blocking rewrite or not.
|
|
bool fully_rewrite_on_flush() const {
|
|
return fully_rewrite_on_flush_;
|
|
}
|
|
|
|
// This is relevant only when fully_rewrite_on_flush is true.
|
|
// When this is set to true, Flush of HTML will not wait for async events
|
|
// while it does wait when it is set to false.
|
|
void set_fast_blocking_rewrite(bool x) {
|
|
fast_blocking_rewrite_ = x;
|
|
}
|
|
|
|
bool fast_blocking_rewrite() const {
|
|
return fast_blocking_rewrite_;
|
|
}
|
|
|
|
// If the value of X-PSA-Blocking-Rewrite request header matches the blocking
|
|
// rewrite key, set fully_rewrite_on_flush flag.
|
|
void EnableBlockingRewrite(RequestHeaders* request_headers);
|
|
|
|
// Indicate that this RewriteDriver will be explicitly deleted, and
|
|
// thus should not be auto-deleted at the end of the parse. This is
|
|
// primarily for tests.
|
|
//
|
|
// TODO(jmarantz): Consider phasing this out to make tests behave
|
|
// more like servers.
|
|
void set_externally_managed(bool x) { externally_managed_ = x; }
|
|
|
|
// Called by RewriteContext to let RewriteDriver know it will be continuing
|
|
// on the fetch in background, and so it should defer doing full cleanup
|
|
// sequences until DetachedFetchComplete() is called.
|
|
void DetachFetch();
|
|
|
|
// Called by RewriteContext when a detached async fetch is complete, allowing
|
|
// the RewriteDriver to be recycled if FetchComplete() got invoked as well.
|
|
void DetachedFetchComplete();
|
|
|
|
// Cleans up the driver and any fetch rewrite contexts, unless the fetch
|
|
// rewrite got detached by a call to DetachFetch(), in which case a call to
|
|
// DetachedFetchComplete() must also be performed.
|
|
void FetchComplete();
|
|
|
|
// Deletes the specified RewriteContext. If this is the last RewriteContext
|
|
// active on this Driver, and there is no other outstanding activity, then
|
|
// the RewriteDriver itself can be recycled, and WaitForCompletion can return.
|
|
//
|
|
// We expect to this method to be called on the Rewrite thread.
|
|
void DeleteRewriteContext(RewriteContext* rewrite_context);
|
|
|
|
int rewrite_deadline_ms() { return options()->rewrite_deadline_ms(); }
|
|
|
|
// Sets a maximum amount of time to process a page across all flush
|
|
// windows; i.e., the entire lifecycle of this driver during a given pageload.
|
|
// A negative value indicates no limit.
|
|
// Setting fully_rewrite_on_flush() overrides this.
|
|
void set_max_page_processing_delay_ms(int x) {
|
|
max_page_processing_delay_ms_ = x;
|
|
}
|
|
int max_page_processing_delay_ms() { return max_page_processing_delay_ms_; }
|
|
|
|
// Sets the device type chosen for the current property_page.
|
|
void set_device_type(UserAgentMatcher::DeviceType x) { device_type_ = x; }
|
|
UserAgentMatcher::DeviceType device_type() const { return device_type_; }
|
|
|
|
// Tries to register the given rewrite context as working on
|
|
// its partition key. If this context is the first one to try to handle it,
|
|
// returns NULL. Otherwise returns the previous such context.
|
|
//
|
|
// Must only be called from rewrite thread.
|
|
RewriteContext* RegisterForPartitionKey(const GoogleString& partition_key,
|
|
RewriteContext* candidate);
|
|
|
|
// Must be called after all other rewrites that are currently relying on this
|
|
// one have had their RepeatedSuccess or RepeatedFailure methods called.
|
|
//
|
|
// Must only be called from rewrite thread.
|
|
void DeregisterForPartitionKey(
|
|
const GoogleString& partition_key, RewriteContext* candidate);
|
|
|
|
// Indicates that a Flush through the HTML parser chain should happen
|
|
// soon, e.g. once the network pauses its incoming byte stream.
|
|
void RequestFlush() { flush_requested_ = true; }
|
|
bool flush_requested() const { return flush_requested_; }
|
|
|
|
// Executes an Flush() if RequestFlush() was called, e.g. from the
|
|
// Listener Filter (see set_event_listener below). Consider an HTML
|
|
// parse driven by a UrlAsyncFetcher. When the UrlAsyncFetcher
|
|
// temporarily runs out of bytes to read, it calls
|
|
// response_writer->Flush(). When that happens, we may want to
|
|
// consider flushing the outstanding HTML events through the system
|
|
// so that the browser can start fetching subresources and
|
|
// rendering. The event_listener (see set_event_listener below)
|
|
// helps determine whether enough "interesting" events have passed
|
|
// in the current flush window so that we should take this incoming
|
|
// network pause as an opportunity.
|
|
void ExecuteFlushIfRequested();
|
|
|
|
// Asynchronous version of the above. Note that you should not
|
|
// attempt to write out any data until the callback is invoked.
|
|
// (If a flush is not needed, the callback will be invoked immediately).
|
|
void ExecuteFlushIfRequestedAsync(Function* callback);
|
|
|
|
// Overrides HtmlParse::Flush so that it can happen in two phases:
|
|
// 1. Pre-render chain runs, resulting in async rewrite activity
|
|
// 2. async rewrite activity ends, calling callback, and post-render
|
|
// filters run.
|
|
// This API is used for unit-tests & Apache (which lacks a useful event
|
|
// model) and results in blocking behavior.
|
|
//
|
|
// FlushAsync is prefered for event-driven servers.
|
|
virtual void Flush();
|
|
|
|
// Initiates an asynchronous Flush. done->Run() will be called when
|
|
// the flush is complete. Further calls to ParseText should be deferred until
|
|
// the callback is called. Scheduler mutex is not held while done is called.
|
|
void FlushAsync(Function* done);
|
|
|
|
// Queues up a task to run on the (high-priority) rewrite thread.
|
|
void AddRewriteTask(Function* task);
|
|
|
|
// Queues up a task to run on the low-priority rewrite thread.
|
|
// Such tasks are expected to be safely cancelable.
|
|
void AddLowPriorityRewriteTask(Function* task);
|
|
|
|
QueuedWorkerPool::Sequence* html_worker() { return html_worker_; }
|
|
Sequence* rewrite_worker();
|
|
Scheduler::Sequence* scheduler_sequence() {
|
|
return scheduler_sequence_.get();
|
|
}
|
|
|
|
QueuedWorkerPool::Sequence* low_priority_rewrite_worker() {
|
|
return low_priority_rewrite_worker_;
|
|
}
|
|
|
|
// Make the rewrite_worker tasks run on the request thread. This
|
|
// must be called immediately after initializing the driver, before
|
|
// it starts processing the request.
|
|
void RunTasksOnRequestThread();
|
|
|
|
// Switches the driver back to running rewrite_worker tasks using
|
|
// the QueuedWorkerPool. This can be called when we are retiring
|
|
// a server-request on behalf of the client (e.g. after a deadline was
|
|
// exceeded), but want background optimization to continue. It can
|
|
// no longer continue on the request thread.
|
|
void SwitchToQueuedWorkerPool() EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
|
|
|
|
Scheduler* scheduler() { return scheduler_; }
|
|
|
|
// Used by CacheExtender, CssCombineFilter, etc. for rewriting domains
|
|
// of sub-resources in CSS.
|
|
DomainRewriteFilter* domain_rewriter() { return domain_rewriter_.get(); }
|
|
UrlLeftTrimFilter* url_trim_filter() { return url_trim_filter_.get(); }
|
|
|
|
// Rewrites CSS content to absolutify any relative embedded URLs, streaming
|
|
// the results to the writer. Returns 'false' if the writer returns false
|
|
// or if the content was not rewritten because the domains of the gurl
|
|
// and resolved_base match.
|
|
//
|
|
// input_css_base contains the path where the CSS text came from.
|
|
// output_css_base contains the path where the CSS will be written.
|
|
CssResolutionStatus ResolveCssUrls(const GoogleUrl& input_css_base,
|
|
const StringPiece& output_css_base,
|
|
const StringPiece& contents,
|
|
Writer* writer,
|
|
MessageHandler* handler);
|
|
|
|
// Determines if an URL relative to the given input_base needs to be
|
|
// absolutified given that it will end up under output_base:
|
|
// - If we are proxying and input_base isn't proxy encoded, then yes.
|
|
// - If we aren't proxying and input_base != output_base, then yes.
|
|
// - If we aren't proxying and the domain lawyer will shard or rewrite
|
|
// input_base, then yes.
|
|
// If not NULL also set *proxy_mode to whether proxy mode is active or not.
|
|
bool ShouldAbsolutifyUrl(const GoogleUrl& input_base,
|
|
const GoogleUrl& output_base,
|
|
bool* proxy_mode) const;
|
|
|
|
// Update the PropertyValue named 'property_name' in dom cohort with
|
|
// the value 'property_value'. It is the responsibility of the client to
|
|
// ensure that property cache and dom cohort are enabled when this function is
|
|
// called. It is a programming error to call this function when property
|
|
// cache or dom cohort is not available, more so since the value payload has
|
|
// to be serialised before calling this function. Hence this function will
|
|
// DFATAL if property cache or dom cohort is not available.
|
|
void UpdatePropertyValueInDomCohort(
|
|
AbstractPropertyPage* page,
|
|
StringPiece property_name,
|
|
StringPiece property_value);
|
|
|
|
// Returns the property page which contains the cached properties associated
|
|
// with the current URL.
|
|
PropertyPage* property_page() const;
|
|
|
|
// Returns the property page which contains the cached properties associated
|
|
// with the current URL and fallback URL (i.e. without query params). This
|
|
// should be used where a property is interested in fallback values if
|
|
// actual values are not present.
|
|
FallbackPropertyPage* fallback_property_page() const {
|
|
return fallback_property_page_;
|
|
}
|
|
|
|
// Returns property page which contains cached properties associated with
|
|
// the current origin (host/port/protocol). May be NULL.
|
|
PropertyPage* origin_property_page() const;
|
|
|
|
// Takes ownership of page.
|
|
void set_property_page(PropertyPage* page);
|
|
// Takes ownership of page.
|
|
void set_fallback_property_page(FallbackPropertyPage* page);
|
|
// Does not take the ownership of the page.
|
|
void set_unowned_fallback_property_page(FallbackPropertyPage* page);
|
|
// Takes ownership of page.
|
|
void set_origin_property_page(PropertyPage* page);
|
|
|
|
// The JS to detect above-the-fold images should only be enabled if one of the
|
|
// filters that uses critical image information is enabled, the property cache
|
|
// is enabled (since the critical image information is stored in the property
|
|
// cache), and it is not explicitly disabled through options.
|
|
bool is_critical_images_beacon_enabled();
|
|
|
|
// Used by ImageRewriteFilter for identifying critical images.
|
|
CriticalImagesInfo* critical_images_info() const {
|
|
return critical_images_info_.get();
|
|
}
|
|
|
|
// This should only be called by the CriticalSelectorFinder. Normal users
|
|
// should call CriticalSelectorFinder::IsCriticalImage.
|
|
// TODO(jud): Remove when the finders reside in RewriteDriver and manage their
|
|
// own state.
|
|
CriticalSelectorInfo* critical_selector_info() {
|
|
return critical_selector_info_.get();
|
|
}
|
|
|
|
// This should only be called by the CriticalSelectorFinder.
|
|
// TODO(jud): Remove when the finders reside in RewriteDriver and manage their
|
|
// own state.
|
|
void set_critical_selector_info(CriticalSelectorInfo* info) {
|
|
critical_selector_info_.reset(info);
|
|
}
|
|
|
|
// Inserts the critical images present on the requested html page. It takes
|
|
// ownership of critical_images_info. This should only be called by the
|
|
// CriticalImagesFinder, normal users should just be using the automatic
|
|
// management of critical_images_info that CriticalImagesFinder provides.
|
|
void set_critical_images_info(CriticalImagesInfo* critical_images_info) {
|
|
critical_images_info_.reset(critical_images_info);
|
|
}
|
|
|
|
// Return true if we must flatten css imports, either because the filter is
|
|
// enabled explicitly or because it is enabled by PrioritizeCriticalCss.
|
|
bool FlattenCssImportsEnabled() const {
|
|
return (options()->Enabled(RewriteOptions::kFlattenCssImports) ||
|
|
(!options()->Forbidden(RewriteOptions::kFlattenCssImports) &&
|
|
(options()->Enabled(RewriteOptions::kPrioritizeCriticalCss) ||
|
|
options()->Enabled(RewriteOptions::kComputeCriticalCss))));
|
|
}
|
|
|
|
// We expect to this method to be called on the HTML parser thread.
|
|
// Returns the number of images whose low quality images are inlined in the
|
|
// html page.
|
|
int num_inline_preview_images() const { return num_inline_preview_images_; }
|
|
|
|
// We expect to this method to be called on the HTML parser thread.
|
|
void increment_num_inline_preview_images();
|
|
|
|
// Increment reference count for misc. async ops that need the RewriteDriver
|
|
// kept alive.
|
|
void IncrementAsyncEventsCount();
|
|
|
|
// Decrements a reference count bumped up by IncrementAsyncEventsCount()
|
|
void DecrementAsyncEventsCount();
|
|
|
|
// Increment reference count for misc async ops that should be waited for
|
|
// before doing rendering for current flush window.
|
|
void IncrementRenderBlockingAsyncEventsCount();
|
|
|
|
// Decrements a reference count bumped up by
|
|
// IncrementRenderBlockingAsyncEventsCount()
|
|
void DecrementRenderBlockingAsyncEventsCount();
|
|
|
|
// Determines whether the document's Content-Type has a mimetype indicating
|
|
// that browsers should parse it as XHTML.
|
|
XhtmlStatus MimeTypeXhtmlStatus();
|
|
|
|
void set_is_lazyload_script_flushed(bool x) {
|
|
is_lazyload_script_flushed_ = x;
|
|
}
|
|
bool is_lazyload_script_flushed() const {
|
|
return is_lazyload_script_flushed_; }
|
|
|
|
// This method is not thread-safe. Call it only from the html parser thread.
|
|
FlushEarlyInfo* flush_early_info();
|
|
|
|
// dependency_tracker()->RegisterDependencyCandidate and
|
|
// ReportDependencyCandidate can be called from any thread.
|
|
DependencyTracker* dependency_tracker() const {
|
|
return dependency_tracker_.get();
|
|
}
|
|
|
|
// Determines whether we are currently in Debug mode; meaning that the
|
|
// site owner or user has enabled filter kDebug.
|
|
bool DebugMode() const { return options()->Enabled(RewriteOptions::kDebug); }
|
|
|
|
// Log the given debug message(s) as HTML comments after the given element,
|
|
// if not NULL, it has not been flushed, and if debug is enabled. The form
|
|
// that takes a repeated field is intended for use by CachedResult, e.g:
|
|
// InsertDebugComment(cached_result.debug_message(), element);
|
|
// Messages are HTML-escaped before being written out to the DOM.
|
|
void InsertDebugComment(StringPiece unescaped_message, HtmlNode* node);
|
|
void InsertDebugComments(
|
|
const protobuf::RepeatedPtrField<GoogleString>& unescaped_messages,
|
|
HtmlElement* element);
|
|
void InsertUnauthorizedDomainDebugComment(StringPiece url,
|
|
HtmlElement* element);
|
|
|
|
// Generates an unauthorized domain debug comment. Public for unit tests.
|
|
static GoogleString GenerateUnauthorizedDomainDebugComment(
|
|
const GoogleUrl& gurl);
|
|
|
|
// log_record() always returns a pointer to a valid AbstractLogRecord, owned
|
|
// by the rewrite_driver's request context.
|
|
AbstractLogRecord* log_record();
|
|
|
|
DomStatsFilter* dom_stats_filter() const {
|
|
return dom_stats_filter_;
|
|
}
|
|
|
|
// Determines whether the system is healthy enough to rewrite resources.
|
|
// Currently, systems get sick based on the health of the metadata cache.
|
|
bool can_rewrite_resources() const { return can_rewrite_resources_; }
|
|
|
|
// Determine whether this driver is nested inside another.
|
|
bool is_nested() const { return is_nested_; }
|
|
|
|
// Determines whether metadata was requested in the response headers and
|
|
// verifies that the key in the header is the same as the expected key. An
|
|
// empty expected key returns false.
|
|
bool MetadataRequested(const RequestHeaders& request_headers) const;
|
|
|
|
// Did the driver attempt to distribute the fetch?
|
|
bool tried_to_distribute_fetch() const { return tried_to_distribute_fetch_; }
|
|
|
|
// Writes the specified contents into the output resource, and marks it
|
|
// as optimized. 'inputs' described the input resources that were used
|
|
// to construct the output, and is used to determine whether the
|
|
// result can be safely cache extended and be marked publicly cacheable.
|
|
// 'content_type' and 'charset' specify the mimetype and encoding of
|
|
// the contents, and will help form the Content-Type header.
|
|
// 'charset' may be empty when not specified.
|
|
//
|
|
// Note that this does not escape charset.
|
|
//
|
|
// Callers should take care that dangerous types like 'text/html' do not
|
|
// sneak into content_type.
|
|
bool Write(const ResourceVector& inputs,
|
|
const StringPiece& contents,
|
|
const ContentType* type,
|
|
StringPiece charset,
|
|
OutputResource* output);
|
|
|
|
void set_defer_instrumentation_script(bool x) {
|
|
defer_instrumentation_script_ = x;
|
|
}
|
|
bool defer_instrumentation_script() const {
|
|
return defer_instrumentation_script_;
|
|
}
|
|
|
|
// Sets the num_initiated_rewrites_. This should only be called from test
|
|
// code.
|
|
void set_num_initiated_rewrites(int64 x) {
|
|
ScopedMutex lock(rewrite_mutex());
|
|
num_initiated_rewrites_ = x;
|
|
}
|
|
int64 num_initiated_rewrites() const {
|
|
ScopedMutex lock(rewrite_mutex());
|
|
return num_initiated_rewrites_;
|
|
}
|
|
// Sets the num_detached_rewrites_. This should only be called from test code.
|
|
void set_num_detached_rewrites(int64 x) {
|
|
ScopedMutex lock(rewrite_mutex());
|
|
num_detached_rewrites_ = x;
|
|
}
|
|
int64 num_detached_rewrites() const {
|
|
ScopedMutex lock(rewrite_mutex());
|
|
return num_detached_rewrites_;
|
|
}
|
|
|
|
void set_pagespeed_query_params(StringPiece x) {
|
|
x.CopyToString(&pagespeed_query_params_);
|
|
}
|
|
StringPiece pagespeed_query_params() const {
|
|
return pagespeed_query_params_;
|
|
}
|
|
|
|
void set_pagespeed_option_cookies(StringPiece x) {
|
|
x.CopyToString(&pagespeed_option_cookies_);
|
|
}
|
|
StringPiece pagespeed_option_cookies() const {
|
|
return pagespeed_option_cookies_;
|
|
}
|
|
|
|
// We fragment the cache based on the hostname we got from the request, unless
|
|
// that was overridden in the options with a cache_fragment.
|
|
const GoogleString& CacheFragment() const;
|
|
|
|
// Utility function to set/clear cookies for PageSpeed options. gurl is the
|
|
// URL of the request from which the host is extracted for a cookie attribute.
|
|
// TODO(matterbury): Get the URL from 'this' which we can't do now because it
|
|
// isn't set until we've decided that the content of requested URL is HTML.
|
|
// Returns true if any Set-Cookie headers are added, in which case
|
|
// ComputeCaching has been called on response_headers.
|
|
bool SetOrClearPageSpeedOptionCookies(const GoogleUrl& gurl,
|
|
ResponseHeaders* response_headers);
|
|
|
|
// Calls the provided ResourceNamer's Decode() function, passing the hash and
|
|
// signature lengths from this RewriteDriver.
|
|
bool Decode(StringPiece leaf, ResourceNamer* resource_namer) const;
|
|
|
|
bool filters_added() const { return filters_added_; }
|
|
bool has_html_writer_filter() const {
|
|
return html_writer_filter_.get() != nullptr;
|
|
}
|
|
|
|
// Declares whether the current document is AMP or not. Prior to calling
|
|
// this, all HTML events are buffered, to avoid waking up filters that
|
|
// inject scripts.
|
|
void SetIsAmpDocument(bool is_amp);
|
|
bool is_amp_document() const { return is_amp_; }
|
|
|
|
protected:
|
|
virtual void DetermineFiltersBehaviorImpl();
|
|
|
|
private:
|
|
friend class DistributedRewriteContextTest;
|
|
friend class RewriteContext;
|
|
friend class RewriteDriverTest;
|
|
friend class RewriteTestBase;
|
|
friend class ServerContextTest;
|
|
|
|
typedef std::map<GoogleString, RewriteFilter*> StringFilterMap;
|
|
|
|
// Returns true if the given fetch request should be distributed.
|
|
bool ShouldDistributeFetch(const StringPiece& filter_id);
|
|
|
|
// Distributes the fetch to another task if ShouldDistributeFetch allows it
|
|
// for the provided filter_id and streams the result to the provided fetch
|
|
// object.
|
|
//
|
|
// Returns true if an attempt to distribute was made. If the attempt fails
|
|
// before async_fetch was written to (before ResponseHeaders) it will call
|
|
// RewriteDriver::FetchResource() and skip distribution. If the attempt fails
|
|
// after writing to the ResponseHeaders then the fetch will ultimately fail
|
|
// and the client will get a broken resource.
|
|
//
|
|
// Returns false if ShouldDistributeFetch disallows the distribution.
|
|
bool DistributeFetch(const StringPiece& url, const StringPiece& filter_id,
|
|
AsyncFetch* async_fetch);
|
|
|
|
// Checks whether outstanding rewrites are completed in a satisfactory fashion
|
|
// with respect to given wait_mode and timeout, and invokes done->Run() (with
|
|
// rewrite_mutex released) when either finished or timed out. May relinquish
|
|
// rewrite_mutex() temporarily to invoke done.
|
|
void CheckForCompletionAsync(WaitMode wait_mode, int64 timeout_ms,
|
|
Function* done)
|
|
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
|
|
|
|
// A single check attempt for the above. Will either invoke callback (with
|
|
// rewrite_mutex released) or ask scheduler to check again. May relinquish
|
|
// rewrite_mutex() temporarily to invoke done.
|
|
void TryCheckForCompletion(WaitMode wait_mode, int64 end_time_ms,
|
|
Function* done)
|
|
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
|
|
|
|
// Termination predicate for above.
|
|
bool IsDone(WaitMode wait_mode, bool deadline_reached)
|
|
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
|
|
|
|
// Always wait for pending async events during shutdown or while waiting for
|
|
// the completion of all rewriting (except in fast_blocking_rewrite mode).
|
|
bool WaitForPendingAsyncEvents(WaitMode wait_mode) {
|
|
return wait_mode == kWaitForShutDown ||
|
|
(fully_rewrite_on_flush_ && !fast_blocking_rewrite_);
|
|
}
|
|
|
|
// Portion of flush that happens asynchronously off the scheduler
|
|
// once the rendering is complete. Calls back to 'callback' after its
|
|
// processing, but with the lock released.
|
|
void FlushAsyncDone(int num_rewrites, Function* callback);
|
|
|
|
// Returns the amount of time to wait for rewrites to complete for the
|
|
// current flush window. This combines the per-flush window deadline
|
|
// (configured via rewrite_deadline_ms()) and the per-page deadline
|
|
// (configured via max_page_processing_delay_ms()).
|
|
int64 ComputeCurrentFlushWindowRewriteDelayMs();
|
|
|
|
// Queues up invocation of FlushAsyncDone in our html_workers sequence.
|
|
void QueueFlushAsyncDone(int num_rewrites, Function* callback);
|
|
|
|
// Called as part of implementation of FinishParseAsync, after the
|
|
// flush is complete.
|
|
void QueueFinishParseAfterFlush(Function* user_callback);
|
|
void FinishParseAfterFlush(Function* user_callback);
|
|
|
|
bool RewritesComplete() const EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
|
|
|
|
// Sets the base GURL in response to a base-tag being parsed. This
|
|
// should only be called by ScanFilter.
|
|
void SetBaseUrlIfUnset(const StringPiece& new_base);
|
|
|
|
// Sets the base URL for a resource fetch. This should only be called from
|
|
// test code and from FetchResource.
|
|
void SetBaseUrlForFetch(const StringPiece& url);
|
|
|
|
// Saves a decoding of the Base URL in decoded_base_url_. Use this
|
|
// whenever updating base_url_.
|
|
void SetDecodedUrlFromBase();
|
|
|
|
// The rewrite_mutex is owned by the scheduler.
|
|
AbstractMutex* rewrite_mutex() const LOCK_RETURNED(scheduler_->mutex()) {
|
|
return scheduler_->mutex();
|
|
}
|
|
|
|
// Parses an arbitrary block of an html file
|
|
virtual void ParseTextInternal(const char* content, int size);
|
|
|
|
// Indicates whether we should skip parsing for the given request.
|
|
bool ShouldSkipParsing();
|
|
|
|
// Returns the length of the signature on a signed resource URL.
|
|
int SignatureLength() const;
|
|
|
|
friend class ScanFilter;
|
|
|
|
// Registers RewriteFilter in the map, but does not put it in the
|
|
// html parse filter chain. This allows it to serve resource
|
|
// requests.
|
|
void RegisterRewriteFilter(RewriteFilter* filter);
|
|
|
|
// Adds an already-owned rewrite filter to the pre-render chain. This
|
|
// is used for filters that are unconditionally created for handling of
|
|
// resources, but their presence in the html-rewrite chain is conditional
|
|
// on options.
|
|
void EnableRewriteFilter(const char* id);
|
|
|
|
// Internal low-level helper for resource creation.
|
|
// Use only when permission checking has been done explicitly on the
|
|
// caller side. is_authorized_domain is passed along to Resource object
|
|
// creation, in order to decide whether to keep the resource in the usual
|
|
// key space or a separate one meant for unauthorized resources only.
|
|
ResourcePtr CreateInputResourceUnchecked(const GoogleUrl& gurl,
|
|
bool is_authorized_domain);
|
|
|
|
void AddPreRenderFilters();
|
|
void AddPostRenderFilters();
|
|
|
|
// Helper function to decode the pagespeed url.
|
|
bool DecodeOutputResourceNameHelper(const GoogleUrl& url,
|
|
const RewriteOptions* options_to_use,
|
|
const UrlNamer* url_namer,
|
|
ResourceNamer* name_out,
|
|
OutputResourceKind* kind_out,
|
|
RewriteFilter** filter_out,
|
|
GoogleString* url_base,
|
|
StringVector* urls) const;
|
|
|
|
// When HTML parsing is complete, we have learned all we can about the DOM, so
|
|
// immediately write anything required into that Cohort into the page property
|
|
// cache. Writes to this cohort are predicated so that they only occur if a
|
|
// filter that actually makes use of it is enabled. This prevents filling the
|
|
// cache with unnecessary entries. To enable writing, a filter should override
|
|
// DetermineEnabled to call
|
|
// RewriteDriver::set_write_property_cache_dom_cohort(true), or in the case of
|
|
// a RewriteFilter, should override
|
|
// RewriteFilter::UsesPropertyCacheDomCohort() to return true.
|
|
void WriteDomCohortIntoPropertyCache();
|
|
|
|
// Used by CreateCacheFetcher() and CreateCacheOnlyFetcher().
|
|
CacheUrlAsyncFetcher* CreateCustomCacheFetcher(UrlAsyncFetcher* base_fetcher);
|
|
|
|
// Just before releasing the rewrite driver, check if the feature for storing
|
|
// rewritten responses (e.g. html) in cache is enabled. If yes, purge the
|
|
// old response if significant amount of rewriting happened after this
|
|
// response was stored in the cache. If not, release the rewrite driver. If a
|
|
// purge fetch request is issued, the rewrite driver will be released after
|
|
// this async fetch request is completed.
|
|
void PossiblyPurgeCachedResponseAndReleaseDriver();
|
|
|
|
// Log statistics to the AbstractLogRecord.
|
|
void LogStats();
|
|
|
|
// This pair of calls helps determine if code that changes event state
|
|
// should wake up anyone waiting for rewrite driver's completion.
|
|
//
|
|
// The usage pattern is something like this:
|
|
// ScopedMutex lock(rewrite_mutex());
|
|
// bool should_signal_cookie = PrepareShouldSignal();
|
|
//
|
|
// // Change state
|
|
// ...
|
|
//
|
|
// SignalIfRequired(should_signal_cookie);
|
|
//
|
|
// WARNING: SignalIfRequired() drops the lock on rewrite_mutex() temporarily,
|
|
// so 'this' could get deleted after it returns, so it should not be accessed
|
|
// afterwards.
|
|
bool PrepareShouldSignal() EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
|
|
void SignalIfRequired(bool result_of_prepare_should_signal)
|
|
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
|
|
|
|
// Reverts the driver back to its default state of using a shared scheduler
|
|
// and running on the shared scheduler.
|
|
void CleanupRequestThread();
|
|
|
|
// Only the first base-tag is significant for a document -- any subsequent
|
|
// ones are ignored. There should be no URLs referenced prior to the base
|
|
// tag, if one exists. See
|
|
//
|
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/
|
|
// semantics.html#the-base-element
|
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/
|
|
// urls.html#document-base-url
|
|
//
|
|
// Thus we keep the base-tag in the RewriteDriver, and also keep track of
|
|
// whether it's been reset already within the document.
|
|
bool base_was_set_;
|
|
|
|
// Stores whether or not there were references to urls before the
|
|
// base tag (if there is a base tag) in this document. If there is
|
|
// no base tag, this should be false. If the base tag is before all
|
|
// other url references, this should also be false.
|
|
bool refs_before_base_;
|
|
|
|
// The charset of the containing HTML page.
|
|
GoogleString containing_charset_;
|
|
|
|
// Copies properties from the request headers to the request context,
|
|
// if both are non-null.
|
|
void PopulateRequestContext();
|
|
|
|
bool filters_added_;
|
|
bool externally_managed_;
|
|
|
|
// Memory management stuff. Some of the reference counts we keep track of
|
|
// also are used as a count of events, to help determine when we are done.
|
|
//
|
|
// WARNING: every time you decrement reference counts, you should
|
|
// check release_driver_ within the critical section, and call
|
|
// PossiblyPurgeCachedResponseAndReleaseDriver() if it is true
|
|
// after releasing the lock. The easiest way to get it right is to just call
|
|
// DropReference().
|
|
enum RefCategory {
|
|
kRefUser, // External refcount from users
|
|
kRefParsing, // Parser active
|
|
|
|
// The number of rewrites (RewriteContext) that have been requested,
|
|
// and not yet completed, and for which we still hope to render
|
|
// them within the flush window. This is waited for.
|
|
kRefPendingRewrites,
|
|
|
|
// The number of rewrites (RewriteContext) that have missed the rendering
|
|
// deadline. We don't wait for them, but they still need to keep
|
|
// the RewriteDriver alive.
|
|
kRefDetachedRewrites,
|
|
|
|
// Tracks the number of RewriteContexts that have been completed,
|
|
// but not yet deleted. Once RewriteComplete has been called,
|
|
// rewrite_context->Propagate() is called to render slots (if not
|
|
// detached) and to queue up activity that must occur prior to the
|
|
// context being deleted: specifically running any successors.
|
|
// After all that occurs, DeleteRewriteContext must be called and
|
|
// that will decrement this counter.
|
|
kRefDeletingRewrites,
|
|
|
|
// Keeps track of fetch-responding work that's user-facing.
|
|
kRefFetchUserFacing,
|
|
|
|
// Keeps track of any background continuation of a fetch.
|
|
kRefFetchBackground,
|
|
|
|
// Misc async references from outside
|
|
//
|
|
// TODO(morlovich): Split between events people might want to wait for
|
|
// and events which they don't in a follow up.
|
|
kRefAsyncEvents,
|
|
|
|
// Async events we always wait for, even if fully_rewrite_on_flush isn't
|
|
// turned on.
|
|
kRefRenderBlockingAsyncEvents,
|
|
|
|
kNumRefCategories
|
|
};
|
|
|
|
friend class CategorizedRefcount<RewriteDriver, RefCategory>;
|
|
|
|
// Protected by rewrite_mutex().
|
|
CategorizedRefcount<RewriteDriver, RefCategory> ref_counts_;
|
|
|
|
// Interface to CategorizedRefcount
|
|
void LastRefRemoved();
|
|
StringPiece RefCategoryName(RefCategory cat);
|
|
|
|
// Drops a reference of given kind, signaling any waiters
|
|
// and potentially even releasing the rewrite driver.
|
|
void DropReference(RefCategory cat);
|
|
|
|
// Set to true when the refcount reaches 0. See comment
|
|
// above RefCategory for how this should be used.
|
|
bool release_driver_;
|
|
|
|
// If not kNoWait, indicates that WaitForCompletion or similar method
|
|
// have been called, and an another thread is waiting for us to notify it of
|
|
// everything having been finished in a given mode.
|
|
WaitMode waiting_ GUARDED_BY(rewrite_mutex());
|
|
|
|
// This is set to true if the current wait's deadline has expired.
|
|
bool waiting_deadline_reached_ GUARDED_BY(rewrite_mutex());
|
|
|
|
// If this is true, the usual HTML streaming interface will let rendering
|
|
// of every flush window fully complete before proceeding rather than
|
|
// use a deadline. This means rewriting of HTML may be slow, and hence
|
|
// should not be used for online traffic.
|
|
bool fully_rewrite_on_flush_;
|
|
|
|
// If this is true, we don't wait for async events before flushing bytes to
|
|
// the client during a blocking rewrite; else we do wait for async events.
|
|
bool fast_blocking_rewrite_;
|
|
|
|
bool flush_requested_;
|
|
bool flush_occurred_;
|
|
|
|
// If it is set to true, then lazyload script is flushed with flush early
|
|
// flow.
|
|
bool is_lazyload_script_flushed_;
|
|
|
|
// Tracks whether any filter that uses the dom cohort of the property cache is
|
|
// enabled. Writes to the property cache for this cohort are predicated on
|
|
// this.
|
|
bool write_property_cache_dom_cohort_;
|
|
|
|
// URL of the HTML pages being rewritten in the HTML flow or the
|
|
// of the resource being rewritten in the resource flow.
|
|
GoogleUrl base_url_;
|
|
|
|
// In the resource flow, the URL requested may not have the same
|
|
// base as the original resource. decoded_base_url_ stores the base
|
|
// of the original (un-rewritten) resource.
|
|
GoogleUrl decoded_base_url_;
|
|
|
|
// This is the URL that is being fetched in a fetch path (not valid in HTML
|
|
// path).
|
|
GoogleString fetch_url_;
|
|
|
|
GoogleString user_agent_;
|
|
|
|
LazyBool should_skip_parsing_;
|
|
|
|
StringFilterMap resource_filter_map_;
|
|
|
|
ResponseHeaders* response_headers_;
|
|
|
|
// request_headers_ is a copy of the Fetch's request headers, and it
|
|
// stays alive until the rewrite driver is recycled or deleted.
|
|
scoped_ptr<const RequestHeaders> request_headers_;
|
|
|
|
int status_code_; // Status code of response for this request.
|
|
|
|
// This group of rewrite-context-related variables is accessed
|
|
// only in the main thread of RewriteDriver (aka the HTML thread).
|
|
typedef std::vector<RewriteContext*> RewriteContextVector;
|
|
RewriteContextVector rewrites_; // ordered list of rewrites to initiate
|
|
|
|
// The maximum amount of time to wait for page processing across all flush
|
|
// windows. A negative value implies no limit.
|
|
int max_page_processing_delay_ms_;
|
|
|
|
typedef std::set<RewriteContext*> RewriteContextSet;
|
|
|
|
// Contains the RewriteContext* that have been queued into the
|
|
// RewriteThread, but have not gotten to the point where
|
|
// RewriteComplete() has been called. This set is cleared
|
|
// one the rewrite_deadline_ms has passed.
|
|
RewriteContextSet initiated_rewrites_ GUARDED_BY(rewrite_mutex());
|
|
|
|
// Number of total initiated rewrites for the request.
|
|
int64 num_initiated_rewrites_ GUARDED_BY(rewrite_mutex());
|
|
|
|
// Number of total detached rewrites for the request, i.e. rewrites whose
|
|
// results did not make it to the response. This is different from
|
|
// kRefDetachedRewrites (and detached_rewrites_.size(), which is equal to it)
|
|
// since that counter is for the number of rewrites
|
|
// currently in the detached state for the current flush window,
|
|
// while this variable is total that ever got detached over all of the
|
|
// document.
|
|
int64 num_detached_rewrites_ GUARDED_BY(rewrite_mutex());
|
|
|
|
// Contains the RewriteContext* that were still running at the deadline.
|
|
// They are said to be in a "detached" state although the RewriteContexts
|
|
// themselves don't know that. They will continue performing their
|
|
// Rewrite in the RewriteThread, and caching the results. And until
|
|
// they complete, the RewriteDriver must stay alive and not be Recycled
|
|
// or deleted. WaitForCompletion() blocks until all detached_rewrites
|
|
// have been retired.
|
|
RewriteContextSet detached_rewrites_ GUARDED_BY(rewrite_mutex());
|
|
|
|
// Rewrites that may possibly be satisfied from metadata cache alone.
|
|
int possibly_quick_rewrites_ GUARDED_BY(rewrite_mutex());
|
|
|
|
// List of RewriteContext objects for fetch to delete. We do it in
|
|
// clear as a simplification.
|
|
RewriteContextVector fetch_rewrites_;
|
|
|
|
// These objects are provided on construction or later, and are
|
|
// owned by the caller.
|
|
FileSystem* file_system_;
|
|
ServerContext* server_context_;
|
|
Scheduler* scheduler_;
|
|
UrlAsyncFetcher* default_url_async_fetcher_; // the fetcher we got at ctor
|
|
|
|
// This is the fetcher we use --- it's either the default_url_async_fetcher_,
|
|
// or whatever it was temporarily overridden to by SetSessionFetcher.
|
|
// This is either owned externally or via owned_url_async_fetchers_.
|
|
UrlAsyncFetcher* url_async_fetcher_;
|
|
|
|
// This is the fetcher that is used to distribute rewrites if enabled. This
|
|
// can be NULL if distributed rewriting is not configured. This is owned
|
|
// externally.
|
|
UrlAsyncFetcher* distributed_async_fetcher_;
|
|
|
|
// A list of all the UrlAsyncFetchers that we own, as set with
|
|
// SetSessionFetcher.
|
|
std::vector<UrlAsyncFetcher*> owned_url_async_fetchers_;
|
|
|
|
DomStatsFilter* dom_stats_filter_;
|
|
scoped_ptr<HtmlWriterFilter> html_writer_filter_;
|
|
|
|
ScanFilter scan_filter_;
|
|
scoped_ptr<DomainRewriteFilter> domain_rewriter_;
|
|
scoped_ptr<UrlLeftTrimFilter> url_trim_filter_;
|
|
|
|
// Maps rewrite context partition keys to the context responsible for
|
|
// rewriting them, in case a URL occurs more than once.
|
|
typedef std::map<GoogleString, RewriteContext*> PrimaryRewriteContextMap;
|
|
PrimaryRewriteContextMap primary_rewrite_context_map_;
|
|
|
|
HtmlResourceSlotSet slots_;
|
|
InlineResourceSlotSet inline_slots_;
|
|
InlineAttributeSlotSet inline_attribute_slots_;
|
|
SrcSetSlotCollectionSet srcset_collections_;
|
|
|
|
scoped_ptr<RewriteOptions> options_;
|
|
|
|
RewriteDriverPool* controlling_pool_; // or NULL if this has custom options.
|
|
|
|
// Object which manages CacheUrlAsyncFetcher async operations.
|
|
scoped_ptr<CacheUrlAsyncFetcher::AsyncOpHooks>
|
|
cache_url_async_fetcher_async_op_hooks_;
|
|
|
|
// The default resource encoder
|
|
UrlSegmentEncoder default_encoder_;
|
|
|
|
// The first chain of filters called before waiting for Rewrites to complete.
|
|
FilterList early_pre_render_filters_;
|
|
// The second chain of filters called before waiting for Rewrites to complete.
|
|
FilterList pre_render_filters_;
|
|
|
|
// Owned by us.
|
|
std::vector<ResourceUrlClaimant*> resource_claimants_;
|
|
|
|
// A container of filters to delete when RewriteDriver is deleted. This
|
|
// can include pre_render_filters as well as those added to the post-render
|
|
// chain owned by HtmlParse.
|
|
FilterVector filters_to_delete_;
|
|
|
|
QueuedWorkerPool::Sequence* html_worker_;
|
|
QueuedWorkerPool::Sequence* rewrite_worker_;
|
|
QueuedWorkerPool::Sequence* low_priority_rewrite_worker_;
|
|
scoped_ptr<Scheduler::Sequence> scheduler_sequence_;
|
|
|
|
Writer* writer_;
|
|
|
|
// Stores any cached properties associated with the current URL and fallback
|
|
// URL (i.e. without query params).
|
|
FallbackPropertyPage* fallback_property_page_;
|
|
|
|
// Boolean value which tells whether property page is owned by driver or not.
|
|
bool owns_property_page_;
|
|
|
|
// Per-origin property page, for things which are site-wide.
|
|
scoped_ptr<PropertyPage> origin_property_page_;
|
|
|
|
// Device type for the current property page.
|
|
UserAgentMatcher::DeviceType device_type_;
|
|
|
|
// The critical image finder and critical selector finder will lazy-init these
|
|
// fields.
|
|
scoped_ptr<CriticalImagesInfo> critical_images_info_;
|
|
scoped_ptr<CriticalSelectorInfo> critical_selector_info_;
|
|
|
|
// Memoized computation of whether the current doc has an XHTML mimetype.
|
|
bool xhtml_mimetype_computed_;
|
|
XhtmlStatus xhtml_status_ : 8;
|
|
|
|
// Number of images whose low quality images are inlined in the html page by
|
|
// InlinePreviewFilter.
|
|
int num_inline_preview_images_;
|
|
|
|
// The total number of bytes for which ParseText is called.
|
|
int num_bytes_in_;
|
|
|
|
DebugFilter* debug_filter_;
|
|
|
|
scoped_ptr<FlushEarlyInfo> flush_early_info_;
|
|
scoped_ptr<DependencyTracker> dependency_tracker_;
|
|
|
|
bool can_rewrite_resources_;
|
|
bool is_nested_;
|
|
|
|
// Additional request context that may outlive this RewriteDriver. (Thus,
|
|
// the context is reference counted.)
|
|
RequestContextPtr request_context_;
|
|
|
|
// Start time for HTML requests. Used for statistics reporting.
|
|
int64 start_time_ms_;
|
|
|
|
scoped_ptr<RequestProperties> request_properties_;
|
|
|
|
// Helps make sure RewriteDriver and its children are initialized exactly
|
|
// once, allowing for multiple calls to RewriteDriver::Initialize as long
|
|
// as they are matched to RewriteDriver::Terminate.
|
|
static int initialized_count_;
|
|
|
|
// True if this RewriteDriver attempted to distribute the rewrite. This is
|
|
// used to prevent a second attempt in case the first errored out.
|
|
bool tried_to_distribute_fetch_;
|
|
|
|
// If false, add data-pagespeed-no-defer attribute to the script inserted by
|
|
// add_instrumentation filter.
|
|
bool defer_instrumentation_script_;
|
|
|
|
// Indicates whether this document is determined to be AMP-HTML.
|
|
bool is_amp_;
|
|
|
|
// Indicates that task execution has started.
|
|
AtomicBool executing_rewrite_tasks_;
|
|
|
|
// Downstream cache object used for issuing purges.
|
|
DownstreamCachePurger downstream_cache_purger_;
|
|
|
|
// Any PageSpeed options stripped from the original URL.
|
|
GoogleString pagespeed_query_params_;
|
|
|
|
// Any PageSpeed option cookies from the original request.
|
|
GoogleString pagespeed_option_cookies_;
|
|
|
|
DISALLOW_COPY_AND_ASSIGN(RewriteDriver);
|
|
};
|
|
|
|
// Subclass of HTTPCache::Callback that incorporates a given RewriteOptions'
|
|
// invalidation policy.
|
|
class OptionsAwareHTTPCacheCallback : public HTTPCache::Callback {
|
|
public:
|
|
virtual ~OptionsAwareHTTPCacheCallback();
|
|
virtual bool IsCacheValid(const GoogleString& key,
|
|
const ResponseHeaders& headers);
|
|
virtual int64 OverrideCacheTtlMs(const GoogleString& key);
|
|
virtual ResponseHeaders::VaryOption RespectVaryOnResources() const;
|
|
|
|
// Validates the specified response for the URL, request, given the specified
|
|
// options. This is for checking if cache response can still be used, not for
|
|
// determining whether an entry should be written to an HTTP cache.
|
|
static bool IsCacheValid(const GoogleString& key,
|
|
const RewriteOptions& rewrite_options,
|
|
const RequestContextPtr& request_ctx,
|
|
const ResponseHeaders& headers);
|
|
|
|
protected:
|
|
// Sub-classes need to ensure that rewrite_options remains valid till
|
|
// Callback::Done finishes.
|
|
OptionsAwareHTTPCacheCallback(
|
|
const RewriteOptions* rewrite_options,
|
|
const RequestContextPtr& request_ctx);
|
|
|
|
private:
|
|
const RewriteOptions* rewrite_options_;
|
|
|
|
DISALLOW_COPY_AND_ASSIGN(OptionsAwareHTTPCacheCallback);
|
|
};
|
|
|
|
} // namespace net_instaweb
|
|
|
|
#endif // NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_
|