diff --git a/net/instaweb/instaweb.gyp b/net/instaweb/instaweb.gyp index 3d2bc1929..7a4bd2152 100644 --- a/net/instaweb/instaweb.gyp +++ b/net/instaweb/instaweb.gyp @@ -1601,6 +1601,7 @@ 'rewriter/inline_attribute_slot.cc', 'rewriter/inline_resource_slot.cc', 'rewriter/inline_rewrite_context.cc', + 'rewriter/input_info_utils.cc', 'rewriter/insert_amp_link_filter.cc', 'rewriter/insert_dns_prefetch_filter.cc', 'rewriter/insert_ga_filter.cc', diff --git a/net/instaweb/rewriter/input_info_utils.cc b/net/instaweb/rewriter/input_info_utils.cc new file mode 100644 index 000000000..69e96eac5 --- /dev/null +++ b/net/instaweb/rewriter/input_info_utils.cc @@ -0,0 +1,209 @@ +/* + * Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: jmarantz@google.com (Joshua Marantz) +// + +#include "net/instaweb/rewriter/public/input_info_utils.h" + +#include "base/logging.h" +#include "net/instaweb/rewriter/public/rewrite_options.h" +#include "net/instaweb/rewriter/public/server_context.h" +#include "pagespeed/kernel/base/file_system.h" +#include "pagespeed/kernel/base/hasher.h" +#include "pagespeed/kernel/base/proto_util.h" +#include "pagespeed/kernel/base/shared_string.h" +#include "pagespeed/kernel/base/string.h" +#include "pagespeed/kernel/base/string_util.h" +#include "pagespeed/kernel/base/timer.h" +#include "pagespeed/kernel/cache/cache_interface.h" + +namespace net_instaweb { +namespace input_info_utils { + +namespace { + +bool MatchesFileSystemMetadataCacheEntry( + const InputInfo& input_info, + const InputInfo& fsmdc_info, + int64 mtime_ms) { + return (fsmdc_info.has_last_modified_time_ms() && + fsmdc_info.has_input_content_hash() && + fsmdc_info.last_modified_time_ms() == mtime_ms && + fsmdc_info.input_content_hash() == input_info.input_content_hash()); +} + +// Checks if the stat() data about the input_info's file matches that in the +// filesystem metadata cache; it needs to be for the input to be "valid". +bool IsFilesystemMetadataCacheCurrent(CacheInterface* fsmdc, + const GoogleString& file_key, + const InputInfo& input_info, + int64 mtime_ms) { + // Get the filesystem metadata cache (FSMDC) entry for the filename. + // If we found an entry, + // Extract the FSMDC timestamp and contents hash. + // If the FSMDC timestamp == the file's current timestamp, + // (the FSMDC contents hash is valid/current/correct) + // If the FSMDC content hash == the metadata cache's content hash, + // The metadata cache's entry is valid so its input_info is valid. + // Else + // Return false as the metadata cache's entry is not valid as + // someone has changed it on us. + // Else + // Return false as our FSMDC entry is out of date so we can't + // tell if the metadata cache's input_info is valid. + // Else + // Return false as we can't tell if the metadata cache's input_info is + // valid. + CacheInterface::SynchronousCallback callback; + fsmdc->Get(file_key, &callback); + DCHECK(callback.called()); + if (callback.state() == CacheInterface::kAvailable) { + StringPiece val_str = callback.value()->Value(); + ArrayInputStream input(val_str.data(), val_str.size()); + InputInfo fsmdc_info; + if (fsmdc_info.ParseFromZeroCopyStream(&input)) { + // We have a filesystem metadata cache entry: if its timestamp equals + // the file's, and its contents hash equals the metadata caches's, then + // the input is valid. + return MatchesFileSystemMetadataCacheEntry( + input_info, fsmdc_info, mtime_ms); + } + } + return false; +} + +// Update the filesystem metadata cache with the timestamp and contents hash +// of the given input's file (which is read from disk to compute the hash). +// Returns false if the file cannot be read. +bool UpdateFilesystemMetadataCache(ServerContext* server_context, + const GoogleString& file_key, + const InputInfo& input_info, + int64 mtime_ms, + CacheInterface* fsmdc, + InputInfo* fsmdc_info) { + GoogleString contents; + if (!server_context->file_system()->ReadFile( + input_info.filename().c_str(), &contents, + server_context->message_handler())) { + return false; + } + GoogleString contents_hash = + server_context->contents_hasher()->Hash(contents); + fsmdc_info->set_type(InputInfo::FILE_BASED); + DCHECK_LT(0, mtime_ms); + fsmdc_info->set_last_modified_time_ms(mtime_ms); + fsmdc_info->set_input_content_hash(contents_hash); + GoogleString buf; + { + // MUST be in a block so that sstream is destructed to finalize buf. + StringOutputStream sstream(&buf); + fsmdc_info->SerializeToZeroCopyStream(&sstream); + } + fsmdc->PutSwappingString(file_key, &buf); + return true; +} + +} // namespace + +// Checks whether the given input is still unchanged. +bool IsInputValid( + ServerContext* server_context, const RewriteOptions* options, + bool nested_rewrite, const InputInfo& input_info, + int64 now_ms, bool* purged, bool* stale_rewrite) { + switch (input_info.type()) { + case InputInfo::CACHED: { + // It is invalid if cacheable inputs have expired or ... + DCHECK(input_info.has_expiration_time_ms()); + if (input_info.has_url()) { + // We do not search wildcards when validating metadata because + // that would require N wildcard matches (not even a + // FastWildcardGroup) per input dependency. + if (!options->IsUrlCacheValid(input_info.url(), input_info.date_ms(), + false /* search_wildcards */)) { + *purged = true; + return false; + } + } + if (!input_info.has_expiration_time_ms()) { + return false; + } + int64 ttl_ms = input_info.expiration_time_ms() - now_ms; + if (ttl_ms > 0) { + return true; + } else if ( + !nested_rewrite && + ttl_ms + options->metadata_cache_staleness_threshold_ms() > 0) { + *stale_rewrite = true; + return true; + } + return false; + } + case InputInfo::FILE_BASED: { + // ... if file-based inputs have changed. + DCHECK(input_info.has_last_modified_time_ms() && + input_info.has_filename()); + if (!input_info.has_last_modified_time_ms() || + !input_info.has_filename()) { + return false; + } + int64 mtime_sec; + server_context->file_system()->Mtime(input_info.filename(), &mtime_sec, + server_context->message_handler()); + int64 mtime_ms = mtime_sec * Timer::kSecondMs; + + CacheInterface* fsmdc = server_context->filesystem_metadata_cache(); + if (fsmdc != nullptr) { + CHECK(fsmdc->IsBlocking()); + if (!input_info.has_input_content_hash()) { + return false; + } + // Construct a host-specific key. The format is somewhat arbitrary, + // all it needs to do is differentiate the same path on different + // hosts. If the size of the key becomes a concern we can hash it + // and hope. + GoogleString file_key; + StrAppend(&file_key, "file://", server_context->hostname(), + input_info.filename()); + if (IsFilesystemMetadataCacheCurrent(fsmdc, file_key, input_info, + mtime_ms)) { + return true; + } + InputInfo fsmdc_info; + if (!UpdateFilesystemMetadataCache(server_context, file_key, + input_info, mtime_ms, fsmdc, + &fsmdc_info)) { + return false; + } + // Check again now that we KNOW we have the most up-to-date data + // in the filesystem metadata cache. + return MatchesFileSystemMetadataCacheEntry( + input_info, fsmdc_info, mtime_ms); + } else { + DCHECK_LT(0, input_info.last_modified_time_ms()); + return (mtime_ms == input_info.last_modified_time_ms()); + } + } + case InputInfo::ALWAYS_VALID: + return true; + } + + LOG(DFATAL) << "Corrupt InputInfo object !?"; + return false; +} + +} // namespace input_info_utils +} // namespace net_instaweb diff --git a/net/instaweb/rewriter/public/input_info_utils.h b/net/instaweb/rewriter/public/input_info_utils.h new file mode 100644 index 000000000..bc4de017e --- /dev/null +++ b/net/instaweb/rewriter/public/input_info_utils.h @@ -0,0 +1,47 @@ +/* + * Copyright 2016 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: morlovich@google.com (Maks Orlovich) + +#ifndef NET_INSTAWEB_REWRITER_PUBLIC_INPUT_INFO_UTILS_H_ +#define NET_INSTAWEB_REWRITER_PUBLIC_INPUT_INFO_UTILS_H_ + +#include "net/instaweb/rewriter/input_info.pb.h" +#include "net/instaweb/rewriter/public/rewrite_options.h" +#include "net/instaweb/rewriter/public/server_context.h" +#include "pagespeed/kernel/base/basictypes.h" + +namespace net_instaweb { + +namespace input_info_utils { + +// Computes whether the given input_info is valid at now_ms, with the filesystem +// and its metadata cache in server_context, considering invalidation +// information and policy in options. +// +// *purged will be set if the entry was invalidated due to a cache purge. +// *stale_rewrite will be set (and true will be returned) if +// options->metadata_cache_staleness_threshold_ms() permitted reuse past +// expiration at this time, andthe rewrite isn't nested. +bool IsInputValid( + ServerContext* server_context, const RewriteOptions* options, + bool nested_rewrite, const InputInfo& input_info, + int64 now_ms, bool* purged, bool* stale_rewrite); + +} // namespace input_info_utils +} // namespace net_instaweb + +#endif // NET_INSTAWEB_REWRITER_PUBLIC_INPUT_INFO_UTILS_H_ diff --git a/net/instaweb/rewriter/rewrite_context.cc b/net/instaweb/rewriter/rewrite_context.cc index e0e8c237d..42898ad2f 100644 --- a/net/instaweb/rewriter/rewrite_context.cc +++ b/net/instaweb/rewriter/rewrite_context.cc @@ -41,6 +41,7 @@ #include "net/instaweb/http/public/url_async_fetcher.h" #include "net/instaweb/rewriter/cached_result.pb.h" #include "net/instaweb/rewriter/public/inline_output_resource.h" +#include "net/instaweb/rewriter/public/input_info_utils.h" #include "net/instaweb/rewriter/public/output_resource.h" #include "net/instaweb/rewriter/public/resource.h" #include "net/instaweb/rewriter/public/resource_namer.h" @@ -55,7 +56,6 @@ #include "pagespeed/kernel/base/base64_util.h" #include "pagespeed/kernel/base/callback.h" #include "pagespeed/kernel/base/dynamic_annotations.h" // RunningOnValgrind -#include "pagespeed/kernel/base/file_system.h" #include "pagespeed/kernel/base/function.h" #include "pagespeed/kernel/base/hasher.h" #include "pagespeed/kernel/base/message_handler.h" @@ -297,169 +297,12 @@ class RewriteContext::OutputCacheCallback : public CacheInterface::Callback { } private: - bool AreInputInfosEqual(const InputInfo& input_info, - const InputInfo& fsmdc_info, - int64 mtime_ms) { - return (fsmdc_info.has_last_modified_time_ms() && - fsmdc_info.has_input_content_hash() && - fsmdc_info.last_modified_time_ms() == mtime_ms && - fsmdc_info.input_content_hash() == input_info.input_content_hash()); - } - - // Checks if the stat() data about the input_info's file matches that in the - // filesystem metadata cache; it needs to be for the input to be "valid". - bool IsFilesystemMetadataCacheCurrent(CacheInterface* fsmdc, - const GoogleString& file_key, - const InputInfo& input_info, - int64 mtime_ms) { - // Get the filesystem metadata cache (FSMDC) entry for the filename. - // If we found an entry, - // Extract the FSMDC timestamp and contents hash. - // If the FSMDC timestamp == the file's current timestamp, - // (the FSMDC contents hash is valid/current/correct) - // If the FSMDC content hash == the metadata cache's content hash, - // The metadata cache's entry is valid so its input_info is valid. - // Else - // Return false as the metadata cache's entry is not valid as - // someone has changed it on us. - // Else - // Return false as our FSMDC entry is out of date so we can't - // tell if the metadata cache's input_info is valid. - // Else - // Return false as we can't tell if the metadata cache's input_info is - // valid. - CacheInterface::SynchronousCallback callback; - fsmdc->Get(file_key, &callback); - DCHECK(callback.called()); - if (callback.state() == CacheInterface::kAvailable) { - StringPiece val_str = callback.value()->Value(); - ArrayInputStream input(val_str.data(), val_str.size()); - InputInfo fsmdc_info; - if (fsmdc_info.ParseFromZeroCopyStream(&input)) { - // We have a filesystem metadata cache entry: if its timestamp equals - // the file's, and its contents hash equals the metadata caches's, then - // the input is valid. - return AreInputInfosEqual(input_info, fsmdc_info, mtime_ms); - } - } - return false; - } - - // Update the filesystem metadata cache with the timestamp and contents hash - // of the given input's file (which is read from disk to compute the hash). - // Returns false if the file cannot be read. - bool UpdateFilesystemMetadataCache(ServerContext* server_context, - const GoogleString& file_key, - const InputInfo& input_info, - int64 mtime_ms, - CacheInterface* fsmdc, - InputInfo* fsmdc_info) { - GoogleString contents; - if (!server_context->file_system()->ReadFile( - input_info.filename().c_str(), &contents, - server_context->message_handler())) { - return false; - } - GoogleString contents_hash = - server_context->contents_hasher()->Hash(contents); - fsmdc_info->set_type(InputInfo::FILE_BASED); - DCHECK_LT(0, mtime_ms); - fsmdc_info->set_last_modified_time_ms(mtime_ms); - fsmdc_info->set_input_content_hash(contents_hash); - GoogleString buf; - { - // MUST be in a block so that sstream is destructed to finalize buf. - StringOutputStream sstream(&buf); - fsmdc_info->SerializeToZeroCopyStream(&sstream); - } - fsmdc->PutSwappingString(file_key, &buf); - return true; - } - - // Checks whether the given input is still unchanged. bool IsInputValid(const InputInfo& input_info, int64 now_ms, bool* purged, bool* stale_rewrite) { - switch (input_info.type()) { - case InputInfo::CACHED: { - // It is invalid if cacheable inputs have expired or ... - DCHECK(input_info.has_expiration_time_ms()); - const RewriteOptions* options = rewrite_context_->Options(); - if (input_info.has_url()) { - // We do not search wildcards when validating metadata because - // that would require N wildcard matches (not even a - // FastWildcardGroup) per input dependency. - if (!options->IsUrlCacheValid(input_info.url(), input_info.date_ms(), - false /* search_wildcards */)) { - *purged = true; - return false; - } - } - if (!input_info.has_expiration_time_ms()) { - return false; - } - int64 ttl_ms = input_info.expiration_time_ms() - now_ms; - if (ttl_ms > 0) { - return true; - } else if ( - !rewrite_context_->has_parent() && - ttl_ms + options->metadata_cache_staleness_threshold_ms() > 0) { - *stale_rewrite = true; - return true; - } - return false; - } - case InputInfo::FILE_BASED: { - ServerContext* server_context = rewrite_context_->FindServerContext(); - - // ... if file-based inputs have changed. - DCHECK(input_info.has_last_modified_time_ms() && - input_info.has_filename()); - if (!input_info.has_last_modified_time_ms() || - !input_info.has_filename()) { - return false; - } - int64 mtime_sec; - server_context->file_system()->Mtime(input_info.filename(), &mtime_sec, - server_context->message_handler()); - int64 mtime_ms = mtime_sec * Timer::kSecondMs; - - CacheInterface* fsmdc = server_context->filesystem_metadata_cache(); - if (fsmdc != NULL) { - CHECK(fsmdc->IsBlocking()); - if (!input_info.has_input_content_hash()) { - return false; - } - // Construct a host-specific key. The format is somewhat arbitrary, - // all it needs to do is differentiate the same path on different - // hosts. If the size of the key becomes a concern we can hash it - // and hope. - GoogleString file_key; - StrAppend(&file_key, "file://", server_context->hostname(), - input_info.filename()); - if (IsFilesystemMetadataCacheCurrent(fsmdc, file_key, input_info, - mtime_ms)) { - return true; - } - InputInfo fsmdc_info; - if (!UpdateFilesystemMetadataCache(server_context, file_key, - input_info, mtime_ms, fsmdc, - &fsmdc_info)) { - return false; - } - // Check again now that we KNOW we have the most up-to-date data - // in the filesystem metadata cache. - return AreInputInfosEqual(input_info, fsmdc_info, mtime_ms); - } else { - DCHECK_LT(0, input_info.last_modified_time_ms()); - return (mtime_ms == input_info.last_modified_time_ms()); - } - } - case InputInfo::ALWAYS_VALID: - return true; - } - - LOG(DFATAL) << "Corrupt InputInfo object !?"; - return false; + return input_info_utils::IsInputValid( + rewrite_context_->FindServerContext(), rewrite_context_->Options(), + rewrite_context_->has_parent(), input_info, now_ms, purged, + stale_rewrite); } // Check that a CachedResult is valid, specifically, that all the inputs are