Bail early in ipro-recording if the content-type is known in the

kPreliminaryHeaders to be something we cannot optimize.
This commit is contained in:
Joshua Marantz
2016-09-28 17:40:55 -04:00
parent 74059559a1
commit 94008d4205
3 changed files with 125 additions and 29 deletions
+27 -3
View File
@@ -740,10 +740,28 @@ apr_status_t instaweb_in_place_filter(ap_filter_t* filter,
first = false;
ResponseHeaders response_headers(recorder->http_options());
ApacheRequestToResponseHeaders(*request, &response_headers, NULL);
// The content-type is likely to be missing from the Apache response
// headers until AP_FTYPE_PROTOCOL, and this filter is run earlier, at
// AP_FTYPE_CONTENT_SET + 1. However, the content-type may be in
// the request object, and if we populate it into the response headers
// early, we can check for uninteresting content-types in
// ConsiderResponseHeaders and avoid the overhead of collecting the
// content into memory.
if ((request->content_type != nullptr) &&
(response_headers.DetermineContentType() == nullptr)) {
response_headers.Replace(
HttpAttributes::kContentType, request->content_type);
}
recorder->ConsiderResponseHeaders(
InPlaceResourceRecorder::kPreliminaryHeaders, &response_headers);
}
if (recorder->failed()) {
break;
}
// Content bucket.
const char* buf = NULL;
size_t bytes = 0;
@@ -843,7 +861,7 @@ apr_status_t instaweb_in_place_check_headers_filter(ap_filter_t* filter,
return ap_pass_brigade(filter->next, bb);
}
// This should always be set by handle_as_in_place() in instaweb_handler.cc.
// This should always be set by Instaweb::HandleAsInPlace().
InPlaceResourceRecorder* recorder =
static_cast<InPlaceResourceRecorder*>(filter->ctx);
@@ -1147,13 +1165,19 @@ void mod_pagespeed_register_hooks(apr_pool_t* pool) {
static_cast<ap_filter_type>(AP_FTYPE_CONTENT_SET + 1));
// Run after contents are set, but before mod_deflate, which runs at
// AP_FTYPE_CONTENT_SET.
// AP_FTYPE_CONTENT_SET. We use a separate filter rather
// than just adding logic to instaweb_fix_headers_filter because the
// recorder gets passed in as the filter->ctx when it is registered
// in InstawebHandler::HandleAsInPlace.
ap_register_output_filter(
kModPagespeedInPlaceFilterName, instaweb_in_place_filter, NULL,
static_cast<ap_filter_type>(AP_FTYPE_CONTENT_SET - 1));
// Run after headers are set by mod_headers, mod_expires, etc. and
// after Content-Type has been set (which appears to be at
// AP_FTYPE_PROTOCOL).
// AP_FTYPE_PROTOCOL). We cannot simply collect the bytes at
// AP_FTYPE_PROTOCOL+1 because, it appears, at that time the headers
// have been serialized into the content, and it's rather embarassing
// to have to rescan for the end of the headers.
ap_register_output_filter(
kModPagespeedInPlaceCheckHeadersName,
instaweb_in_place_check_headers_filter, NULL,
+50 -24
View File
@@ -22,6 +22,7 @@
#include "net/instaweb/http/public/http_cache.h"
#include "net/instaweb/http/public/http_cache_failure.h"
#include "net/instaweb/http/public/http_value.h"
#include "net/instaweb/http/public/inflating_fetch.h"
#include "pagespeed/kernel/base/statistics.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/http/content_type.h"
@@ -128,7 +129,7 @@ bool InPlaceResourceRecorder::Write(const StringPiece& contents,
void InPlaceResourceRecorder::ConsiderResponseHeaders(
HeadersKind headers_kind,
ResponseHeaders* response_headers) {
CHECK(response_headers != NULL) << "Response headers cannot be NULL";
CHECK(response_headers != nullptr);
DCHECK(!full_response_headers_considered_);
if (!consider_response_headers_called_) {
@@ -141,6 +142,8 @@ void InPlaceResourceRecorder::ConsiderResponseHeaders(
HttpStatus::kOK);
}
status_code_ = response_headers->status_code();
// Shortcut for bailing out early when the response will be too large.
int64 content_length;
if (max_response_bytes_ <= 0 &&
@@ -153,13 +156,46 @@ void InPlaceResourceRecorder::ConsiderResponseHeaders(
return;
}
// Check if IPRO applies considering the content type, if we have one at
// this point. Depending on the server, we may only know the content type
// after the we are called with kFullHeaders.
//
// Note: in a proxy setup it might be desirable to cache HTML and
// non-rewritable Content-Types to avoid re-fetching from the origin server.
//
// If we have the full-headers, then we demand to have a good content type
// now.
if (response_headers->Has(HttpAttributes::kContentType) ||
(headers_kind == kFullHeaders)) {
const ContentType* content_type = response_headers->DetermineContentType();
// Bail if not an image, css, or JS.
if ((content_type == nullptr) ||
!(content_type->IsImage() ||
content_type->IsCss() ||
content_type->IsJsLike())) {
// TODO(jmarantz): I feel that it's better not to write a note
// about uncachable-content-types to the cache when we are failing
// early. However, I cannot prove that in a siege test. It is
// clear that calling DroppedAsUncacheable() here fills the HTTP
// cache with a new entry for every HTML URL, including all
// query-params, and that seems bad, but
// siege_ipro_high_entropy.sh does not benefit from just simply
// setting failure_=true here and skipping the call to
// DroppedAsUncacheable(). If at some point we decide to go this
// way, we must also change the expected cache_inserts count in
// "Blocking rewrite enabled." in apache/system_test.sh from 3 to
// 2.
DroppedAsUncacheable();
return;
}
}
if (headers_kind != kFullHeaders) {
return;
}
full_response_headers_considered_ = true;
status_code_ = response_headers->status_code();
// For 4xx and 5xx we can't IPRO, but we can also cache the failure so we
// don't retry recording for a bit.
if (response_headers->IsErrorStatus()) {
@@ -181,20 +217,6 @@ void InPlaceResourceRecorder::ConsiderResponseHeaders(
return;
}
// First, check if IPRO applies considering the content type.
// Note: in a proxy setup it might be desirable to cache HTML and
// non-rewritable Content-Types to avoid re-fetching from the origin server.
const ContentType* content_type =
response_headers->DetermineContentType();
if (content_type == NULL ||
!(content_type->IsImage() ||
content_type->IsCss() ||
content_type->IsJsLike())) {
// We remember wrong mimetypes as uncacheable. This is slightly goofy,
// and is inconsistent with how they are treated on normal rewrite path...
DroppedAsUncacheable();
return;
}
bool is_cacheable = response_headers->IsProxyCacheable(
request_properties_,
ResponseHeaders::GetVaryOption(http_options_.respect_vary),
@@ -213,12 +235,14 @@ void InPlaceResourceRecorder::DroppedDueToSize() {
}
void InPlaceResourceRecorder::DroppedAsUncacheable() {
cache_->RememberFailure(
url_, fragment_,
status_code_ == 200 ? kFetchStatusUncacheable200
: kFetchStatusUncacheableError,
handler_);
failure_ = true;
if (!failure_) {
cache_->RememberFailure(
url_, fragment_,
status_code_ == 200 ? kFetchStatusUncacheable200
: kFetchStatusUncacheableError,
handler_);
failure_ = true;
}
}
void InPlaceResourceRecorder::SaveCacheControl(const char* cache_control) {
@@ -243,7 +267,9 @@ void InPlaceResourceRecorder::DoneAndSetHeaders(
if (status_code_ == HttpStatus::kOK && resource_value_.contents_size() == 0) {
// Ignore Empty 200 responses.
// https://github.com/pagespeed/mod_pagespeed/issues/1050
cache_->RememberFailure(url_, fragment_, kFetchStatusEmpty, handler_);
if (!failure_) {
cache_->RememberFailure(url_, fragment_, kFetchStatusEmpty, handler_);
}
failure_ = true;
}
@@ -84,7 +84,8 @@ class InPlaceResourceRecorderTest : public RewriteTestBase {
ResponseHeaders final_headers;
SetDefaultLongCacheHeaders(&kContentTypeCss, &final_headers);
if (header_time == kLateGzipHeader) {
if ((header_time == kLateGzipHeader) ||
(header_time == kPrelimGzipHeader)) {
final_headers.Add(HttpAttributes::kContentEncoding,
HttpAttributes::kGzip);
}
@@ -144,7 +145,33 @@ class InPlaceResourceRecorderTest : public RewriteTestBase {
EXPECT_EQ(
HTTPCache::FindResult(HTTPCache::kRecentFailure,
kFetchStatusUncacheable200),
HttpBlockingFind(kTestUrl, http_cache(), &value_out, &headers_out));
HttpBlockingFind(kTestUrl, http_cache(), &value_out, &headers_out))
<< content_type->mime_type();
}
const char* BailsForContentType(const char* mime_type) {
ResponseHeaders headers;
headers.set_status_code(HttpStatus::kOK);
SetDefaultLongCacheHeaders(&kContentTypeCss, &headers);
if (mime_type == nullptr) {
headers.RemoveAll(HttpAttributes::kContentType);
} else {
headers.Replace(HttpAttributes::kContentType, mime_type);
}
headers.ComputeCaching();
scoped_ptr<InPlaceResourceRecorder> recorder(MakeRecorder(kTestUrl));
recorder->ConsiderResponseHeaders(
InPlaceResourceRecorder::kPreliminaryHeaders, &headers);
if (recorder->failed()) {
return "prelim";
}
recorder->ConsiderResponseHeaders(
InPlaceResourceRecorder::kFullHeaders, &headers);
if (recorder->failed()) {
return "full";
}
return "none";
}
};
@@ -314,6 +341,25 @@ TEST_F(InPlaceResourceRecorderTest, SplitOperationWithGzipWithCompressedCache) {
TestWithGzip(kLateGzipHeader);
}
TEST_F(InPlaceResourceRecorderTest, BailEarlyOnUnexpectedContentType) {
EXPECT_STREQ("prelim", BailsForContentType(kContentTypeHtml.mime_type()));
EXPECT_STREQ("prelim", BailsForContentType(kContentTypePdf.mime_type()));
EXPECT_STREQ("prelim", BailsForContentType(kContentTypeText.mime_type()));
EXPECT_STREQ("prelim", BailsForContentType("bogus"));
EXPECT_STREQ("none", BailsForContentType(kContentTypeCss.mime_type()));
EXPECT_STREQ("none", BailsForContentType(kContentTypeJavascript.mime_type()));
EXPECT_STREQ("none", BailsForContentType(kContentTypeGif.mime_type()));
EXPECT_STREQ("none", BailsForContentType(kContentTypePng.mime_type()));
EXPECT_STREQ("none", BailsForContentType(kContentTypeJpeg.mime_type()));
EXPECT_STREQ("none", BailsForContentType(kContentTypeWebp.mime_type()));
// Note that if the content-type is missing in the first round, we
// don't bail early, but we will bail late. This is because in
// the preliminary round, we may not know the correct content type
// yet, so we need to be conservative and let the processing continue.
EXPECT_STREQ("full", BailsForContentType(nullptr));
}
} // namespace
} // namespace net_instaweb