// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // // This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache. /* * Copyright (C) 2008 Apple Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "webkit/appcache/manifest_parser.h" #include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/utf_string_conversions.h" #include "googleurl/src/gurl.h" namespace appcache { enum Mode { EXPLICIT, INTERCEPT, FALLBACK, ONLINE_WHITELIST, UNKNOWN, }; Manifest::Manifest() : online_whitelist_all(false) {} Manifest::~Manifest() {} bool ParseManifest(const GURL& manifest_url, const char* data, int length, Manifest& manifest) { // This is an implementation of the parsing algorithm specified in // the HTML5 offline web application docs: // http://www.w3.org/TR/html5/offline.html // Do not modify it without consulting those docs. // Though you might be tempted to convert these wstrings to UTF-8 or // string16, this implementation seems simpler given the constraints. const wchar_t kSignature[] = L"CACHE MANIFEST"; const size_t kSignatureLength = arraysize(kSignature) - 1; const wchar_t kChromiumSignature[] = L"CHROMIUM CACHE MANIFEST"; const size_t kChromiumSignatureLength = arraysize(kChromiumSignature) - 1; DCHECK(manifest.explicit_urls.empty()); DCHECK(manifest.fallback_namespaces.empty()); DCHECK(manifest.online_whitelist_namespaces.empty()); DCHECK(!manifest.online_whitelist_all); Mode mode = EXPLICIT; std::wstring data_string; // TODO(jennb): cannot do UTF8ToWide(data, length, &data_string); // until UTF8ToWide uses 0xFFFD Unicode replacement character. base::CodepageToWide(std::string(data, length), base::kCodepageUTF8, base::OnStringConversionError::SUBSTITUTE, &data_string); const wchar_t* p = data_string.c_str(); const wchar_t* end = p + data_string.length(); // Look for the magic signature: "^\xFEFF?CACHE MANIFEST[ \t]?" // Example: "CACHE MANIFEST #comment" is a valid signature. // Example: "CACHE MANIFEST;V2" is not. // When the input data starts with a UTF-8 Byte-Order-Mark // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a // Unicode BOM (U+FEFF). Skip a converted Unicode BOM if it exists. int bom_offset = 0; if (!data_string.empty() && data_string[0] == 0xFEFF) { bom_offset = 1; ++p; } if (p >= end) return false; // Check for a supported signature and skip p past it. if (0 == data_string.compare(bom_offset, kSignatureLength, kSignature)) { p += kSignatureLength; } else if (0 == data_string.compare(bom_offset, kChromiumSignatureLength, kChromiumSignature)) { p += kChromiumSignatureLength; } else { return false; } // Character after "CACHE MANIFEST" must be whitespace. if (p < end && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r') return false; // Skip to the end of the line. while (p < end && *p != '\r' && *p != '\n') ++p; while (1) { // Skip whitespace while (p < end && (*p == '\n' || *p == '\r' || *p == ' ' || *p == '\t')) ++p; if (p == end) break; const wchar_t* line_start = p; // Find the end of the line while (p < end && *p != '\r' && *p != '\n') ++p; // Check if we have a comment if (*line_start == '#') continue; // Get rid of trailing whitespace const wchar_t* tmp = p - 1; while (tmp > line_start && (*tmp == ' ' || *tmp == '\t')) --tmp; std::wstring line(line_start, tmp - line_start + 1); if (line == L"CACHE:") { mode = EXPLICIT; } else if (line == L"FALLBACK:") { mode = FALLBACK; } else if (line == L"NETWORK:") { mode = ONLINE_WHITELIST; } else if (line == L"CHROMIUM-INTERCEPT:") { mode = INTERCEPT; } else if (*(line.end() - 1) == ':') { mode = UNKNOWN; } else if (mode == UNKNOWN) { continue; } else if (line == L"*" && mode == ONLINE_WHITELIST) { manifest.online_whitelist_all = true; continue; } else if (mode == EXPLICIT || mode == ONLINE_WHITELIST) { const wchar_t *line_p = line.c_str(); const wchar_t *line_end = line_p + line.length(); // Look for whitespace separating the URL from subsequent ignored tokens. while (line_p < line_end && *line_p != '\t' && *line_p != ' ') ++line_p; string16 url16; WideToUTF16(line.c_str(), line_p - line.c_str(), &url16); GURL url = manifest_url.Resolve(url16); if (!url.is_valid()) continue; if (url.has_ref()) { GURL::Replacements replacements; replacements.ClearRef(); url = url.ReplaceComponents(replacements); } // Scheme component must be the same as the manifest URL's. if (url.scheme() != manifest_url.scheme()) { continue; } // See http://code.google.com/p/chromium/issues/detail?id=69594 // We willfully violate the HTML5 spec at this point in order // to support the appcaching of cross-origin HTTPS resources. // Per the spec, EXPLICIT cross-origin HTTS resources should be // ignored here. We've opted for a milder constraint and allow // caching unless the resource has a "no-store" header. That // condition is enforced in AppCacheUpdateJob. if (mode == EXPLICIT) { manifest.explicit_urls.insert(url.spec()); } else { manifest.online_whitelist_namespaces.push_back(url); } } else if (mode == INTERCEPT) { // Lines of the form, // const wchar_t* line_p = line.c_str(); const wchar_t* line_end = line_p + line.length(); // Look for first whitespace separating the url namespace from // the intercept type. while (line_p < line_end && *line_p != '\t' && *line_p != ' ') ++line_p; if (line_p == line_end) continue; // There was no whitespace separating the URLs. string16 namespace_url16; WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16); GURL namespace_url = manifest_url.Resolve(namespace_url16); if (!namespace_url.is_valid()) continue; if (namespace_url.has_ref()) { GURL::Replacements replacements; replacements.ClearRef(); namespace_url = namespace_url.ReplaceComponents(replacements); } // The namespace URL must have the same scheme, host and port // as the manifest's URL. if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) continue; // Skip whitespace separating namespace from the type. while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) ++line_p; // Look for whitespace separating the type from the target url. const wchar_t* type_start = line_p; while (line_p < line_end && *line_p != '\t' && *line_p != ' ') ++line_p; // Look for a type value we understand, otherwise skip the line. std::wstring type(type_start, line_p - type_start); if (type != L"return") continue; // Skip whitespace separating type from the target_url. while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) ++line_p; // Look for whitespace separating the URL from subsequent ignored tokens. const wchar_t* target_url_start = line_p; while (line_p < line_end && *line_p != '\t' && *line_p != ' ') ++line_p; string16 target_url16; WideToUTF16(target_url_start, line_p - target_url_start, &target_url16); GURL target_url = manifest_url.Resolve(target_url16); if (!target_url.is_valid()) continue; if (target_url.has_ref()) { GURL::Replacements replacements; replacements.ClearRef(); target_url = target_url.ReplaceComponents(replacements); } if (manifest_url.GetOrigin() != target_url.GetOrigin()) continue; manifest.intercept_namespaces.push_back( Namespace(INTERCEPT_NAMESPACE, namespace_url, target_url)); } else if (mode == FALLBACK) { const wchar_t* line_p = line.c_str(); const wchar_t* line_end = line_p + line.length(); // Look for whitespace separating the two URLs while (line_p < line_end && *line_p != '\t' && *line_p != ' ') ++line_p; if (line_p == line_end) { // There was no whitespace separating the URLs. continue; } string16 namespace_url16; WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16); GURL namespace_url = manifest_url.Resolve(namespace_url16); if (!namespace_url.is_valid()) continue; if (namespace_url.has_ref()) { GURL::Replacements replacements; replacements.ClearRef(); namespace_url = namespace_url.ReplaceComponents(replacements); } // Fallback namespace URL must have the same scheme, host and port // as the manifest's URL. if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) { continue; } // Skip whitespace separating fallback namespace from URL. while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) ++line_p; // Look for whitespace separating the URL from subsequent ignored tokens. const wchar_t* fallback_start = line_p; while (line_p < line_end && *line_p != '\t' && *line_p != ' ') ++line_p; string16 fallback_url16; WideToUTF16(fallback_start, line_p - fallback_start, &fallback_url16); GURL fallback_url = manifest_url.Resolve(fallback_url16); if (!fallback_url.is_valid()) continue; if (fallback_url.has_ref()) { GURL::Replacements replacements; replacements.ClearRef(); fallback_url = fallback_url.ReplaceComponents(replacements); } // Fallback entry URL must have the same scheme, host and port // as the manifest's URL. if (manifest_url.GetOrigin() != fallback_url.GetOrigin()) { continue; } // Store regardless of duplicate namespace URL. Only first match // will ever be used. manifest.fallback_namespaces.push_back( Namespace(FALLBACK_NAMESPACE, namespace_url, fallback_url)); } else { NOTREACHED(); } } return true; } } // namespace appcache