diff options
Diffstat (limited to 'tools/grit/grit/format/html_inline.py')
-rwxr-xr-x | tools/grit/grit/format/html_inline.py | 423 |
1 files changed, 423 insertions, 0 deletions
diff --git a/tools/grit/grit/format/html_inline.py b/tools/grit/grit/format/html_inline.py new file mode 100755 index 0000000..f532496 --- /dev/null +++ b/tools/grit/grit/format/html_inline.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python +# Copyright (c) 2012 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""Flattens a HTML file by inlining its external resources. + +This is a small script that takes a HTML file, looks for src attributes +and inlines the specified file, producing one HTML file with no external +dependencies. It recursively inlines the included files. +""" + +import os +import re +import sys +import base64 +import mimetypes + +from grit import lazy_re +from grit import util + +# There is a python bug that makes mimetypes crash if the Windows +# registry contains non-Latin keys ( http://bugs.python.org/issue9291 +# ). Initing manually and blocking external mime-type databases will +# prevent that bug and if we add svg manually, it will still give us +# the data we need. +mimetypes.init([]) +mimetypes.add_type('image/svg+xml', '.svg') + +DIST_DEFAULT = 'chromium' +DIST_ENV_VAR = 'CHROMIUM_BUILD' +DIST_SUBSTR = '%DISTRIBUTION%' + +# Matches beginning of an "if" block with trailing spaces. +_BEGIN_IF_BLOCK = lazy_re.compile( + '<if [^>]*?expr="(?P<expression>[^"]*)"[^>]*?>\s*') + +# Matches ending of an "if" block with preceding spaces. +_END_IF_BLOCK = lazy_re.compile('\s*</if>') + +# Used by DoInline to replace various links with inline content. +_STYLESHEET_RE = lazy_re.compile( + '<link rel="stylesheet"[^>]+?href="(?P<filename>[^"]*)".*?>(\s*</link>)?', + re.DOTALL) +_INCLUDE_RE = lazy_re.compile( + '<include[^>]+?src="(?P<filename>[^"\']*)".*?>(\s*</include>)?', + re.DOTALL) +_SRC_RE = lazy_re.compile( + r'<(?!script)(?:[^>]+?\s)src=(?P<quote>")(?!\[\[|{{)(?P<filename>[^"\']*)\1', + re.MULTILINE) +_ICON_RE = lazy_re.compile( + r'<link rel="icon"\s(?:[^>]+?\s)?' + 'href=(?P<quote>")(?P<filename>[^"\']*)\1', + re.MULTILINE) + + +def GetDistribution(): + """Helper function that gets the distribution we are building. + + Returns: + string + """ + distribution = DIST_DEFAULT + if DIST_ENV_VAR in os.environ.keys(): + distribution = os.environ[DIST_ENV_VAR] + if len(distribution) > 1 and distribution[0] == '_': + distribution = distribution[1:].lower() + return distribution + + +def SrcInlineAsDataURL( + src_match, base_path, distribution, inlined_files, names_only=False, + filename_expansion_function=None): + """regex replace function. + + Takes a regex match for src="filename", attempts to read the file + at 'filename' and returns the src attribute with the file inlined + as a data URI. If it finds DIST_SUBSTR string in file name, replaces + it with distribution. + + Args: + src_match: regex match object with 'filename' and 'quote' named capturing + groups + base_path: path that to look for files in + distribution: string that should replace DIST_SUBSTR + inlined_files: The name of the opened file is appended to this list. + names_only: If true, the function will not read the file but just return "". + It will still add the filename to |inlined_files|. + + Returns: + string + """ + filename = src_match.group('filename') + if filename_expansion_function: + filename = filename_expansion_function(filename) + quote = src_match.group('quote') + + if filename.find(':') != -1: + # filename is probably a URL, which we don't want to bother inlining + return src_match.group(0) + + filename = filename.replace(DIST_SUBSTR , distribution) + filepath = os.path.normpath(os.path.join(base_path, filename)) + inlined_files.add(filepath) + + if names_only: + return "" + + mimetype = mimetypes.guess_type(filename)[0] + if mimetype is None: + raise Exception('%s is of an an unknown type and ' + 'cannot be stored in a data url.' % filename) + inline_data = base64.standard_b64encode(util.ReadFile(filepath, util.BINARY)) + + prefix = src_match.string[src_match.start():src_match.start('filename')] + suffix = src_match.string[src_match.end('filename'):src_match.end()] + return '%sdata:%s;base64,%s%s' % (prefix, mimetype, inline_data, suffix) + + +class InlinedData: + """Helper class holding the results from DoInline(). + + Holds the inlined data and the set of filenames of all the inlined + files. + """ + def __init__(self, inlined_data, inlined_files): + self.inlined_data = inlined_data + self.inlined_files = inlined_files + +def DoInline( + input_filename, grd_node, allow_external_script=False, names_only=False, + rewrite_function=None, filename_expansion_function=None): + """Helper function that inlines the resources in a specified file. + + Reads input_filename, finds all the src attributes and attempts to + inline the files they are referring to, then returns the result and + the set of inlined files. + + Args: + input_filename: name of file to read in + grd_node: html node from the grd file for this include tag + names_only: |nil| will be returned for the inlined contents (faster). + rewrite_function: function(filepath, text, distribution) which will be + called to rewrite html content before inlining images. + filename_expansion_function: function(filename) which will be called to + rewrite filenames before attempting to read them. + Returns: + a tuple of the inlined data as a string and the set of filenames + of all the inlined files + """ + if filename_expansion_function: + input_filename = filename_expansion_function(input_filename) + input_filepath = os.path.dirname(input_filename) + distribution = GetDistribution() + + # Keep track of all the files we inline. + inlined_files = set() + + def SrcReplace(src_match, filepath=input_filepath, + inlined_files=inlined_files): + """Helper function to provide SrcInlineAsDataURL with the base file path""" + return SrcInlineAsDataURL( + src_match, filepath, distribution, inlined_files, names_only=names_only, + filename_expansion_function=filename_expansion_function) + + def GetFilepath(src_match, base_path = input_filepath): + filename = src_match.group('filename') + + if filename.find(':') != -1: + # filename is probably a URL, which we don't want to bother inlining + return None + + filename = filename.replace('%DISTRIBUTION%', distribution) + if filename_expansion_function: + filename = filename_expansion_function(filename) + return os.path.normpath(os.path.join(base_path, filename)) + + def IsConditionSatisfied(src_match): + expression = src_match.group('expression') + return grd_node is None or grd_node.EvaluateCondition(expression) + + def CheckConditionalElements(str): + """Helper function to conditionally inline inner elements""" + while True: + begin_if = _BEGIN_IF_BLOCK.search(str) + if begin_if is None: + if _END_IF_BLOCK.search(str) is not None: + raise Exception('Unmatched </if>') + return str + + condition_satisfied = IsConditionSatisfied(begin_if) + leading = str[0:begin_if.start()] + content_start = begin_if.end() + + # Find matching "if" block end. + count = 1 + pos = begin_if.end() + while True: + end_if = _END_IF_BLOCK.search(str, pos) + if end_if is None: + raise Exception('Unmatched <if>') + + next_if = _BEGIN_IF_BLOCK.search(str, pos) + if next_if is None or next_if.start() >= end_if.end(): + count = count - 1 + if count == 0: + break + pos = end_if.end() + else: + count = count + 1 + pos = next_if.end() + + content = str[content_start:end_if.start()] + trailing = str[end_if.end():] + + if condition_satisfied: + str = leading + CheckConditionalElements(content) + trailing + else: + str = leading + trailing + + def InlineFileContents(src_match, pattern, inlined_files=inlined_files): + """Helper function to inline external files of various types""" + filepath = GetFilepath(src_match) + if filepath is None: + return src_match.group(0) + inlined_files.add(filepath) + + if names_only: + inlined_files.update(GetResourceFilenames( + filepath, + allow_external_script, + rewrite_function, + filename_expansion_function=filename_expansion_function)) + return "" + + return pattern % InlineToString( + filepath, grd_node, allow_external_script, + filename_expansion_function=filename_expansion_function) + + def InlineIncludeFiles(src_match): + """Helper function to directly inline generic external files (without + wrapping them with any kind of tags). + """ + return InlineFileContents(src_match, '%s') + + def InlineScript(match): + """Helper function to inline external script files""" + attrs = (match.group('attrs1') + match.group('attrs2')).strip() + if attrs: + attrs = ' ' + attrs + return InlineFileContents(match, '<script' + attrs + '>%s</script>') + + def InlineCSSText(text, css_filepath): + """Helper function that inlines external resources in CSS text""" + filepath = os.path.dirname(css_filepath) + # Allow custom modifications before inlining images. + if rewrite_function: + text = rewrite_function(filepath, text, distribution) + text = InlineCSSImages(text, filepath) + return InlineCSSImports(text, filepath) + + def InlineCSSFile(src_match, pattern, base_path=input_filepath): + """Helper function to inline external CSS files. + + Args: + src_match: A regular expression match with a named group named "filename". + pattern: The pattern to replace with the contents of the CSS file. + base_path: The base path to use for resolving the CSS file. + + Returns: + The text that should replace the reference to the CSS file. + """ + filepath = GetFilepath(src_match, base_path) + if filepath is None: + return src_match.group(0) + + # Even if names_only is set, the CSS file needs to be opened, because it + # can link to images that need to be added to the file set. + inlined_files.add(filepath) + # When resolving CSS files we need to pass in the path so that relative URLs + # can be resolved. + return pattern % InlineCSSText(util.ReadFile(filepath, util.BINARY), + filepath) + + def InlineCSSImages(text, filepath=input_filepath): + """Helper function that inlines external images in CSS backgrounds.""" + # Replace contents of url() for css attributes: content, background, + # or *-image. + return re.sub('(content|background|[\w-]*-image):[^;]*' + + '(url\((?P<quote1>"|\'|)[^"\'()]*(?P=quote1)\)|' + + 'image-set\(' + + '([ ]*url\((?P<quote2>"|\'|)[^"\'()]*(?P=quote2)\)' + + '[ ]*[0-9.]*x[ ]*(,[ ]*)?)+\))', + lambda m: InlineCSSUrls(m, filepath), + text) + + def InlineCSSUrls(src_match, filepath=input_filepath): + """Helper function that inlines each url on a CSS image rule match.""" + # Replace contents of url() references in matches. + return re.sub('url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)(?P=quote)\)', + lambda m: SrcReplace(m, filepath), + src_match.group(0)) + + def InlineCSSImports(text, filepath=input_filepath): + """Helper function that inlines CSS files included via the @import + directive. + """ + return re.sub('@import\s+url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)' + + '(?P=quote)\);', + lambda m: InlineCSSFile(m, '%s', filepath), + text) + + + flat_text = util.ReadFile(input_filename, util.BINARY) + + # Check conditional elements, remove unsatisfied ones from the file. We do + # this twice. The first pass is so that we don't even bother calling + # InlineScript, InlineCSSFile and InlineIncludeFiles on text we're eventually + # going to throw out anyway. + flat_text = CheckConditionalElements(flat_text) + + if not allow_external_script: + # We need to inline css and js before we inline images so that image + # references gets inlined in the css and js + flat_text = re.sub('<script (?P<attrs1>.*?)src="(?P<filename>[^"\']*)"' + + '(?P<attrs2>.*?)></script>', + InlineScript, + flat_text) + + flat_text = _STYLESHEET_RE.sub( + lambda m: InlineCSSFile(m, '<style>%s</style>'), + flat_text) + + flat_text = _INCLUDE_RE.sub(InlineIncludeFiles, flat_text) + + # Check conditional elements, second pass. This catches conditionals in any + # of the text we just inlined. + flat_text = CheckConditionalElements(flat_text) + + # Allow custom modifications before inlining images. + if rewrite_function: + flat_text = rewrite_function(input_filepath, flat_text, distribution) + + flat_text = _SRC_RE.sub(SrcReplace, flat_text) + + # TODO(arv): Only do this inside <style> tags. + flat_text = InlineCSSImages(flat_text) + + flat_text = _ICON_RE.sub(SrcReplace, flat_text) + + if names_only: + flat_text = None # Will contains garbage if the flag is set anyway. + return InlinedData(flat_text, inlined_files) + + +def InlineToString(input_filename, grd_node, allow_external_script=False, + rewrite_function=None, filename_expansion_function=None): + """Inlines the resources in a specified file and returns it as a string. + + Args: + input_filename: name of file to read in + grd_node: html node from the grd file for this include tag + Returns: + the inlined data as a string + """ + try: + return DoInline( + input_filename, + grd_node, + allow_external_script=allow_external_script, + rewrite_function=rewrite_function, + filename_expansion_function=filename_expansion_function).inlined_data + except IOError, e: + raise Exception("Failed to open %s while trying to flatten %s. (%s)" % + (e.filename, input_filename, e.strerror)) + + +def InlineToFile(input_filename, output_filename, grd_node): + """Inlines the resources in a specified file and writes it. + + Reads input_filename, finds all the src attributes and attempts to + inline the files they are referring to, then writes the result + to output_filename. + + Args: + input_filename: name of file to read in + output_filename: name of file to be written to + grd_node: html node from the grd file for this include tag + Returns: + a set of filenames of all the inlined files + """ + inlined_data = InlineToString(input_filename, grd_node) + with open(output_filename, 'wb') as out_file: + out_file.writelines(inlined_data) + + +def GetResourceFilenames(filename, + allow_external_script=False, + rewrite_function=None, + filename_expansion_function=None): + """For a grd file, returns a set of all the files that would be inline.""" + try: + return DoInline( + filename, + None, + names_only=True, + allow_external_script=allow_external_script, + rewrite_function=rewrite_function, + filename_expansion_function=filename_expansion_function).inlined_files + except IOError, e: + raise Exception("Failed to open %s while trying to flatten %s. (%s)" % + (e.filename, filename, e.strerror)) + + +def main(): + if len(sys.argv) <= 2: + print "Flattens a HTML file by inlining its external resources.\n" + print "html_inline.py inputfile outputfile" + else: + InlineToFile(sys.argv[1], sys.argv[2], None) + +if __name__ == '__main__': + main() |