1#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Flattens a HTML file by inlining its external resources.
7
8This is a small script that takes a HTML file, looks for src attributes
9and inlines the specified file, producing one HTML file with no external
10dependencies. It recursively inlines the included files.
11"""
12
13import os
14import re
15import sys
16import base64
17import mimetypes
18
19from grit import lazy_re
20from grit import util
21
22DIST_DEFAULT = 'chromium'
23DIST_ENV_VAR = 'CHROMIUM_BUILD'
24DIST_SUBSTR = '%DISTRIBUTION%'
25
26# Matches beginning of an "if" block with trailing spaces.
27_BEGIN_IF_BLOCK = lazy_re.compile(
28    '<if [^>]*?expr="(?P<expression>[^"]*)"[^>]*?>\s*')
29
30# Matches ending of an "if" block with preceding spaces.
31_END_IF_BLOCK = lazy_re.compile('\s*</if>')
32
33# Used by DoInline to replace various links with inline content.
34_STYLESHEET_RE = lazy_re.compile(
35    '<link rel="stylesheet"[^>]+?href="(?P<filename>[^"]*)".*?>(\s*</link>)?',
36    re.DOTALL)
37_INCLUDE_RE = lazy_re.compile(
38    '<include[^>]+?src="(?P<filename>[^"\']*)".*?>(\s*</include>)?',
39    re.DOTALL)
40_SRC_RE = lazy_re.compile(
41    r'<(?!script)(?:[^>]+?\s)src=(?P<quote>")(?P<filename>[^"\']*)\1',
42    re.MULTILINE)
43_ICON_RE = lazy_re.compile(
44    r'<link rel="icon"\s(?:[^>]+?\s)?'
45    'href=(?P<quote>")(?P<filename>[^"\']*)\1',
46    re.MULTILINE)
47
48
49
50def FixupMimeType(mime_type):
51  """Helper function that normalizes platform differences in the mime type
52     returned by the Python's mimetypes.guess_type API.
53  """
54  mappings = {
55    'image/x-png': 'image/png'
56  }
57  return mappings[mime_type] if mime_type in mappings else mime_type
58
59
60def GetDistribution():
61  """Helper function that gets the distribution we are building.
62
63  Returns:
64    string
65  """
66  distribution = DIST_DEFAULT
67  if DIST_ENV_VAR in os.environ.keys():
68    distribution = os.environ[DIST_ENV_VAR]
69    if len(distribution) > 1 and distribution[0] == '_':
70      distribution = distribution[1:].lower()
71  return distribution
72
73
74def SrcInlineAsDataURL(
75    src_match, base_path, distribution, inlined_files, names_only=False,
76    filename_expansion_function=None):
77  """regex replace function.
78
79  Takes a regex match for src="filename", attempts to read the file
80  at 'filename' and returns the src attribute with the file inlined
81  as a data URI. If it finds DIST_SUBSTR string in file name, replaces
82  it with distribution.
83
84  Args:
85    src_match: regex match object with 'filename' and 'quote' named capturing
86               groups
87    base_path: path that to look for files in
88    distribution: string that should replace DIST_SUBSTR
89    inlined_files: The name of the opened file is appended to this list.
90    names_only: If true, the function will not read the file but just return "".
91                It will still add the filename to |inlined_files|.
92
93  Returns:
94    string
95  """
96  filename = src_match.group('filename')
97  if filename_expansion_function:
98    filename = filename_expansion_function(filename)
99  quote = src_match.group('quote')
100
101  if filename.find(':') != -1:
102    # filename is probably a URL, which we don't want to bother inlining
103    return src_match.group(0)
104
105  filename = filename.replace(DIST_SUBSTR , distribution)
106  filepath = os.path.normpath(os.path.join(base_path, filename))
107  inlined_files.add(filepath)
108
109  if names_only:
110    return ""
111
112  mimetype = FixupMimeType(mimetypes.guess_type(filename)[0]) or 'text/plain'
113  inline_data = base64.standard_b64encode(util.ReadFile(filepath, util.BINARY))
114
115  prefix = src_match.string[src_match.start():src_match.start('filename')]
116  suffix = src_match.string[src_match.end('filename'):src_match.end()]
117  return '%sdata:%s;base64,%s%s' % (prefix, mimetype, inline_data, suffix)
118
119
120class InlinedData:
121  """Helper class holding the results from DoInline().
122
123  Holds the inlined data and the set of filenames of all the inlined
124  files.
125  """
126  def __init__(self, inlined_data, inlined_files):
127    self.inlined_data = inlined_data
128    self.inlined_files = inlined_files
129
130def DoInline(
131    input_filename, grd_node, allow_external_script=False, names_only=False,
132    rewrite_function=None, filename_expansion_function=None):
133  """Helper function that inlines the resources in a specified file.
134
135  Reads input_filename, finds all the src attributes and attempts to
136  inline the files they are referring to, then returns the result and
137  the set of inlined files.
138
139  Args:
140    input_filename: name of file to read in
141    grd_node: html node from the grd file for this include tag
142    names_only: |nil| will be returned for the inlined contents (faster).
143    rewrite_function: function(filepath, text, distribution) which will be
144        called to rewrite html content before inlining images.
145    filename_expansion_function: function(filename) which will be called to
146        rewrite filenames before attempting to read them.
147  Returns:
148    a tuple of the inlined data as a string and the set of filenames
149    of all the inlined files
150  """
151  if filename_expansion_function:
152    input_filename = filename_expansion_function(input_filename)
153  input_filepath = os.path.dirname(input_filename)
154  distribution = GetDistribution()
155
156  # Keep track of all the files we inline.
157  inlined_files = set()
158
159  def SrcReplace(src_match, filepath=input_filepath,
160                 inlined_files=inlined_files):
161    """Helper function to provide SrcInlineAsDataURL with the base file path"""
162    return SrcInlineAsDataURL(
163        src_match, filepath, distribution, inlined_files, names_only=names_only,
164        filename_expansion_function=filename_expansion_function)
165
166  def GetFilepath(src_match, base_path = input_filepath):
167    filename = src_match.group('filename')
168
169    if filename.find(':') != -1:
170      # filename is probably a URL, which we don't want to bother inlining
171      return None
172
173    filename = filename.replace('%DISTRIBUTION%', distribution)
174    if filename_expansion_function:
175      filename = filename_expansion_function(filename)
176    return os.path.normpath(os.path.join(base_path, filename))
177
178  def IsConditionSatisfied(src_match):
179    expression = src_match.group('expression')
180    return grd_node is None or grd_node.EvaluateCondition(expression)
181
182  def CheckConditionalElements(str):
183    """Helper function to conditionally inline inner elements"""
184    while True:
185      begin_if = _BEGIN_IF_BLOCK.search(str)
186      if begin_if is None:
187        return str
188
189      condition_satisfied = IsConditionSatisfied(begin_if)
190      leading = str[0:begin_if.start()]
191      content_start = begin_if.end()
192
193      # Find matching "if" block end.
194      count = 1
195      pos = begin_if.end()
196      while True:
197        end_if = _END_IF_BLOCK.search(str, pos)
198        if end_if is None:
199          raise Exception('Unmatched <if>')
200
201        next_if = _BEGIN_IF_BLOCK.search(str, pos)
202        if next_if is None or next_if.start() >= end_if.end():
203          count = count - 1
204          if count == 0:
205            break
206          pos = end_if.end()
207        else:
208          count = count + 1
209          pos = next_if.end()
210
211      content = str[content_start:end_if.start()]
212      trailing = str[end_if.end():]
213
214      if condition_satisfied:
215        str = leading + CheckConditionalElements(content) + trailing
216      else:
217        str = leading + trailing
218
219  def InlineFileContents(src_match, pattern, inlined_files=inlined_files):
220    """Helper function to inline external files of various types"""
221    filepath = GetFilepath(src_match)
222    if filepath is None:
223      return src_match.group(0)
224    inlined_files.add(filepath)
225
226    if names_only:
227      inlined_files.update(GetResourceFilenames(
228          filepath,
229          allow_external_script,
230          rewrite_function,
231          filename_expansion_function=filename_expansion_function))
232      return ""
233
234    return pattern % InlineToString(
235        filepath, grd_node, allow_external_script,
236        filename_expansion_function=filename_expansion_function)
237
238  def InlineIncludeFiles(src_match):
239    """Helper function to directly inline generic external files (without
240       wrapping them with any kind of tags).
241    """
242    return InlineFileContents(src_match, '%s')
243
244  def InlineScript(match):
245    """Helper function to inline external script files"""
246    attrs = (match.group('attrs1') + match.group('attrs2')).strip()
247    if attrs:
248       attrs = ' ' + attrs
249    return InlineFileContents(match, '<script' + attrs + '>%s</script>')
250
251  def InlineCSSText(text, css_filepath):
252    """Helper function that inlines external resources in CSS text"""
253    filepath = os.path.dirname(css_filepath)
254    # Allow custom modifications before inlining images.
255    if rewrite_function:
256      text = rewrite_function(filepath, text, distribution)
257    text = InlineCSSImages(text, filepath)
258    return InlineCSSImports(text, filepath)
259
260  def InlineCSSFile(src_match, pattern, base_path=input_filepath):
261    """Helper function to inline external CSS files.
262
263    Args:
264      src_match: A regular expression match with a named group named "filename".
265      pattern: The pattern to replace with the contents of the CSS file.
266      base_path: The base path to use for resolving the CSS file.
267
268    Returns:
269      The text that should replace the reference to the CSS file.
270    """
271    filepath = GetFilepath(src_match, base_path)
272    if filepath is None:
273      return src_match.group(0)
274
275    # Even if names_only is set, the CSS file needs to be opened, because it
276    # can link to images that need to be added to the file set.
277    inlined_files.add(filepath)
278    # When resolving CSS files we need to pass in the path so that relative URLs
279    # can be resolved.
280    return pattern % InlineCSSText(util.ReadFile(filepath, util.BINARY),
281                                   filepath)
282
283  def InlineCSSImages(text, filepath=input_filepath):
284    """Helper function that inlines external images in CSS backgrounds."""
285    # Replace contents of url() for css attributes: content, background,
286    # or *-image.
287    return re.sub('(content|background|[\w-]*-image):[^;]*' +
288                  '(url\((?P<quote1>"|\'|)[^"\'()]*(?P=quote1)\)|' +
289                      'image-set\(' +
290                          '([ ]*url\((?P<quote2>"|\'|)[^"\'()]*(?P=quote2)\)' +
291                              '[ ]*[0-9.]*x[ ]*(,[ ]*)?)+\))',
292                  lambda m: InlineCSSUrls(m, filepath),
293                  text)
294
295  def InlineCSSUrls(src_match, filepath=input_filepath):
296    """Helper function that inlines each url on a CSS image rule match."""
297    # Replace contents of url() references in matches.
298    return re.sub('url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)(?P=quote)\)',
299                  lambda m: SrcReplace(m, filepath),
300                  src_match.group(0))
301
302  def InlineCSSImports(text, filepath=input_filepath):
303    """Helper function that inlines CSS files included via the @import
304       directive.
305    """
306    return re.sub('@import\s+url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)' +
307                  '(?P=quote)\);',
308                  lambda m: InlineCSSFile(m, '%s', filepath),
309                  text)
310
311
312  flat_text = util.ReadFile(input_filename, util.BINARY)
313
314  # Check conditional elements, remove unsatisfied ones from the file. We do
315  # this twice. The first pass is so that we don't even bother calling
316  # InlineScript, InlineCSSFile and InlineIncludeFiles on text we're eventually
317  # going to throw out anyway.
318  flat_text = CheckConditionalElements(flat_text)
319
320  if not allow_external_script:
321    # We need to inline css and js before we inline images so that image
322    # references gets inlined in the css and js
323    flat_text = re.sub('<script (?P<attrs1>.*?)src="(?P<filename>[^"\']*)"' +
324                       '(?P<attrs2>.*?)></script>',
325                       InlineScript,
326                       flat_text)
327
328  flat_text = _STYLESHEET_RE.sub(
329      lambda m: InlineCSSFile(m, '<style>%s</style>'),
330      flat_text)
331
332  flat_text = _INCLUDE_RE.sub(InlineIncludeFiles, flat_text)
333
334  # Check conditional elements, second pass. This catches conditionals in any
335  # of the text we just inlined.
336  flat_text = CheckConditionalElements(flat_text)
337
338  # Allow custom modifications before inlining images.
339  if rewrite_function:
340    flat_text = rewrite_function(input_filepath, flat_text, distribution)
341
342  flat_text = _SRC_RE.sub(SrcReplace, flat_text)
343
344  # TODO(arv): Only do this inside <style> tags.
345  flat_text = InlineCSSImages(flat_text)
346
347  flat_text = _ICON_RE.sub(SrcReplace, flat_text)
348
349  if names_only:
350    flat_text = None  # Will contains garbage if the flag is set anyway.
351  return InlinedData(flat_text, inlined_files)
352
353
354def InlineToString(input_filename, grd_node, allow_external_script=False,
355                   rewrite_function=None, filename_expansion_function=None):
356  """Inlines the resources in a specified file and returns it as a string.
357
358  Args:
359    input_filename: name of file to read in
360    grd_node: html node from the grd file for this include tag
361  Returns:
362    the inlined data as a string
363  """
364  try:
365    return DoInline(
366        input_filename,
367        grd_node,
368        allow_external_script=allow_external_script,
369        rewrite_function=rewrite_function,
370        filename_expansion_function=filename_expansion_function).inlined_data
371  except IOError, e:
372    raise Exception("Failed to open %s while trying to flatten %s. (%s)" %
373                    (e.filename, input_filename, e.strerror))
374
375
376def InlineToFile(input_filename, output_filename, grd_node):
377  """Inlines the resources in a specified file and writes it.
378
379  Reads input_filename, finds all the src attributes and attempts to
380  inline the files they are referring to, then writes the result
381  to output_filename.
382
383  Args:
384    input_filename: name of file to read in
385    output_filename: name of file to be written to
386    grd_node: html node from the grd file for this include tag
387  Returns:
388    a set of filenames of all the inlined files
389  """
390  inlined_data = InlineToString(input_filename, grd_node)
391  with open(output_filename, 'wb') as out_file:
392    out_file.writelines(inlined_data)
393
394
395def GetResourceFilenames(filename,
396                         allow_external_script=False,
397                         rewrite_function=None,
398                         filename_expansion_function=None):
399  """For a grd file, returns a set of all the files that would be inline."""
400  try:
401    return DoInline(
402        filename,
403        None,
404        names_only=True,
405        allow_external_script=allow_external_script,
406        rewrite_function=rewrite_function,
407        filename_expansion_function=filename_expansion_function).inlined_files
408  except IOError, e:
409    raise Exception("Failed to open %s while trying to flatten %s. (%s)" %
410                    (e.filename, filename, e.strerror))
411
412
413def main():
414  if len(sys.argv) <= 2:
415    print "Flattens a HTML file by inlining its external resources.\n"
416    print "html_inline.py inputfile outputfile"
417  else:
418    InlineToFile(sys.argv[1], sys.argv[2], None)
419
420if __name__ == '__main__':
421  main()
422