1#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6'''Utilities used by GRIT.
7'''
8
9import codecs
10import htmlentitydefs
11import os
12import re
13import shutil
14import sys
15import tempfile
16import time
17import types
18from xml.sax import saxutils
19
20from grit import lazy_re
21
22_root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
23
24
25# Unique constants for use by ReadFile().
26BINARY, RAW_TEXT = range(2)
27
28
29# Unique constants representing data pack encodings.
30_, UTF8, UTF16 = range(3)
31
32
33def Encode(message, encoding):
34  '''Returns a byte stream that represents |message| in the given |encoding|.'''
35  # |message| is a python unicode string, so convert to a byte stream that
36  # has the correct encoding requested for the datapacks. We skip the first
37  # 2 bytes of text resources because it is the BOM.
38  if encoding == UTF8:
39    return message.encode('utf8')
40  if encoding == UTF16:
41    return message.encode('utf16')[2:]
42  # Default is BINARY
43  return message
44
45
46# Matches all different types of linebreaks.
47LINEBREAKS = re.compile('\r\n|\n|\r')
48
49def MakeRelativePath(base_path, path_to_make_relative):
50  """Returns a relative path such from the base_path to
51  the path_to_make_relative.
52
53  In other words, os.join(base_path,
54    MakeRelativePath(base_path, path_to_make_relative))
55  is the same location as path_to_make_relative.
56
57  Args:
58    base_path: the root path
59    path_to_make_relative: an absolute path that is on the same drive
60      as base_path
61  """
62
63  def _GetPathAfterPrefix(prefix_path, path_with_prefix):
64    """Gets the subpath within in prefix_path for the path_with_prefix
65    with no beginning or trailing path separators.
66
67    Args:
68      prefix_path: the base path
69      path_with_prefix: a path that starts with prefix_path
70    """
71    assert path_with_prefix.startswith(prefix_path)
72    path_without_prefix = path_with_prefix[len(prefix_path):]
73    normalized_path = os.path.normpath(path_without_prefix.strip(os.path.sep))
74    if normalized_path == '.':
75      normalized_path = ''
76    return normalized_path
77
78  def _GetCommonBaseDirectory(*args):
79    """Returns the common prefix directory for the given paths
80
81    Args:
82      The list of paths (at least one of which should be a directory)
83    """
84    prefix = os.path.commonprefix(args)
85    # prefix is a character-by-character prefix (i.e. it does not end
86    # on a directory bound, so this code fixes that)
87
88    # if the prefix ends with the separator, then it is prefect.
89    if len(prefix) > 0 and prefix[-1] == os.path.sep:
90      return prefix
91
92    # We need to loop through all paths or else we can get
93    # tripped up by "c:\a" and "c:\abc".  The common prefix
94    # is "c:\a" which is a directory and looks good with
95    # respect to the first directory but it is clear that
96    # isn't a common directory when the second path is
97    # examined.
98    for path in args:
99      assert len(path) >= len(prefix)
100      # If the prefix the same length as the path,
101      # then the prefix must be a directory (since one
102      # of the arguements should be a directory).
103      if path == prefix:
104        continue
105      # if the character after the prefix in the path
106      # is the separator, then the prefix appears to be a
107      # valid a directory as well for the given path
108      if path[len(prefix)] == os.path.sep:
109        continue
110      # Otherwise, the prefix is not a directory, so it needs
111      # to be shortened to be one
112      index_sep = prefix.rfind(os.path.sep)
113      # The use "index_sep + 1" because it includes the final sep
114      # and it handles the case when the index_sep is -1 as well
115      prefix = prefix[:index_sep + 1]
116      # At this point we backed up to a directory bound which is
117      # common to all paths, so we can quit going through all of
118      # the paths.
119      break
120    return prefix
121
122  prefix =  _GetCommonBaseDirectory(base_path, path_to_make_relative)
123  # If the paths had no commonality at all, then return the absolute path
124  # because it is the best that can be done.  If the path had to be relative
125  # then eventually this absolute path will be discovered (when a build breaks)
126  # and an appropriate fix can be made, but having this allows for the best
127  # backward compatibility with the absolute path behavior in the past.
128  if len(prefix) <= 0:
129    return path_to_make_relative
130  # Build a path from the base dir to the common prefix
131  remaining_base_path = _GetPathAfterPrefix(prefix, base_path)
132
133  #  The follow handles two case: "" and "foo\\bar"
134  path_pieces = remaining_base_path.split(os.path.sep)
135  base_depth_from_prefix = len([d for d in path_pieces if len(d)])
136  base_to_prefix = (".." + os.path.sep) * base_depth_from_prefix
137
138  # Put add in the path from the prefix to the path_to_make_relative
139  remaining_other_path = _GetPathAfterPrefix(prefix, path_to_make_relative)
140  return base_to_prefix + remaining_other_path
141
142
143KNOWN_SYSTEM_IDENTIFIERS = set()
144
145SYSTEM_IDENTIFIERS = None
146
147def SetupSystemIdentifiers(ids):
148  '''Adds ids to a regexp of known system identifiers.
149
150  Can be called many times, ids will be accumulated.
151
152  Args:
153    ids: an iterable of strings
154  '''
155  KNOWN_SYSTEM_IDENTIFIERS.update(ids)
156  global SYSTEM_IDENTIFIERS
157  SYSTEM_IDENTIFIERS = lazy_re.compile(
158      ' | '.join([r'\b%s\b' % i for i in KNOWN_SYSTEM_IDENTIFIERS]),
159      re.VERBOSE)
160
161
162# Matches all of the resource IDs predefined by Windows.
163SetupSystemIdentifiers((
164    'IDOK', 'IDCANCEL', 'IDC_STATIC', 'IDYES', 'IDNO',
165    'ID_FILE_NEW', 'ID_FILE_OPEN', 'ID_FILE_CLOSE', 'ID_FILE_SAVE',
166    'ID_FILE_SAVE_AS', 'ID_FILE_PAGE_SETUP', 'ID_FILE_PRINT_SETUP',
167    'ID_FILE_PRINT', 'ID_FILE_PRINT_DIRECT', 'ID_FILE_PRINT_PREVIEW',
168    'ID_FILE_UPDATE', 'ID_FILE_SAVE_COPY_AS', 'ID_FILE_SEND_MAIL',
169    'ID_FILE_MRU_FIRST', 'ID_FILE_MRU_LAST',
170    'ID_EDIT_CLEAR', 'ID_EDIT_CLEAR_ALL', 'ID_EDIT_COPY',
171    'ID_EDIT_CUT', 'ID_EDIT_FIND', 'ID_EDIT_PASTE', 'ID_EDIT_PASTE_LINK',
172    'ID_EDIT_PASTE_SPECIAL', 'ID_EDIT_REPEAT', 'ID_EDIT_REPLACE',
173    'ID_EDIT_SELECT_ALL', 'ID_EDIT_UNDO', 'ID_EDIT_REDO',
174    'VS_VERSION_INFO', 'IDRETRY',
175    'ID_APP_ABOUT', 'ID_APP_EXIT',
176    'ID_NEXT_PANE', 'ID_PREV_PANE',
177    'ID_WINDOW_NEW', 'ID_WINDOW_ARRANGE', 'ID_WINDOW_CASCADE',
178    'ID_WINDOW_TILE_HORZ', 'ID_WINDOW_TILE_VERT', 'ID_WINDOW_SPLIT',
179    'ATL_IDS_SCSIZE', 'ATL_IDS_SCMOVE', 'ATL_IDS_SCMINIMIZE',
180    'ATL_IDS_SCMAXIMIZE', 'ATL_IDS_SCNEXTWINDOW', 'ATL_IDS_SCPREVWINDOW',
181    'ATL_IDS_SCCLOSE', 'ATL_IDS_SCRESTORE', 'ATL_IDS_SCTASKLIST',
182    'ATL_IDS_MDICHILD', 'ATL_IDS_IDLEMESSAGE', 'ATL_IDS_MRU_FILE' ))
183
184
185# Matches character entities, whether specified by name, decimal or hex.
186_HTML_ENTITY = lazy_re.compile(
187  '&(#(?P<decimal>[0-9]+)|#x(?P<hex>[a-fA-F0-9]+)|(?P<named>[a-z0-9]+));',
188  re.IGNORECASE)
189
190# Matches characters that should be HTML-escaped.  This is <, > and &, but only
191# if the & is not the start of an HTML character entity.
192_HTML_CHARS_TO_ESCAPE = lazy_re.compile(
193    '"|<|>|&(?!#[0-9]+|#x[0-9a-z]+|[a-z]+;)',
194    re.IGNORECASE | re.MULTILINE)
195
196
197def ReadFile(filename, encoding):
198  '''Reads and returns the entire contents of the given file.
199
200  Args:
201    filename: The path to the file.
202    encoding: A Python codec name or one of two special values: BINARY to read
203              the file in binary mode, or RAW_TEXT to read it with newline
204              conversion but without decoding to Unicode.
205  '''
206  mode = 'rb' if encoding == BINARY else 'rU'
207  with open(filename, mode) as f:
208    data = f.read()
209  if encoding not in (BINARY, RAW_TEXT):
210    data = data.decode(encoding)
211  return data
212
213
214def WrapOutputStream(stream, encoding = 'utf-8'):
215  '''Returns a stream that wraps the provided stream, making it write
216  characters using the specified encoding.'''
217  return codecs.getwriter(encoding)(stream)
218
219
220def ChangeStdoutEncoding(encoding = 'utf-8'):
221  '''Changes STDOUT to print characters using the specified encoding.'''
222  sys.stdout = WrapOutputStream(sys.stdout, encoding)
223
224
225def EscapeHtml(text, escape_quotes = False):
226  '''Returns 'text' with <, > and & (and optionally ") escaped to named HTML
227  entities.  Any existing named entity or HTML entity defined by decimal or
228  hex code will be left untouched.  This is appropriate for escaping text for
229  inclusion in HTML, but not for XML.
230  '''
231  def Replace(match):
232    if match.group() == '&': return '&amp;'
233    elif match.group() == '<': return '&lt;'
234    elif match.group() == '>': return '&gt;'
235    elif match.group() == '"':
236      if escape_quotes: return '&quot;'
237      else: return match.group()
238    else: assert False
239  out = _HTML_CHARS_TO_ESCAPE.sub(Replace, text)
240  return out
241
242
243def UnescapeHtml(text, replace_nbsp=True):
244  '''Returns 'text' with all HTML character entities (both named character
245  entities and those specified by decimal or hexadecimal Unicode ordinal)
246  replaced by their Unicode characters (or latin1 characters if possible).
247
248  The only exception is that &nbsp; will not be escaped if 'replace_nbsp' is
249  False.
250  '''
251  def Replace(match):
252    groups = match.groupdict()
253    if groups['hex']:
254      return unichr(int(groups['hex'], 16))
255    elif groups['decimal']:
256      return unichr(int(groups['decimal'], 10))
257    else:
258      name = groups['named']
259      if name == 'nbsp' and not replace_nbsp:
260        return match.group()  # Don't replace &nbsp;
261      assert name != None
262      if name in htmlentitydefs.name2codepoint.keys():
263        return unichr(htmlentitydefs.name2codepoint[name])
264      else:
265        return match.group()  # Unknown HTML character entity - don't replace
266
267  out = _HTML_ENTITY.sub(Replace, text)
268  return out
269
270
271def EncodeCdata(cdata):
272  '''Returns the provided cdata in either escaped format or <![CDATA[xxx]]>
273  format, depending on which is more appropriate for easy editing.  The data
274  is escaped for inclusion in an XML element's body.
275
276  Args:
277    cdata: 'If x < y and y < z then x < z'
278
279  Return:
280    '<![CDATA[If x < y and y < z then x < z]]>'
281  '''
282  if cdata.count('<') > 1 or cdata.count('>') > 1 and cdata.count(']]>') == 0:
283    return '<![CDATA[%s]]>' % cdata
284  else:
285    return saxutils.escape(cdata)
286
287
288def FixupNamedParam(function, param_name, param_value):
289  '''Returns a closure that is identical to 'function' but ensures that the
290  named parameter 'param_name' is always set to 'param_value' unless explicitly
291  set by the caller.
292
293  Args:
294    function: callable
295    param_name: 'bingo'
296    param_value: 'bongo' (any type)
297
298  Return:
299    callable
300  '''
301  def FixupClosure(*args, **kw):
302    if not param_name in kw:
303      kw[param_name] = param_value
304    return function(*args, **kw)
305  return FixupClosure
306
307
308def PathFromRoot(path):
309  '''Takes a path relative to the root directory for GRIT (the one that grit.py
310  resides in) and returns a path that is either absolute or relative to the
311  current working directory (i.e .a path you can use to open the file).
312
313  Args:
314    path: 'rel_dir\file.ext'
315
316  Return:
317    'c:\src\tools\rel_dir\file.ext
318  '''
319  return os.path.normpath(os.path.join(_root_dir, path))
320
321
322def ParseGrdForUnittest(body, base_dir=None):
323  '''Parse a skeleton .grd file and return it, for use in unit tests.
324
325  Args:
326    body: XML that goes inside the <release> element.
327    base_dir: The base_dir attribute of the <grit> tag.
328  '''
329  import StringIO
330  from grit import grd_reader
331  if isinstance(body, unicode):
332    body = body.encode('utf-8')
333  if base_dir is None:
334    base_dir = PathFromRoot('.')
335  body = '''<?xml version="1.0" encoding="UTF-8"?>
336<grit latest_public_release="2" current_release="3" source_lang_id="en" base_dir="%s">
337  <outputs>
338  </outputs>
339  <release seq="3">
340    %s
341  </release>
342</grit>''' % (base_dir, body)
343  return grd_reader.Parse(StringIO.StringIO(body), dir=".")
344
345
346def StripBlankLinesAndComments(text):
347  '''Strips blank lines and comments from C source code, for unit tests.'''
348  return '\n'.join(line for line in text.splitlines()
349                        if line and not line.startswith('//'))
350
351
352def dirname(filename):
353  '''Version of os.path.dirname() that never returns empty paths (returns
354  '.' if the result of os.path.dirname() is empty).
355  '''
356  ret = os.path.dirname(filename)
357  if ret == '':
358    ret = '.'
359  return ret
360
361
362def normpath(path):
363  '''Version of os.path.normpath that also changes backward slashes to
364  forward slashes when not running on Windows.
365  '''
366  # This is safe to always do because the Windows version of os.path.normpath
367  # will replace forward slashes with backward slashes.
368  path = path.replace('\\', '/')
369  return os.path.normpath(path)
370
371
372_LANGUAGE_SPLIT_RE = lazy_re.compile('-|_|/')
373
374
375def CanonicalLanguage(code):
376  '''Canonicalizes two-part language codes by using a dash and making the
377  second part upper case.  Returns one-part language codes unchanged.
378
379  Args:
380    code: 'zh_cn'
381
382  Return:
383    code: 'zh-CN'
384  '''
385  parts = _LANGUAGE_SPLIT_RE.split(code)
386  code = [ parts[0] ]
387  for part in parts[1:]:
388    code.append(part.upper())
389  return '-'.join(code)
390
391
392_LANG_TO_CODEPAGE = {
393  'en' : 1252,
394  'fr' : 1252,
395  'it' : 1252,
396  'de' : 1252,
397  'es' : 1252,
398  'nl' : 1252,
399  'sv' : 1252,
400  'no' : 1252,
401  'da' : 1252,
402  'fi' : 1252,
403  'pt-BR' : 1252,
404  'ru' : 1251,
405  'ja' : 932,
406  'zh-TW' : 950,
407  'zh-CN' : 936,
408  'ko' : 949,
409}
410
411
412def LanguageToCodepage(lang):
413  '''Returns the codepage _number_ that can be used to represent 'lang', which
414  may be either in formats such as 'en', 'pt_br', 'pt-BR', etc.
415
416  The codepage returned will be one of the 'cpXXXX' codepage numbers.
417
418  Args:
419    lang: 'de'
420
421  Return:
422    1252
423  '''
424  lang = CanonicalLanguage(lang)
425  if lang in _LANG_TO_CODEPAGE:
426    return _LANG_TO_CODEPAGE[lang]
427  else:
428    print "Not sure which codepage to use for %s, assuming cp1252" % lang
429    return 1252
430
431def NewClassInstance(class_name, class_type):
432  '''Returns an instance of the class specified in classname
433
434  Args:
435    class_name: the fully qualified, dot separated package + classname,
436    i.e. "my.package.name.MyClass". Short class names are not supported.
437    class_type: the class or superclass this object must implement
438
439  Return:
440    An instance of the class, or None if none was found
441  '''
442  lastdot = class_name.rfind('.')
443  module_name = ''
444  if lastdot >= 0:
445    module_name = class_name[0:lastdot]
446    if module_name:
447      class_name = class_name[lastdot+1:]
448      module = __import__(module_name, globals(), locals(), [''])
449      if hasattr(module, class_name):
450        class_ = getattr(module, class_name)
451        class_instance = class_()
452        if isinstance(class_instance, class_type):
453          return class_instance
454  return None
455
456
457def FixLineEnd(text, line_end):
458  # First normalize
459  text = text.replace('\r\n', '\n')
460  text = text.replace('\r', '\n')
461  # Then fix
462  text = text.replace('\n', line_end)
463  return text
464
465
466def BoolToString(bool):
467  if bool:
468    return 'true'
469  else:
470    return 'false'
471
472
473verbose = False
474extra_verbose = False
475
476def IsVerbose():
477  return verbose
478
479def IsExtraVerbose():
480  return extra_verbose
481
482def ParseDefine(define):
483  '''Parses a define argument and returns the name and value.
484
485  The format is either "NAME=VAL" or "NAME", using True as the default value.
486  Values of "1" and "0" are transformed to True and False respectively.
487
488  Args:
489    define: a string of the form "NAME=VAL" or "NAME".
490
491  Returns:
492    A (name, value) pair. name is a string, value a string or boolean.
493  '''
494  parts = [part.strip() for part in define.split('=', 1)]
495  assert len(parts) >= 1
496  name = parts[0]
497  val = True
498  if len(parts) > 1:
499    val = parts[1]
500  if val == "1": val = True
501  elif val == "0": val = False
502  return (name, val)
503
504
505class Substituter(object):
506  '''Finds and substitutes variable names in text strings.
507
508  Given a dictionary of variable names and values, prepares to
509  search for patterns of the form [VAR_NAME] in a text.
510  The value will be substituted back efficiently.
511  Also applies to tclib.Message objects.
512  '''
513
514  def __init__(self):
515    '''Create an empty substituter.'''
516    self.substitutions_ = {}
517    self.dirty_ = True
518
519  def AddSubstitutions(self, subs):
520    '''Add new values to the substitutor.
521
522    Args:
523      subs: A dictionary of new substitutions.
524    '''
525    self.substitutions_.update(subs)
526    self.dirty_ = True
527
528  def AddMessages(self, messages, lang):
529    '''Adds substitutions extracted from node.Message objects.
530
531    Args:
532      messages: a list of node.Message objects.
533      lang: The translation language to use in substitutions.
534    '''
535    subs = [(str(msg.attrs['name']), msg.Translate(lang)) for msg in messages]
536    self.AddSubstitutions(dict(subs))
537    self.dirty_ = True
538
539  def GetExp(self):
540    '''Obtain a regular expression that will find substitution keys in text.
541
542    Create and cache if the substituter has been updated. Use the cached value
543    otherwise. Keys will be enclosed in [square brackets] in text.
544
545    Returns:
546      A regular expression object.
547    '''
548    if self.dirty_:
549      components = ['\[%s\]' % (k,) for k in self.substitutions_.keys()]
550      self.exp = re.compile("(%s)" % ('|'.join(components),))
551      self.dirty_ = False
552    return self.exp
553
554  def Substitute(self, text):
555    '''Substitute the variable values in the given text.
556
557    Text of the form [message_name] will be replaced by the message's value.
558
559    Args:
560      text: A string of text.
561
562    Returns:
563      A string of text with substitutions done.
564    '''
565    return ''.join([self._SubFragment(f) for f in self.GetExp().split(text)])
566
567  def _SubFragment(self, fragment):
568    '''Utility function for Substitute.
569
570    Performs a simple substitution if the fragment is exactly of the form
571    [message_name].
572
573    Args:
574      fragment: A simple string.
575
576    Returns:
577      A string with the substitution done.
578    '''
579    if len(fragment) > 2 and fragment[0] == '[' and fragment[-1] == ']':
580      sub = self.substitutions_.get(fragment[1:-1], None)
581      if sub is not None:
582        return sub
583    return fragment
584
585  def SubstituteMessage(self, msg):
586    '''Apply substitutions to a tclib.Message object.
587
588    Text of the form [message_name] will be replaced by a new placeholder,
589    whose presentation will take the form the message_name_{UsageCount}, and
590    whose example will be the message's value. Existing placeholders are
591    not affected.
592
593    Args:
594      msg: A tclib.Message object.
595
596    Returns:
597      A tclib.Message object, with substitutions done.
598    '''
599    from grit import tclib  # avoid circular import
600    counts = {}
601    text = msg.GetPresentableContent()
602    placeholders = []
603    newtext = ''
604    for f in self.GetExp().split(text):
605      sub = self._SubFragment(f)
606      if f != sub:
607        f = str(f)
608        count = counts.get(f, 0) + 1
609        counts[f] = count
610        name = "%s_%d" % (f[1:-1], count)
611        placeholders.append(tclib.Placeholder(name, f, sub))
612        newtext += name
613      else:
614        newtext += f
615    if placeholders:
616      return tclib.Message(newtext, msg.GetPlaceholders() + placeholders,
617                           msg.GetDescription(), msg.GetMeaning())
618    else:
619      return msg
620
621
622class TempDir(object):
623  '''Creates files with the specified contents in a temporary directory,
624  for unit testing.
625  '''
626  def __init__(self, file_data):
627    self._tmp_dir_name = tempfile.mkdtemp()
628    assert not os.listdir(self.GetPath())
629    for name, contents in file_data.items():
630      file_path = self.GetPath(name)
631      dir_path = os.path.split(file_path)[0]
632      if not os.path.exists(dir_path):
633        os.makedirs(dir_path)
634      with open(file_path, 'w') as f:
635        f.write(file_data[name])
636
637  def __enter__(self):
638    return self
639
640  def __exit__(self, *exc_info):
641    self.CleanUp()
642
643  def CleanUp(self):
644    shutil.rmtree(self.GetPath())
645
646  def GetPath(self, name=''):
647    name = os.path.join(self._tmp_dir_name, name)
648    assert name.startswith(self._tmp_dir_name)
649    return name
650
651  def AsCurrentDir(self):
652    return self._AsCurrentDirClass(self.GetPath())
653
654  class _AsCurrentDirClass(object):
655    def __init__(self, path):
656      self.path = path
657    def __enter__(self):
658      self.oldpath = os.getcwd()
659      os.chdir(self.path)
660    def __exit__(self, *exc_info):
661      os.chdir(self.oldpath)
662