grit/tool/xmb.py

#!/usr/bin/env python
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""The 'grit xmb' tool.
"""

import getopt
import os

from xml.sax import saxutils

from grit import grd_reader
from grit import lazy_re
from grit import tclib
from grit import util
from grit.tool import interface


# Used to collapse presentable content to determine if
# xml:space="preserve" is needed.
_WHITESPACES_REGEX = lazy_re.compile(ur'\s\s*')


# See XmlEscape below.
_XML_QUOTE_ESCAPES = {
    u"'":  u'&apos;',
    u'"':  u'&quot;',
}
_XML_BAD_CHAR_REGEX = lazy_re.compile(u'[^\u0009\u000A\u000D'
                                      u'\u0020-\uD7FF\uE000-\uFFFD]')


def _XmlEscape(s):
  """Returns text escaped for XML in a way compatible with Google's
  internal Translation Console tool.  May be used for attributes as
  well as for contents.
  """
  if not type(s) == unicode:
    s = unicode(s)
  result = saxutils.escape(s, _XML_QUOTE_ESCAPES)
  return _XML_BAD_CHAR_REGEX.sub(u'', result).encode('utf-8')


def _WriteAttribute(file, name, value):
  """Writes an XML attribute to the specified file.

    Args:
      file: file to write to
      name: name of the attribute
      value: (unescaped) value of the attribute
    """
  if value:
    file.write(' %s="%s"' % (name, _XmlEscape(value)))


def _WriteMessage(file, message):
  presentable_content = message.GetPresentableContent()
  assert (type(presentable_content) == unicode or
          (len(message.parts) == 1 and
           type(message.parts[0] == tclib.Placeholder)))
  preserve_space = presentable_content != _WHITESPACES_REGEX.sub(
      u' ', presentable_content.strip())

  file.write('<msg')
  _WriteAttribute(file, 'desc', message.GetDescription())
  _WriteAttribute(file, 'id', message.GetId())
  _WriteAttribute(file, 'meaning', message.GetMeaning())
  if preserve_space:
    _WriteAttribute(file, 'xml:space', 'preserve')
  file.write('>')
  if not preserve_space:
    file.write('\n  ')

  parts = message.GetContent()
  for part in parts:
    if isinstance(part, tclib.Placeholder):
      file.write('<ph')
      _WriteAttribute(file, 'name', part.GetPresentation())
      file.write('><ex>')
      file.write(_XmlEscape(part.GetExample()))
      file.write('</ex>')
      file.write(_XmlEscape(part.GetOriginal()))
      file.write('</ph>')
    else:
      file.write(_XmlEscape(part))
  if not preserve_space:
    file.write('\n')
  file.write('</msg>\n')


def WriteXmbFile(file, messages):
  """Writes the given grit.tclib.Message items to the specified open
  file-like object in the XMB format.
  """
  file.write("""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE messagebundle [
<!ELEMENT messagebundle (msg)*>
<!ATTLIST messagebundle class CDATA #IMPLIED>

<!ELEMENT msg (#PCDATA|ph|source)*>
<!ATTLIST msg id CDATA #IMPLIED>
<!ATTLIST msg seq CDATA #IMPLIED>
<!ATTLIST msg name CDATA #IMPLIED>
<!ATTLIST msg desc CDATA #IMPLIED>
<!ATTLIST msg meaning CDATA #IMPLIED>
<!ATTLIST msg obsolete (obsolete) #IMPLIED>
<!ATTLIST msg xml:space (default|preserve) "default">
<!ATTLIST msg is_hidden CDATA #IMPLIED>

<!ELEMENT source (#PCDATA)>

<!ELEMENT ph (#PCDATA|ex)*>
<!ATTLIST ph name CDATA #REQUIRED>

<!ELEMENT ex (#PCDATA)>
]>
<messagebundle>
""")
  for message in messages:
    _WriteMessage(file, message)
  file.write('</messagebundle>')


class OutputXmb(interface.Tool):
  """Outputs all translateable messages in the .grd input file to an
.xmb file, which is the format used to give source messages to
Google's internal Translation Console tool.  The format could easily
be used for other systems.

Usage: grit xmb [-i|-h] [-l LIMITFILE] OUTPUTPATH

OUTPUTPATH is the path you want to output the .xmb file to.

The -l option can be used to output only some of the resources to the .xmb file.
LIMITFILE is the path to a file that is used to limit the items output to the
xmb file.  If the filename extension is .grd, the file must be a .grd file
and the tool only output the contents of nodes from the input file that also
exist in the limit file (as compared on the 'name' attribute). Otherwise it must
contain a list of the IDs that output should be limited to, one ID per line, and
the tool will only output nodes with 'name' attributes that match one of the
IDs.

The -i option causes 'grit xmb' to output an "IDs only" file instead of an XMB
file.  The "IDs only" file contains the message ID of each message that would
normally be output to the XMB file, one message ID per line.  It is designed for
use with the 'grit transl2tc' tool's -l option.

Other options:

  -D NAME[=VAL]     Specify a C-preprocessor-like define NAME with optional
                    value VAL (defaults to 1) which will be used to control
                    conditional inclusion of resources.

  -E NAME=VALUE     Set environment variable NAME to VALUE (within grit).

"""
  # The different output formats supported by this tool
  FORMAT_XMB = 0
  FORMAT_IDS_ONLY = 1

  def __init__(self, defines=None):
    super(OutputXmb, self).__init__()
    self.format = self.FORMAT_XMB
    self.defines = defines or {}

  def ShortDescription(self):
    return 'Exports all translateable messages into an XMB file.'

  def Run(self, opts, args):
    self.SetOptions(opts)

    limit_file = None
    limit_is_grd = False
    limit_file_dir = None
    own_opts, args = getopt.getopt(args, 'l:D:ih')
    for key, val in own_opts:
      if key == '-l':
        limit_file = open(val, 'r')
        limit_file_dir = util.dirname(val)
        if not len(limit_file_dir):
          limit_file_dir = '.'
        limit_is_grd = os.path.splitext(val)[1] == '.grd'
      elif key == '-i':
        self.format = self.FORMAT_IDS_ONLY
      elif key == '-D':
        name, val = util.ParseDefine(val)
        self.defines[name] = val
      elif key == '-E':
        (env_name, env_value) = val.split('=', 1)
        os.environ[env_name] = env_value
    if not len(args) == 1:
      print ('grit xmb takes exactly one argument, the path to the XMB file '
             'to output.')
      return 2

    xmb_path = args[0]
    res_tree = grd_reader.Parse(opts.input, debug=opts.extra_verbose)
    res_tree.SetOutputLanguage('en')
    res_tree.SetDefines(self.defines)
    res_tree.OnlyTheseTranslations([])
    res_tree.RunGatherers()

    with open(xmb_path, 'wb') as output_file:
      self.Process(
        res_tree, output_file, limit_file, limit_is_grd, limit_file_dir)
    if limit_file:
      limit_file.close()
    print "Wrote %s" % xmb_path

  def Process(self, res_tree, output_file, limit_file=None, limit_is_grd=False,
              dir=None):
    """Writes a document with the contents of res_tree into output_file,
    limiting output to the IDs specified in limit_file, which is a GRD file if
    limit_is_grd is true, otherwise a file with one ID per line.

    The format of the output document depends on this object's format attribute.
    It can be FORMAT_XMB or FORMAT_IDS_ONLY.

    The FORMAT_IDS_ONLY format causes this function to write just a list
    of the IDs of all messages that would have been added to the XMB file, one
    ID per line.

    The FORMAT_XMB format causes this function to output the (default) XMB
    format.

    Args:
      res_tree: base.Node()
      output_file: file open for writing
      limit_file: None or file open for reading
      limit_is_grd: True | False
      dir: Directory of the limit file
    """
    if limit_file:
      if limit_is_grd:
        limit_list = []
        limit_tree = grd_reader.Parse(limit_file,
                                      dir=dir,
                                      debug=self.o.extra_verbose)
        for node in limit_tree:
          if 'name' in node.attrs:
            limit_list.append(node.attrs['name'])
      else:
        # Not a GRD file, so it's just a file with one ID per line
        limit_list = [item.strip() for item in limit_file.read().split('\n')]

    ids_already_done = {}
    messages = []
    for node in res_tree:
      if (limit_file and
          not ('name' in node.attrs and node.attrs['name'] in limit_list)):
        continue
      if not node.IsTranslateable():
        continue

      for clique in node.GetCliques():
        if not clique.IsTranslateable():
          continue
        if not clique.GetMessage().GetRealContent():
          continue

        # Some explanation is in order here.  Note that we can have
        # many messages with the same ID.
        #
        # The way we work around this is to maintain a list of cliques
        # per message ID (in the UberClique) and select the "best" one
        # (the first one that has a description, or an arbitrary one
        # if there is no description) for inclusion in the XMB file.
        # The translations are all going to be the same for messages
        # with the same ID, although the way we replace placeholders
        # might be slightly different.
        id = clique.GetMessage().GetId()
        if id in ids_already_done:
          continue
        ids_already_done[id] = 1

        message = node.UberClique().BestClique(id).GetMessage()
        messages += [message]

    # Ensure a stable order of messages, to help regression testing.
    messages.sort(key=lambda x:x.GetId())

    if self.format == self.FORMAT_IDS_ONLY:
      # We just print the list of IDs to the output file.
      for msg in messages:
        output_file.write(msg.GetId())
        output_file.write('\n')
    else:
      assert self.format == self.FORMAT_XMB
      WriteXmbFile(output_file, messages)