195640e3a20adea634b4df4ccf8c93f411184c438joi@chromium.org#!/usr/bin/env python
295640e3a20adea634b4df4ccf8c93f411184c438joi@chromium.org# Copyright (c) 2012 The Chromium Authors. All rights reserved.
301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# Use of this source code is governed by a BSD-style license that can be
401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# found in the LICENSE file.
501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org'''Pseudo RTL, (aka Fake Bidi) support. It simply wraps each word with
701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgUnicode RTL overrides.
801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgMore info at https://sites.google.com/a/chromium.org/dev/Home/fake-bidi
901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org'''
1001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
1101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgimport re
1201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
1301fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.orgfrom grit import lazy_re
1401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgfrom grit import tclib
1501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
1601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgACCENTED_STRINGS = {
1701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'a': u"\u00e5", 'e': u"\u00e9", 'i': u"\u00ee", 'o': u"\u00f6",
1801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'u': u"\u00fb", 'A': u"\u00c5", 'E': u"\u00c9", 'I': u"\u00ce",
1901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'O': u"\u00d6", 'U': u"\u00db", 'c': u"\u00e7", 'd': u"\u00f0",
2001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'n': u"\u00f1", 'p': u"\u00fe", 'y': u"\u00fd", 'C': u"\u00c7",
2101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'D': u"\u00d0", 'N': u"\u00d1", 'P': u"\u00de", 'Y': u"\u00dd",
2201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'f': u"\u0192", 's': u"\u0161", 'S': u"\u0160", 'z': u"\u017e",
2301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'Z': u"\u017d", 'g': u"\u011d", 'G': u"\u011c", 'h': u"\u0125",
2401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'H': u"\u0124", 'j': u"\u0135", 'J': u"\u0134", 'k': u"\u0137",
2501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'K': u"\u0136", 'l': u"\u013c", 'L': u"\u013b", 't': u"\u0163",
2601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  'T': u"\u0162", 'w': u"\u0175", 'W': u"\u0174",
2701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  '$': u"\u20ac", '?': u"\u00bf", 'R': u"\u00ae", r'!': u"\u00a1",
2801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org}
2901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
3001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# a character set containing the keys in ACCENTED_STRINGS
3101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# We should not accent characters in an escape sequence such as "\n".
3201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# To be safe, we assume every character following a backslash is an escaped
3301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# character. We also need to consider the case like "\\n", which means
3401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# a blackslash and a character "n", we will accent the character "n".
3501fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.orgTO_ACCENT = lazy_re.compile(
3601fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.org    r'[%s]|\\[a-z\\]' % ''.join(ACCENTED_STRINGS.keys()))
3701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
3801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# Lex text so that we don't interfere with html tokens and entities.
3901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# This lexing scheme will handle all well formed tags and entities, html or
4001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# xhtml.  It will not handle comments, CDATA sections, or the unescaping tags:
4101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# script, style, xmp or listing.  If any of those appear in messages,
4201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# something is wrong.
4301fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.orgTOKENS = [ lazy_re.compile(
4401fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.org                           '^%s' % pattern,  # match at the beginning of input
4501fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.org                           re.I | re.S  # html tokens are case-insensitive
4601fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.org                         )
4701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org           for pattern in
4801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org           (
4901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org            # a run of non html special characters
5001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org            r'[^<&]+',
5101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org            # a tag
5201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org            (r'</?[a-z]\w*' # beginning of tag
5301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org             r'(?:\s+\w+(?:\s*=\s*' # attribute start
5401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org             r'(?:[^\s"\'>]+|"[^\"]*"|\'[^\']*\'))?' # attribute value
5501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org             r')*\s*/?>'),
5601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org            # an entity
5701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org            r'&(?:[a-z]\w+|#\d+|#x[\da-f]+);',
5801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org            # an html special character not part of a special sequence
5901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org            r'.'
6001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org           ) ]
6101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
6201fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.orgALPHABETIC_RUN = lazy_re.compile(r'([^\W0-9_]+)')
6301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
6401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgRLO = u'\u202e'
6501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgPDF = u'\u202c'
6601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
6701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgdef PseudoRTLString(text):
6801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  '''Returns a fake bidirectional version of the source string. This code is
6901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org    based on accentString above, in turn copied from Frank Tang.
7001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org    '''
7101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  parts = []
7201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  while text:
7301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org    m = None
7401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org    for token in TOKENS:
7501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org      m = token.search(text)
7601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org      if m:
7701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org        part = m.group(0)
7801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org        text = text[len(part):]
7901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org        if part[0] not in ('<', '&'):
8001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org          # not a tag or entity, so accent
8101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org          part = ALPHABETIC_RUN.sub(lambda run: RLO + run.group() + PDF, part)
8201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org        parts.append(part)
8301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org        break
8401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  return ''.join(parts)
8501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
8601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
8701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgdef PseudoRTLMessage(message):
8801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  '''Returns a pseudo-RTL (aka Fake-Bidi) translation of the provided message.
8901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
9001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  Args:
9101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org    message: tclib.Message()
9201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
9301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  Return:
9401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org    tclib.Translation()
9501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  '''
9601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  transl = tclib.Translation()
9701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  for part in message.GetContent():
9801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org    if isinstance(part, tclib.Placeholder):
9901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org      transl.AppendPlaceholder(part)
10001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org    else:
10101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org      transl.AppendText(PseudoRTLString(part))
10201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org
10301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org  return transl
104