1#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6'''Pseudo RTL, (aka Fake Bidi) support. It simply wraps each word with
7Unicode RTL overrides.
8More info at https://sites.google.com/a/chromium.org/dev/Home/fake-bidi
9'''
10
11import re
12
13from grit import lazy_re
14from grit import tclib
15
16ACCENTED_STRINGS = {
17  'a': u"\u00e5", 'e': u"\u00e9", 'i': u"\u00ee", 'o': u"\u00f6",
18  'u': u"\u00fb", 'A': u"\u00c5", 'E': u"\u00c9", 'I': u"\u00ce",
19  'O': u"\u00d6", 'U': u"\u00db", 'c': u"\u00e7", 'd': u"\u00f0",
20  'n': u"\u00f1", 'p': u"\u00fe", 'y': u"\u00fd", 'C': u"\u00c7",
21  'D': u"\u00d0", 'N': u"\u00d1", 'P': u"\u00de", 'Y': u"\u00dd",
22  'f': u"\u0192", 's': u"\u0161", 'S': u"\u0160", 'z': u"\u017e",
23  'Z': u"\u017d", 'g': u"\u011d", 'G': u"\u011c", 'h': u"\u0125",
24  'H': u"\u0124", 'j': u"\u0135", 'J': u"\u0134", 'k': u"\u0137",
25  'K': u"\u0136", 'l': u"\u013c", 'L': u"\u013b", 't': u"\u0163",
26  'T': u"\u0162", 'w': u"\u0175", 'W': u"\u0174",
27  '$': u"\u20ac", '?': u"\u00bf", 'R': u"\u00ae", r'!': u"\u00a1",
28}
29
30# a character set containing the keys in ACCENTED_STRINGS
31# We should not accent characters in an escape sequence such as "\n".
32# To be safe, we assume every character following a backslash is an escaped
33# character. We also need to consider the case like "\\n", which means
34# a blackslash and a character "n", we will accent the character "n".
35TO_ACCENT = lazy_re.compile(
36    r'[%s]|\\[a-z\\]' % ''.join(ACCENTED_STRINGS.keys()))
37
38# Lex text so that we don't interfere with html tokens and entities.
39# This lexing scheme will handle all well formed tags and entities, html or
40# xhtml.  It will not handle comments, CDATA sections, or the unescaping tags:
41# script, style, xmp or listing.  If any of those appear in messages,
42# something is wrong.
43TOKENS = [ lazy_re.compile(
44                           '^%s' % pattern,  # match at the beginning of input
45                           re.I | re.S  # html tokens are case-insensitive
46                         )
47           for pattern in
48           (
49            # a run of non html special characters
50            r'[^<&]+',
51            # a tag
52            (r'</?[a-z]\w*' # beginning of tag
53             r'(?:\s+\w+(?:\s*=\s*' # attribute start
54             r'(?:[^\s"\'>]+|"[^\"]*"|\'[^\']*\'))?' # attribute value
55             r')*\s*/?>'),
56            # an entity
57            r'&(?:[a-z]\w+|#\d+|#x[\da-f]+);',
58            # an html special character not part of a special sequence
59            r'.'
60           ) ]
61
62ALPHABETIC_RUN = lazy_re.compile(r'([^\W0-9_]+)')
63
64RLO = u'\u202e'
65PDF = u'\u202c'
66
67def PseudoRTLString(text):
68  '''Returns a fake bidirectional version of the source string. This code is
69    based on accentString above, in turn copied from Frank Tang.
70    '''
71  parts = []
72  while text:
73    m = None
74    for token in TOKENS:
75      m = token.search(text)
76      if m:
77        part = m.group(0)
78        text = text[len(part):]
79        if part[0] not in ('<', '&'):
80          # not a tag or entity, so accent
81          part = ALPHABETIC_RUN.sub(lambda run: RLO + run.group() + PDF, part)
82        parts.append(part)
83        break
84  return ''.join(parts)
85
86
87def PseudoRTLMessage(message):
88  '''Returns a pseudo-RTL (aka Fake-Bidi) translation of the provided message.
89
90  Args:
91    message: tclib.Message()
92
93  Return:
94    tclib.Translation()
95  '''
96  transl = tclib.Translation()
97  for part in message.GetContent():
98    if isinstance(part, tclib.Placeholder):
99      transl.AppendPlaceholder(part)
100    else:
101      transl.AppendText(PseudoRTLString(part))
102
103  return transl
104