1#!/usr/bin/env python 2# Copyright (c) 2012 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6'''Pseudo RTL, (aka Fake Bidi) support. It simply wraps each word with 7Unicode RTL overrides. 8More info at https://sites.google.com/a/chromium.org/dev/Home/fake-bidi 9''' 10 11import re 12 13from grit import lazy_re 14from grit import tclib 15 16ACCENTED_STRINGS = { 17 'a': u"\u00e5", 'e': u"\u00e9", 'i': u"\u00ee", 'o': u"\u00f6", 18 'u': u"\u00fb", 'A': u"\u00c5", 'E': u"\u00c9", 'I': u"\u00ce", 19 'O': u"\u00d6", 'U': u"\u00db", 'c': u"\u00e7", 'd': u"\u00f0", 20 'n': u"\u00f1", 'p': u"\u00fe", 'y': u"\u00fd", 'C': u"\u00c7", 21 'D': u"\u00d0", 'N': u"\u00d1", 'P': u"\u00de", 'Y': u"\u00dd", 22 'f': u"\u0192", 's': u"\u0161", 'S': u"\u0160", 'z': u"\u017e", 23 'Z': u"\u017d", 'g': u"\u011d", 'G': u"\u011c", 'h': u"\u0125", 24 'H': u"\u0124", 'j': u"\u0135", 'J': u"\u0134", 'k': u"\u0137", 25 'K': u"\u0136", 'l': u"\u013c", 'L': u"\u013b", 't': u"\u0163", 26 'T': u"\u0162", 'w': u"\u0175", 'W': u"\u0174", 27 '$': u"\u20ac", '?': u"\u00bf", 'R': u"\u00ae", r'!': u"\u00a1", 28} 29 30# a character set containing the keys in ACCENTED_STRINGS 31# We should not accent characters in an escape sequence such as "\n". 32# To be safe, we assume every character following a backslash is an escaped 33# character. We also need to consider the case like "\\n", which means 34# a blackslash and a character "n", we will accent the character "n". 35TO_ACCENT = lazy_re.compile( 36 r'[%s]|\\[a-z\\]' % ''.join(ACCENTED_STRINGS.keys())) 37 38# Lex text so that we don't interfere with html tokens and entities. 39# This lexing scheme will handle all well formed tags and entities, html or 40# xhtml. It will not handle comments, CDATA sections, or the unescaping tags: 41# script, style, xmp or listing. If any of those appear in messages, 42# something is wrong. 43TOKENS = [ lazy_re.compile( 44 '^%s' % pattern, # match at the beginning of input 45 re.I | re.S # html tokens are case-insensitive 46 ) 47 for pattern in 48 ( 49 # a run of non html special characters 50 r'[^<&]+', 51 # a tag 52 (r'</?[a-z]\w*' # beginning of tag 53 r'(?:\s+\w+(?:\s*=\s*' # attribute start 54 r'(?:[^\s"\'>]+|"[^\"]*"|\'[^\']*\'))?' # attribute value 55 r')*\s*/?>'), 56 # an entity 57 r'&(?:[a-z]\w+|#\d+|#x[\da-f]+);', 58 # an html special character not part of a special sequence 59 r'.' 60 ) ] 61 62ALPHABETIC_RUN = lazy_re.compile(r'([^\W0-9_]+)') 63 64RLO = u'\u202e' 65PDF = u'\u202c' 66 67def PseudoRTLString(text): 68 '''Returns a fake bidirectional version of the source string. This code is 69 based on accentString above, in turn copied from Frank Tang. 70 ''' 71 parts = [] 72 while text: 73 m = None 74 for token in TOKENS: 75 m = token.search(text) 76 if m: 77 part = m.group(0) 78 text = text[len(part):] 79 if part[0] not in ('<', '&'): 80 # not a tag or entity, so accent 81 part = ALPHABETIC_RUN.sub(lambda run: RLO + run.group() + PDF, part) 82 parts.append(part) 83 break 84 return ''.join(parts) 85 86 87def PseudoRTLMessage(message): 88 '''Returns a pseudo-RTL (aka Fake-Bidi) translation of the provided message. 89 90 Args: 91 message: tclib.Message() 92 93 Return: 94 tclib.Translation() 95 ''' 96 transl = tclib.Translation() 97 for part in message.GetContent(): 98 if isinstance(part, tclib.Placeholder): 99 transl.AppendPlaceholder(part) 100 else: 101 transl.AppendText(PseudoRTLString(part)) 102 103 return transl 104