195640e3a20adea634b4df4ccf8c93f411184c438joi@chromium.org#!/usr/bin/env python 295640e3a20adea634b4df4ccf8c93f411184c438joi@chromium.org# Copyright (c) 2012 The Chromium Authors. All rights reserved. 301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# Use of this source code is governed by a BSD-style license that can be 401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# found in the LICENSE file. 501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org'''Pseudotranslation support. Our pseudotranslations are based on the 701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgP-language, which is a simple vowel-extending language. Examples of P: 801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org - "hello" becomes "hepellopo" 901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org - "howdie" becomes "hopowdiepie" 1001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org - "because" becomes "bepecaupause" (but in our implementation we don't 1101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org handle the silent e at the end so it actually would return "bepecaupausepe" 1201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 1301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgThe P-language has the excellent quality of increasing the length of text 1401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgby around 30-50% which is great for pseudotranslations, to stress test any 1501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgGUI layouts etc. 1601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 1701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgTo make the pseudotranslations more obviously "not a translation" and to make 1801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgthem exercise any code that deals with encodings, we also transform all English 1901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgvowels into equivalent vowels with diacriticals on them (rings, acutes, 2001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgdiaresis, and circumflex), and we write the "p" in the P-language as a Hebrew 2101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgcharacter Qof. It looks sort of like a latin character "p" but it is outside 2201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgthe latin-1 character set which will stress character encoding bugs. 2301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org''' 2401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 2501fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.orgfrom grit import lazy_re 2601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgfrom grit import tclib 2701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 2801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 2901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# An RFC language code for the P pseudolanguage. 3001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgPSEUDO_LANG = 'x-P-pseudo' 3101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 3201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# Hebrew character Qof. It looks kind of like a 'p' but is outside 3301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# the latin-1 character set which is good for our purposes. 3401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# TODO(joi) For now using P instead of Qof, because of some bugs it used. Find 3501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# a better solution, i.e. one that introduces a non-latin1 character into the 3601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# pseudotranslation. 3701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org#_QOF = u'\u05e7' 3801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org_QOF = u'P' 3901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 4001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# How we map each vowel. 4101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org_VOWELS = { 4201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'a' : u'\u00e5', # a with ring 4301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'e' : u'\u00e9', # e acute 4401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'i' : u'\u00ef', # i diaresis 4501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'o' : u'\u00f4', # o circumflex 4601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'u' : u'\u00fc', # u diaresis 4701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'y' : u'\u00fd', # y acute 4801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'A' : u'\u00c5', # A with ring 4901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'E' : u'\u00c9', # E acute 5001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'I' : u'\u00cf', # I diaresis 5101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'O' : u'\u00d4', # O circumflex 5201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'U' : u'\u00dc', # U diaresis 5301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org u'Y' : u'\u00dd', # Y acute 5401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org} 5501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 5601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# Matches vowels and P 5701fadb72b6e94e6511eaffd1874a8cc095f098a7joi@chromium.org_PSUB_RE = lazy_re.compile("(%s)" % '|'.join(_VOWELS.keys() + ['P'])) 5801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 5901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 6001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# Pseudotranslations previously created. This is important for performance 6101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# reasons, especially since we routinely pseudotranslate the whole project 6201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org# several or many different times for each build. 6301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org_existing_translations = {} 6401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 6501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 6601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgdef MapVowels(str, also_p = False): 6701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org '''Returns a copy of 'str' where characters that exist as keys in _VOWELS 6801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org have been replaced with the corresponding value. If also_p is true, this 6901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org function will also change capital P characters into a Hebrew character Qof. 7001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org ''' 7101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org def Repl(match): 7201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org if match.group() == 'p': 7301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org if also_p: 7401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org return _QOF 7501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org else: 7601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org return 'p' 7701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org else: 7801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org return _VOWELS[match.group()] 7901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org return _PSUB_RE.sub(Repl, str) 8001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 8101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 8201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgdef PseudoString(str): 8301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org '''Returns a pseudotranslation of the provided string, in our enhanced 8401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org P-language.''' 8501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org if str in _existing_translations: 8601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org return _existing_translations[str] 8701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 8801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org outstr = u'' 8901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org ix = 0 9001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org while ix < len(str): 9101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org if str[ix] not in _VOWELS.keys(): 9201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org outstr += str[ix] 9301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org ix += 1 9401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org else: 9501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org # We want to treat consecutive vowels as one composite vowel. This is not 9601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org # always accurate e.g. in composite words but good enough. 9701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org consecutive_vowels = u'' 9801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org while ix < len(str) and str[ix] in _VOWELS.keys(): 9901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org consecutive_vowels += str[ix] 10001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org ix += 1 10101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org changed_vowels = MapVowels(consecutive_vowels) 10201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org outstr += changed_vowels 10301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org outstr += _QOF 10401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org outstr += changed_vowels 10501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 10601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org _existing_translations[str] = outstr 10701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org return outstr 10801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 10901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 11001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.orgdef PseudoMessage(message): 11101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org '''Returns a pseudotranslation of the provided message. 11201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 11301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org Args: 11401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org message: tclib.Message() 11501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 11601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org Return: 11701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org tclib.Translation() 11801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org ''' 11901b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org transl = tclib.Translation() 12001b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 12101b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org for part in message.GetContent(): 12201b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org if isinstance(part, tclib.Placeholder): 12301b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org transl.AppendPlaceholder(part) 12401b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org else: 12501b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org transl.AppendText(PseudoString(part)) 12601b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 12701b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org return transl 12801b3bc768461bd303bff39f8cd1663682254e407joi@chromium.org 129