1"""Internationalization and localization support.
2
3This module provides internationalization (I18N) and localization (L10N)
4support for your Python programs by providing an interface to the GNU gettext
5message catalog library.
6
7I18N refers to the operation by which a program is made aware of multiple
8languages.  L10N refers to the adaptation of your program, once
9internationalized, to the local language and cultural habits.
10
11"""
12
13# This module represents the integration of work, contributions, feedback, and
14# suggestions from the following people:
15#
16# Martin von Loewis, who wrote the initial implementation of the underlying
17# C-based libintlmodule (later renamed _gettext), along with a skeletal
18# gettext.py implementation.
19#
20# Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
21# which also included a pure-Python implementation to read .mo files if
22# intlmodule wasn't available.
23#
24# James Henstridge, who also wrote a gettext.py module, which has some
25# interesting, but currently unsupported experimental features: the notion of
26# a Catalog class and instances, and the ability to add to a catalog file via
27# a Python API.
28#
29# Barry Warsaw integrated these modules, wrote the .install() API and code,
30# and conformed all C and Python code to Python's coding standards.
31#
32# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
33# module.
34#
35# J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
36#
37# TODO:
38# - Lazy loading of .mo files.  Currently the entire catalog is loaded into
39#   memory, but that's probably bad for large translated programs.  Instead,
40#   the lexical sort of original strings in GNU .mo files should be exploited
41#   to do binary searches and lazy initializations.  Or you might want to use
42#   the undocumented double-hash algorithm for .mo files with hash tables, but
43#   you'll need to study the GNU gettext code to do this.
44#
45# - Support Solaris .mo file formats.  Unfortunately, we've been unable to
46#   find this format documented anywhere.
47
48
49import locale, copy, os, re, struct, sys
50from errno import ENOENT
51
52
53__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
54           'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
55           'dgettext', 'dngettext', 'gettext', 'ngettext',
56           ]
57
58_default_localedir = os.path.join(sys.prefix, 'share', 'locale')
59
60
61def test(condition, true, false):
62    """
63    Implements the C expression:
64
65      condition ? true : false
66
67    Required to correctly interpret plural forms.
68    """
69    if condition:
70        return true
71    else:
72        return false
73
74
75def c2py(plural):
76    """Gets a C expression as used in PO files for plural forms and returns a
77    Python lambda function that implements an equivalent expression.
78    """
79    # Security check, allow only the "n" identifier
80    try:
81        from cStringIO import StringIO
82    except ImportError:
83        from StringIO import StringIO
84    import token, tokenize
85    tokens = tokenize.generate_tokens(StringIO(plural).readline)
86    try:
87        danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
88    except tokenize.TokenError:
89        raise ValueError, \
90              'plural forms expression error, maybe unbalanced parenthesis'
91    else:
92        if danger:
93            raise ValueError, 'plural forms expression could be dangerous'
94
95    # Replace some C operators by their Python equivalents
96    plural = plural.replace('&&', ' and ')
97    plural = plural.replace('||', ' or ')
98
99    expr = re.compile(r'\!([^=])')
100    plural = expr.sub(' not \\1', plural)
101
102    # Regular expression and replacement function used to transform
103    # "a?b:c" to "test(a,b,c)".
104    expr = re.compile(r'(.*?)\?(.*?):(.*)')
105    def repl(x):
106        return "test(%s, %s, %s)" % (x.group(1), x.group(2),
107                                     expr.sub(repl, x.group(3)))
108
109    # Code to transform the plural expression, taking care of parentheses
110    stack = ['']
111    for c in plural:
112        if c == '(':
113            stack.append('')
114        elif c == ')':
115            if len(stack) == 1:
116                # Actually, we never reach this code, because unbalanced
117                # parentheses get caught in the security check at the
118                # beginning.
119                raise ValueError, 'unbalanced parenthesis in plural form'
120            s = expr.sub(repl, stack.pop())
121            stack[-1] += '(%s)' % s
122        else:
123            stack[-1] += c
124    plural = expr.sub(repl, stack.pop())
125
126    return eval('lambda n: int(%s)' % plural)
127
128
129
130def _expand_lang(locale):
131    from locale import normalize
132    locale = normalize(locale)
133    COMPONENT_CODESET   = 1 << 0
134    COMPONENT_TERRITORY = 1 << 1
135    COMPONENT_MODIFIER  = 1 << 2
136    # split up the locale into its base components
137    mask = 0
138    pos = locale.find('@')
139    if pos >= 0:
140        modifier = locale[pos:]
141        locale = locale[:pos]
142        mask |= COMPONENT_MODIFIER
143    else:
144        modifier = ''
145    pos = locale.find('.')
146    if pos >= 0:
147        codeset = locale[pos:]
148        locale = locale[:pos]
149        mask |= COMPONENT_CODESET
150    else:
151        codeset = ''
152    pos = locale.find('_')
153    if pos >= 0:
154        territory = locale[pos:]
155        locale = locale[:pos]
156        mask |= COMPONENT_TERRITORY
157    else:
158        territory = ''
159    language = locale
160    ret = []
161    for i in range(mask+1):
162        if not (i & ~mask):  # if all components for this combo exist ...
163            val = language
164            if i & COMPONENT_TERRITORY: val += territory
165            if i & COMPONENT_CODESET:   val += codeset
166            if i & COMPONENT_MODIFIER:  val += modifier
167            ret.append(val)
168    ret.reverse()
169    return ret
170
171
172
173class NullTranslations:
174    def __init__(self, fp=None):
175        self._info = {}
176        self._charset = None
177        self._output_charset = None
178        self._fallback = None
179        if fp is not None:
180            self._parse(fp)
181
182    def _parse(self, fp):
183        pass
184
185    def add_fallback(self, fallback):
186        if self._fallback:
187            self._fallback.add_fallback(fallback)
188        else:
189            self._fallback = fallback
190
191    def gettext(self, message):
192        if self._fallback:
193            return self._fallback.gettext(message)
194        return message
195
196    def lgettext(self, message):
197        if self._fallback:
198            return self._fallback.lgettext(message)
199        return message
200
201    def ngettext(self, msgid1, msgid2, n):
202        if self._fallback:
203            return self._fallback.ngettext(msgid1, msgid2, n)
204        if n == 1:
205            return msgid1
206        else:
207            return msgid2
208
209    def lngettext(self, msgid1, msgid2, n):
210        if self._fallback:
211            return self._fallback.lngettext(msgid1, msgid2, n)
212        if n == 1:
213            return msgid1
214        else:
215            return msgid2
216
217    def ugettext(self, message):
218        if self._fallback:
219            return self._fallback.ugettext(message)
220        return unicode(message)
221
222    def ungettext(self, msgid1, msgid2, n):
223        if self._fallback:
224            return self._fallback.ungettext(msgid1, msgid2, n)
225        if n == 1:
226            return unicode(msgid1)
227        else:
228            return unicode(msgid2)
229
230    def info(self):
231        return self._info
232
233    def charset(self):
234        return self._charset
235
236    def output_charset(self):
237        return self._output_charset
238
239    def set_output_charset(self, charset):
240        self._output_charset = charset
241
242    def install(self, unicode=False, names=None):
243        import __builtin__
244        __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
245        if hasattr(names, "__contains__"):
246            if "gettext" in names:
247                __builtin__.__dict__['gettext'] = __builtin__.__dict__['_']
248            if "ngettext" in names:
249                __builtin__.__dict__['ngettext'] = (unicode and self.ungettext
250                                                             or self.ngettext)
251            if "lgettext" in names:
252                __builtin__.__dict__['lgettext'] = self.lgettext
253            if "lngettext" in names:
254                __builtin__.__dict__['lngettext'] = self.lngettext
255
256
257class GNUTranslations(NullTranslations):
258    # Magic number of .mo files
259    LE_MAGIC = 0x950412deL
260    BE_MAGIC = 0xde120495L
261
262    def _parse(self, fp):
263        """Override this method to support alternative .mo formats."""
264        unpack = struct.unpack
265        filename = getattr(fp, 'name', '')
266        # Parse the .mo file header, which consists of 5 little endian 32
267        # bit words.
268        self._catalog = catalog = {}
269        self.plural = lambda n: int(n != 1) # germanic plural by default
270        buf = fp.read()
271        buflen = len(buf)
272        # Are we big endian or little endian?
273        magic = unpack('<I', buf[:4])[0]
274        if magic == self.LE_MAGIC:
275            version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
276            ii = '<II'
277        elif magic == self.BE_MAGIC:
278            version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
279            ii = '>II'
280        else:
281            raise IOError(0, 'Bad magic number', filename)
282        # Now put all messages from the .mo file buffer into the catalog
283        # dictionary.
284        for i in xrange(0, msgcount):
285            mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
286            mend = moff + mlen
287            tlen, toff = unpack(ii, buf[transidx:transidx+8])
288            tend = toff + tlen
289            if mend < buflen and tend < buflen:
290                msg = buf[moff:mend]
291                tmsg = buf[toff:tend]
292            else:
293                raise IOError(0, 'File is corrupt', filename)
294            # See if we're looking at GNU .mo conventions for metadata
295            if mlen == 0:
296                # Catalog description
297                lastk = k = None
298                for item in tmsg.splitlines():
299                    item = item.strip()
300                    if not item:
301                        continue
302                    if ':' in item:
303                        k, v = item.split(':', 1)
304                        k = k.strip().lower()
305                        v = v.strip()
306                        self._info[k] = v
307                        lastk = k
308                    elif lastk:
309                        self._info[lastk] += '\n' + item
310                    if k == 'content-type':
311                        self._charset = v.split('charset=')[1]
312                    elif k == 'plural-forms':
313                        v = v.split(';')
314                        plural = v[1].split('plural=')[1]
315                        self.plural = c2py(plural)
316            # Note: we unconditionally convert both msgids and msgstrs to
317            # Unicode using the character encoding specified in the charset
318            # parameter of the Content-Type header.  The gettext documentation
319            # strongly encourages msgids to be us-ascii, but some applications
320            # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
321            # traditional gettext applications, the msgid conversion will
322            # cause no problems since us-ascii should always be a subset of
323            # the charset encoding.  We may want to fall back to 8-bit msgids
324            # if the Unicode conversion fails.
325            if '\x00' in msg:
326                # Plural forms
327                msgid1, msgid2 = msg.split('\x00')
328                tmsg = tmsg.split('\x00')
329                if self._charset:
330                    msgid1 = unicode(msgid1, self._charset)
331                    tmsg = [unicode(x, self._charset) for x in tmsg]
332                for i in range(len(tmsg)):
333                    catalog[(msgid1, i)] = tmsg[i]
334            else:
335                if self._charset:
336                    msg = unicode(msg, self._charset)
337                    tmsg = unicode(tmsg, self._charset)
338                catalog[msg] = tmsg
339            # advance to next entry in the seek tables
340            masteridx += 8
341            transidx += 8
342
343    def gettext(self, message):
344        missing = object()
345        tmsg = self._catalog.get(message, missing)
346        if tmsg is missing:
347            if self._fallback:
348                return self._fallback.gettext(message)
349            return message
350        # Encode the Unicode tmsg back to an 8-bit string, if possible
351        if self._output_charset:
352            return tmsg.encode(self._output_charset)
353        elif self._charset:
354            return tmsg.encode(self._charset)
355        return tmsg
356
357    def lgettext(self, message):
358        missing = object()
359        tmsg = self._catalog.get(message, missing)
360        if tmsg is missing:
361            if self._fallback:
362                return self._fallback.lgettext(message)
363            return message
364        if self._output_charset:
365            return tmsg.encode(self._output_charset)
366        return tmsg.encode(locale.getpreferredencoding())
367
368    def ngettext(self, msgid1, msgid2, n):
369        try:
370            tmsg = self._catalog[(msgid1, self.plural(n))]
371            if self._output_charset:
372                return tmsg.encode(self._output_charset)
373            elif self._charset:
374                return tmsg.encode(self._charset)
375            return tmsg
376        except KeyError:
377            if self._fallback:
378                return self._fallback.ngettext(msgid1, msgid2, n)
379            if n == 1:
380                return msgid1
381            else:
382                return msgid2
383
384    def lngettext(self, msgid1, msgid2, n):
385        try:
386            tmsg = self._catalog[(msgid1, self.plural(n))]
387            if self._output_charset:
388                return tmsg.encode(self._output_charset)
389            return tmsg.encode(locale.getpreferredencoding())
390        except KeyError:
391            if self._fallback:
392                return self._fallback.lngettext(msgid1, msgid2, n)
393            if n == 1:
394                return msgid1
395            else:
396                return msgid2
397
398    def ugettext(self, message):
399        missing = object()
400        tmsg = self._catalog.get(message, missing)
401        if tmsg is missing:
402            if self._fallback:
403                return self._fallback.ugettext(message)
404            return unicode(message)
405        return tmsg
406
407    def ungettext(self, msgid1, msgid2, n):
408        try:
409            tmsg = self._catalog[(msgid1, self.plural(n))]
410        except KeyError:
411            if self._fallback:
412                return self._fallback.ungettext(msgid1, msgid2, n)
413            if n == 1:
414                tmsg = unicode(msgid1)
415            else:
416                tmsg = unicode(msgid2)
417        return tmsg
418
419
420# Locate a .mo file using the gettext strategy
421def find(domain, localedir=None, languages=None, all=0):
422    # Get some reasonable defaults for arguments that were not supplied
423    if localedir is None:
424        localedir = _default_localedir
425    if languages is None:
426        languages = []
427        for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
428            val = os.environ.get(envar)
429            if val:
430                languages = val.split(':')
431                break
432        if 'C' not in languages:
433            languages.append('C')
434    # now normalize and expand the languages
435    nelangs = []
436    for lang in languages:
437        for nelang in _expand_lang(lang):
438            if nelang not in nelangs:
439                nelangs.append(nelang)
440    # select a language
441    if all:
442        result = []
443    else:
444        result = None
445    for lang in nelangs:
446        if lang == 'C':
447            break
448        mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
449        if os.path.exists(mofile):
450            if all:
451                result.append(mofile)
452            else:
453                return mofile
454    return result
455
456
457
458# a mapping between absolute .mo file path and Translation object
459_translations = {}
460
461def translation(domain, localedir=None, languages=None,
462                class_=None, fallback=False, codeset=None):
463    if class_ is None:
464        class_ = GNUTranslations
465    mofiles = find(domain, localedir, languages, all=1)
466    if not mofiles:
467        if fallback:
468            return NullTranslations()
469        raise IOError(ENOENT, 'No translation file found for domain', domain)
470    # Avoid opening, reading, and parsing the .mo file after it's been done
471    # once.
472    result = None
473    for mofile in mofiles:
474        key = (class_, os.path.abspath(mofile))
475        t = _translations.get(key)
476        if t is None:
477            with open(mofile, 'rb') as fp:
478                t = _translations.setdefault(key, class_(fp))
479        # Copy the translation object to allow setting fallbacks and
480        # output charset. All other instance data is shared with the
481        # cached object.
482        t = copy.copy(t)
483        if codeset:
484            t.set_output_charset(codeset)
485        if result is None:
486            result = t
487        else:
488            result.add_fallback(t)
489    return result
490
491
492def install(domain, localedir=None, unicode=False, codeset=None, names=None):
493    t = translation(domain, localedir, fallback=True, codeset=codeset)
494    t.install(unicode, names)
495
496
497
498# a mapping b/w domains and locale directories
499_localedirs = {}
500# a mapping b/w domains and codesets
501_localecodesets = {}
502# current global domain, `messages' used for compatibility w/ GNU gettext
503_current_domain = 'messages'
504
505
506def textdomain(domain=None):
507    global _current_domain
508    if domain is not None:
509        _current_domain = domain
510    return _current_domain
511
512
513def bindtextdomain(domain, localedir=None):
514    global _localedirs
515    if localedir is not None:
516        _localedirs[domain] = localedir
517    return _localedirs.get(domain, _default_localedir)
518
519
520def bind_textdomain_codeset(domain, codeset=None):
521    global _localecodesets
522    if codeset is not None:
523        _localecodesets[domain] = codeset
524    return _localecodesets.get(domain)
525
526
527def dgettext(domain, message):
528    try:
529        t = translation(domain, _localedirs.get(domain, None),
530                        codeset=_localecodesets.get(domain))
531    except IOError:
532        return message
533    return t.gettext(message)
534
535def ldgettext(domain, message):
536    try:
537        t = translation(domain, _localedirs.get(domain, None),
538                        codeset=_localecodesets.get(domain))
539    except IOError:
540        return message
541    return t.lgettext(message)
542
543def dngettext(domain, msgid1, msgid2, n):
544    try:
545        t = translation(domain, _localedirs.get(domain, None),
546                        codeset=_localecodesets.get(domain))
547    except IOError:
548        if n == 1:
549            return msgid1
550        else:
551            return msgid2
552    return t.ngettext(msgid1, msgid2, n)
553
554def ldngettext(domain, msgid1, msgid2, n):
555    try:
556        t = translation(domain, _localedirs.get(domain, None),
557                        codeset=_localecodesets.get(domain))
558    except IOError:
559        if n == 1:
560            return msgid1
561        else:
562            return msgid2
563    return t.lngettext(msgid1, msgid2, n)
564
565def gettext(message):
566    return dgettext(_current_domain, message)
567
568def lgettext(message):
569    return ldgettext(_current_domain, message)
570
571def ngettext(msgid1, msgid2, n):
572    return dngettext(_current_domain, msgid1, msgid2, n)
573
574def lngettext(msgid1, msgid2, n):
575    return ldngettext(_current_domain, msgid1, msgid2, n)
576
577# dcgettext() has been deemed unnecessary and is not implemented.
578
579# James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
580# was:
581#
582#    import gettext
583#    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
584#    _ = cat.gettext
585#    print _('Hello World')
586
587# The resulting catalog object currently don't support access through a
588# dictionary API, which was supported (but apparently unused) in GNOME
589# gettext.
590
591Catalog = translation
592