1"""Internationalization and localization support. 2 3This module provides internationalization (I18N) and localization (L10N) 4support for your Python programs by providing an interface to the GNU gettext 5message catalog library. 6 7I18N refers to the operation by which a program is made aware of multiple 8languages. L10N refers to the adaptation of your program, once 9internationalized, to the local language and cultural habits. 10 11""" 12 13# This module represents the integration of work, contributions, feedback, and 14# suggestions from the following people: 15# 16# Martin von Loewis, who wrote the initial implementation of the underlying 17# C-based libintlmodule (later renamed _gettext), along with a skeletal 18# gettext.py implementation. 19# 20# Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule, 21# which also included a pure-Python implementation to read .mo files if 22# intlmodule wasn't available. 23# 24# James Henstridge, who also wrote a gettext.py module, which has some 25# interesting, but currently unsupported experimental features: the notion of 26# a Catalog class and instances, and the ability to add to a catalog file via 27# a Python API. 28# 29# Barry Warsaw integrated these modules, wrote the .install() API and code, 30# and conformed all C and Python code to Python's coding standards. 31# 32# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this 33# module. 34# 35# J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs. 36# 37# TODO: 38# - Lazy loading of .mo files. Currently the entire catalog is loaded into 39# memory, but that's probably bad for large translated programs. Instead, 40# the lexical sort of original strings in GNU .mo files should be exploited 41# to do binary searches and lazy initializations. Or you might want to use 42# the undocumented double-hash algorithm for .mo files with hash tables, but 43# you'll need to study the GNU gettext code to do this. 44# 45# - Support Solaris .mo file formats. Unfortunately, we've been unable to 46# find this format documented anywhere. 47 48 49import locale, copy, os, re, struct, sys 50from errno import ENOENT 51 52 53__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog', 54 'find', 'translation', 'install', 'textdomain', 'bindtextdomain', 55 'bind_textdomain_codeset', 56 'dgettext', 'dngettext', 'gettext', 'lgettext', 'ldgettext', 57 'ldngettext', 'lngettext', 'ngettext', 58 ] 59 60_default_localedir = os.path.join(sys.prefix, 'share', 'locale') 61 62 63def test(condition, true, false): 64 """ 65 Implements the C expression: 66 67 condition ? true : false 68 69 Required to correctly interpret plural forms. 70 """ 71 if condition: 72 return true 73 else: 74 return false 75 76 77def c2py(plural): 78 """Gets a C expression as used in PO files for plural forms and returns a 79 Python lambda function that implements an equivalent expression. 80 """ 81 # Security check, allow only the "n" identifier 82 try: 83 from cStringIO import StringIO 84 except ImportError: 85 from StringIO import StringIO 86 import token, tokenize 87 tokens = tokenize.generate_tokens(StringIO(plural).readline) 88 try: 89 danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n'] 90 except tokenize.TokenError: 91 raise ValueError, \ 92 'plural forms expression error, maybe unbalanced parenthesis' 93 else: 94 if danger: 95 raise ValueError, 'plural forms expression could be dangerous' 96 97 # Replace some C operators by their Python equivalents 98 plural = plural.replace('&&', ' and ') 99 plural = plural.replace('||', ' or ') 100 101 expr = re.compile(r'\!([^=])') 102 plural = expr.sub(' not \\1', plural) 103 104 # Regular expression and replacement function used to transform 105 # "a?b:c" to "test(a,b,c)". 106 expr = re.compile(r'(.*?)\?(.*?):(.*)') 107 def repl(x): 108 return "test(%s, %s, %s)" % (x.group(1), x.group(2), 109 expr.sub(repl, x.group(3))) 110 111 # Code to transform the plural expression, taking care of parentheses 112 stack = [''] 113 for c in plural: 114 if c == '(': 115 stack.append('') 116 elif c == ')': 117 if len(stack) == 1: 118 # Actually, we never reach this code, because unbalanced 119 # parentheses get caught in the security check at the 120 # beginning. 121 raise ValueError, 'unbalanced parenthesis in plural form' 122 s = expr.sub(repl, stack.pop()) 123 stack[-1] += '(%s)' % s 124 else: 125 stack[-1] += c 126 plural = expr.sub(repl, stack.pop()) 127 128 return eval('lambda n: int(%s)' % plural) 129 130 131 132def _expand_lang(locale): 133 from locale import normalize 134 locale = normalize(locale) 135 COMPONENT_CODESET = 1 << 0 136 COMPONENT_TERRITORY = 1 << 1 137 COMPONENT_MODIFIER = 1 << 2 138 # split up the locale into its base components 139 mask = 0 140 pos = locale.find('@') 141 if pos >= 0: 142 modifier = locale[pos:] 143 locale = locale[:pos] 144 mask |= COMPONENT_MODIFIER 145 else: 146 modifier = '' 147 pos = locale.find('.') 148 if pos >= 0: 149 codeset = locale[pos:] 150 locale = locale[:pos] 151 mask |= COMPONENT_CODESET 152 else: 153 codeset = '' 154 pos = locale.find('_') 155 if pos >= 0: 156 territory = locale[pos:] 157 locale = locale[:pos] 158 mask |= COMPONENT_TERRITORY 159 else: 160 territory = '' 161 language = locale 162 ret = [] 163 for i in range(mask+1): 164 if not (i & ~mask): # if all components for this combo exist ... 165 val = language 166 if i & COMPONENT_TERRITORY: val += territory 167 if i & COMPONENT_CODESET: val += codeset 168 if i & COMPONENT_MODIFIER: val += modifier 169 ret.append(val) 170 ret.reverse() 171 return ret 172 173 174 175class NullTranslations: 176 def __init__(self, fp=None): 177 self._info = {} 178 self._charset = None 179 self._output_charset = None 180 self._fallback = None 181 if fp is not None: 182 self._parse(fp) 183 184 def _parse(self, fp): 185 pass 186 187 def add_fallback(self, fallback): 188 if self._fallback: 189 self._fallback.add_fallback(fallback) 190 else: 191 self._fallback = fallback 192 193 def gettext(self, message): 194 if self._fallback: 195 return self._fallback.gettext(message) 196 return message 197 198 def lgettext(self, message): 199 if self._fallback: 200 return self._fallback.lgettext(message) 201 return message 202 203 def ngettext(self, msgid1, msgid2, n): 204 if self._fallback: 205 return self._fallback.ngettext(msgid1, msgid2, n) 206 if n == 1: 207 return msgid1 208 else: 209 return msgid2 210 211 def lngettext(self, msgid1, msgid2, n): 212 if self._fallback: 213 return self._fallback.lngettext(msgid1, msgid2, n) 214 if n == 1: 215 return msgid1 216 else: 217 return msgid2 218 219 def ugettext(self, message): 220 if self._fallback: 221 return self._fallback.ugettext(message) 222 return unicode(message) 223 224 def ungettext(self, msgid1, msgid2, n): 225 if self._fallback: 226 return self._fallback.ungettext(msgid1, msgid2, n) 227 if n == 1: 228 return unicode(msgid1) 229 else: 230 return unicode(msgid2) 231 232 def info(self): 233 return self._info 234 235 def charset(self): 236 return self._charset 237 238 def output_charset(self): 239 return self._output_charset 240 241 def set_output_charset(self, charset): 242 self._output_charset = charset 243 244 def install(self, unicode=False, names=None): 245 import __builtin__ 246 __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext 247 if hasattr(names, "__contains__"): 248 if "gettext" in names: 249 __builtin__.__dict__['gettext'] = __builtin__.__dict__['_'] 250 if "ngettext" in names: 251 __builtin__.__dict__['ngettext'] = (unicode and self.ungettext 252 or self.ngettext) 253 if "lgettext" in names: 254 __builtin__.__dict__['lgettext'] = self.lgettext 255 if "lngettext" in names: 256 __builtin__.__dict__['lngettext'] = self.lngettext 257 258 259class GNUTranslations(NullTranslations): 260 # Magic number of .mo files 261 LE_MAGIC = 0x950412deL 262 BE_MAGIC = 0xde120495L 263 264 def _parse(self, fp): 265 """Override this method to support alternative .mo formats.""" 266 unpack = struct.unpack 267 filename = getattr(fp, 'name', '') 268 # Parse the .mo file header, which consists of 5 little endian 32 269 # bit words. 270 self._catalog = catalog = {} 271 self.plural = lambda n: int(n != 1) # germanic plural by default 272 buf = fp.read() 273 buflen = len(buf) 274 # Are we big endian or little endian? 275 magic = unpack('<I', buf[:4])[0] 276 if magic == self.LE_MAGIC: 277 version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20]) 278 ii = '<II' 279 elif magic == self.BE_MAGIC: 280 version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20]) 281 ii = '>II' 282 else: 283 raise IOError(0, 'Bad magic number', filename) 284 # Now put all messages from the .mo file buffer into the catalog 285 # dictionary. 286 for i in xrange(0, msgcount): 287 mlen, moff = unpack(ii, buf[masteridx:masteridx+8]) 288 mend = moff + mlen 289 tlen, toff = unpack(ii, buf[transidx:transidx+8]) 290 tend = toff + tlen 291 if mend < buflen and tend < buflen: 292 msg = buf[moff:mend] 293 tmsg = buf[toff:tend] 294 else: 295 raise IOError(0, 'File is corrupt', filename) 296 # See if we're looking at GNU .mo conventions for metadata 297 if mlen == 0: 298 # Catalog description 299 lastk = None 300 for item in tmsg.splitlines(): 301 item = item.strip() 302 if not item: 303 continue 304 k = v = None 305 if ':' in item: 306 k, v = item.split(':', 1) 307 k = k.strip().lower() 308 v = v.strip() 309 self._info[k] = v 310 lastk = k 311 elif lastk: 312 self._info[lastk] += '\n' + item 313 if k == 'content-type': 314 self._charset = v.split('charset=')[1] 315 elif k == 'plural-forms': 316 v = v.split(';') 317 plural = v[1].split('plural=')[1] 318 self.plural = c2py(plural) 319 # Note: we unconditionally convert both msgids and msgstrs to 320 # Unicode using the character encoding specified in the charset 321 # parameter of the Content-Type header. The gettext documentation 322 # strongly encourages msgids to be us-ascii, but some applications 323 # require alternative encodings (e.g. Zope's ZCML and ZPT). For 324 # traditional gettext applications, the msgid conversion will 325 # cause no problems since us-ascii should always be a subset of 326 # the charset encoding. We may want to fall back to 8-bit msgids 327 # if the Unicode conversion fails. 328 if '\x00' in msg: 329 # Plural forms 330 msgid1, msgid2 = msg.split('\x00') 331 tmsg = tmsg.split('\x00') 332 if self._charset: 333 msgid1 = unicode(msgid1, self._charset) 334 tmsg = [unicode(x, self._charset) for x in tmsg] 335 for i in range(len(tmsg)): 336 catalog[(msgid1, i)] = tmsg[i] 337 else: 338 if self._charset: 339 msg = unicode(msg, self._charset) 340 tmsg = unicode(tmsg, self._charset) 341 catalog[msg] = tmsg 342 # advance to next entry in the seek tables 343 masteridx += 8 344 transidx += 8 345 346 def gettext(self, message): 347 missing = object() 348 tmsg = self._catalog.get(message, missing) 349 if tmsg is missing: 350 if self._fallback: 351 return self._fallback.gettext(message) 352 return message 353 # Encode the Unicode tmsg back to an 8-bit string, if possible 354 if self._output_charset: 355 return tmsg.encode(self._output_charset) 356 elif self._charset: 357 return tmsg.encode(self._charset) 358 return tmsg 359 360 def lgettext(self, message): 361 missing = object() 362 tmsg = self._catalog.get(message, missing) 363 if tmsg is missing: 364 if self._fallback: 365 return self._fallback.lgettext(message) 366 return message 367 if self._output_charset: 368 return tmsg.encode(self._output_charset) 369 return tmsg.encode(locale.getpreferredencoding()) 370 371 def ngettext(self, msgid1, msgid2, n): 372 try: 373 tmsg = self._catalog[(msgid1, self.plural(n))] 374 if self._output_charset: 375 return tmsg.encode(self._output_charset) 376 elif self._charset: 377 return tmsg.encode(self._charset) 378 return tmsg 379 except KeyError: 380 if self._fallback: 381 return self._fallback.ngettext(msgid1, msgid2, n) 382 if n == 1: 383 return msgid1 384 else: 385 return msgid2 386 387 def lngettext(self, msgid1, msgid2, n): 388 try: 389 tmsg = self._catalog[(msgid1, self.plural(n))] 390 if self._output_charset: 391 return tmsg.encode(self._output_charset) 392 return tmsg.encode(locale.getpreferredencoding()) 393 except KeyError: 394 if self._fallback: 395 return self._fallback.lngettext(msgid1, msgid2, n) 396 if n == 1: 397 return msgid1 398 else: 399 return msgid2 400 401 def ugettext(self, message): 402 missing = object() 403 tmsg = self._catalog.get(message, missing) 404 if tmsg is missing: 405 if self._fallback: 406 return self._fallback.ugettext(message) 407 return unicode(message) 408 return tmsg 409 410 def ungettext(self, msgid1, msgid2, n): 411 try: 412 tmsg = self._catalog[(msgid1, self.plural(n))] 413 except KeyError: 414 if self._fallback: 415 return self._fallback.ungettext(msgid1, msgid2, n) 416 if n == 1: 417 tmsg = unicode(msgid1) 418 else: 419 tmsg = unicode(msgid2) 420 return tmsg 421 422 423# Locate a .mo file using the gettext strategy 424def find(domain, localedir=None, languages=None, all=0): 425 # Get some reasonable defaults for arguments that were not supplied 426 if localedir is None: 427 localedir = _default_localedir 428 if languages is None: 429 languages = [] 430 for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'): 431 val = os.environ.get(envar) 432 if val: 433 languages = val.split(':') 434 break 435 if 'C' not in languages: 436 languages.append('C') 437 # now normalize and expand the languages 438 nelangs = [] 439 for lang in languages: 440 for nelang in _expand_lang(lang): 441 if nelang not in nelangs: 442 nelangs.append(nelang) 443 # select a language 444 if all: 445 result = [] 446 else: 447 result = None 448 for lang in nelangs: 449 if lang == 'C': 450 break 451 mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain) 452 if os.path.exists(mofile): 453 if all: 454 result.append(mofile) 455 else: 456 return mofile 457 return result 458 459 460 461# a mapping between absolute .mo file path and Translation object 462_translations = {} 463 464def translation(domain, localedir=None, languages=None, 465 class_=None, fallback=False, codeset=None): 466 if class_ is None: 467 class_ = GNUTranslations 468 mofiles = find(domain, localedir, languages, all=1) 469 if not mofiles: 470 if fallback: 471 return NullTranslations() 472 raise IOError(ENOENT, 'No translation file found for domain', domain) 473 # Avoid opening, reading, and parsing the .mo file after it's been done 474 # once. 475 result = None 476 for mofile in mofiles: 477 key = (class_, os.path.abspath(mofile)) 478 t = _translations.get(key) 479 if t is None: 480 with open(mofile, 'rb') as fp: 481 t = _translations.setdefault(key, class_(fp)) 482 # Copy the translation object to allow setting fallbacks and 483 # output charset. All other instance data is shared with the 484 # cached object. 485 t = copy.copy(t) 486 if codeset: 487 t.set_output_charset(codeset) 488 if result is None: 489 result = t 490 else: 491 result.add_fallback(t) 492 return result 493 494 495def install(domain, localedir=None, unicode=False, codeset=None, names=None): 496 t = translation(domain, localedir, fallback=True, codeset=codeset) 497 t.install(unicode, names) 498 499 500 501# a mapping b/w domains and locale directories 502_localedirs = {} 503# a mapping b/w domains and codesets 504_localecodesets = {} 505# current global domain, `messages' used for compatibility w/ GNU gettext 506_current_domain = 'messages' 507 508 509def textdomain(domain=None): 510 global _current_domain 511 if domain is not None: 512 _current_domain = domain 513 return _current_domain 514 515 516def bindtextdomain(domain, localedir=None): 517 global _localedirs 518 if localedir is not None: 519 _localedirs[domain] = localedir 520 return _localedirs.get(domain, _default_localedir) 521 522 523def bind_textdomain_codeset(domain, codeset=None): 524 global _localecodesets 525 if codeset is not None: 526 _localecodesets[domain] = codeset 527 return _localecodesets.get(domain) 528 529 530def dgettext(domain, message): 531 try: 532 t = translation(domain, _localedirs.get(domain, None), 533 codeset=_localecodesets.get(domain)) 534 except IOError: 535 return message 536 return t.gettext(message) 537 538def ldgettext(domain, message): 539 try: 540 t = translation(domain, _localedirs.get(domain, None), 541 codeset=_localecodesets.get(domain)) 542 except IOError: 543 return message 544 return t.lgettext(message) 545 546def dngettext(domain, msgid1, msgid2, n): 547 try: 548 t = translation(domain, _localedirs.get(domain, None), 549 codeset=_localecodesets.get(domain)) 550 except IOError: 551 if n == 1: 552 return msgid1 553 else: 554 return msgid2 555 return t.ngettext(msgid1, msgid2, n) 556 557def ldngettext(domain, msgid1, msgid2, n): 558 try: 559 t = translation(domain, _localedirs.get(domain, None), 560 codeset=_localecodesets.get(domain)) 561 except IOError: 562 if n == 1: 563 return msgid1 564 else: 565 return msgid2 566 return t.lngettext(msgid1, msgid2, n) 567 568def gettext(message): 569 return dgettext(_current_domain, message) 570 571def lgettext(message): 572 return ldgettext(_current_domain, message) 573 574def ngettext(msgid1, msgid2, n): 575 return dngettext(_current_domain, msgid1, msgid2, n) 576 577def lngettext(msgid1, msgid2, n): 578 return ldngettext(_current_domain, msgid1, msgid2, n) 579 580# dcgettext() has been deemed unnecessary and is not implemented. 581 582# James Henstridge's Catalog constructor from GNOME gettext. Documented usage 583# was: 584# 585# import gettext 586# cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR) 587# _ = cat.gettext 588# print _('Hello World') 589 590# The resulting catalog object currently don't support access through a 591# dictionary API, which was supported (but apparently unused) in GNOME 592# gettext. 593 594Catalog = translation 595