1#! /usr/bin/env python3 2# -*- coding: iso-8859-1 -*- 3# Originally written by Barry Warsaw <barry@python.org> 4# 5# Minimally patched to make it even more xgettext compatible 6# by Peter Funk <pf@artcom-gmbh.de> 7# 8# 2002-11-22 J�rgen Hermann <jh@web.de> 9# Added checks that _() only contains string literals, and 10# command line args are resolved to module lists, i.e. you 11# can now pass a filename, a module or package name, or a 12# directory (including globbing chars, important for Win32). 13# Made docstring fit in 80 chars wide displays using pydoc. 14# 15 16# for selftesting 17try: 18 import fintl 19 _ = fintl.gettext 20except ImportError: 21 _ = lambda s: s 22 23__doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26internationalization of C programs. Most of these tools are independent of 27the programming language and can be used from within Python programs. 28Martin von Loewis' work[1] helps considerably in this regard. 29 30There's one problem though; xgettext is the program that scans source code 31looking for message strings, but it groks only C (or C++). Python 32introduces a few wrinkles, such as dual quoting characters, triple quoted 33strings, and raw strings. xgettext understands none of this. 34 35Enter pygettext, which uses Python's standard tokenize module to scan 36Python source code, generating .pot files identical to what GNU xgettext[2] 37generates for C and C++ code. From there, the standard GNU tools can be 38used. 39 40A word about marking Python strings as candidates for translation. GNU 41xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42and gettext_noop. But those can be a lot of text to include all over your 43code. C and C++ have a trick: they use the C preprocessor. Most 44internationalized C source includes a #define for gettext() to _() so that 45what has to be written in the source is much less. Thus these are both 46translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51Python of course has no preprocessor so this doesn't work so well. Thus, 52pygettext searches only for _() by default, but see the -k/--keyword flag 53below for how to augment this. 54 55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] http://www.gnu.org/software/gettext/gettext.html 57 58NOTE: pygettext attempts to be option and feature compatible with GNU 59xgettext where ever possible. However some options are still missing or are 60not fully implemented. Also, xgettext's use of command line switches with 61option arguments is broken, and in these cases, pygettext just defines 62additional switches. 63 64Usage: pygettext [options] inputfile ... 65 66Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155If `inputfile' is -, standard input is read. 156""") 157 158import os 159import importlib.machinery 160import importlib.util 161import sys 162import glob 163import time 164import getopt 165import token 166import tokenize 167 168__version__ = '1.5' 169 170default_keywords = ['_'] 171DEFAULTKEYWORDS = ', '.join(default_keywords) 172 173EMPTYSTRING = '' 174 175 176 177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 178# there. 179pot_header = _('''\ 180# SOME DESCRIPTIVE TITLE. 181# Copyright (C) YEAR ORGANIZATION 182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 183# 184msgid "" 185msgstr "" 186"Project-Id-Version: PACKAGE VERSION\\n" 187"POT-Creation-Date: %(time)s\\n" 188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 190"Language-Team: LANGUAGE <LL@li.org>\\n" 191"MIME-Version: 1.0\\n" 192"Content-Type: text/plain; charset=%(charset)s\\n" 193"Content-Transfer-Encoding: %(encoding)s\\n" 194"Generated-By: pygettext.py %(version)s\\n" 195 196''') 197 198 199def usage(code, msg=''): 200 print(__doc__ % globals(), file=sys.stderr) 201 if msg: 202 print(msg, file=sys.stderr) 203 sys.exit(code) 204 205 206 207def make_escapes(pass_nonascii): 208 global escapes, escape 209 if pass_nonascii: 210 # Allow non-ascii characters to pass through so that e.g. 'msgid 211 # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we 212 # escape any character outside the 32..126 range. 213 mod = 128 214 escape = escape_ascii 215 else: 216 mod = 256 217 escape = escape_nonascii 218 escapes = [r"\%03o" % i for i in range(mod)] 219 for i in range(32, 127): 220 escapes[i] = chr(i) 221 escapes[ord('\\')] = r'\\' 222 escapes[ord('\t')] = r'\t' 223 escapes[ord('\r')] = r'\r' 224 escapes[ord('\n')] = r'\n' 225 escapes[ord('\"')] = r'\"' 226 227 228def escape_ascii(s, encoding): 229 return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) 230 231def escape_nonascii(s, encoding): 232 return ''.join(escapes[b] for b in s.encode(encoding)) 233 234 235def safe_eval(s): 236 # unwrap quotes, safely 237 return eval(s, {'__builtins__':{}}, {}) 238 239 240def normalize(s, encoding): 241 # This converts the various Python string types into a format that is 242 # appropriate for .po files, namely much closer to C style. 243 lines = s.split('\n') 244 if len(lines) == 1: 245 s = '"' + escape(s, encoding) + '"' 246 else: 247 if not lines[-1]: 248 del lines[-1] 249 lines[-1] = lines[-1] + '\n' 250 for i in range(len(lines)): 251 lines[i] = escape(lines[i], encoding) 252 lineterm = '\\n"\n"' 253 s = '""\n"' + lineterm.join(lines) + '"' 254 return s 255 256 257def containsAny(str, set): 258 """Check whether 'str' contains ANY of the chars in 'set'""" 259 return 1 in [c in str for c in set] 260 261 262def _visit_pyfiles(list, dirname, names): 263 """Helper for getFilesForName().""" 264 # get extension for python source files 265 if '_py_ext' not in globals(): 266 global _py_ext 267 _py_ext = importlib.machinery.SOURCE_SUFFIXES[0] 268 269 # don't recurse into CVS directories 270 if 'CVS' in names: 271 names.remove('CVS') 272 273 # add all *.py files to list 274 list.extend( 275 [os.path.join(dirname, file) for file in names 276 if os.path.splitext(file)[1] == _py_ext] 277 ) 278 279 280def getFilesForName(name): 281 """Get a list of module files for a filename, a module or package name, 282 or a directory. 283 """ 284 if not os.path.exists(name): 285 # check for glob chars 286 if containsAny(name, "*?[]"): 287 files = glob.glob(name) 288 list = [] 289 for file in files: 290 list.extend(getFilesForName(file)) 291 return list 292 293 # try to find module or package 294 try: 295 spec = importlib.util.find_spec(name) 296 name = spec.origin 297 except ImportError: 298 name = None 299 if not name: 300 return [] 301 302 if os.path.isdir(name): 303 # find all python files in directory 304 list = [] 305 os.walk(name, _visit_pyfiles, list) 306 return list 307 elif os.path.exists(name): 308 # a single file 309 return [name] 310 311 return [] 312 313 314class TokenEater: 315 def __init__(self, options): 316 self.__options = options 317 self.__messages = {} 318 self.__state = self.__waiting 319 self.__data = [] 320 self.__lineno = -1 321 self.__freshmodule = 1 322 self.__curfile = None 323 324 def __call__(self, ttype, tstring, stup, etup, line): 325 # dispatch 326## import token 327## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ 328## 'tstring:', tstring 329 self.__state(ttype, tstring, stup[0]) 330 331 def __waiting(self, ttype, tstring, lineno): 332 opts = self.__options 333 # Do docstring extractions, if enabled 334 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 335 # module docstring? 336 if self.__freshmodule: 337 if ttype == tokenize.STRING: 338 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 339 self.__freshmodule = 0 340 elif ttype not in (tokenize.COMMENT, tokenize.NL): 341 self.__freshmodule = 0 342 return 343 # class docstring? 344 if ttype == tokenize.NAME and tstring in ('class', 'def'): 345 self.__state = self.__suiteseen 346 return 347 if ttype == tokenize.NAME and tstring in opts.keywords: 348 self.__state = self.__keywordseen 349 350 def __suiteseen(self, ttype, tstring, lineno): 351 # ignore anything until we see the colon 352 if ttype == tokenize.OP and tstring == ':': 353 self.__state = self.__suitedocstring 354 355 def __suitedocstring(self, ttype, tstring, lineno): 356 # ignore any intervening noise 357 if ttype == tokenize.STRING: 358 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 359 self.__state = self.__waiting 360 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 361 tokenize.COMMENT): 362 # there was no class docstring 363 self.__state = self.__waiting 364 365 def __keywordseen(self, ttype, tstring, lineno): 366 if ttype == tokenize.OP and tstring == '(': 367 self.__data = [] 368 self.__lineno = lineno 369 self.__state = self.__openseen 370 else: 371 self.__state = self.__waiting 372 373 def __openseen(self, ttype, tstring, lineno): 374 if ttype == tokenize.OP and tstring == ')': 375 # We've seen the last of the translatable strings. Record the 376 # line number of the first line of the strings and update the list 377 # of messages seen. Reset state for the next batch. If there 378 # were no strings inside _(), then just ignore this entry. 379 if self.__data: 380 self.__addentry(EMPTYSTRING.join(self.__data)) 381 self.__state = self.__waiting 382 elif ttype == tokenize.STRING: 383 self.__data.append(safe_eval(tstring)) 384 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 385 token.NEWLINE, tokenize.NL]: 386 # warn if we see anything else than STRING or whitespace 387 print(_( 388 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 389 ) % { 390 'token': tstring, 391 'file': self.__curfile, 392 'lineno': self.__lineno 393 }, file=sys.stderr) 394 self.__state = self.__waiting 395 396 def __addentry(self, msg, lineno=None, isdocstring=0): 397 if lineno is None: 398 lineno = self.__lineno 399 if not msg in self.__options.toexclude: 400 entry = (self.__curfile, lineno) 401 self.__messages.setdefault(msg, {})[entry] = isdocstring 402 403 def set_filename(self, filename): 404 self.__curfile = filename 405 self.__freshmodule = 1 406 407 def write(self, fp): 408 options = self.__options 409 timestamp = time.strftime('%Y-%m-%d %H:%M%z') 410 encoding = fp.encoding if fp.encoding else 'UTF-8' 411 print(pot_header % {'time': timestamp, 'version': __version__, 412 'charset': encoding, 413 'encoding': '8bit'}, file=fp) 414 # Sort the entries. First sort each particular entry's keys, then 415 # sort all the entries by their first item. 416 reverse = {} 417 for k, v in self.__messages.items(): 418 keys = sorted(v.keys()) 419 reverse.setdefault(tuple(keys), []).append((k, v)) 420 rkeys = sorted(reverse.keys()) 421 for rkey in rkeys: 422 rentries = reverse[rkey] 423 rentries.sort() 424 for k, v in rentries: 425 # If the entry was gleaned out of a docstring, then add a 426 # comment stating so. This is to aid translators who may wish 427 # to skip translating some unimportant docstrings. 428 isdocstring = any(v.values()) 429 # k is the message string, v is a dictionary-set of (filename, 430 # lineno) tuples. We want to sort the entries in v first by 431 # file name and then by line number. 432 v = sorted(v.keys()) 433 if not options.writelocations: 434 pass 435 # location comments are different b/w Solaris and GNU: 436 elif options.locationstyle == options.SOLARIS: 437 for filename, lineno in v: 438 d = {'filename': filename, 'lineno': lineno} 439 print(_( 440 '# File: %(filename)s, line: %(lineno)d') % d, file=fp) 441 elif options.locationstyle == options.GNU: 442 # fit as many locations on one line, as long as the 443 # resulting line length doesn't exceed 'options.width' 444 locline = '#:' 445 for filename, lineno in v: 446 d = {'filename': filename, 'lineno': lineno} 447 s = _(' %(filename)s:%(lineno)d') % d 448 if len(locline) + len(s) <= options.width: 449 locline = locline + s 450 else: 451 print(locline, file=fp) 452 locline = "#:" + s 453 if len(locline) > 2: 454 print(locline, file=fp) 455 if isdocstring: 456 print('#, docstring', file=fp) 457 print('msgid', normalize(k, encoding), file=fp) 458 print('msgstr ""\n', file=fp) 459 460 461 462def main(): 463 global default_keywords 464 try: 465 opts, args = getopt.getopt( 466 sys.argv[1:], 467 'ad:DEhk:Kno:p:S:Vvw:x:X:', 468 ['extract-all', 'default-domain=', 'escape', 'help', 469 'keyword=', 'no-default-keywords', 470 'add-location', 'no-location', 'output=', 'output-dir=', 471 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 472 'docstrings', 'no-docstrings', 473 ]) 474 except getopt.error as msg: 475 usage(1, msg) 476 477 # for holding option values 478 class Options: 479 # constants 480 GNU = 1 481 SOLARIS = 2 482 # defaults 483 extractall = 0 # FIXME: currently this option has no effect at all. 484 escape = 0 485 keywords = [] 486 outpath = '' 487 outfile = 'messages.pot' 488 writelocations = 1 489 locationstyle = GNU 490 verbose = 0 491 width = 78 492 excludefilename = '' 493 docstrings = 0 494 nodocstrings = {} 495 496 options = Options() 497 locations = {'gnu' : options.GNU, 498 'solaris' : options.SOLARIS, 499 } 500 501 # parse options 502 for opt, arg in opts: 503 if opt in ('-h', '--help'): 504 usage(0) 505 elif opt in ('-a', '--extract-all'): 506 options.extractall = 1 507 elif opt in ('-d', '--default-domain'): 508 options.outfile = arg + '.pot' 509 elif opt in ('-E', '--escape'): 510 options.escape = 1 511 elif opt in ('-D', '--docstrings'): 512 options.docstrings = 1 513 elif opt in ('-k', '--keyword'): 514 options.keywords.append(arg) 515 elif opt in ('-K', '--no-default-keywords'): 516 default_keywords = [] 517 elif opt in ('-n', '--add-location'): 518 options.writelocations = 1 519 elif opt in ('--no-location',): 520 options.writelocations = 0 521 elif opt in ('-S', '--style'): 522 options.locationstyle = locations.get(arg.lower()) 523 if options.locationstyle is None: 524 usage(1, _('Invalid value for --style: %s') % arg) 525 elif opt in ('-o', '--output'): 526 options.outfile = arg 527 elif opt in ('-p', '--output-dir'): 528 options.outpath = arg 529 elif opt in ('-v', '--verbose'): 530 options.verbose = 1 531 elif opt in ('-V', '--version'): 532 print(_('pygettext.py (xgettext for Python) %s') % __version__) 533 sys.exit(0) 534 elif opt in ('-w', '--width'): 535 try: 536 options.width = int(arg) 537 except ValueError: 538 usage(1, _('--width argument must be an integer: %s') % arg) 539 elif opt in ('-x', '--exclude-file'): 540 options.excludefilename = arg 541 elif opt in ('-X', '--no-docstrings'): 542 fp = open(arg) 543 try: 544 while 1: 545 line = fp.readline() 546 if not line: 547 break 548 options.nodocstrings[line[:-1]] = 1 549 finally: 550 fp.close() 551 552 # calculate escapes 553 make_escapes(not options.escape) 554 555 # calculate all keywords 556 options.keywords.extend(default_keywords) 557 558 # initialize list of strings to exclude 559 if options.excludefilename: 560 try: 561 fp = open(options.excludefilename) 562 options.toexclude = fp.readlines() 563 fp.close() 564 except IOError: 565 print(_( 566 "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) 567 sys.exit(1) 568 else: 569 options.toexclude = [] 570 571 # resolve args to module lists 572 expanded = [] 573 for arg in args: 574 if arg == '-': 575 expanded.append(arg) 576 else: 577 expanded.extend(getFilesForName(arg)) 578 args = expanded 579 580 # slurp through all the files 581 eater = TokenEater(options) 582 for filename in args: 583 if filename == '-': 584 if options.verbose: 585 print(_('Reading standard input')) 586 fp = sys.stdin.buffer 587 closep = 0 588 else: 589 if options.verbose: 590 print(_('Working on %s') % filename) 591 fp = open(filename, 'rb') 592 closep = 1 593 try: 594 eater.set_filename(filename) 595 try: 596 tokens = tokenize.tokenize(fp.readline) 597 for _token in tokens: 598 eater(*_token) 599 except tokenize.TokenError as e: 600 print('%s: %s, line %d, column %d' % ( 601 e.args[0], filename, e.args[1][0], e.args[1][1]), 602 file=sys.stderr) 603 finally: 604 if closep: 605 fp.close() 606 607 # write the output 608 if options.outfile == '-': 609 fp = sys.stdout 610 closep = 0 611 else: 612 if options.outpath: 613 options.outfile = os.path.join(options.outpath, options.outfile) 614 fp = open(options.outfile, 'w') 615 closep = 1 616 try: 617 eater.write(fp) 618 finally: 619 if closep: 620 fp.close() 621 622 623if __name__ == '__main__': 624 main() 625 # some more test strings 626 # this one creates a warning 627 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 628 _('more' 'than' 'one' 'string') 629