1#! /usr/bin/env python
2# -*- coding: iso-8859-1 -*-
3# Originally written by Barry Warsaw <barry@python.org>
4#
5# Minimally patched to make it even more xgettext compatible
6# by Peter Funk <pf@artcom-gmbh.de>
7#
8# 2002-11-22 J�rgen Hermann <jh@web.de>
9# Added checks that _() only contains string literals, and
10# command line args are resolved to module lists, i.e. you
11# can now pass a filename, a module or package name, or a
12# directory (including globbing chars, important for Win32).
13# Made docstring fit in 80 chars wide displays using pydoc.
14#
15
16# for selftesting
17try:
18    import fintl
19    _ = fintl.gettext
20except ImportError:
21    _ = lambda s: s
22
23__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26internationalization of C programs. Most of these tools are independent of
27the programming language and can be used from within Python programs.
28Martin von Loewis' work[1] helps considerably in this regard.
29
30There's one problem though; xgettext is the program that scans source code
31looking for message strings, but it groks only C (or C++). Python
32introduces a few wrinkles, such as dual quoting characters, triple quoted
33strings, and raw strings. xgettext understands none of this.
34
35Enter pygettext, which uses Python's standard tokenize module to scan
36Python source code, generating .pot files identical to what GNU xgettext[2]
37generates for C and C++ code. From there, the standard GNU tools can be
38used.
39
40A word about marking Python strings as candidates for translation. GNU
41xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42and gettext_noop. But those can be a lot of text to include all over your
43code. C and C++ have a trick: they use the C preprocessor. Most
44internationalized C source includes a #define for gettext() to _() so that
45what has to be written in the source is much less. Thus these are both
46translatable strings:
47
48    gettext("Translatable String")
49    _("Translatable String")
50
51Python of course has no preprocessor so this doesn't work so well.  Thus,
52pygettext searches only for _() by default, but see the -k/--keyword flag
53below for how to augment this.
54
55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] http://www.gnu.org/software/gettext/gettext.html
57
58NOTE: pygettext attempts to be option and feature compatible with GNU
59xgettext where ever possible. However some options are still missing or are
60not fully implemented. Also, xgettext's use of command line switches with
61option arguments is broken, and in these cases, pygettext just defines
62additional switches.
63
64Usage: pygettext [options] inputfile ...
65
66Options:
67
68    -a
69    --extract-all
70        Extract all strings.
71
72    -d name
73    --default-domain=name
74        Rename the default output file from messages.pot to name.pot.
75
76    -E
77    --escape
78        Replace non-ASCII characters with octal escape sequences.
79
80    -D
81    --docstrings
82        Extract module, class, method, and function docstrings.  These do
83        not need to be wrapped in _() markers, and in fact cannot be for
84        Python to consider them docstrings. (See also the -X option).
85
86    -h
87    --help
88        Print this help message and exit.
89
90    -k word
91    --keyword=word
92        Keywords to look for in addition to the default set, which are:
93        %(DEFAULTKEYWORDS)s
94
95        You can have multiple -k flags on the command line.
96
97    -K
98    --no-default-keywords
99        Disable the default set of keywords (see above).  Any keywords
100        explicitly added with the -k/--keyword option are still recognized.
101
102    --no-location
103        Do not write filename/lineno location comments.
104
105    -n
106    --add-location
107        Write filename/lineno location comments indicating where each
108        extracted string is found in the source.  These lines appear before
109        each msgid.  The style of comments is controlled by the -S/--style
110        option.  This is the default.
111
112    -o filename
113    --output=filename
114        Rename the default output file from messages.pot to filename.  If
115        filename is `-' then the output is sent to standard out.
116
117    -p dir
118    --output-dir=dir
119        Output files will be placed in directory dir.
120
121    -S stylename
122    --style stylename
123        Specify which style to use for location comments.  Two styles are
124        supported:
125
126        Solaris  # File: filename, line: line-number
127        GNU      #: filename:line
128
129        The style name is case insensitive.  GNU style is the default.
130
131    -v
132    --verbose
133        Print the names of the files being processed.
134
135    -V
136    --version
137        Print the version of pygettext and exit.
138
139    -w columns
140    --width=columns
141        Set width of output to columns.
142
143    -x filename
144    --exclude-file=filename
145        Specify a file that contains a list of strings that are not be
146        extracted from the input files.  Each string to be excluded must
147        appear on a line by itself in the file.
148
149    -X filename
150    --no-docstrings=filename
151        Specify a file that contains a list of files (one per line) that
152        should not have their docstrings extracted.  This is only useful in
153        conjunction with the -D option above.
154
155If `inputfile' is -, standard input is read.
156""")
157
158import os
159import imp
160import sys
161import glob
162import time
163import getopt
164import token
165import tokenize
166import operator
167
168__version__ = '1.5'
169
170default_keywords = ['_']
171DEFAULTKEYWORDS = ', '.join(default_keywords)
172
173EMPTYSTRING = ''
174
175
176
177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
178# there.
179pot_header = _('''\
180# SOME DESCRIPTIVE TITLE.
181# Copyright (C) YEAR ORGANIZATION
182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
183#
184msgid ""
185msgstr ""
186"Project-Id-Version: PACKAGE VERSION\\n"
187"POT-Creation-Date: %(time)s\\n"
188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
190"Language-Team: LANGUAGE <LL@li.org>\\n"
191"MIME-Version: 1.0\\n"
192"Content-Type: text/plain; charset=CHARSET\\n"
193"Content-Transfer-Encoding: ENCODING\\n"
194"Generated-By: pygettext.py %(version)s\\n"
195
196''')
197
198
199def usage(code, msg=''):
200    print >> sys.stderr, __doc__ % globals()
201    if msg:
202        print >> sys.stderr, msg
203    sys.exit(code)
204
205
206
207escapes = []
208
209def make_escapes(pass_iso8859):
210    global escapes
211    escapes = [chr(i) for i in range(256)]
212    if pass_iso8859:
213        # Allow iso-8859 characters to pass through so that e.g. 'msgid
214        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
215        # escape any character outside the 32..126 range.
216        mod = 128
217    else:
218        mod = 256
219    for i in range(mod):
220        if not(32 <= i <= 126):
221            escapes[i] = "\\%03o" % i
222    escapes[ord('\\')] = '\\\\'
223    escapes[ord('\t')] = '\\t'
224    escapes[ord('\r')] = '\\r'
225    escapes[ord('\n')] = '\\n'
226    escapes[ord('\"')] = '\\"'
227
228
229def escape(s):
230    global escapes
231    s = list(s)
232    for i in range(len(s)):
233        s[i] = escapes[ord(s[i])]
234    return EMPTYSTRING.join(s)
235
236
237def safe_eval(s):
238    # unwrap quotes, safely
239    return eval(s, {'__builtins__':{}}, {})
240
241
242def normalize(s):
243    # This converts the various Python string types into a format that is
244    # appropriate for .po files, namely much closer to C style.
245    lines = s.split('\n')
246    if len(lines) == 1:
247        s = '"' + escape(s) + '"'
248    else:
249        if not lines[-1]:
250            del lines[-1]
251            lines[-1] = lines[-1] + '\n'
252        for i in range(len(lines)):
253            lines[i] = escape(lines[i])
254        lineterm = '\\n"\n"'
255        s = '""\n"' + lineterm.join(lines) + '"'
256    return s
257
258
259def containsAny(str, set):
260    """Check whether 'str' contains ANY of the chars in 'set'"""
261    return 1 in [c in str for c in set]
262
263
264def _visit_pyfiles(list, dirname, names):
265    """Helper for getFilesForName()."""
266    # get extension for python source files
267    if not globals().has_key('_py_ext'):
268        global _py_ext
269        _py_ext = [triple[0] for triple in imp.get_suffixes()
270                   if triple[2] == imp.PY_SOURCE][0]
271
272    # don't recurse into CVS directories
273    if 'CVS' in names:
274        names.remove('CVS')
275
276    # add all *.py files to list
277    list.extend(
278        [os.path.join(dirname, file) for file in names
279         if os.path.splitext(file)[1] == _py_ext]
280        )
281
282
283def _get_modpkg_path(dotted_name, pathlist=None):
284    """Get the filesystem path for a module or a package.
285
286    Return the file system path to a file for a module, and to a directory for
287    a package. Return None if the name is not found, or is a builtin or
288    extension module.
289    """
290    # split off top-most name
291    parts = dotted_name.split('.', 1)
292
293    if len(parts) > 1:
294        # we have a dotted path, import top-level package
295        try:
296            file, pathname, description = imp.find_module(parts[0], pathlist)
297            if file: file.close()
298        except ImportError:
299            return None
300
301        # check if it's indeed a package
302        if description[2] == imp.PKG_DIRECTORY:
303            # recursively handle the remaining name parts
304            pathname = _get_modpkg_path(parts[1], [pathname])
305        else:
306            pathname = None
307    else:
308        # plain name
309        try:
310            file, pathname, description = imp.find_module(
311                dotted_name, pathlist)
312            if file:
313                file.close()
314            if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
315                pathname = None
316        except ImportError:
317            pathname = None
318
319    return pathname
320
321
322def getFilesForName(name):
323    """Get a list of module files for a filename, a module or package name,
324    or a directory.
325    """
326    if not os.path.exists(name):
327        # check for glob chars
328        if containsAny(name, "*?[]"):
329            files = glob.glob(name)
330            list = []
331            for file in files:
332                list.extend(getFilesForName(file))
333            return list
334
335        # try to find module or package
336        name = _get_modpkg_path(name)
337        if not name:
338            return []
339
340    if os.path.isdir(name):
341        # find all python files in directory
342        list = []
343        os.path.walk(name, _visit_pyfiles, list)
344        return list
345    elif os.path.exists(name):
346        # a single file
347        return [name]
348
349    return []
350
351
352class TokenEater:
353    def __init__(self, options):
354        self.__options = options
355        self.__messages = {}
356        self.__state = self.__waiting
357        self.__data = []
358        self.__lineno = -1
359        self.__freshmodule = 1
360        self.__curfile = None
361
362    def __call__(self, ttype, tstring, stup, etup, line):
363        # dispatch
364##        import token
365##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
366##              'tstring:', tstring
367        self.__state(ttype, tstring, stup[0])
368
369    def __waiting(self, ttype, tstring, lineno):
370        opts = self.__options
371        # Do docstring extractions, if enabled
372        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
373            # module docstring?
374            if self.__freshmodule:
375                if ttype == tokenize.STRING:
376                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
377                    self.__freshmodule = 0
378                elif ttype not in (tokenize.COMMENT, tokenize.NL):
379                    self.__freshmodule = 0
380                return
381            # class docstring?
382            if ttype == tokenize.NAME and tstring in ('class', 'def'):
383                self.__state = self.__suiteseen
384                return
385        if ttype == tokenize.NAME and tstring in opts.keywords:
386            self.__state = self.__keywordseen
387
388    def __suiteseen(self, ttype, tstring, lineno):
389        # ignore anything until we see the colon
390        if ttype == tokenize.OP and tstring == ':':
391            self.__state = self.__suitedocstring
392
393    def __suitedocstring(self, ttype, tstring, lineno):
394        # ignore any intervening noise
395        if ttype == tokenize.STRING:
396            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
397            self.__state = self.__waiting
398        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
399                           tokenize.COMMENT):
400            # there was no class docstring
401            self.__state = self.__waiting
402
403    def __keywordseen(self, ttype, tstring, lineno):
404        if ttype == tokenize.OP and tstring == '(':
405            self.__data = []
406            self.__lineno = lineno
407            self.__state = self.__openseen
408        else:
409            self.__state = self.__waiting
410
411    def __openseen(self, ttype, tstring, lineno):
412        if ttype == tokenize.OP and tstring == ')':
413            # We've seen the last of the translatable strings.  Record the
414            # line number of the first line of the strings and update the list
415            # of messages seen.  Reset state for the next batch.  If there
416            # were no strings inside _(), then just ignore this entry.
417            if self.__data:
418                self.__addentry(EMPTYSTRING.join(self.__data))
419            self.__state = self.__waiting
420        elif ttype == tokenize.STRING:
421            self.__data.append(safe_eval(tstring))
422        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
423                           token.NEWLINE, tokenize.NL]:
424            # warn if we see anything else than STRING or whitespace
425            print >> sys.stderr, _(
426                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
427                ) % {
428                'token': tstring,
429                'file': self.__curfile,
430                'lineno': self.__lineno
431                }
432            self.__state = self.__waiting
433
434    def __addentry(self, msg, lineno=None, isdocstring=0):
435        if lineno is None:
436            lineno = self.__lineno
437        if not msg in self.__options.toexclude:
438            entry = (self.__curfile, lineno)
439            self.__messages.setdefault(msg, {})[entry] = isdocstring
440
441    def set_filename(self, filename):
442        self.__curfile = filename
443        self.__freshmodule = 1
444
445    def write(self, fp):
446        options = self.__options
447        timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
448        # The time stamp in the header doesn't have the same format as that
449        # generated by xgettext...
450        print >> fp, pot_header % {'time': timestamp, 'version': __version__}
451        # Sort the entries.  First sort each particular entry's keys, then
452        # sort all the entries by their first item.
453        reverse = {}
454        for k, v in self.__messages.items():
455            keys = v.keys()
456            keys.sort()
457            reverse.setdefault(tuple(keys), []).append((k, v))
458        rkeys = reverse.keys()
459        rkeys.sort()
460        for rkey in rkeys:
461            rentries = reverse[rkey]
462            rentries.sort()
463            for k, v in rentries:
464                isdocstring = 0
465                # If the entry was gleaned out of a docstring, then add a
466                # comment stating so.  This is to aid translators who may wish
467                # to skip translating some unimportant docstrings.
468                if reduce(operator.__add__, v.values()):
469                    isdocstring = 1
470                # k is the message string, v is a dictionary-set of (filename,
471                # lineno) tuples.  We want to sort the entries in v first by
472                # file name and then by line number.
473                v = v.keys()
474                v.sort()
475                if not options.writelocations:
476                    pass
477                # location comments are different b/w Solaris and GNU:
478                elif options.locationstyle == options.SOLARIS:
479                    for filename, lineno in v:
480                        d = {'filename': filename, 'lineno': lineno}
481                        print >>fp, _(
482                            '# File: %(filename)s, line: %(lineno)d') % d
483                elif options.locationstyle == options.GNU:
484                    # fit as many locations on one line, as long as the
485                    # resulting line length doesn't exceed 'options.width'
486                    locline = '#:'
487                    for filename, lineno in v:
488                        d = {'filename': filename, 'lineno': lineno}
489                        s = _(' %(filename)s:%(lineno)d') % d
490                        if len(locline) + len(s) <= options.width:
491                            locline = locline + s
492                        else:
493                            print >> fp, locline
494                            locline = "#:" + s
495                    if len(locline) > 2:
496                        print >> fp, locline
497                if isdocstring:
498                    print >> fp, '#, docstring'
499                print >> fp, 'msgid', normalize(k)
500                print >> fp, 'msgstr ""\n'
501
502
503
504def main():
505    global default_keywords
506    try:
507        opts, args = getopt.getopt(
508            sys.argv[1:],
509            'ad:DEhk:Kno:p:S:Vvw:x:X:',
510            ['extract-all', 'default-domain=', 'escape', 'help',
511             'keyword=', 'no-default-keywords',
512             'add-location', 'no-location', 'output=', 'output-dir=',
513             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
514             'docstrings', 'no-docstrings',
515             ])
516    except getopt.error, msg:
517        usage(1, msg)
518
519    # for holding option values
520    class Options:
521        # constants
522        GNU = 1
523        SOLARIS = 2
524        # defaults
525        extractall = 0 # FIXME: currently this option has no effect at all.
526        escape = 0
527        keywords = []
528        outpath = ''
529        outfile = 'messages.pot'
530        writelocations = 1
531        locationstyle = GNU
532        verbose = 0
533        width = 78
534        excludefilename = ''
535        docstrings = 0
536        nodocstrings = {}
537
538    options = Options()
539    locations = {'gnu' : options.GNU,
540                 'solaris' : options.SOLARIS,
541                 }
542
543    # parse options
544    for opt, arg in opts:
545        if opt in ('-h', '--help'):
546            usage(0)
547        elif opt in ('-a', '--extract-all'):
548            options.extractall = 1
549        elif opt in ('-d', '--default-domain'):
550            options.outfile = arg + '.pot'
551        elif opt in ('-E', '--escape'):
552            options.escape = 1
553        elif opt in ('-D', '--docstrings'):
554            options.docstrings = 1
555        elif opt in ('-k', '--keyword'):
556            options.keywords.append(arg)
557        elif opt in ('-K', '--no-default-keywords'):
558            default_keywords = []
559        elif opt in ('-n', '--add-location'):
560            options.writelocations = 1
561        elif opt in ('--no-location',):
562            options.writelocations = 0
563        elif opt in ('-S', '--style'):
564            options.locationstyle = locations.get(arg.lower())
565            if options.locationstyle is None:
566                usage(1, _('Invalid value for --style: %s') % arg)
567        elif opt in ('-o', '--output'):
568            options.outfile = arg
569        elif opt in ('-p', '--output-dir'):
570            options.outpath = arg
571        elif opt in ('-v', '--verbose'):
572            options.verbose = 1
573        elif opt in ('-V', '--version'):
574            print _('pygettext.py (xgettext for Python) %s') % __version__
575            sys.exit(0)
576        elif opt in ('-w', '--width'):
577            try:
578                options.width = int(arg)
579            except ValueError:
580                usage(1, _('--width argument must be an integer: %s') % arg)
581        elif opt in ('-x', '--exclude-file'):
582            options.excludefilename = arg
583        elif opt in ('-X', '--no-docstrings'):
584            fp = open(arg)
585            try:
586                while 1:
587                    line = fp.readline()
588                    if not line:
589                        break
590                    options.nodocstrings[line[:-1]] = 1
591            finally:
592                fp.close()
593
594    # calculate escapes
595    make_escapes(not options.escape)
596
597    # calculate all keywords
598    options.keywords.extend(default_keywords)
599
600    # initialize list of strings to exclude
601    if options.excludefilename:
602        try:
603            fp = open(options.excludefilename)
604            options.toexclude = fp.readlines()
605            fp.close()
606        except IOError:
607            print >> sys.stderr, _(
608                "Can't read --exclude-file: %s") % options.excludefilename
609            sys.exit(1)
610    else:
611        options.toexclude = []
612
613    # resolve args to module lists
614    expanded = []
615    for arg in args:
616        if arg == '-':
617            expanded.append(arg)
618        else:
619            expanded.extend(getFilesForName(arg))
620    args = expanded
621
622    # slurp through all the files
623    eater = TokenEater(options)
624    for filename in args:
625        if filename == '-':
626            if options.verbose:
627                print _('Reading standard input')
628            fp = sys.stdin
629            closep = 0
630        else:
631            if options.verbose:
632                print _('Working on %s') % filename
633            fp = open(filename)
634            closep = 1
635        try:
636            eater.set_filename(filename)
637            try:
638                tokenize.tokenize(fp.readline, eater)
639            except tokenize.TokenError, e:
640                print >> sys.stderr, '%s: %s, line %d, column %d' % (
641                    e[0], filename, e[1][0], e[1][1])
642        finally:
643            if closep:
644                fp.close()
645
646    # write the output
647    if options.outfile == '-':
648        fp = sys.stdout
649        closep = 0
650    else:
651        if options.outpath:
652            options.outfile = os.path.join(options.outpath, options.outfile)
653        fp = open(options.outfile, 'w')
654        closep = 1
655    try:
656        eater.write(fp)
657    finally:
658        if closep:
659            fp.close()
660
661
662if __name__ == '__main__':
663    main()
664    # some more test strings
665    _(u'a unicode string')
666    # this one creates a warning
667    _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
668    _('more' 'than' 'one' 'string')
669