1#! /usr/bin/env python3
2# -*- coding: iso-8859-1 -*-
3# Originally written by Barry Warsaw <barry@python.org>
4#
5# Minimally patched to make it even more xgettext compatible
6# by Peter Funk <pf@artcom-gmbh.de>
7#
8# 2002-11-22 J�rgen Hermann <jh@web.de>
9# Added checks that _() only contains string literals, and
10# command line args are resolved to module lists, i.e. you
11# can now pass a filename, a module or package name, or a
12# directory (including globbing chars, important for Win32).
13# Made docstring fit in 80 chars wide displays using pydoc.
14#
15
16# for selftesting
17try:
18    import fintl
19    _ = fintl.gettext
20except ImportError:
21    _ = lambda s: s
22
23__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26internationalization of C programs. Most of these tools are independent of
27the programming language and can be used from within Python programs.
28Martin von Loewis' work[1] helps considerably in this regard.
29
30There's one problem though; xgettext is the program that scans source code
31looking for message strings, but it groks only C (or C++). Python
32introduces a few wrinkles, such as dual quoting characters, triple quoted
33strings, and raw strings. xgettext understands none of this.
34
35Enter pygettext, which uses Python's standard tokenize module to scan
36Python source code, generating .pot files identical to what GNU xgettext[2]
37generates for C and C++ code. From there, the standard GNU tools can be
38used.
39
40A word about marking Python strings as candidates for translation. GNU
41xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42and gettext_noop. But those can be a lot of text to include all over your
43code. C and C++ have a trick: they use the C preprocessor. Most
44internationalized C source includes a #define for gettext() to _() so that
45what has to be written in the source is much less. Thus these are both
46translatable strings:
47
48    gettext("Translatable String")
49    _("Translatable String")
50
51Python of course has no preprocessor so this doesn't work so well.  Thus,
52pygettext searches only for _() by default, but see the -k/--keyword flag
53below for how to augment this.
54
55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] http://www.gnu.org/software/gettext/gettext.html
57
58NOTE: pygettext attempts to be option and feature compatible with GNU
59xgettext where ever possible. However some options are still missing or are
60not fully implemented. Also, xgettext's use of command line switches with
61option arguments is broken, and in these cases, pygettext just defines
62additional switches.
63
64Usage: pygettext [options] inputfile ...
65
66Options:
67
68    -a
69    --extract-all
70        Extract all strings.
71
72    -d name
73    --default-domain=name
74        Rename the default output file from messages.pot to name.pot.
75
76    -E
77    --escape
78        Replace non-ASCII characters with octal escape sequences.
79
80    -D
81    --docstrings
82        Extract module, class, method, and function docstrings.  These do
83        not need to be wrapped in _() markers, and in fact cannot be for
84        Python to consider them docstrings. (See also the -X option).
85
86    -h
87    --help
88        Print this help message and exit.
89
90    -k word
91    --keyword=word
92        Keywords to look for in addition to the default set, which are:
93        %(DEFAULTKEYWORDS)s
94
95        You can have multiple -k flags on the command line.
96
97    -K
98    --no-default-keywords
99        Disable the default set of keywords (see above).  Any keywords
100        explicitly added with the -k/--keyword option are still recognized.
101
102    --no-location
103        Do not write filename/lineno location comments.
104
105    -n
106    --add-location
107        Write filename/lineno location comments indicating where each
108        extracted string is found in the source.  These lines appear before
109        each msgid.  The style of comments is controlled by the -S/--style
110        option.  This is the default.
111
112    -o filename
113    --output=filename
114        Rename the default output file from messages.pot to filename.  If
115        filename is `-' then the output is sent to standard out.
116
117    -p dir
118    --output-dir=dir
119        Output files will be placed in directory dir.
120
121    -S stylename
122    --style stylename
123        Specify which style to use for location comments.  Two styles are
124        supported:
125
126        Solaris  # File: filename, line: line-number
127        GNU      #: filename:line
128
129        The style name is case insensitive.  GNU style is the default.
130
131    -v
132    --verbose
133        Print the names of the files being processed.
134
135    -V
136    --version
137        Print the version of pygettext and exit.
138
139    -w columns
140    --width=columns
141        Set width of output to columns.
142
143    -x filename
144    --exclude-file=filename
145        Specify a file that contains a list of strings that are not be
146        extracted from the input files.  Each string to be excluded must
147        appear on a line by itself in the file.
148
149    -X filename
150    --no-docstrings=filename
151        Specify a file that contains a list of files (one per line) that
152        should not have their docstrings extracted.  This is only useful in
153        conjunction with the -D option above.
154
155If `inputfile' is -, standard input is read.
156""")
157
158import os
159import importlib.machinery
160import importlib.util
161import sys
162import glob
163import time
164import getopt
165import token
166import tokenize
167
168__version__ = '1.5'
169
170default_keywords = ['_']
171DEFAULTKEYWORDS = ', '.join(default_keywords)
172
173EMPTYSTRING = ''
174
175
176
177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
178# there.
179pot_header = _('''\
180# SOME DESCRIPTIVE TITLE.
181# Copyright (C) YEAR ORGANIZATION
182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
183#
184msgid ""
185msgstr ""
186"Project-Id-Version: PACKAGE VERSION\\n"
187"POT-Creation-Date: %(time)s\\n"
188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
190"Language-Team: LANGUAGE <LL@li.org>\\n"
191"MIME-Version: 1.0\\n"
192"Content-Type: text/plain; charset=%(charset)s\\n"
193"Content-Transfer-Encoding: %(encoding)s\\n"
194"Generated-By: pygettext.py %(version)s\\n"
195
196''')
197
198
199def usage(code, msg=''):
200    print(__doc__ % globals(), file=sys.stderr)
201    if msg:
202        print(msg, file=sys.stderr)
203    sys.exit(code)
204
205
206
207def make_escapes(pass_nonascii):
208    global escapes, escape
209    if pass_nonascii:
210        # Allow non-ascii characters to pass through so that e.g. 'msgid
211        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
212        # escape any character outside the 32..126 range.
213        mod = 128
214        escape = escape_ascii
215    else:
216        mod = 256
217        escape = escape_nonascii
218    escapes = [r"\%03o" % i for i in range(mod)]
219    for i in range(32, 127):
220        escapes[i] = chr(i)
221    escapes[ord('\\')] = r'\\'
222    escapes[ord('\t')] = r'\t'
223    escapes[ord('\r')] = r'\r'
224    escapes[ord('\n')] = r'\n'
225    escapes[ord('\"')] = r'\"'
226
227
228def escape_ascii(s, encoding):
229    return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
230
231def escape_nonascii(s, encoding):
232    return ''.join(escapes[b] for b in s.encode(encoding))
233
234
235def safe_eval(s):
236    # unwrap quotes, safely
237    return eval(s, {'__builtins__':{}}, {})
238
239
240def normalize(s, encoding):
241    # This converts the various Python string types into a format that is
242    # appropriate for .po files, namely much closer to C style.
243    lines = s.split('\n')
244    if len(lines) == 1:
245        s = '"' + escape(s, encoding) + '"'
246    else:
247        if not lines[-1]:
248            del lines[-1]
249            lines[-1] = lines[-1] + '\n'
250        for i in range(len(lines)):
251            lines[i] = escape(lines[i], encoding)
252        lineterm = '\\n"\n"'
253        s = '""\n"' + lineterm.join(lines) + '"'
254    return s
255
256
257def containsAny(str, set):
258    """Check whether 'str' contains ANY of the chars in 'set'"""
259    return 1 in [c in str for c in set]
260
261
262def _visit_pyfiles(list, dirname, names):
263    """Helper for getFilesForName()."""
264    # get extension for python source files
265    if '_py_ext' not in globals():
266        global _py_ext
267        _py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
268
269    # don't recurse into CVS directories
270    if 'CVS' in names:
271        names.remove('CVS')
272
273    # add all *.py files to list
274    list.extend(
275        [os.path.join(dirname, file) for file in names
276         if os.path.splitext(file)[1] == _py_ext]
277        )
278
279
280def getFilesForName(name):
281    """Get a list of module files for a filename, a module or package name,
282    or a directory.
283    """
284    if not os.path.exists(name):
285        # check for glob chars
286        if containsAny(name, "*?[]"):
287            files = glob.glob(name)
288            list = []
289            for file in files:
290                list.extend(getFilesForName(file))
291            return list
292
293        # try to find module or package
294        try:
295            spec = importlib.util.find_spec(name)
296            name = spec.origin
297        except ImportError:
298            name = None
299        if not name:
300            return []
301
302    if os.path.isdir(name):
303        # find all python files in directory
304        list = []
305        os.walk(name, _visit_pyfiles, list)
306        return list
307    elif os.path.exists(name):
308        # a single file
309        return [name]
310
311    return []
312
313
314class TokenEater:
315    def __init__(self, options):
316        self.__options = options
317        self.__messages = {}
318        self.__state = self.__waiting
319        self.__data = []
320        self.__lineno = -1
321        self.__freshmodule = 1
322        self.__curfile = None
323
324    def __call__(self, ttype, tstring, stup, etup, line):
325        # dispatch
326##        import token
327##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
328##              'tstring:', tstring
329        self.__state(ttype, tstring, stup[0])
330
331    def __waiting(self, ttype, tstring, lineno):
332        opts = self.__options
333        # Do docstring extractions, if enabled
334        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
335            # module docstring?
336            if self.__freshmodule:
337                if ttype == tokenize.STRING:
338                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
339                    self.__freshmodule = 0
340                elif ttype not in (tokenize.COMMENT, tokenize.NL):
341                    self.__freshmodule = 0
342                return
343            # class docstring?
344            if ttype == tokenize.NAME and tstring in ('class', 'def'):
345                self.__state = self.__suiteseen
346                return
347        if ttype == tokenize.NAME and tstring in opts.keywords:
348            self.__state = self.__keywordseen
349
350    def __suiteseen(self, ttype, tstring, lineno):
351        # ignore anything until we see the colon
352        if ttype == tokenize.OP and tstring == ':':
353            self.__state = self.__suitedocstring
354
355    def __suitedocstring(self, ttype, tstring, lineno):
356        # ignore any intervening noise
357        if ttype == tokenize.STRING:
358            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
359            self.__state = self.__waiting
360        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
361                           tokenize.COMMENT):
362            # there was no class docstring
363            self.__state = self.__waiting
364
365    def __keywordseen(self, ttype, tstring, lineno):
366        if ttype == tokenize.OP and tstring == '(':
367            self.__data = []
368            self.__lineno = lineno
369            self.__state = self.__openseen
370        else:
371            self.__state = self.__waiting
372
373    def __openseen(self, ttype, tstring, lineno):
374        if ttype == tokenize.OP and tstring == ')':
375            # We've seen the last of the translatable strings.  Record the
376            # line number of the first line of the strings and update the list
377            # of messages seen.  Reset state for the next batch.  If there
378            # were no strings inside _(), then just ignore this entry.
379            if self.__data:
380                self.__addentry(EMPTYSTRING.join(self.__data))
381            self.__state = self.__waiting
382        elif ttype == tokenize.STRING:
383            self.__data.append(safe_eval(tstring))
384        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
385                           token.NEWLINE, tokenize.NL]:
386            # warn if we see anything else than STRING or whitespace
387            print(_(
388                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
389                ) % {
390                'token': tstring,
391                'file': self.__curfile,
392                'lineno': self.__lineno
393                }, file=sys.stderr)
394            self.__state = self.__waiting
395
396    def __addentry(self, msg, lineno=None, isdocstring=0):
397        if lineno is None:
398            lineno = self.__lineno
399        if not msg in self.__options.toexclude:
400            entry = (self.__curfile, lineno)
401            self.__messages.setdefault(msg, {})[entry] = isdocstring
402
403    def set_filename(self, filename):
404        self.__curfile = filename
405        self.__freshmodule = 1
406
407    def write(self, fp):
408        options = self.__options
409        timestamp = time.strftime('%Y-%m-%d %H:%M%z')
410        encoding = fp.encoding if fp.encoding else 'UTF-8'
411        print(pot_header % {'time': timestamp, 'version': __version__,
412                            'charset': encoding,
413                            'encoding': '8bit'}, file=fp)
414        # Sort the entries.  First sort each particular entry's keys, then
415        # sort all the entries by their first item.
416        reverse = {}
417        for k, v in self.__messages.items():
418            keys = sorted(v.keys())
419            reverse.setdefault(tuple(keys), []).append((k, v))
420        rkeys = sorted(reverse.keys())
421        for rkey in rkeys:
422            rentries = reverse[rkey]
423            rentries.sort()
424            for k, v in rentries:
425                # If the entry was gleaned out of a docstring, then add a
426                # comment stating so.  This is to aid translators who may wish
427                # to skip translating some unimportant docstrings.
428                isdocstring = any(v.values())
429                # k is the message string, v is a dictionary-set of (filename,
430                # lineno) tuples.  We want to sort the entries in v first by
431                # file name and then by line number.
432                v = sorted(v.keys())
433                if not options.writelocations:
434                    pass
435                # location comments are different b/w Solaris and GNU:
436                elif options.locationstyle == options.SOLARIS:
437                    for filename, lineno in v:
438                        d = {'filename': filename, 'lineno': lineno}
439                        print(_(
440                            '# File: %(filename)s, line: %(lineno)d') % d, file=fp)
441                elif options.locationstyle == options.GNU:
442                    # fit as many locations on one line, as long as the
443                    # resulting line length doesn't exceed 'options.width'
444                    locline = '#:'
445                    for filename, lineno in v:
446                        d = {'filename': filename, 'lineno': lineno}
447                        s = _(' %(filename)s:%(lineno)d') % d
448                        if len(locline) + len(s) <= options.width:
449                            locline = locline + s
450                        else:
451                            print(locline, file=fp)
452                            locline = "#:" + s
453                    if len(locline) > 2:
454                        print(locline, file=fp)
455                if isdocstring:
456                    print('#, docstring', file=fp)
457                print('msgid', normalize(k, encoding), file=fp)
458                print('msgstr ""\n', file=fp)
459
460
461
462def main():
463    global default_keywords
464    try:
465        opts, args = getopt.getopt(
466            sys.argv[1:],
467            'ad:DEhk:Kno:p:S:Vvw:x:X:',
468            ['extract-all', 'default-domain=', 'escape', 'help',
469             'keyword=', 'no-default-keywords',
470             'add-location', 'no-location', 'output=', 'output-dir=',
471             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
472             'docstrings', 'no-docstrings',
473             ])
474    except getopt.error as msg:
475        usage(1, msg)
476
477    # for holding option values
478    class Options:
479        # constants
480        GNU = 1
481        SOLARIS = 2
482        # defaults
483        extractall = 0 # FIXME: currently this option has no effect at all.
484        escape = 0
485        keywords = []
486        outpath = ''
487        outfile = 'messages.pot'
488        writelocations = 1
489        locationstyle = GNU
490        verbose = 0
491        width = 78
492        excludefilename = ''
493        docstrings = 0
494        nodocstrings = {}
495
496    options = Options()
497    locations = {'gnu' : options.GNU,
498                 'solaris' : options.SOLARIS,
499                 }
500
501    # parse options
502    for opt, arg in opts:
503        if opt in ('-h', '--help'):
504            usage(0)
505        elif opt in ('-a', '--extract-all'):
506            options.extractall = 1
507        elif opt in ('-d', '--default-domain'):
508            options.outfile = arg + '.pot'
509        elif opt in ('-E', '--escape'):
510            options.escape = 1
511        elif opt in ('-D', '--docstrings'):
512            options.docstrings = 1
513        elif opt in ('-k', '--keyword'):
514            options.keywords.append(arg)
515        elif opt in ('-K', '--no-default-keywords'):
516            default_keywords = []
517        elif opt in ('-n', '--add-location'):
518            options.writelocations = 1
519        elif opt in ('--no-location',):
520            options.writelocations = 0
521        elif opt in ('-S', '--style'):
522            options.locationstyle = locations.get(arg.lower())
523            if options.locationstyle is None:
524                usage(1, _('Invalid value for --style: %s') % arg)
525        elif opt in ('-o', '--output'):
526            options.outfile = arg
527        elif opt in ('-p', '--output-dir'):
528            options.outpath = arg
529        elif opt in ('-v', '--verbose'):
530            options.verbose = 1
531        elif opt in ('-V', '--version'):
532            print(_('pygettext.py (xgettext for Python) %s') % __version__)
533            sys.exit(0)
534        elif opt in ('-w', '--width'):
535            try:
536                options.width = int(arg)
537            except ValueError:
538                usage(1, _('--width argument must be an integer: %s') % arg)
539        elif opt in ('-x', '--exclude-file'):
540            options.excludefilename = arg
541        elif opt in ('-X', '--no-docstrings'):
542            fp = open(arg)
543            try:
544                while 1:
545                    line = fp.readline()
546                    if not line:
547                        break
548                    options.nodocstrings[line[:-1]] = 1
549            finally:
550                fp.close()
551
552    # calculate escapes
553    make_escapes(not options.escape)
554
555    # calculate all keywords
556    options.keywords.extend(default_keywords)
557
558    # initialize list of strings to exclude
559    if options.excludefilename:
560        try:
561            fp = open(options.excludefilename)
562            options.toexclude = fp.readlines()
563            fp.close()
564        except IOError:
565            print(_(
566                "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr)
567            sys.exit(1)
568    else:
569        options.toexclude = []
570
571    # resolve args to module lists
572    expanded = []
573    for arg in args:
574        if arg == '-':
575            expanded.append(arg)
576        else:
577            expanded.extend(getFilesForName(arg))
578    args = expanded
579
580    # slurp through all the files
581    eater = TokenEater(options)
582    for filename in args:
583        if filename == '-':
584            if options.verbose:
585                print(_('Reading standard input'))
586            fp = sys.stdin.buffer
587            closep = 0
588        else:
589            if options.verbose:
590                print(_('Working on %s') % filename)
591            fp = open(filename, 'rb')
592            closep = 1
593        try:
594            eater.set_filename(filename)
595            try:
596                tokens = tokenize.tokenize(fp.readline)
597                for _token in tokens:
598                    eater(*_token)
599            except tokenize.TokenError as e:
600                print('%s: %s, line %d, column %d' % (
601                    e.args[0], filename, e.args[1][0], e.args[1][1]),
602                    file=sys.stderr)
603        finally:
604            if closep:
605                fp.close()
606
607    # write the output
608    if options.outfile == '-':
609        fp = sys.stdout
610        closep = 0
611    else:
612        if options.outpath:
613            options.outfile = os.path.join(options.outpath, options.outfile)
614        fp = open(options.outfile, 'w')
615        closep = 1
616    try:
617        eater.write(fp)
618    finally:
619        if closep:
620            fp.close()
621
622
623if __name__ == '__main__':
624    main()
625    # some more test strings
626    # this one creates a warning
627    _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
628    _('more' 'than' 'one' 'string')
629