1""" TeXcheck.py -- rough syntax checking on Python style LaTeX documents.
2
3   Written by Raymond D. Hettinger <python at rcn.com>
4   Copyright (c) 2003 Python Software Foundation.  All rights reserved.
5
6Designed to catch common markup errors including:
7* Unbalanced or mismatched parenthesis, brackets, and braces.
8* Unbalanced or mismatched \\begin and \\end blocks.
9* Misspelled or invalid LaTeX commands.
10* Use of forward slashes instead of backslashes for commands.
11* Table line size mismatches.
12
13Sample command line usage:
14    python texcheck.py -k chapterheading -m lib/librandomtex *.tex
15
16Options:
17    -m          Munge parenthesis and brackets. [0,n) would normally mismatch.
18    -k keyword: Keyword is a valid LaTeX command. Do not include the backslash.
19    -d:         Delimiter check only (useful for non-LaTeX files).
20    -h:         Help
21    -s lineno:  Start at lineno (useful for skipping complex sections).
22    -v:         Verbose.  Trace the matching of //begin and //end blocks.
23"""
24
25import re
26import sys
27import getopt
28from itertools import izip, count, islice
29import glob
30
31cmdstr = r"""
32    \section \module \declaremodule \modulesynopsis \moduleauthor
33    \sectionauthor \versionadded \code \class \method \begin
34    \optional \var \ref \end \subsection \lineiii \hline \label
35    \indexii \textrm \ldots \keyword \stindex \index \item \note
36    \withsubitem \ttindex \footnote \citetitle \samp \opindex
37    \noindent \exception \strong \dfn \ctype \obindex \character
38    \indexiii \function \bifuncindex \refmodule \refbimodindex
39    \subsubsection \nodename \member \chapter \emph \ASCII \UNIX
40    \regexp \program \production \token \productioncont \term
41    \grammartoken \lineii \seemodule \file \EOF \documentclass
42    \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp
43    \tableofcontents \kbd \programopt \envvar \refstmodindex
44    \cfunction \constant \NULL \moreargs \cfuncline \cdata
45    \textasciicircum \n \ABC \setindexsubitem \versionchanged
46    \deprecated \seetext \newcommand \POSIX \pep \warning \rfc
47    \verbatiminput \methodline \textgreater \seetitle \lineiv
48    \funclineni \ulink \manpage \funcline \dataline \unspecified
49    \textbackslash \mimetype \mailheader \seepep \textunderscore
50    \longprogramopt \infinity \plusminus \shortversion \version
51    \refmodindex \seerfc \makeindex \makemodindex \renewcommand
52    \indexname \appendix \protect \indexiv \mbox \textasciitilde
53    \platform \seeurl \leftmargin \labelwidth \localmoduletable
54    \LaTeX \copyright \memberline \backslash \pi \centerline
55    \caption \vspace \textwidth \menuselection \textless
56    \makevar \csimplemacro \menuselection \bfcode \sub \release
57    \email \kwindex \refexmodindex \filenq \e \menuselection
58    \exindex \linev \newsgroup \verbatim \setshortversion
59    \author \authoraddress \paragraph \subparagraph \cmemberline
60    \textbar \C \seelink
61"""
62
63def matchclose(c_lineno, c_symbol, openers, pairmap):
64    "Verify that closing delimiter matches most recent opening delimiter"
65    try:
66        o_lineno, o_symbol = openers.pop()
67    except IndexError:
68        print "\nDelimiter mismatch.  On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol)
69        return
70    if o_symbol in pairmap.get(c_symbol, [c_symbol]): return
71    print "\nOpener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno)
72    return
73
74def checkit(source, opts, morecmds=[]):
75    """Check the LaTeX formatting in a sequence of lines.
76
77    Opts is a mapping of options to option values if any:
78        -m          munge parenthesis and brackets
79        -d          delimiters only checking
80        -v          verbose trace of delimiter matching
81        -s lineno:  linenumber to start scan (default is 1).
82
83    Morecmds is a sequence of LaTeX commands (without backslashes) that
84    are to be considered valid in the scan.
85    """
86
87    texcmd = re.compile(r'\\[A-Za-z]+')
88    falsetexcmd = re.compile(r'\/([A-Za-z]+)') # Mismarked with forward slash
89
90    validcmds = set(cmdstr.split())
91    for cmd in morecmds:
92        validcmds.add('\\' + cmd)
93
94    if '-m' in opts:
95        pairmap = {']':'[(', ')':'(['}      # Munged openers
96    else:
97        pairmap = {']':'[', ')':'('}        # Normal opener for a given closer
98    openpunct = set('([')                   # Set of valid openers
99
100    delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])')
101    braces = re.compile(r'({)|(})')
102    doubledwords = re.compile(r'(\b[A-za-z]+\b) \b\1\b')
103    spacingmarkup = re.compile(r'\\(ABC|ASCII|C|Cpp|EOF|infinity|NULL|plusminus|POSIX|UNIX)\s')
104
105    openers = []                            # Stack of pending open delimiters
106    bracestack = []                         # Stack of pending open braces
107
108    tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}')
109    tableline = re.compile(r'\\line([iv]+){')
110    tableend = re.compile(r'\\end{(?:long)?table([iv]+)}')
111    tablelevel = ''
112    tablestartline = 0
113
114    startline = int(opts.get('-s', '1'))
115    lineno = 0
116
117    for lineno, line in izip(count(startline), islice(source, startline-1, None)):
118        line = line.rstrip()
119
120        # Check balancing of open/close parenthesis, brackets, and begin/end blocks
121        for begend, name, punct in delimiters.findall(line):
122            if '-v' in opts:
123                print lineno, '|', begend, name, punct,
124            if begend == 'begin' and '-d' not in opts:
125                openers.append((lineno, name))
126            elif punct in openpunct:
127                openers.append((lineno, punct))
128            elif begend == 'end' and '-d' not in opts:
129                matchclose(lineno, name, openers, pairmap)
130            elif punct in pairmap:
131                matchclose(lineno, punct, openers, pairmap)
132            if '-v' in opts:
133                print '   --> ', openers
134
135        # Balance opening and closing braces
136        for open, close in braces.findall(line):
137            if open == '{':
138                bracestack.append(lineno)
139            if close == '}':
140                try:
141                    bracestack.pop()
142                except IndexError:
143                    print r'Warning, unmatched } on line %s.' % (lineno,)
144
145        # Optionally, skip LaTeX specific checks
146        if '-d' in opts:
147            continue
148
149        # Warn whenever forward slashes encountered with a LaTeX command
150        for cmd in falsetexcmd.findall(line):
151            if '822' in line or '.html' in line:
152                continue    # Ignore false positives for urls and for /rfc822
153            if '\\' + cmd in validcmds:
154                print 'Warning, forward slash used on line %d with cmd: /%s' % (lineno, cmd)
155
156        # Check for markup requiring {} for correct spacing
157        for cmd in spacingmarkup.findall(line):
158            print r'Warning, \%s should be written as \%s{} on line %d' % (cmd, cmd, lineno)
159
160        # Validate commands
161        nc = line.find(r'\newcommand')
162        if nc != -1:
163            start = line.find('{', nc)
164            end = line.find('}', start)
165            validcmds.add(line[start+1:end])
166        for cmd in texcmd.findall(line):
167            if cmd not in validcmds:
168                print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd)
169
170        # Check table levels (make sure lineii only inside tableii)
171        m = tablestart.search(line)
172        if m:
173            tablelevel = m.group(1)
174            tablestartline = lineno
175        m = tableline.search(line)
176        if m and m.group(1) != tablelevel:
177            print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline)
178        if tableend.search(line):
179            tablelevel = ''
180
181        # Style guide warnings
182        if 'e.g.' in line or 'i.e.' in line:
183            print r'Style warning, avoid use of i.e or e.g. on line %d' % (lineno,)
184
185        for dw in doubledwords.findall(line):
186            print r'Doubled word warning.  "%s" on line %d' % (dw, lineno)
187
188    lastline = lineno
189    for lineno, symbol in openers:
190        print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno)
191    for lineno in bracestack:
192        print "Unmatched { on line %d" % (lineno,)
193    print 'Done checking %d lines.' % (lastline,)
194    return 0
195
196def main(args=None):
197    if args is None:
198        args = sys.argv[1:]
199    optitems, arglist = getopt.getopt(args, "k:mdhs:v")
200    opts = dict(optitems)
201    if '-h' in opts or args==[]:
202        print __doc__
203        return 0
204
205    if len(arglist) < 1:
206        print 'Please specify a file to be checked'
207        return 1
208
209    for i, filespec in enumerate(arglist):
210        if '*' in filespec or '?' in filespec:
211            arglist[i:i+1] = glob.glob(filespec)
212
213    morecmds = [v for k,v in optitems if k=='-k']
214    err = []
215
216    for filename in arglist:
217        print '=' * 30
218        print "Checking", filename
219        try:
220            f = open(filename)
221        except IOError:
222            print 'Cannot open file %s.' % arglist[0]
223            return 2
224
225        try:
226            err.append(checkit(f, opts, morecmds))
227        finally:
228            f.close()
229
230    return max(err)
231
232if __name__ == '__main__':
233    sys.exit(main())
234