1324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverimport antlr3
2324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverimport testbase
3324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverimport unittest
4324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverimport os
5324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverimport sys
6324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverfrom cStringIO import StringIO
7324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverimport difflib
8324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverimport textwrap
9324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
10324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverclass t012lexerXML(testbase.ANTLRTest):
11324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver    def setUp(self):
12324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        self.compileGrammar('t012lexerXMLLexer.g')
13324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
14324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
15324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver    def lexerClass(self, base):
16324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        class TLexer(base):
17324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            def emitErrorMessage(self, msg):
18324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                # report errors to /dev/null
19324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                pass
20324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
21324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            def reportError(self, re):
22324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                # no error recovery yet, just crash!
23324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                raise re
24324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
25324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        return TLexer
26324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
27324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
28324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver    def testValid(self):
29324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        inputPath = os.path.splitext(__file__)[0] + '.input'
30324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        stream = antlr3.StringStream(unicode(open(inputPath).read(), 'utf-8'))
31324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        lexer = self.getLexer(stream)
32324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
33324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        while True:
34324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            token = lexer.nextToken()
35324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            if token.type == self.lexerModule.EOF:
36324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                break
37324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
38324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
39324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        output = unicode(lexer.outbuf.getvalue(), 'utf-8')
40324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
41324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        outputPath = os.path.splitext(__file__)[0] + '.output'
42324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        testOutput = unicode(open(outputPath).read(), 'utf-8')
43324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
44324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        success = (output == testOutput)
45324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        if not success:
46324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            d = difflib.Differ()
47324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            r = d.compare(output.splitlines(1), testOutput.splitlines(1))
48324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            self.fail(
49324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                ''.join([l.encode('ascii', 'backslashreplace') for l in r])
50324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                )
51324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
52324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
53324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver    def testMalformedInput1(self):
54324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        input = textwrap.dedent("""\
55324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        <?xml version='1.0'?>
56324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        <document d>
57324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        </document>
58324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        """)
59324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
60324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        stream = antlr3.StringStream(input)
61324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        lexer = self.getLexer(stream)
62324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
63324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        try:
64324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            while True:
65324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                token = lexer.nextToken()
66324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                if token.type == antlr3.EOF:
67324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                    break
68324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
69324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            raise AssertionError
70324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
71324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        except antlr3.NoViableAltException, exc:
72324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.unexpectedType == '>', repr(exc.unexpectedType)
73324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.charPositionInLine == 11, repr(exc.charPositionInLine)
74324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.line == 2, repr(exc.line)
75324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
76324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
77324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver    def testMalformedInput2(self):
78324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        input = textwrap.dedent("""\
79324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        <?tml version='1.0'?>
80324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        <document>
81324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        </document>
82324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        """)
83324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
84324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        stream = antlr3.StringStream(input)
85324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        lexer = self.getLexer(stream)
86324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
87324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        try:
88324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            while True:
89324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                token = lexer.nextToken()
90324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                if token.type == antlr3.EOF:
91324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                    break
92324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
93324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            raise AssertionError
94324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
95324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        except antlr3.MismatchedSetException, exc:
96324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.unexpectedType == 't', repr(exc.unexpectedType)
97324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.charPositionInLine == 2, repr(exc.charPositionInLine)
98324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.line == 1, repr(exc.line)
99324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
100324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
101324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver    def testMalformedInput3(self):
102324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        input = textwrap.dedent("""\
103324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        <?xml version='1.0'?>
104324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        <docu ment attr="foo">
105324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        </document>
106324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        """)
107324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
108324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        stream = antlr3.StringStream(input)
109324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        lexer = self.getLexer(stream)
110324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
111324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        try:
112324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            while True:
113324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                token = lexer.nextToken()
114324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                if token.type == antlr3.EOF:
115324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver                    break
116324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
117324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            raise AssertionError
118324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
119324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver        except antlr3.NoViableAltException, exc:
120324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.unexpectedType == 'a', repr(exc.unexpectedType)
121324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.charPositionInLine == 11, repr(exc.charPositionInLine)
122324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver            assert exc.line == 2, repr(exc.line)
123324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
124324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
125324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
126324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverif __name__ == '__main__':
127324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver    unittest.main()
128324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
129324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
130324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## # run an infinite loop with randomly mangled input
131324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## while True:
132324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     print "ping"
133324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
134324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     input = """\
135324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <?xml version='1.0'?>
136324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <!DOCTYPE component [
137324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <!ELEMENT component (PCDATA|sub)*>
138324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <!ATTLIST component
139324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##           attr CDATA #IMPLIED
140324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##           attr2 CDATA #IMPLIED
141324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## >
142324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <!ELMENT sub EMPTY>
143324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
144324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## ]>
145324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <component attr="val'ue" attr2='val"ue'>
146324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <!-- This is a comment -->
147324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## Text
148324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <![CDATA[huhu]]>
149324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## &amp;
150324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## &lt;
151324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <?xtal cursor='11'?>
152324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <sub/>
153324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## <sub></sub>
154324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## </component>
155324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver## """
156324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
157324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     import random
158324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     input = list(input) # make it mutable
159324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     for _ in range(3):
160324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         p1 = random.randrange(len(input))
161324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         p2 = random.randrange(len(input))
162324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
163324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         c1 = input[p1]
164324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         input[p1] = input[p2]
165324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         input[p2] = c1
166324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     input = ''.join(input) # back to string
167324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
168324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     stream = antlr3.StringStream(input)
169324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     lexer = Lexer(stream)
170324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
171324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     try:
172324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         while True:
173324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##             token = lexer.nextToken()
174324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##             if token.type == EOF:
175324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##                 break
176324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
177324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     except antlr3.RecognitionException, exc:
178324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         print exc
179324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         for l in input.splitlines()[0:exc.line]:
180324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##             print l
181324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         print ' '*exc.charPositionInLine + '^'
182324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
183324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##     except BaseException, exc:
184324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         print '\n'.join(['%02d: %s' % (idx+1, l) for idx, l in enumerate(input.splitlines())])
185324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         print "%s at %d:%d" % (exc, stream.line, stream.charPositionInLine)
186324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         print
187324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
188324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver##         raise
189324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver
190