1var assert = require('assert'),
2    fs = require('fs'),
3    path = require('path'),
4    Tokenizer = require('../../lib/tokenization/tokenizer');
5
6function tokenize(html, initialState, lastStartTag) {
7    var tokenizer = new Tokenizer(html),
8        nextToken = null,
9        out = [];
10
11    tokenizer.state = initialState;
12
13    if (lastStartTag)
14        tokenizer.lastStartTagName = lastStartTag;
15
16    do {
17        nextToken = tokenizer.getNextToken();
18
19        //NOTE: append current token to the output sequence in html5lib test suite compatible format
20        switch (nextToken.type) {
21            case Tokenizer.CHARACTER_TOKEN:
22            case Tokenizer.NULL_CHARACTER_TOKEN:
23            case Tokenizer.WHITESPACE_CHARACTER_TOKEN:
24                out.push(['Character', nextToken.chars]);
25                break;
26
27            case Tokenizer.START_TAG_TOKEN:
28                var reformatedAttrs = {};
29
30                nextToken.attrs.forEach(function (attr) {
31                    reformatedAttrs[attr.name] = attr.value;
32                });
33
34                var startTagEntry = [
35                    'StartTag',
36                    nextToken.tagName,
37                    reformatedAttrs
38                ];
39
40                if (nextToken.selfClosing)
41                    startTagEntry.push(true);
42
43                out.push(startTagEntry);
44                break;
45
46            case Tokenizer.END_TAG_TOKEN:
47                out.push(['EndTag', nextToken.tagName]);
48                break;
49
50            case Tokenizer.COMMENT_TOKEN:
51                out.push(['Comment', nextToken.data]);
52                break;
53
54            case Tokenizer.DOCTYPE_TOKEN:
55                out.push([
56                    'DOCTYPE',
57                    nextToken.name,
58                    nextToken.publicId,
59                    nextToken.systemId,
60                    !nextToken.forceQuirks
61                ]);
62                break;
63        }
64    } while (nextToken.type !== Tokenizer.EOF_TOKEN);
65
66    return concatCharacterTokens(out)
67}
68
69function unicodeUnescape(str) {
70    return str.replace(/\\u([\d\w]{4})/gi, function (match, chCodeStr) {
71        return String.fromCharCode(parseInt(chCodeStr, 16));
72    });
73}
74
75function unescapeDescrIO(testDescr) {
76    testDescr.input = unicodeUnescape(testDescr.input);
77
78    testDescr.output.forEach(function (tokenEntry) {
79        if (tokenEntry === 'ParseError')
80            return;
81
82        //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token),
83        //character token data (for Character token).
84        tokenEntry[1] = unicodeUnescape(tokenEntry[1]);
85
86        //NOTE: unescape token attributes(if we have them).
87        if (tokenEntry.length > 2) {
88            Object.keys(tokenEntry).forEach(function (attrName) {
89                var attrVal = tokenEntry[attrName];
90
91                delete tokenEntry[attrName];
92                tokenEntry[unicodeUnescape(attrName)] = unicodeUnescape(attrVal);
93            });
94        }
95    });
96}
97
98function concatCharacterTokens(tokenEntries) {
99    var result = [];
100
101    tokenEntries.forEach(function (tokenEntry) {
102        if (tokenEntry[0] === 'Character') {
103            var lastEntry = result[result.length - 1];
104
105            if (lastEntry && lastEntry[0] === 'Character') {
106                lastEntry[1] += tokenEntry[1];
107                return;
108            }
109        }
110
111        result.push(tokenEntry);
112    });
113
114    return result;
115}
116
117function getTokenizerSuitableStateName(testDataStateName) {
118    return testDataStateName.toUpperCase().replace(/\s/g, '_');
119}
120
121function loadTests() {
122    var dataDirPath = path.join(__dirname, '../data/tokenization'),
123        testSetFileNames = fs.readdirSync(dataDirPath),
124        testIdx = 0,
125        tests = [];
126
127    testSetFileNames.forEach(function (fileName) {
128        var filePath = path.join(dataDirPath, fileName),
129            testSetJson = fs.readFileSync(filePath).toString(),
130            testSet = JSON.parse(testSetJson),
131            testDescrs = testSet.tests,
132            setName = fileName.replace('.test', '');
133
134        testDescrs.forEach(function (descr) {
135            if (!descr.initialStates)
136                descr.initialStates = ['Data state'];
137
138            if (descr.doubleEscaped)
139                unescapeDescrIO(descr);
140
141            var expected = [];
142
143            descr.output.forEach(function (tokenEntry) {
144                if (tokenEntry !== 'ParseError')
145                    expected.push(tokenEntry);
146            });
147
148            descr.initialStates.forEach(function (initialState) {
149                tests.push({
150                    idx: ++testIdx,
151                    setName: setName,
152                    name: descr.description,
153                    input: descr.input,
154                    expected: concatCharacterTokens(expected),
155                    initialState: getTokenizerSuitableStateName(initialState),
156                    lastStartTag: descr.lastStartTag
157                });
158            });
159        });
160    });
161
162    return tests;
163}
164
165function getFullTestName(test) {
166    return ['Tokenizer - ' +
167            test.idx, '.', test.setName, ' - ', test.name, ' - Initial state: ', test.initialState].join('');
168}
169
170//Here we go..
171loadTests().forEach(function (test) {
172    exports[getFullTestName(test)] = function () {
173        var out = tokenize(test.input, test.initialState, test.lastStartTag);
174
175        assert.deepEqual(out, test.expected);
176    };
177});
178
179
180exports['Options - locationInfo'] = function () {
181    var testCases = [
182        {
183            initialMode: Tokenizer.MODE.DATA,
184            lastStartTagName: '',
185            htmlChunks: [
186                '\r\n', '<!DOCTYPE html>', '\n',
187                '<!-- Test -->', '\n',
188                '<head>',
189                '\n   ', '<meta charset="utf-8">', '<title>', '   ', 'node.js', '\u0000', '</title>', '\n',
190                '</head>', '\n',
191                '<body id="front">', '\n',
192                '<div id="intro">',
193                '\n   ', '<p>',
194                '\n       ', 'Node.js', ' ', 'is', ' ', 'a',
195                '\n       ', 'platform', ' ', 'built', ' ', 'on',
196                '\n       ', '<a href="http://code.google.com/p/v8/">',
197                '\n       ', 'Chrome\'s', ' ', 'JavaScript', ' ', 'runtime',
198                '\n       ', '</a>', '\n',
199                '</div>',
200                '<body>'
201            ]
202        },
203        {
204            initialMode: Tokenizer.MODE.RCDATA,
205            lastStartTagName: 'title',
206            htmlChunks: [
207                '<div>Test',
208                ' \n   ', 'hey', ' ', 'ya!', '</title>', '<!--Yo-->'
209            ]
210        },
211        {
212            initialMode: Tokenizer.MODE.RAWTEXT,
213            lastStartTagName: 'style',
214            htmlChunks: [
215                '.header{', ' \n   ', 'color:red;', '\n', '}', '</style>', 'Some', ' ', 'text'
216            ]
217        },
218        {
219            initialMode: Tokenizer.MODE.SCRIPT_DATA,
220            lastStartTagName: 'script',
221            htmlChunks: [
222                'var', ' ', 'a=c', ' ', '-', ' ', 'd;', '\n', 'a<--d;', '</script>', '<div>'
223            ]
224        },
225        {
226            initialMode: Tokenizer.MODE.PLAINTEXT,
227            lastStartTagName: 'plaintext',
228            htmlChunks: [
229                'Text', ' \n', 'Test</plaintext><div>'
230            ]
231        }
232
233    ];
234
235    testCases.forEach(function (testCase) {
236        var html = testCase.htmlChunks.join(''),
237            tokenizer = new Tokenizer(html, {locationInfo: true});
238
239        tokenizer.state = testCase.initialMode;
240        tokenizer.lastStartTagName = testCase.lastStartTagName;
241
242        for (var token = tokenizer.getNextToken(), i = 0; token.type !== Tokenizer.EOF_TOKEN;) {
243            var chunk = html.substring(token.location.start, token.location.end);
244
245            assert.strictEqual(chunk, testCase.htmlChunks[i]);
246
247            token = tokenizer.getNextToken();
248            i++;
249        }
250    });
251};
252