1var assert = require('assert'), 2 fs = require('fs'), 3 path = require('path'), 4 Tokenizer = require('../../lib/tokenization/tokenizer'); 5 6function tokenize(html, initialState, lastStartTag) { 7 var tokenizer = new Tokenizer(html), 8 nextToken = null, 9 out = []; 10 11 tokenizer.state = initialState; 12 13 if (lastStartTag) 14 tokenizer.lastStartTagName = lastStartTag; 15 16 do { 17 nextToken = tokenizer.getNextToken(); 18 19 //NOTE: append current token to the output sequence in html5lib test suite compatible format 20 switch (nextToken.type) { 21 case Tokenizer.CHARACTER_TOKEN: 22 case Tokenizer.NULL_CHARACTER_TOKEN: 23 case Tokenizer.WHITESPACE_CHARACTER_TOKEN: 24 out.push(['Character', nextToken.chars]); 25 break; 26 27 case Tokenizer.START_TAG_TOKEN: 28 var reformatedAttrs = {}; 29 30 nextToken.attrs.forEach(function (attr) { 31 reformatedAttrs[attr.name] = attr.value; 32 }); 33 34 var startTagEntry = [ 35 'StartTag', 36 nextToken.tagName, 37 reformatedAttrs 38 ]; 39 40 if (nextToken.selfClosing) 41 startTagEntry.push(true); 42 43 out.push(startTagEntry); 44 break; 45 46 case Tokenizer.END_TAG_TOKEN: 47 out.push(['EndTag', nextToken.tagName]); 48 break; 49 50 case Tokenizer.COMMENT_TOKEN: 51 out.push(['Comment', nextToken.data]); 52 break; 53 54 case Tokenizer.DOCTYPE_TOKEN: 55 out.push([ 56 'DOCTYPE', 57 nextToken.name, 58 nextToken.publicId, 59 nextToken.systemId, 60 !nextToken.forceQuirks 61 ]); 62 break; 63 } 64 } while (nextToken.type !== Tokenizer.EOF_TOKEN); 65 66 return concatCharacterTokens(out) 67} 68 69function unicodeUnescape(str) { 70 return str.replace(/\\u([\d\w]{4})/gi, function (match, chCodeStr) { 71 return String.fromCharCode(parseInt(chCodeStr, 16)); 72 }); 73} 74 75function unescapeDescrIO(testDescr) { 76 testDescr.input = unicodeUnescape(testDescr.input); 77 78 testDescr.output.forEach(function (tokenEntry) { 79 if (tokenEntry === 'ParseError') 80 return; 81 82 //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token), 83 //character token data (for Character token). 84 tokenEntry[1] = unicodeUnescape(tokenEntry[1]); 85 86 //NOTE: unescape token attributes(if we have them). 87 if (tokenEntry.length > 2) { 88 Object.keys(tokenEntry).forEach(function (attrName) { 89 var attrVal = tokenEntry[attrName]; 90 91 delete tokenEntry[attrName]; 92 tokenEntry[unicodeUnescape(attrName)] = unicodeUnescape(attrVal); 93 }); 94 } 95 }); 96} 97 98function concatCharacterTokens(tokenEntries) { 99 var result = []; 100 101 tokenEntries.forEach(function (tokenEntry) { 102 if (tokenEntry[0] === 'Character') { 103 var lastEntry = result[result.length - 1]; 104 105 if (lastEntry && lastEntry[0] === 'Character') { 106 lastEntry[1] += tokenEntry[1]; 107 return; 108 } 109 } 110 111 result.push(tokenEntry); 112 }); 113 114 return result; 115} 116 117function getTokenizerSuitableStateName(testDataStateName) { 118 return testDataStateName.toUpperCase().replace(/\s/g, '_'); 119} 120 121function loadTests() { 122 var dataDirPath = path.join(__dirname, '../data/tokenization'), 123 testSetFileNames = fs.readdirSync(dataDirPath), 124 testIdx = 0, 125 tests = []; 126 127 testSetFileNames.forEach(function (fileName) { 128 var filePath = path.join(dataDirPath, fileName), 129 testSetJson = fs.readFileSync(filePath).toString(), 130 testSet = JSON.parse(testSetJson), 131 testDescrs = testSet.tests, 132 setName = fileName.replace('.test', ''); 133 134 testDescrs.forEach(function (descr) { 135 if (!descr.initialStates) 136 descr.initialStates = ['Data state']; 137 138 if (descr.doubleEscaped) 139 unescapeDescrIO(descr); 140 141 var expected = []; 142 143 descr.output.forEach(function (tokenEntry) { 144 if (tokenEntry !== 'ParseError') 145 expected.push(tokenEntry); 146 }); 147 148 descr.initialStates.forEach(function (initialState) { 149 tests.push({ 150 idx: ++testIdx, 151 setName: setName, 152 name: descr.description, 153 input: descr.input, 154 expected: concatCharacterTokens(expected), 155 initialState: getTokenizerSuitableStateName(initialState), 156 lastStartTag: descr.lastStartTag 157 }); 158 }); 159 }); 160 }); 161 162 return tests; 163} 164 165function getFullTestName(test) { 166 return ['Tokenizer - ' + 167 test.idx, '.', test.setName, ' - ', test.name, ' - Initial state: ', test.initialState].join(''); 168} 169 170//Here we go.. 171loadTests().forEach(function (test) { 172 exports[getFullTestName(test)] = function () { 173 var out = tokenize(test.input, test.initialState, test.lastStartTag); 174 175 assert.deepEqual(out, test.expected); 176 }; 177}); 178 179 180exports['Options - locationInfo'] = function () { 181 var testCases = [ 182 { 183 initialMode: Tokenizer.MODE.DATA, 184 lastStartTagName: '', 185 htmlChunks: [ 186 '\r\n', '<!DOCTYPE html>', '\n', 187 '<!-- Test -->', '\n', 188 '<head>', 189 '\n ', '<meta charset="utf-8">', '<title>', ' ', 'node.js', '\u0000', '</title>', '\n', 190 '</head>', '\n', 191 '<body id="front">', '\n', 192 '<div id="intro">', 193 '\n ', '<p>', 194 '\n ', 'Node.js', ' ', 'is', ' ', 'a', 195 '\n ', 'platform', ' ', 'built', ' ', 'on', 196 '\n ', '<a href="http://code.google.com/p/v8/">', 197 '\n ', 'Chrome\'s', ' ', 'JavaScript', ' ', 'runtime', 198 '\n ', '</a>', '\n', 199 '</div>', 200 '<body>' 201 ] 202 }, 203 { 204 initialMode: Tokenizer.MODE.RCDATA, 205 lastStartTagName: 'title', 206 htmlChunks: [ 207 '<div>Test', 208 ' \n ', 'hey', ' ', 'ya!', '</title>', '<!--Yo-->' 209 ] 210 }, 211 { 212 initialMode: Tokenizer.MODE.RAWTEXT, 213 lastStartTagName: 'style', 214 htmlChunks: [ 215 '.header{', ' \n ', 'color:red;', '\n', '}', '</style>', 'Some', ' ', 'text' 216 ] 217 }, 218 { 219 initialMode: Tokenizer.MODE.SCRIPT_DATA, 220 lastStartTagName: 'script', 221 htmlChunks: [ 222 'var', ' ', 'a=c', ' ', '-', ' ', 'd;', '\n', 'a<--d;', '</script>', '<div>' 223 ] 224 }, 225 { 226 initialMode: Tokenizer.MODE.PLAINTEXT, 227 lastStartTagName: 'plaintext', 228 htmlChunks: [ 229 'Text', ' \n', 'Test</plaintext><div>' 230 ] 231 } 232 233 ]; 234 235 testCases.forEach(function (testCase) { 236 var html = testCase.htmlChunks.join(''), 237 tokenizer = new Tokenizer(html, {locationInfo: true}); 238 239 tokenizer.state = testCase.initialMode; 240 tokenizer.lastStartTagName = testCase.lastStartTagName; 241 242 for (var token = tokenizer.getNextToken(), i = 0; token.type !== Tokenizer.EOF_TOKEN;) { 243 var chunk = html.substring(token.location.start, token.location.end); 244 245 assert.strictEqual(chunk, testCase.htmlChunks[i]); 246 247 token = tokenizer.getNextToken(); 248 i++; 249 } 250 }); 251}; 252