1'use strict'; 2 3var UNICODE = require('../common/unicode'); 4 5//Aliases 6var $ = UNICODE.CODE_POINTS; 7 8//Utils 9 10//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline 11//this functions if they will be situated in another module due to context switch. 12//Always perform inlining check before modifying this functions ('node --trace-inlining'). 13function isReservedCodePoint(cp) { 14 return cp >= 0xD800 && cp <= 0xDFFF || cp > 0x10FFFF; 15} 16 17function isSurrogatePair(cp1, cp2) { 18 return cp1 >= 0xD800 && cp1 <= 0xDBFF && cp2 >= 0xDC00 && cp2 <= 0xDFFF; 19} 20 21function getSurrogatePairCodePoint(cp1, cp2) { 22 return (cp1 - 0xD800) * 0x400 + 0x2400 + cp2; 23} 24 25//Preprocessor 26//NOTE: HTML input preprocessing 27//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream) 28var Preprocessor = module.exports = function (html) { 29 this.write(html); 30 31 //NOTE: one leading U+FEFF BYTE ORDER MARK character must be ignored if any are present in the input stream. 32 this.pos = this.html.charCodeAt(0) === $.BOM ? 0 : -1; 33 34 this.gapStack = []; 35 this.lastGapPos = -1; 36 this.skipNextNewLine = false; 37}; 38 39Preprocessor.prototype.write = function (html) { 40 if (this.html) { 41 this.html = this.html.substring(0, this.pos + 1) + 42 html + 43 this.html.substring(this.pos + 1, this.html.length); 44 45 } 46 else 47 this.html = html; 48 49 50 this.lastCharPos = this.html.length - 1; 51}; 52 53Preprocessor.prototype.advanceAndPeekCodePoint = function () { 54 this.pos++; 55 56 if (this.pos > this.lastCharPos) 57 return $.EOF; 58 59 var cp = this.html.charCodeAt(this.pos); 60 61 //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character 62 //must be ignored. 63 if (this.skipNextNewLine && cp === $.LINE_FEED) { 64 this.skipNextNewLine = false; 65 this._addGap(); 66 return this.advanceAndPeekCodePoint(); 67 } 68 69 //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters 70 if (cp === $.CARRIAGE_RETURN) { 71 this.skipNextNewLine = true; 72 return $.LINE_FEED; 73 } 74 75 this.skipNextNewLine = false; 76 77 //OPTIMIZATION: first perform check if the code point in the allowed range that covers most common 78 //HTML input (e.g. ASCII codes) to avoid performance-cost operations for high-range code points. 79 return cp >= 0xD800 ? this._processHighRangeCodePoint(cp) : cp; 80}; 81 82Preprocessor.prototype._processHighRangeCodePoint = function (cp) { 83 //NOTE: try to peek a surrogate pair 84 if (this.pos !== this.lastCharPos) { 85 var nextCp = this.html.charCodeAt(this.pos + 1); 86 87 if (isSurrogatePair(cp, nextCp)) { 88 //NOTE: we have a surrogate pair. Peek pair character and recalculate code point. 89 this.pos++; 90 cp = getSurrogatePairCodePoint(cp, nextCp); 91 92 //NOTE: add gap that should be avoided during retreat 93 this._addGap(); 94 } 95 } 96 97 if (isReservedCodePoint(cp)) 98 cp = $.REPLACEMENT_CHARACTER; 99 100 return cp; 101}; 102 103Preprocessor.prototype._addGap = function () { 104 this.gapStack.push(this.lastGapPos); 105 this.lastGapPos = this.pos; 106}; 107 108Preprocessor.prototype.retreat = function () { 109 if (this.pos === this.lastGapPos) { 110 this.lastGapPos = this.gapStack.pop(); 111 this.pos--; 112 } 113 114 this.pos--; 115}; 116