1'use strict';
2
3var UNICODE = require('../common/unicode');
4
5//Aliases
6var $ = UNICODE.CODE_POINTS;
7
8//Utils
9
10//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline
11//this functions if they will be situated in another module due to context switch.
12//Always perform inlining check before modifying this functions ('node --trace-inlining').
13function isReservedCodePoint(cp) {
14    return cp >= 0xD800 && cp <= 0xDFFF || cp > 0x10FFFF;
15}
16
17function isSurrogatePair(cp1, cp2) {
18    return cp1 >= 0xD800 && cp1 <= 0xDBFF && cp2 >= 0xDC00 && cp2 <= 0xDFFF;
19}
20
21function getSurrogatePairCodePoint(cp1, cp2) {
22    return (cp1 - 0xD800) * 0x400 + 0x2400 + cp2;
23}
24
25//Preprocessor
26//NOTE: HTML input preprocessing
27//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
28var Preprocessor = module.exports = function (html) {
29    this.write(html);
30
31    //NOTE: one leading U+FEFF BYTE ORDER MARK character must be ignored if any are present in the input stream.
32    this.pos = this.html.charCodeAt(0) === $.BOM ? 0 : -1;
33
34    this.gapStack = [];
35    this.lastGapPos = -1;
36    this.skipNextNewLine = false;
37};
38
39Preprocessor.prototype.write = function (html) {
40    if (this.html) {
41        this.html = this.html.substring(0, this.pos + 1) +
42                    html +
43                    this.html.substring(this.pos + 1, this.html.length);
44
45    }
46    else
47        this.html = html;
48
49
50    this.lastCharPos = this.html.length - 1;
51};
52
53Preprocessor.prototype.advanceAndPeekCodePoint = function () {
54    this.pos++;
55
56    if (this.pos > this.lastCharPos)
57        return $.EOF;
58
59    var cp = this.html.charCodeAt(this.pos);
60
61    //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
62    //must be ignored.
63    if (this.skipNextNewLine && cp === $.LINE_FEED) {
64        this.skipNextNewLine = false;
65        this._addGap();
66        return this.advanceAndPeekCodePoint();
67    }
68
69    //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
70    if (cp === $.CARRIAGE_RETURN) {
71        this.skipNextNewLine = true;
72        return $.LINE_FEED;
73    }
74
75    this.skipNextNewLine = false;
76
77    //OPTIMIZATION: first perform check if the code point in the allowed range that covers most common
78    //HTML input (e.g. ASCII codes) to avoid performance-cost operations for high-range code points.
79    return cp >= 0xD800 ? this._processHighRangeCodePoint(cp) : cp;
80};
81
82Preprocessor.prototype._processHighRangeCodePoint = function (cp) {
83    //NOTE: try to peek a surrogate pair
84    if (this.pos !== this.lastCharPos) {
85        var nextCp = this.html.charCodeAt(this.pos + 1);
86
87        if (isSurrogatePair(cp, nextCp)) {
88            //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
89            this.pos++;
90            cp = getSurrogatePairCodePoint(cp, nextCp);
91
92            //NOTE: add gap that should be avoided during retreat
93            this._addGap();
94        }
95    }
96
97    if (isReservedCodePoint(cp))
98        cp = $.REPLACEMENT_CHARACTER;
99
100    return cp;
101};
102
103Preprocessor.prototype._addGap = function () {
104    this.gapStack.push(this.lastGapPos);
105    this.lastGapPos = this.pos;
106};
107
108Preprocessor.prototype.retreat = function () {
109    if (this.pos === this.lastGapPos) {
110        this.lastGapPos = this.gapStack.pop();
111        this.pos--;
112    }
113
114    this.pos--;
115};
116