1// Copyright 2005 Google Inc.
2// All Rights Reserved.
3//
4// msamuel@google.com
5
6// Usage:
7// 1) include this source file in an html page via
8// <script type=text/javascript src=prettify.js></script>
9// 2) define style rules.  See the example page for examples.
10// 3) mark the <pre> and <code> tags in your source with class=prettyprint.
11//    You can also use the (html deprecated) <xmp> tag, but the pretty printer
12//    needs to do more substantial DOM manipulations to support that, so some
13//    css styles may not be preserved.
14
15// Change log:
16// cbeust, 2006/08/22
17//   Java annotations (start with "@") are now captured as literals ("lit")
18//
19
20var PR_keywords = new Object();
21/** initialize the keyword list for our target languages. */
22(function () {
23  var CPP_KEYWORDS = (
24    "bool break case catch char class const const_cast continue default " +
25    "delete deprecated dllexport dllimport do double dynamic_cast else enum " +
26    "explicit extern false float for friend goto if inline int long mutable " +
27    "naked namespace new noinline noreturn nothrow novtable operator private " +
28    "property protected public register reinterpret_cast return selectany " +
29    "short signed sizeof static static_cast struct switch template this " +
30    "thread throw true try typedef typeid typename union unsigned using " +
31    "declaration, using directive uuid virtual void volatile while typeof");
32  var JAVA_KEYWORDS = (
33    "abstract default goto package synchronized boolean do if private this " +
34    "break double implements protected throw byte else import public throws " +
35    "case enum instanceof return transient catch extends int short try char " +
36    "final interface static void class finally long strictfp volatile const " +
37    "float native super while continue for new switch");
38  var PYTHON_KEYWORDS = (
39    "and assert break class continue def del elif else except exec finally " +
40    "for from global if import in is lambda not or pass print raise return " +
41    "try while yield");
42  var JSCRIPT_KEYWORDS = (
43    "abstract boolean break byte case catch char class const continue " +
44    "debugger default delete do double else enum export extends false final " +
45    "finally float for function goto if implements import in instanceof int " +
46    "interface long native new null package private protected public return " +
47    "short static super switch synchronized this throw throws transient " +
48    "true try typeof var void volatile while with NaN Infinity");
49  var PERL_KEYWORDS = (
50    "foreach require sub unless until use elsif BEGIN END");
51  var SH_KEYWORDS = (
52    "if then do else fi end");
53  var KEYWORDS = [CPP_KEYWORDS, JAVA_KEYWORDS, PYTHON_KEYWORDS,
54                  JSCRIPT_KEYWORDS, PERL_KEYWORDS, SH_KEYWORDS];
55  for (var k = 0; k < KEYWORDS.length; k++) {
56    var kw = KEYWORDS[k].split(' ');
57    for (var i = 0; i < kw.length; i++) {
58      if (kw[i]) { PR_keywords[kw[i]] = true; }
59    }
60  }
61}).call(this);
62
63// token style names.  correspond to css classes
64/** token style for a string literal */
65var PR_STRING = 'str';
66/** token style for a keyword */
67var PR_KEYWORD = 'kwd';
68/** token style for a comment */
69var PR_COMMENT = 'com';
70/** token style for a type */
71var PR_TYPE = 'typ';
72/** token style for a literal value.  e.g. 1, null, true. */
73var PR_LITERAL = 'lit';
74/** token style for a punctuation string. */
75var PR_PUNCTUATION = 'pun';
76/** token style for a punctuation string. */
77var PR_PLAIN = 'pln';
78
79/** token style for an sgml tag. */
80var PR_TAG = 'tag';
81/** token style for a markup declaration such as a DOCTYPE. */
82var PR_DECLARATION = 'dec';
83/** token style for embedded source. */
84var PR_SOURCE = 'src';
85/** token style for an sgml attribute name. */
86var PR_ATTRIB_NAME = 'atn';
87/** token style for an sgml attribute value. */
88var PR_ATTRIB_VALUE = 'atv';
89
90/** the position of the end of a token during.  A division of a string into
91  * n tokens can be represented as a series n - 1 token ends, as long as
92  * runs of whitespace warrant their own token.
93  * @private
94  */
95function PR_TokenEnd(end, style) {
96  if (undefined === style) { throw new Error('BAD'); }
97  if ('number' != typeof(end)) { throw new Error('BAD'); }
98  this.end = end;
99  this.style = style;
100}
101PR_TokenEnd.prototype.toString = function () {
102  return '[PR_TokenEnd ' + this.end +
103    (this.style ? ':' + this.style : '') + ']';
104};
105
106
107/** a chunk of text with a style.  These are used to represent both the output
108  * from the lexing functions as well as intermediate results.
109  * @constructor
110  * @param token the token text
111  * @param style one of the token styles defined in designdoc-template, or null
112  *   for a styleless token, such as an embedded html tag.
113  * @private
114  */
115function PR_Token(token, style) {
116  if (undefined === style) { throw new Error('BAD'); }
117  this.token = token;
118  this.style = style;
119}
120
121PR_Token.prototype.toString = function () {
122  return '[PR_Token ' + this.token + (this.style ? ':' + this.style : '') + ']';
123};
124
125
126/** a helper class that decodes common html entities used to escape source and
127  * markup punctuation characters in html.
128  * @constructor
129  * @private
130  */
131function PR_DecodeHelper() {
132  this.next = 0;
133  this.ch = '\0';
134}
135
136PR_DecodeHelper.prototype.decode = function (s, i) {
137  var next = i + 1;
138  var ch = s.charAt(i);
139  if ('&' == ch) {
140    var semi = s.indexOf(';', next);
141    if (semi >= 0 && semi < next + 4) {
142      var entityName = s.substring(next, semi).toLowerCase();
143      next = semi + 1;
144      if ('lt' == entityName) {
145        ch = '<';
146      } else if ('gt' == entityName) {
147        ch = '>';
148      } else if ('quot' == entityName) {
149        ch = '"';
150      } else if ('apos' == entityName) {
151        ch = '\'';
152      } else if ('amp' == entityName) {
153        ch = '&';
154      } else {
155        next = i + 1;
156      }
157    }
158  }
159  this.next = next;
160  this.ch = ch;
161  return this.ch;
162}
163
164
165// some string utilities
166function PR_isWordChar(ch) {
167  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
168}
169
170function PR_isIdentifierStart(ch) {
171  return PR_isWordChar(ch) || ch == '_' || ch == '$' || ch == '@';
172}
173
174function PR_isIdentifierPart(ch) {
175  return PR_isIdentifierStart(ch) || PR_isDigitChar(ch);
176}
177
178function PR_isSpaceChar(ch) {
179  return "\t \r\n".indexOf(ch) >= 0;
180}
181
182function PR_isDigitChar(ch) {
183  return ch >= '0' && ch <= '9';
184}
185
186function PR_trim(s) {
187  var i = 0, j = s.length - 1;
188  while (i <= j && PR_isSpaceChar(s.charAt(i))) { ++i; }
189  while (j > i && PR_isSpaceChar(s.charAt(j))) { --j; }
190  return s.substring(i, j + 1);
191}
192
193function PR_startsWith(s, prefix) {
194  return s.length >= prefix.length && prefix == s.substring(0, prefix.length);
195}
196
197function PR_endsWith(s, suffix) {
198  return s.length >= suffix.length &&
199         suffix == s.substring(s.length - suffix.length, s.length);
200}
201
202/** true iff prefix matches the first prefix characters in chars[0:len].
203  * @private
204  */
205function PR_prefixMatch(chars, len, prefix) {
206  if (len < prefix.length) { return false; }
207  for (var i = 0, n = prefix.length; i < n; ++i) {
208    if (prefix.charAt(i) != chars[i]) { return false; }
209  }
210  return true;
211}
212
213/** used to convert html special characters embedded in XMP tags into html. */
214function PR_textToHtml(str) {
215  return str.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
216}
217
218
219/** split markup into chunks of html tags (style null) and
220  * plain text (style {@link #PR_PLAIN}).
221  *
222  * @param s a String of html.
223  * @return an Array of PR_Tokens of style PR_PLAIN and null.
224  * @private
225  */
226function PR_chunkify(s) {
227  var chunks = new Array();
228  var state = 0;
229  var start = 0;
230  var pos = -1;
231  for (var i = 0, n = s.length; i < n; ++i) {
232    var ch = s.charAt(i);
233    switch (state) {
234      case 0:
235        if ('<' == ch) { state = 1; }
236        break;
237      case 1:
238        pos = i - 1;
239        if ('/' == ch) { state = 2; }
240        else if (PR_isWordChar(ch)) { state = 3; }
241        else if ('<' == ch) { state = 1; }
242        else { state = 0; }
243        break;
244      case 2:
245        if (PR_isWordChar(ch)) { state = 3; }
246        else if ('<' == ch) { state = 1; }
247        else { state = 0; }
248        break;
249      case 3:
250        if ('>' == ch) {
251          if (pos > start) {
252            chunks.push(new PR_Token(s.substring(start, pos), PR_PLAIN));
253          }
254          chunks.push(new PR_Token(s.substring(pos, i + 1), null));
255          start = i + 1;
256          pos = -1;
257          state = 0;
258        }
259        break;
260    }
261  }
262  if (s.length > start) {
263    chunks.push(new PR_Token(s.substring(start, s.length), PR_PLAIN));
264  }
265  return chunks;
266}
267
268/** splits chunks around entities.
269  * @private
270  */
271function PR_splitEntities(chunks) {
272  var chunksOut = new Array();
273  var state = 0;
274  for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
275    var chunk = chunks[ci];
276    if (PR_PLAIN != chunk.style) {
277      chunksOut.push(chunk);
278      continue;
279    }
280    var s = chunk.token;
281    var pos = 0;
282    var start;
283    for (var i = 0; i < s.length; ++i) {
284      var ch = s.charAt(i);
285      switch (state) {
286        case 0:
287          if ('&' == ch) { state = 1; }
288          break;
289        case 1:
290          if ('#' == ch || PR_isWordChar(ch)) {
291            start = i - 1;
292            state = 2;
293          } else {
294            state = 0;
295          }
296          break;
297        case 2:
298          if (';' == ch) {
299            if (start > pos) {
300              chunksOut.push(
301                  new PR_Token(s.substring(pos, start), chunk.style));
302            }
303            chunksOut.push(new PR_Token(s.substring(start, i + 1), null));
304            pos = i + 1;
305            state = 0;
306          }
307          break;
308      }
309    }
310    if (s.length > pos) {
311      chunksOut.push(pos ?
312                     new PR_Token(s.substring(pos, s.length), chunk.style) :
313                     chunk);
314    }
315  }
316  return chunksOut;
317}
318
319/** walk the tokenEnds list and the chunk list in parallel to generate a list
320  * of split tokens.
321  * @private
322  */
323function PR_splitChunks(chunks, tokenEnds) {
324  var tokens = new Array();  // the output
325
326  var ci = 0;  // index into chunks
327  // position of beginning of amount written so far in absolute space.
328  var posAbs = 0;
329  // position of amount written so far in chunk space
330  var posChunk = 0;
331
332  // current chunk
333  var chunk = new PR_Token('', null);
334
335  for (var ei = 0, ne = tokenEnds.length; ei < ne; ++ei) {
336    var tokenEnd = tokenEnds[ei];
337    var end = tokenEnd.end;
338
339    var tokLen = end - posAbs;
340    var remainingInChunk = chunk.token.length - posChunk;
341    while (remainingInChunk <= tokLen) {
342      if (remainingInChunk > 0) {
343        tokens.push(
344            new PR_Token(chunk.token.substring(posChunk, chunk.token.length),
345                         null == chunk.style ? null : tokenEnd.style));
346      }
347      posAbs += remainingInChunk;
348      posChunk = 0;
349      if (ci < chunks.length) { chunk = chunks[ci++]; }
350
351      tokLen = end - posAbs;
352      remainingInChunk = chunk.token.length - posChunk;
353    }
354
355    if (tokLen) {
356      tokens.push(
357          new PR_Token(chunk.token.substring(posChunk, posChunk + tokLen),
358                       tokenEnd.style));
359      posAbs += tokLen;
360      posChunk += tokLen;
361    }
362  }
363
364  return tokens;
365}
366
367/** splits markup tokens into declarations, tags, and source chunks.
368  * @private
369  */
370function PR_splitMarkup(chunks) {
371  // A state machine to split out declarations, tags, etc.
372  // This state machine deals with absolute space in the text, indexed by k,
373  // and position in the current chunk, indexed by pos and tokenStart to
374  // generate a list of the ends of tokens.
375  // Absolute space is calculated by considering the chunks as appended into
376  // one big string, as they were before being split.
377
378  // Known failure cases
379  // Server side scripting sections such as <?...?> in attributes.
380  // i.e. <span class="<? foo ?>">
381  // Handling this would require a stack, and we don't use PHP.
382
383  // The output: a list of pairs of PR_TokenEnd instances
384  var tokenEnds = new Array();
385
386  var state = 0;  // FSM state variable
387  var k = 0;  // position in absolute space of the start of the current chunk
388  var tokenStart = -1;  // the start of the current token
389
390  // Try to find a closing tag for any open <style> or <script> tags
391  // We can't do this at a later stage because then the following case
392  // would fail:
393  // <script>document.writeln('<!--');</script>
394
395  // We use tokenChars[:tokenCharsI] to accumulate the tag name so that we
396  // can check whether to enter into a no scripting section when the tag ends.
397  var tokenChars = new Array(12);
398  var tokenCharsI = 0;
399  // if non null, the tag prefix that we need to see to break out.
400  var endScriptTag = null;
401  var decodeHelper = new PR_DecodeHelper();
402
403  for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
404    var chunk = chunks[ci];
405    if (PR_PLAIN != chunk.style) {
406      k += chunk.token.length;
407      continue;
408    }
409
410    var s = chunk.token;
411    var pos = 0;  // the position past the last character processed so far in s
412
413    for (var i = 0, n = s.length; i < n; /* i = next at bottom */) {
414      decodeHelper.decode(s, i);
415      var ch = decodeHelper.ch;
416      var next = decodeHelper.next;
417
418      var tokenStyle = null;
419      switch (state) {
420        case 0:
421          if ('<' == ch) { state = 1; }
422          break;
423        case 1:
424          tokenCharsI = 0;
425          if ('/' == ch) {  // only consider close tags if we're in script/style
426            state = 7;
427          } else if (null == endScriptTag) {
428            if ('!' == ch) {
429              state = 2;
430            } else if (PR_isWordChar(ch)) {
431              state = 8;
432            } else if ('?' == ch) {
433              state = 9;
434            } else if ('%' == ch) {
435              state = 11;
436            } else if ('<' != ch) {
437              state = 0;
438            }
439          } else if ('<' != ch) {
440            state = 0;
441          }
442          break;
443        case 2:
444          if ('-' == ch) {
445            state = 4;
446          } else if (PR_isWordChar(ch)) {
447            state = 3;
448          } else if ('<' == ch) {
449            state = 1;
450          } else {
451            state = 0;
452          }
453          break;
454        case 3:
455          if ('>' == ch) {
456            state = 0;
457            tokenStyle = PR_DECLARATION;
458          }
459          break;
460        case 4:
461          if ('-' == ch) { state = 5; }
462          break;
463        case 5:
464          if ('-' == ch) { state = 6; }
465          break;
466        case 6:
467          if ('>' == ch) {
468            state = 0;
469            tokenStyle = PR_COMMENT;
470          } else if ('-' == ch) {
471            state = 6;
472          } else {
473            state = 4;
474          }
475          break;
476        case 7:
477          if (PR_isWordChar(ch)) {
478            state = 8;
479          } else if ('<' == ch) {
480            state = 1;
481          } else {
482            state = 0;
483          }
484          break;
485        case 8:
486          if ('>' == ch) {
487            state = 0;
488            tokenStyle = PR_TAG;
489          }
490          break;
491        case 9:
492          if ('?' == ch) { state = 10; }
493          break;
494        case 10:
495          if ('>' == ch) {
496            state = 0;
497            tokenStyle = PR_SOURCE;
498          } else if ('?' != ch) {
499            state = 9;
500          }
501          break;
502        case 11:
503          if ('%' == ch) { state = 12; }
504          break;
505        case 12:
506          if ('>' == ch) {
507            state = 0;
508            tokenStyle = PR_SOURCE;
509          } else if ('%' != ch) {
510            state = 11;
511          }
512          break;
513      }
514
515      if (tokenCharsI < tokenChars.length) {
516        tokenChars[tokenCharsI++] = ch.toLowerCase();
517      }
518      if (1 == state) { tokenStart = k + i; }
519      i = next;
520      if (tokenStyle != null) {
521        if (null != tokenStyle) {
522          if (endScriptTag) {
523            if (PR_prefixMatch(tokenChars, tokenCharsI, endScriptTag)) {
524              endScriptTag = null;
525            }
526          } else {
527            if (PR_prefixMatch(tokenChars, tokenCharsI, 'script')) {
528              endScriptTag = '/script';
529            } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'style')) {
530              endScriptTag = '/style';
531            } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'xmp')) {
532              endScriptTag = '/xmp';
533            }
534          }
535          // disallow the tag if endScriptTag is set and this was not an open
536          // tag.
537          if (endScriptTag && tokenCharsI && '/' == tokenChars[0]) {
538            tokenStyle = null;
539          }
540        }
541        if (null != tokenStyle) {
542          tokenEnds.push(new PR_TokenEnd(tokenStart, PR_PLAIN));
543          tokenEnds.push(new PR_TokenEnd(k + next, tokenStyle));
544        }
545      }
546    }
547    k += chunk.token.length;
548  }
549  tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN));
550
551  return tokenEnds;
552}
553
554/** splits the given string into comment, string, and "other" tokens.
555  * @return an array of PR_Tokens with style in
556  *   (PR_STRING, PR_COMMENT, PR_PLAIN, null)
557  *   The result array may contain spurious zero length tokens.  Ignore them.
558  *
559  * @private
560  */
561function PR_splitStringAndCommentTokens(chunks) {
562  // a state machine to split out comments, strings, and other stuff
563  var tokenEnds = new Array();  // positions of ends of tokens in absolute space
564  var state = 0;  // FSM state variable
565  var delim = -1;  // string delimiter
566  var k = 0;  // absolute position of beginning of current chunk
567  for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
568    var chunk = chunks[ci];
569    var s = chunk.token;
570    if (PR_PLAIN == chunk.style) {
571      for (var i = 0, n = s.length; i < n; ++i) {
572        var ch = s.charAt(i);
573        if (0 == state) {
574          if (ch == '"' || ch == '\'' || ch == '`') {
575            tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
576            state = 1;
577            delim = ch;
578          } else if (ch == '/') {
579            state = 3;
580          } else if (ch == '#') {
581            tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
582            state = 4;
583          }
584        } else if (1 == state) {
585          if (ch == delim) {
586            state = 0;
587            tokenEnds.push(new PR_TokenEnd(k + i + 1, PR_STRING));
588          } else if (ch == '\\') {
589            state = 2;
590          }
591        } else if (2 == state) {
592          state = 1;
593        } else if (3 == state) {
594          if (ch == '/') {
595            state = 4;
596            tokenEnds.push(new PR_TokenEnd(k + i - 1, PR_PLAIN));
597          } else if (ch == '*') {
598            state = 5;
599            tokenEnds.push(new PR_TokenEnd(k + i - 1, PR_PLAIN));
600          } else {
601            state = 0;
602            // next loop will reenter state 0 without same value of i, so
603            // ch will be reconsidered as start of new token.
604            --i;
605          }
606        } else if (4 == state) {
607          if (ch == '\r' || ch == '\n') {
608            state = 0;
609            tokenEnds.push(new PR_TokenEnd(k + i, PR_COMMENT));
610          }
611        } else if (5 == state) {
612          if (ch == '*') {
613            state = 6;
614          }
615        } else if (6 == state) {
616          if (ch == '/') {
617            state = 0;
618            tokenEnds.push(new PR_TokenEnd(k + i + 1, PR_COMMENT));
619          } else if (ch != '*') {
620            state = 5;
621          }
622        }
623      }
624    }
625    k += s.length;
626  }
627  tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN));  // a token ends at the end
628
629  return PR_splitChunks(chunks, tokenEnds);
630}
631
632/** used by lexSource to split a non string, non comment token.
633  * @private
634  */
635function PR_splitNonStringNonCommentToken(s, outlist) {
636  var pos = 0;
637  var state = 0;
638  for (var i = 0; i <= s.length; i++) {
639    var ch = s.charAt(i);
640    // the next state.
641    // if set to -1 then it will cause a reentry to state 0 without consuming
642    // another character.
643    var nstate = state;
644
645    if (i == s.length) {
646      // nstate will not be equal to state, so it will append the token
647      nstate = -2;
648    } else {
649      switch (state) {
650      case 0:  // whitespace state
651        if (PR_isIdentifierStart(ch)) {
652          nstate = 1;
653        } else if (PR_isDigitChar(ch)) {
654          nstate = 2;
655        } else if (!PR_isSpaceChar(ch)) {
656          nstate = 3;
657        }
658        if (nstate && pos < i) {
659          var t = s.substring(pos, i);
660          outlist.push(new PR_Token(t, PR_PLAIN));
661          pos = i;
662        }
663        break;
664      case 1:  // identifier state
665        if (!PR_isIdentifierPart(ch)) {
666          nstate = -1;
667        }
668        break;
669      case 2:  // number literal state
670        // handle numeric literals like
671        // 0x7f 300UL 100_000
672
673        // this does not treat floating point values as a single literal
674        //   0.1 and 3e-6
675        // are each split into multiple tokens
676        if (!(PR_isDigitChar(ch) || PR_isWordChar(ch) || ch == '_')) {
677          nstate = -1;
678        }
679        break;
680      case 3:  // punctuation state
681        if (PR_isIdentifierStart(ch) || PR_isDigitChar(ch) ||
682            PR_isSpaceChar(ch)) {
683          nstate = -1;
684        }
685        break;
686      }
687    }
688
689    if (nstate != state) {
690      if (nstate < 0) {
691        if (i > pos) {
692          var t = s.substring(pos, i);
693          var ch0 = t.charAt(0);
694          var style;
695          if (PR_isIdentifierStart(ch0)) {
696            if (PR_keywords[t]) {
697              style = PR_KEYWORD;
698            }
699            else if (ch0 == '@') {
700              style = PR_LITERAL;
701            } else {
702              // Treat any word that starts with an uppercase character and
703              // contains at least one lowercase character as a type, or
704              // ends with _t.
705              // This works perfectly for Java, pretty well for C++, and
706              // passably for Python.  The _t catches C structs.
707              var isType = false;
708              if (ch0 >= 'A' && ch0 <= 'Z') {
709                for (var j = 1; j < t.length; j++) {
710                  var ch1 = t.charAt(j);
711                  if (ch1 >= 'a' && ch1 <= 'z') {
712                    isType = true;
713                    break;
714                  }
715                }
716                if (!isType && t.length >= 2 &&
717                    t.substring(t.length - 2) == '_t') {
718                  isType = true;
719                }
720              }
721              style = isType ? PR_TYPE : PR_PLAIN;
722            }
723          } else if (PR_isDigitChar(ch0)) {
724            style = PR_LITERAL;
725          } else if (!PR_isSpaceChar(ch0)) {
726            style = PR_PUNCTUATION;
727          } else {
728            style = PR_PLAIN;
729          }
730          pos = i;
731          outlist.push(new PR_Token(t, style));
732        }
733
734        state = 0;
735        if (nstate == -1) {
736          // don't increment.  This allows us to use state 0 to redispatch based
737          // on the current character.
738          i--;
739          continue;
740        }
741      }
742      state = nstate;
743    }
744  }
745}
746
747/** split a group of chunks of markup.
748  * @private
749  */
750function PR_tokenizeMarkup(chunks) {
751  if (!(chunks && chunks.length)) { return chunks; }
752
753  var tokenEnds = PR_splitMarkup(chunks);
754  return PR_splitChunks(chunks, tokenEnds);
755}
756
757/** split tags attributes and their values out from the tag name, and
758  * recursively lex source chunks.
759  * @private
760  */
761function PR_splitTagAttributes(tokens) {
762  var tokensOut = new Array();
763  var state = 0;
764  var stateStyle = PR_TAG;
765  var delim = null;  // attribute delimiter for quoted value state.
766  var decodeHelper = new PR_DecodeHelper();
767  for (var ci = 0; ci < tokens.length; ++ci) {
768    var tok = tokens[ci];
769    if (PR_TAG == tok.style) {
770      var s = tok.token;
771      var start = 0;
772      for (var i = 0; i < s.length; /* i = next at bottom */) {
773        decodeHelper.decode(s, i);
774        var ch = decodeHelper.ch;
775        var next = decodeHelper.next;
776
777        var emitEnd = null;  // null or position of end of chunk to emit.
778        var nextStyle = null;  // null or next value of stateStyle
779        if (ch == '>') {
780          if (PR_TAG != stateStyle) {
781            emitEnd = i;
782            nextStyle = PR_TAG;
783          }
784        } else {
785          switch (state) {
786            case 0:
787              if ('<' == ch) { state = 1; }
788              break;
789            case 1:
790              if (PR_isSpaceChar(ch)) { state = 2; }
791              break;
792            case 2:
793              if (!PR_isSpaceChar(ch)) {
794                nextStyle = PR_ATTRIB_NAME;
795                emitEnd = i;
796                state = 3;
797              }
798              break;
799            case 3:
800              if ('=' == ch) {
801                emitEnd = i;
802                nextStyle = PR_TAG;
803                state = 5;
804              } else if (PR_isSpaceChar(ch)) {
805                emitEnd = i;
806                nextStyle = PR_TAG;
807                state = 4;
808              }
809              break;
810            case 4:
811              if ('=' == ch) {
812                state = 5;
813              } else if (!PR_isSpaceChar(ch)) {
814                emitEnd = i;
815                nextStyle = PR_ATTRIB_NAME;
816                state = 3;
817              }
818              break;
819            case 5:
820              if ('"' == ch || '\'' == ch) {
821                emitEnd = i;
822                nextStyle = PR_ATTRIB_VALUE;
823                state = 6;
824                delim = ch;
825              } else if (!PR_isSpaceChar(ch)) {
826                emitEnd = i;
827                nextStyle = PR_ATTRIB_VALUE;
828                state = 7;
829              }
830              break;
831            case 6:
832              if (ch == delim) {
833                emitEnd = next;
834                nextStyle = PR_TAG;
835                state = 2;
836              }
837              break;
838            case 7:
839              if (PR_isSpaceChar(ch)) {
840                emitEnd = i;
841                nextStyle = PR_TAG;
842                state = 2;
843              }
844              break;
845          }
846        }
847        if (emitEnd) {
848          if (emitEnd > start) {
849            tokensOut.push(
850                new PR_Token(s.substring(start, emitEnd), stateStyle));
851            start = emitEnd;
852          }
853          stateStyle = nextStyle;
854        }
855        i = next;
856      }
857      if (s.length > start) {
858        tokensOut.push(new PR_Token(s.substring(start, s.length), stateStyle));
859      }
860    } else {
861      if (tok.style) {
862        state = 0;
863        stateStyle = PR_TAG;
864      }
865      tokensOut.push(tok);
866    }
867  }
868  return tokensOut;
869}
870
871/** identify regions of markup that are really source code, and recursivley
872  * lex them.
873  * @private
874  */
875function PR_splitSourceNodes(tokens) {
876  var tokensOut = new Array();
877  // when we see a <script> tag, store '/' here so that we know to end the
878  // source processing
879  var endScriptTag = null;
880  var decodeHelper = new PR_DecodeHelper();
881
882  var sourceChunks = null;
883
884  for (var ci = 0, nc = tokens.length; ci < nc; ++ci) {
885    var tok = tokens[ci];
886    if (null == tok.style) {
887      tokens.push(tok);
888      continue;
889    }
890
891    var s = tok.token;
892
893    if (null == endScriptTag) {
894      if (PR_SOURCE == tok.style) {
895        // split off any starting and trailing <?, <%
896        if ('<' == decodeHelper.decode(s, 0)) {
897          decodeHelper.decode(s, decodeHelper.next);
898          if ('%' == decodeHelper.ch || '?' == decodeHelper.ch) {
899            endScriptTag = decodeHelper.ch;
900            tokensOut.push(new PR_Token(s.substring(0, decodeHelper.next),
901                                        PR_TAG));
902            s = s.substring(decodeHelper.next, s.length);
903          }
904        }
905      } else if (PR_TAG == tok.style) {
906        if ('<' == decodeHelper.decode(s, 0) &&
907            '/' != s.charAt(decodeHelper.next)) {
908          var tagContent = s.substring(decodeHelper.next).toLowerCase();
909          // FIXME(msamuel): this does not mirror exactly the code in
910          // in PR_splitMarkup that defers splitting tags inside script and
911          // style blocks.
912          if (PR_startsWith(tagContent, 'script') ||
913              PR_startsWith(tagContent, 'style') ||
914              PR_startsWith(tagContent, 'xmp')) {
915            endScriptTag = '/';
916          }
917        }
918      }
919    }
920
921    if (null != endScriptTag) {
922      var endTok = null;
923      if (PR_SOURCE == tok.style) {
924        if (endScriptTag == '%' || endScriptTag == '?') {
925          var pos = s.lastIndexOf(endScriptTag);
926          if (pos >= 0 && '>' == decodeHelper.decode(s, pos + 1) &&
927              s.length == decodeHelper.next) {
928            endTok = new PR_Token(s.substring(pos, s.length), PR_TAG);
929            s = s.substring(0, pos);
930          }
931        }
932        if (null == sourceChunks) { sourceChunks = new Array(); }
933        sourceChunks.push(new PR_Token(s, PR_PLAIN));
934      } else if (PR_PLAIN == tok.style) {
935        if (null == sourceChunks) { sourceChunks = new Array(); }
936        sourceChunks.push(tok);
937      } else if (PR_TAG == tok.style) {
938        // if it starts with </ then it must be the end tag.
939        if ('<' == decodeHelper.decode(tok.token, 0) &&
940            tok.token.length > decodeHelper.next &&
941            '/' == decodeHelper.decode(tok.token, decodeHelper.next)) {
942          endTok = tok;
943        } else {
944          tokensOut.push(tok);
945        }
946      } else {
947        if (sourceChunks) {
948          sourceChunks.push(tok);
949        } else {
950          // push remaining tag and attribute tokens from the opening tag
951          tokensOut.push(tok);
952        }
953      }
954      if (endTok) {
955        if (sourceChunks) {
956          var sourceTokens = PR_lexSource(sourceChunks);
957          tokensOut.push(new PR_Token('<span class=embsrc>', null));
958          for (var si = 0, ns = sourceTokens.length; si < ns; ++si) {
959            tokensOut.push(sourceTokens[si]);
960          }
961          tokensOut.push(new PR_Token('</span>', null));
962          sourceChunks = null;
963        }
964        tokensOut.push(endTok);
965        endScriptTag = null;
966      }
967    } else {
968      tokensOut.push(tok);
969    }
970  }
971  return tokensOut;
972}
973
974/** splits the quotes from an attribute value.
975  * ['"foo"'] -> ['"', 'foo', '"']
976  * @private
977  */
978function PR_splitAttributeQuotes(tokens) {
979  var firstPlain = null, lastPlain = null;
980  for (var i = 0; i < tokens.length; ++i) {
981    if (PR_PLAIN = tokens[i].style) {
982      firstPlain = i;
983      break;
984    }
985  }
986  for (var i = tokens.length; --i >= 0;) {
987    if (PR_PLAIN = tokens[i].style) {
988      lastPlain = i;
989      break;
990    }
991  }
992  if (null == firstPlain) { return tokens; }
993
994  var decodeHelper = new PR_DecodeHelper();
995  var fs = tokens[firstPlain].token;
996  var fc = decodeHelper.decode(fs, 0);
997  if ('"' != fc && '\'' != fc) {
998    return tokens;
999  }
1000  var fpos = decodeHelper.next;
1001
1002  var ls = tokens[lastPlain].token;
1003  var lpos = ls.lastIndexOf('&');
1004  if (lpos < 0) { lpos = ls.length - 1; }
1005  var lc = decodeHelper.decode(ls, lpos);
1006  if (lc != fc || decodeHelper.next != ls.length) {
1007    lc = null;
1008    lpos = ls.length;
1009  }
1010
1011  var tokensOut = new Array();
1012  for (var i = 0; i < firstPlain; ++i) {
1013    tokensOut.push(tokens[i]);
1014  }
1015  tokensOut.push(new PR_Token(fs.substring(0, fpos), PR_ATTRIB_VALUE));
1016  if (lastPlain == firstPlain) {
1017    tokensOut.push(new PR_Token(fs.substring(fpos, lpos), PR_PLAIN));
1018  } else {
1019    tokensOut.push(new PR_Token(fs.substring(fpos, fs.length), PR_PLAIN));
1020    for (var i = firstPlain + 1; i < lastPlain; ++i) {
1021      tokensOut.push(tokens[i]);
1022    }
1023    if (lc) {
1024      tokens.push(new PR_Token(ls.substring(0, lpos), PR_PLAIN));
1025    } else {
1026      tokens.push(tokens[lastPlain]);
1027    }
1028  }
1029  if (lc) {
1030    tokensOut.push(new PR_Token(ls.substring(lpos, ls.length), PR_PLAIN));
1031  }
1032  for (var i = lastPlain + 1; i < tokens.length; ++i) {
1033    tokensOut.push(tokens[i]);
1034  }
1035  return tokensOut;
1036}
1037
1038/** identify attribute values that really contain source code and recursively
1039  * lex them.
1040  * @private
1041  */
1042function PR_splitSourceAttributes(tokens) {
1043  var tokensOut = new Array();
1044
1045  var sourceChunks = null;
1046  var inSource = false;
1047  var name = '';
1048
1049  for (var ci = 0, nc = tokens.length; ci < nc; ++ci) {
1050    var tok = tokens[ci];
1051    var outList = tokensOut;
1052    if (PR_TAG == tok.style) {
1053      if (inSource) {
1054        inSource = false;
1055        name = '';
1056        if (sourceChunks) {
1057          tokensOut.push(new PR_Token('<span class=embsrc>', null));
1058          var sourceTokens =
1059            PR_lexSource(PR_splitAttributeQuotes(sourceChunks));
1060          for (var si = 0, ns = sourceTokens.length; si < ns; ++si) {
1061            tokensOut.push(sourceTokens[si]);
1062          }
1063          tokensOut.push(new PR_Token('</span>', null));
1064          sourceChunks = null;
1065        }
1066      } else if (name && tok.token.indexOf('=') >= 0) {
1067        var nameLower = name.toLowerCase();
1068        if (PR_startsWith(nameLower, 'on') || 'style' == nameLower) {
1069          inSource = true;
1070        }
1071      } else {
1072        name = '';
1073      }
1074    } else if (PR_ATTRIB_NAME == tok.style) {
1075      name += tok.token;
1076    } else if (PR_ATTRIB_VALUE == tok.style) {
1077      if (inSource) {
1078        if (null == sourceChunks) { sourceChunks = new Array(); }
1079        outList = sourceChunks;
1080        tok = new PR_Token(tok.token, PR_PLAIN);
1081      }
1082    } else {
1083      if (sourceChunks) {
1084        outList = sourceChunks;
1085      }
1086    }
1087    outList.push(tok);
1088  }
1089  return tokensOut;
1090}
1091
1092/** returns a list of PR_Token objects given chunks of source code.
1093  *
1094  * This code assumes that < tokens are html escaped, but " are not.
1095  * It will do a resonable job with <, but will not recognize an &quot;
1096  * as starting a string.
1097  *
1098  * This code treats ", ', and ` as string delimiters, and \ as a string escape.
1099  * It does not recognize double delimiter escapes, or perl's qq() style
1100  * strings.
1101  *
1102  * It recognizes C, C++, and shell style comments.
1103  *
1104  * @param chunks PR_Tokens with style in (null, PR_PLAIN)
1105  */
1106function PR_lexSource(chunks) {
1107  // positions of ends of tokens in order
1108  var tokensIn = PR_splitStringAndCommentTokens(chunks);
1109
1110  // split entities out of so that we know to treat them as single units.
1111  tokensIn = PR_splitEntities(tokensIn);
1112
1113  // split non comment|string tokens on whitespace and word boundaries
1114  var tokensOut = new Array();
1115  for (var i = 0; i < tokensIn.length; ++i) {
1116    var tok = tokensIn[i];
1117    var t = tok.token;
1118    var s = tok.style;
1119
1120    if (PR_PLAIN == s) {
1121      PR_splitNonStringNonCommentToken(t, tokensOut);
1122      continue;
1123    }
1124    tokensOut.push(tok);
1125  }
1126
1127  return tokensOut;
1128}
1129
1130/** returns a list of PR_Token objects given a string of markup.
1131  *
1132  * This code assumes that < tokens are html escaped, but " are not.
1133  * It will do a resonable job with <, but will not recognize an &quot;
1134  * as starting a string.
1135  *
1136  * This code recognizes a number of constructs.
1137  * <!-- ... --> comment
1138  * <!\w ... >   declaration
1139  * <\w ... >    tag
1140  * </\w ... >   tag
1141  * <?...?>      embedded source
1142  * &[#\w]...;   entity
1143  *
1144  * It does not recognizes %foo; entities.
1145  *
1146  * It will recurse into any <style>, <script>, and on* attributes using
1147  * PR_lexSource.
1148  */
1149function PR_lexMarkup(chunks) {
1150  // This function works as follows:
1151  // 1) Start by splitting the markup into text and tag chunks
1152  //    Input:  String s
1153  //    Output: List<PR_Token> where style in (PR_PLAIN, null)
1154  // 2) Then split the text chunks further into comments, declarations,
1155  //    tags, etc.
1156  //    After each split, consider whether the token is the start of an
1157  //    embedded source section, i.e. is an open <script> tag.  If it is,
1158  //    find the corresponding close token, and don't bother to lex in between.
1159  //    Input:  List<String>
1160  //    Output: List<PR_Token> with style in (PR_TAG, PR_PLAIN, PR_SOURCE, null)
1161  // 3) Finally go over each tag token and split out attribute names and values.
1162  //    Input:  List<PR_Token>
1163  //    Output: List<PR_Token> where style in
1164  //            (PR_TAG, PR_PLAIN, PR_SOURCE, NAME, VALUE, null)
1165  var tokensOut = PR_tokenizeMarkup(chunks);
1166  tokensOut = PR_splitTagAttributes(tokensOut);
1167  tokensOut = PR_splitSourceNodes(tokensOut);
1168  tokensOut = PR_splitSourceAttributes(tokensOut);
1169  return tokensOut;
1170}
1171
1172/** classify the string as either source or markup and lex appropriately. */
1173function PR_lexOne(s) {
1174  var chunks = PR_chunkify(s);
1175  // treat it as markup if the first non whitespace character is a < and the
1176  // last non-whitespace character is a >
1177  var isMarkup = false;
1178  for (var i = 0; i < chunks.length; ++i) {
1179    if (PR_PLAIN == chunks[i].style) {
1180      if (PR_startsWith(PR_trim(chunks[i].token), '&lt;')) {
1181        for (var j = chunks.length; --j >= 0;) {
1182          if (PR_PLAIN == chunks[j].style) {
1183            isMarkup = PR_endsWith(PR_trim(chunks[j].token), '&gt;');
1184            break;
1185          }
1186        }
1187      }
1188      break;
1189    }
1190  }
1191  return isMarkup ? PR_lexMarkup(chunks) : PR_lexSource(chunks);
1192}
1193
1194/** pretty print a chunk of code.
1195  *
1196  * @param s code as html
1197  * @return code as html, but prettier
1198  */
1199function prettyPrintOne(s) {
1200  try {
1201    var tokens = PR_lexOne(s);
1202    var out = '';
1203    var lastStyle = null;
1204    for (var i = 0; i < tokens.length; i++) {
1205      var t = tokens[i];
1206      if (t.style != lastStyle) {
1207        if (lastStyle != null) {
1208          out += '</span>';
1209        }
1210        if (t.style != null) {
1211          out += '<span class=' + t.style + '>';
1212        }
1213        lastStyle = t.style;
1214      }
1215      var html = t.token;
1216      if (null != t.style) {
1217        // This interacts badly with the wiki which introduces paragraph tags
1218        // int pre blocks for some strange reason.
1219        // It's necessary for IE though which seems to lose the preformattedness
1220        // of <pre> tags when their innerHTML is assigned.
1221        html = html.replace(/(?:\r\n?)|\n/g, '<br>').replace(/  /g, '&nbsp; ');
1222      }
1223      out += html;
1224    }
1225    if (lastStyle != null) {
1226      out += '</span>';
1227    }
1228    return out;
1229  } catch (e) {
1230    //alert(e.stack);  // DISABLE in production
1231    return s;
1232  }
1233}
1234
1235/** find all the < pre > and < code > tags in the DOM with class=prettyprint and
1236  * prettify them.
1237  */
1238function prettyPrint() {
1239  // fetch a list of nodes to rewrite
1240  var codeSegments = [
1241      document.getElementsByTagName('pre'),
1242      document.getElementsByTagName('code'),
1243      document.getElementsByTagName('xmp') ];
1244  var elements = [];
1245  for (var i = 0; i < codeSegments.length; ++i) {
1246    for (var j = 0; j < codeSegments[i].length; ++j) {
1247      elements.push(codeSegments[i][j]);
1248    }
1249  }
1250  codeSegments = null;
1251
1252  // the loop is broken into a series of continuations to make sure that we
1253  // don't make the browser unresponsive when rewriting a large page.
1254  var k = 0;
1255
1256  function doWork() {
1257    var endTime = new Date().getTime() + 250;
1258    for (; k < elements.length && new Date().getTime() < endTime; k++) {
1259      var cs = elements[k];
1260      if (cs.className && cs.className.indexOf('prettyprint') >= 0) {
1261
1262        // make sure this is not nested in an already prettified element
1263        var nested = false;
1264        for (var p = cs.parentNode; p != null; p = p.parentNode) {
1265          if ((p.tagName == 'pre' || p.tagName == 'code' ||
1266               p.tagName == 'xmp') &&
1267              p.className && p.className.indexOf('prettyprint') >= 0) {
1268            nested = true;
1269            break;
1270          }
1271        }
1272        if (!nested) {
1273          // XMP tags contain unescaped entities so require special handling.
1274          var isRawContent = 'XMP' == cs.tagName;
1275
1276          // fetch the content as a snippet of properly escaped HTML
1277          var content = cs.innerHTML;
1278          if (isRawContent) {
1279            content = PR_textToHtml(content);
1280          }
1281
1282          // do the pretty printing
1283          var newContent = prettyPrintOne(content);
1284
1285          // push the prettified html back into the tag.
1286          if (!isRawContent) {
1287            // just replace the old html with the new
1288            cs.innerHTML = newContent;
1289          } else {
1290            // we need to change the tag to a <pre> since <xmp>s do not allow
1291            // embedded tags such as the span tags used to attach styles to
1292            // sections of source code.
1293            var pre = document.createElement('PRE');
1294            for (var i = 0; i < cs.attributes.length; ++i) {
1295              var a = cs.attributes[i];
1296              if (a.specified) {
1297                pre.setAttribute(a.name, a.value);
1298              }
1299            }
1300            pre.innerHTML = newContent;
1301            // remove the old
1302            cs.parentNode.replaceChild(pre, cs);
1303          }
1304        }
1305      }
1306    }
1307    if (k < elements.length) {
1308      // finish up in a continuation
1309      setTimeout(doWork, 250);
1310    }
1311  }
1312
1313  doWork();
1314}
1315