NewHdfParser.java revision 712c262ddf6790c16a8567053f616c71da7e1856
1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.clearsilver.jsilver.data;
18
19import com.google.clearsilver.jsilver.resourceloader.ResourceLoader;
20
21import java.io.IOException;
22import java.io.LineNumberReader;
23import java.io.Reader;
24import java.util.ArrayList;
25import java.util.Iterator;
26import java.util.Stack;
27
28/**
29 * Parser for HDF based on the following grammar by Brandon Long.
30 *
31 * COMMAND := (INCLUDE | COMMENT | HDF_SET | HDF_DESCEND | HDF_ASCEND ) INCLUDE := #include
32 * "FILENAME" EOL COMMENT := # .* EOL HDF_DESCEND := HDF_NAME_ATTRS { EOL HDF_ASCEND := } EOL
33 * HDF_SET := (HDF_ASSIGN | HDF_MULTILINE_ASSIGN | HDF_COPY | HDF_LINK) HDF_ASSIGN := HDF_NAME_ATTRS
34 * = .* EOL HDF_MULTILINE_ASSIGN := HDF_NAME_ATTRS << EOM_MARKER EOL (.* EOL)* EOM_MARKER EOL
35 * HDF_COPY := HDF_NAME_ATTRS := HDF_NAME EOL HDF_LINK := HDF_NAME_ATTRS : HDF_NAME EOL
36 * HDF_NAME_ATTRS := (HDF_NAME | HDF_NAME [HDF_ATTRS]) HDF_ATTRS := (HDF_ATTR | HDF_ATTR, HDF_ATTRS)
37 * HDF_ATTR := (HDF_ATTR_KEY | HDF_ATTR_KEY = [^\s,\]]+ | HDF_ATTR_KEY = DQUOTED_STRING)
38 * HDF_ATTR_KEY := [0-9a-zA-Z]+ DQUOTED_STRING := "([^\\"]|\\[ntr]|\\.)*" HDF_NAME := (HDF_SUB_NAME
39 * | HDF_SUB_NAME\.HDF_NAME) HDF_SUB_NAME := [0-9a-zA-Z_]+ EOM_MARKER := \S.*\S EOL := \n
40 */
41public class NewHdfParser implements Parser {
42
43  private final StringInternStrategy internStrategy;
44
45  /**
46   * Special exception used to detect when we unexpectedly run out of characters on the line.
47   */
48  private static class OutOfCharsException extends Exception {}
49
50  /**
51   * Object used to hold the name and attributes of an HDF node before we are ready to commit it to
52   * the Data object.
53   */
54  private static class HdfNameAttrs {
55    String name;
56    ArrayList<String> attrs = null;
57    int endOfSequence;
58
59    void reset(String newname) {
60      // TODO: think about moving interning here instead of parser code
61      this.name = newname;
62      if (attrs != null) {
63        attrs.clear();
64      }
65      endOfSequence = 0;
66    }
67
68    void addAttribute(String key, String value) {
69      if (attrs == null) {
70        attrs = new ArrayList<String>(10);
71      }
72      attrs.ensureCapacity(attrs.size() + 2);
73      // TODO: think about moving interning here instead of parser code
74      attrs.add(key);
75      attrs.add(value);
76    }
77
78    Data toData(Data data) {
79      Data child = data.createChild(name);
80      if (attrs != null) {
81        Iterator<String> it = attrs.iterator();
82        while (it.hasNext()) {
83          String key = it.next();
84          String value = it.next();
85          child.setAttribute(key, value);
86        }
87      }
88      return child;
89    }
90  }
91
92  static final String UNNAMED_INPUT = "[UNNAMED_INPUT]";
93
94  /**
95   * State information that we pass through the parse methods. Allows parser to be reentrant as all
96   * the state is passed through method calls.
97   */
98  static class ParseState {
99    final Stack<Data> context = new Stack<Data>();
100    final Data output;
101    final LineNumberReader lineReader;
102    final ErrorHandler errorHandler;
103    final ResourceLoader resourceLoader;
104    final NewHdfParser hdfParser;
105    final boolean ignoreAttributes;
106    final HdfNameAttrs hdfNameAttrs;
107    final UniqueStack<String> includeStack;
108    final String parsedFileName;
109
110    String line;
111    Data currentNode;
112
113    private ParseState(Data output, LineNumberReader lineReader, ErrorHandler errorHandler,
114        ResourceLoader resourceLoader, NewHdfParser hdfParser, String parsedFileName,
115        boolean ignoreAttributes, HdfNameAttrs hdfNameAttrs, UniqueStack<String> includeStack) {
116      this.lineReader = lineReader;
117      this.errorHandler = errorHandler;
118      this.output = output;
119      currentNode = output;
120      this.resourceLoader = resourceLoader;
121      this.hdfParser = hdfParser;
122      this.parsedFileName = parsedFileName;
123      this.ignoreAttributes = ignoreAttributes;
124      this.hdfNameAttrs = hdfNameAttrs;
125      this.includeStack = includeStack;
126    }
127
128    public static ParseState createNewParseState(Data output, Reader reader,
129        ErrorHandler errorHandler, ResourceLoader resourceLoader, NewHdfParser hdfParser,
130        String parsedFileName, boolean ignoreAttributes) {
131
132      if (parsedFileName == null) {
133        parsedFileName = UNNAMED_INPUT;
134      }
135      UniqueStack<String> includeStack = new UniqueStack<String>();
136      includeStack.push(parsedFileName);
137
138      return new ParseState(output, new LineNumberReader(reader), errorHandler, resourceLoader,
139          hdfParser, parsedFileName, ignoreAttributes, new HdfNameAttrs(), includeStack);
140    }
141
142    public static ParseState createParseStateForIncludedFile(ParseState originalState,
143        String includeFileName, Reader includeFileReader) {
144      return new ParseState(originalState.output, new LineNumberReader(includeFileReader),
145          originalState.errorHandler, originalState.resourceLoader, originalState.hdfParser,
146          originalState.parsedFileName, originalState.ignoreAttributes, new HdfNameAttrs(),
147          originalState.includeStack);
148    }
149  }
150
151
152  /**
153   * Constructor for {@link NewHdfParser}.
154   *
155   * @param internPool - {@link StringInternStrategy} instance used to optimize the HDF parsing.
156   */
157  public NewHdfParser(StringInternStrategy internPool) {
158    this.internStrategy = internPool;
159  }
160
161  private static class NewHdfParserFactory implements ParserFactory {
162    private final StringInternStrategy stringInternStrategy;
163
164    public NewHdfParserFactory(StringInternStrategy stringInternStrategy) {
165      this.stringInternStrategy = stringInternStrategy;
166    }
167
168    @Override
169    public Parser newInstance() {
170      return new NewHdfParser(stringInternStrategy);
171    }
172  }
173
174  /**
175   * Creates a {@link ParserFactory} instance.
176   *
177   * <p>
178   * Provided {@code stringInternStrategy} instance will be used by shared all {@link Parser}
179   * objects created by the factory and used to optimize the HDF parsing process by reusing the
180   * String for keys and values.
181   *
182   * @param stringInternStrategy - {@link StringInternStrategy} instance used to optimize the HDF
183   *        parsing.
184   * @return an instance of {@link ParserFactory} implementation.
185   */
186  public static ParserFactory newFactory(StringInternStrategy stringInternStrategy) {
187    return new NewHdfParserFactory(stringInternStrategy);
188  }
189
190  public void parse(Reader reader, Data output, Parser.ErrorHandler errorHandler,
191      ResourceLoader resourceLoader, String dataFileName, boolean ignoreAttributes)
192      throws IOException {
193
194    parse(ParseState.createNewParseState(output, reader, errorHandler, resourceLoader, this,
195        dataFileName, ignoreAttributes));
196  }
197
198  private void parse(ParseState state) throws IOException {
199    while ((state.line = state.lineReader.readLine()) != null) {
200      String seq = stripWhitespace(state.line);
201      try {
202        parseCommand(seq, state);
203      } catch (OutOfCharsException e) {
204        reportError(state, "End of line was prematurely reached. Parse error.");
205      }
206    }
207  }
208
209  private static final String INCLUDE_WS = "#include ";
210
211  private void parseCommand(String seq, ParseState state) throws IOException, OutOfCharsException {
212    if (seq.length() == 0) {
213      // Empty line.
214      return;
215    }
216    if (charAt(seq, 0) == '#') {
217      // If there isn't a match on include then this is a comment and we do nothing.
218      if (matches(seq, 0, INCLUDE_WS)) {
219        // This is an include command
220        int start = skipLeadingWhitespace(seq, INCLUDE_WS.length());
221        parseInclude(seq, start, state);
222      }
223      return;
224    } else if (charAt(seq, 0) == '}') {
225      if (skipLeadingWhitespace(seq, 1) != seq.length()) {
226        reportError(state, "Extra chars after '}'");
227        return;
228      }
229      handleAscend(state);
230    } else {
231      parseHdfElement(seq, state);
232    }
233  }
234
235  private void parseInclude(String seq, int start, ParseState state) throws IOException,
236      OutOfCharsException {
237    int end = seq.length();
238    if (charAt(seq, start) == '"') {
239      if (charAt(seq, end - 1) == '"') {
240        start++;
241        end--;
242      } else {
243        reportError(state, "Missing '\"' at end of include");
244        return;
245      }
246    }
247    handleInclude(seq.substring(start, end), state);
248  }
249
250  private static final int NO_MATCH = -1;
251
252  private void parseHdfElement(String seq, ParseState state) throws IOException,
253      OutOfCharsException {
254    // Re-use a single element to avoid repeated allocations/trashing (serious
255    // performance impact, 5% of real service performance)
256    HdfNameAttrs element = state.hdfNameAttrs;
257    if (!parseHdfNameAttrs(element, seq, 0, state)) {
258      return;
259    }
260    int index = skipLeadingWhitespace(seq, element.endOfSequence);
261    switch (charAt(seq, index)) {
262      case '{':
263        // Descend
264        if (index + 1 != seq.length()) {
265          reportError(state, "No characters expected after '{'");
266          return;
267        }
268        handleDescend(state, element);
269        return;
270      case '=':
271        // Assignment
272        index = skipLeadingWhitespace(seq, index + 1);
273        String value = internStrategy.intern(seq.substring(index, seq.length()));
274        handleAssign(state, element, value);
275        return;
276      case ':':
277        if (charAt(seq, index + 1) == '=') {
278          // Copy
279          index = skipLeadingWhitespace(seq, index + 2);
280          String src = parseHdfName(seq, index);
281          if (src == null) {
282            reportError(state, "Invalid HDF name");
283            return;
284          }
285          if (index + src.length() != seq.length()) {
286            reportError(state, "No characters expected after '{'");
287            return;
288          }
289          handleCopy(state, element, src);
290        } else {
291          // Link
292          index = skipLeadingWhitespace(seq, index + 1);
293          String src = parseHdfName(seq, index);
294          if (src == null) {
295            reportError(state, "Invalid HDF name");
296            return;
297          }
298          if (index + src.length() != seq.length()) {
299            reportError(state, "No characters expected after '{'");
300            return;
301          }
302          handleLink(state, element, src);
303        }
304        return;
305      case '<':
306        if (charAt(seq, index + 1) != '<') {
307          reportError(state, "Expected '<<'");
308        }
309        index = skipLeadingWhitespace(seq, index + 2);
310        String eomMarker = seq.substring(index, seq.length());
311        // TODO: think about moving interning to handleAssign()
312        String multilineValue = internStrategy.intern(parseMultilineValue(state, eomMarker));
313        if (multilineValue == null) {
314          return;
315        }
316        handleAssign(state, element, multilineValue);
317        return;
318      default:
319        reportError(state, "No valid operator");
320        return;
321    }
322  }
323
324  /**
325   * This method parses out an HDF element name and any optional attributes into a caller-supplied
326   * HdfNameAttrs object. It returns a {@code boolean} with whether it succeeded to parse.
327   */
328  private boolean parseHdfNameAttrs(HdfNameAttrs destination, String seq, int index,
329      ParseState state) throws OutOfCharsException {
330    String hdfName = parseHdfName(seq, index);
331    if (hdfName == null) {
332      reportError(state, "Invalid HDF name");
333      return false;
334    }
335    destination.reset(hdfName);
336    index = skipLeadingWhitespace(seq, index + hdfName.length());
337    int end = parseAttributes(seq, index, state, destination);
338    if (end == NO_MATCH) {
339      // Error already reported below.
340      return false;
341    } else {
342      destination.endOfSequence = end;
343      return true;
344    }
345  }
346
347  /**
348   * Parses a valid hdf path name.
349   */
350  private String parseHdfName(String seq, int index) throws OutOfCharsException {
351    int end = index;
352    while (end < seq.length() && isHdfNameChar(charAt(seq, end))) {
353      end++;
354    }
355    if (end == index) {
356      return null;
357    }
358    return internStrategy.intern(seq.substring(index, end));
359  }
360
361  /**
362   * Looks for optional attributes and adds them to the HdfNameAttrs object passed into the method.
363   */
364  private int parseAttributes(String seq, int index, ParseState state, HdfNameAttrs element)
365      throws OutOfCharsException {
366    if (charAt(seq, index) != '[') {
367      // No attributes to parse
368      return index;
369    }
370    index = skipLeadingWhitespace(seq, index + 1);
371
372    // If we don't care about attributes, just skip over them.
373    if (state.ignoreAttributes) {
374      while (charAt(seq, index) != ']') {
375        index++;
376      }
377      return index + 1;
378    }
379
380    boolean first = true;
381    do {
382      if (first) {
383        first = false;
384      } else if (charAt(seq, index) == ',') {
385        index = skipLeadingWhitespace(seq, index + 1);
386      } else {
387        reportError(state, "Error parsing attribute list");
388      }
389      index = parseAttribute(seq, index, state, element);
390      if (index == NO_MATCH) {
391        // reportError called by parseAttribute already.
392        return NO_MATCH;
393      }
394      index = skipLeadingWhitespace(seq, index);
395    } while (charAt(seq, index) != ']');
396    return index + 1;
397  }
398
399  private static final String DEFAULT_ATTR_VALUE = "1";
400
401  /**
402   * Parse out a single HDF attribute. If there is no explicit value, use default value of "1" like
403   * in C clearsilver. Returns NO_MATCH if it fails to parse an attribute.
404   */
405  private int parseAttribute(String seq, int index, ParseState state, HdfNameAttrs element)
406      throws OutOfCharsException {
407    int end = parseAttributeKey(seq, index);
408    if (index == end) {
409      reportError(state, "No valid attribute key");
410      return NO_MATCH;
411    }
412    String attrKey = internStrategy.intern(seq.substring(index, end));
413    index = skipLeadingWhitespace(seq, end);
414    if (charAt(seq, index) != '=') {
415      // No value for this attribute key. Use default value of "1"
416      element.addAttribute(attrKey, DEFAULT_ATTR_VALUE);
417      return index;
418    }
419    // We need to parse out the attribute value.
420    index = skipLeadingWhitespace(seq, index + 1);
421    if (charAt(seq, index) == '"') {
422      index++;
423      StringBuilder sb = new StringBuilder();
424      end = parseQuotedAttributeValue(seq, index, sb);
425      if (end == NO_MATCH) {
426        reportError(state, "Unable to parse quoted attribute value");
427        return NO_MATCH;
428      }
429      String attrValue = internStrategy.intern(sb.toString());
430      element.addAttribute(attrKey, attrValue);
431      end++;
432    } else {
433      // Simple attribute that has no whitespace.
434      String attrValue = parseAttributeValue(seq, index, state);
435      if (attrValue == null || attrValue.length() == 0) {
436        reportError(state, "No attribute for key " + attrKey);
437        return NO_MATCH;
438      }
439
440      attrValue = internStrategy.intern(attrValue);
441      element.addAttribute(attrKey, attrValue);
442      end = index + attrValue.length();
443    }
444    return end;
445  }
446
447  /**
448   * Returns the range in the sequence starting at start that corresponds to a valid attribute key.
449   */
450  private int parseAttributeKey(String seq, int index) throws OutOfCharsException {
451    while (isAlphaNumericChar(charAt(seq, index))) {
452      index++;
453    }
454    return index;
455  }
456
457  /**
458   * Parses a quoted attribute value. Unescapes octal characters and \n, \r, \t, \", etc.
459   */
460  private int parseQuotedAttributeValue(String seq, int index, StringBuilder sb)
461      throws OutOfCharsException {
462    char c;
463    while ((c = charAt(seq, index)) != '"') {
464      if (c == '\\') {
465        // Escaped character. Look for 1 to 3 digits in a row as octal or n,t,r.
466        index++;
467        char next = charAt(seq, index);
468        if (isNumericChar(next)) {
469          // Parse the next 1 to 3 characters if they are digits. Treat it as an octal code.
470          int val = next - '0';
471          if (isNumericChar(charAt(seq, index + 1))) {
472            index++;
473            val = val * 8 + (charAt(seq, index) - '0');
474            if (isNumericChar(charAt(seq, index + 1))) {
475              index++;
476              val = val * 8 + (charAt(seq, index) - '0');
477            }
478          }
479          c = (char) val;
480        } else if (next == 'n') {
481          c = '\n';
482        } else if (next == 't') {
483          c = '\t';
484        } else if (next == 'r') {
485          c = '\r';
486        } else {
487          // Regular escaped char like " or /
488          c = next;
489        }
490      }
491      sb.append(c);
492      index++;
493    }
494    return index;
495  }
496
497  /**
498   * Parses a simple attribute value that cannot have any whitespace or specific punctuation
499   * reserved by the HDF grammar.
500   */
501  private String parseAttributeValue(String seq, int index, ParseState state)
502      throws OutOfCharsException {
503    int end = index;
504    char c = charAt(seq, end);
505    while (c != ',' && c != ']' && c != '"' && !Character.isWhitespace(c)) {
506      end++;
507      c = charAt(seq, end);
508    }
509    return seq.substring(index, end);
510  }
511
512  private String parseMultilineValue(ParseState state, String eomMarker) throws IOException {
513    StringBuilder sb = new StringBuilder(256);
514    String line;
515    while ((line = state.lineReader.readLine()) != null) {
516      if (line.startsWith(eomMarker)
517          && skipLeadingWhitespace(line, eomMarker.length()) == line.length()) {
518        return sb.toString();
519      } else {
520        sb.append(line).append('\n');
521      }
522    }
523    reportError(state, "EOM " + eomMarker + " never found");
524    return null;
525  }
526
527  // //////////////////////////////////////////////////////////////////////////
528  //
529  // Handlers
530
531  private void handleDescend(ParseState state, HdfNameAttrs element) {
532    Data child = handleNodeCreation(state.currentNode, element);
533    state.context.push(state.currentNode);
534    state.currentNode = child;
535  }
536
537  private Data handleNodeCreation(Data node, HdfNameAttrs element) {
538    return element.toData(node);
539  }
540
541  private void handleAssign(ParseState state, HdfNameAttrs element, String value) {
542    // TODO: think about moving interning here
543    Data child = handleNodeCreation(state.currentNode, element);
544    child.setValue(value);
545  }
546
547  private void handleCopy(ParseState state, HdfNameAttrs element, String srcName) {
548    Data child = handleNodeCreation(state.currentNode, element);
549    Data src = state.output.getChild(srcName);
550    if (src != null) {
551      child.setValue(src.getValue());
552    } else {
553      child.setValue("");
554    }
555  }
556
557  private void handleLink(ParseState state, HdfNameAttrs element, String srcName) {
558    Data child = handleNodeCreation(state.currentNode, element);
559    child.setSymlink(state.output.createChild(srcName));
560  }
561
562  private void handleAscend(ParseState state) {
563    if (state.context.isEmpty()) {
564      reportError(state, "Too many '}'");
565      return;
566    }
567    state.currentNode = state.context.pop();
568  }
569
570  private void handleInclude(String seq, ParseState state) throws IOException {
571    String includeFileName = internStrategy.intern(seq);
572
573    // Load the file
574    Reader reader = state.resourceLoader.open(includeFileName);
575    if (reader == null) {
576      reportError(state, "Unable to find file " + includeFileName);
577      return;
578    }
579
580    // Check whether we are in include loop
581    if (!state.includeStack.push(includeFileName)) {
582      reportError(state, createIncludeStackTraceMessage(state.includeStack, includeFileName));
583      return;
584    }
585
586    // Parse the file
587    state.hdfParser.parse(ParseState
588        .createParseStateForIncludedFile(state, includeFileName, reader));
589
590    if (!includeFileName.equals(state.includeStack.pop())) {
591      // Include stack trace is corrupted
592      throw new IllegalStateException("Unable to find on include stack: " + includeFileName);
593    }
594  }
595
596  private String createIncludeStackTraceMessage(UniqueStack<String> includeStack,
597      String includeFileName) {
598    StringBuilder message = new StringBuilder();
599    message.append("File included twice: ");
600    message.append(includeFileName);
601
602    message.append(" Include stack: ");
603    for (String fileName : includeStack) {
604      message.append(fileName);
605      message.append(" -> ");
606    }
607    message.append(includeFileName);
608    return message.toString();
609  }
610
611  // /////////////////////////////////////////////////////////////////////////
612  //
613  // Character values
614
615  private static boolean isNumericChar(char c) {
616    if ('0' <= c && c <= '9') {
617      return true;
618    } else {
619      return false;
620    }
621  }
622
623  private static boolean isAlphaNumericChar(char c) {
624    if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9')) {
625      return true;
626    } else {
627      return false;
628    }
629  }
630
631  private static boolean isHdfNameChar(char c) {
632    if (isAlphaNumericChar(c) || c == '_' || c == '.') {
633      return true;
634    } else {
635      return false;
636    }
637  }
638
639  private static String stripWhitespace(String seq) {
640    int start = skipLeadingWhitespace(seq, 0);
641    int end = seq.length() - 1;
642    while (end > start && Character.isWhitespace(seq.charAt(end))) {
643      --end;
644    }
645    if (start == 0 && end == seq.length() - 1) {
646      return seq;
647    } else {
648      return seq.substring(start, end + 1);
649    }
650  }
651
652  private static int skipLeadingWhitespace(String seq, int index) {
653    while (index < seq.length() && Character.isWhitespace(seq.charAt(index))) {
654      index++;
655    }
656    return index;
657  }
658
659  /**
660   * Determines if a character sequence appears in the given sequence starting at a specified index.
661   *
662   * @param seq the sequence that we want to see if it contains the string match.
663   * @param start the index into seq where we want to check for match
664   * @param match the String we want to look for in the sequence.
665   * @return {@code true} if the string match appears in seq starting at the index start, {@code
666   *         false} otherwise.
667   */
668  private static boolean matches(String seq, int start, String match) {
669    if (seq.length() - start < match.length()) {
670      return false;
671    }
672    for (int i = 0; i < match.length(); i++) {
673      if (match.charAt(i) != seq.charAt(start + i)) {
674        return false;
675      }
676    }
677    return true;
678  }
679
680  /**
681   * Reads the character at the specified index in the given String. Throws an exception to be
682   * caught above if the index is out of range.
683   */
684  private static char charAt(String seq, int index) throws OutOfCharsException {
685    if (0 <= index && index < seq.length()) {
686      return seq.charAt(index);
687    } else {
688      throw new OutOfCharsException();
689    }
690  }
691
692
693  private static void reportError(ParseState state, String errorMessage) {
694    if (state.errorHandler != null) {
695      state.errorHandler.error(state.lineReader.getLineNumber(), state.line, state.parsedFileName,
696          errorMessage);
697    } else {
698      throw new RuntimeException("Parse Error on line " + state.lineReader.getLineNumber() + ": "
699          + errorMessage + " : " + state.line);
700    }
701  }
702}
703