1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31package com.google.protobuf;
32
33import com.google.protobuf.Descriptors.Descriptor;
34import com.google.protobuf.Descriptors.EnumDescriptor;
35import com.google.protobuf.Descriptors.EnumValueDescriptor;
36import com.google.protobuf.Descriptors.FieldDescriptor;
37
38import java.io.IOException;
39import java.math.BigInteger;
40import java.nio.CharBuffer;
41import java.util.ArrayList;
42import java.util.List;
43import java.util.Locale;
44import java.util.Map;
45import java.util.logging.Logger;
46import java.util.regex.Matcher;
47import java.util.regex.Pattern;
48
49/**
50 * Provide text parsing and formatting support for proto2 instances.
51 * The implementation largely follows google/protobuf/text_format.cc.
52 *
53 * @author wenboz@google.com Wenbo Zhu
54 * @author kenton@google.com Kenton Varda
55 */
56public final class TextFormat {
57  private TextFormat() {}
58
59  private static final Logger logger =
60      Logger.getLogger(TextFormat.class.getName());
61
62  private static final Printer DEFAULT_PRINTER = new Printer();
63  private static final Printer SINGLE_LINE_PRINTER =
64      (new Printer()).setSingleLineMode(true);
65  private static final Printer UNICODE_PRINTER =
66      (new Printer()).setEscapeNonAscii(false);
67
68  /**
69   * Outputs a textual representation of the Protocol Message supplied into
70   * the parameter output. (This representation is the new version of the
71   * classic "ProtocolPrinter" output from the original Protocol Buffer system)
72   */
73  public static void print(
74      final MessageOrBuilder message, final Appendable output)
75      throws IOException {
76    DEFAULT_PRINTER.print(message, new TextGenerator(output));
77  }
78
79  /** Outputs a textual representation of {@code fields} to {@code output}. */
80  public static void print(final UnknownFieldSet fields,
81                           final Appendable output)
82                           throws IOException {
83    DEFAULT_PRINTER.printUnknownFields(fields, new TextGenerator(output));
84  }
85
86  /**
87   * Same as {@code print()}, except that non-ASCII characters are not
88   * escaped.
89   */
90  public static void printUnicode(
91      final MessageOrBuilder message, final Appendable output)
92      throws IOException {
93    UNICODE_PRINTER.print(message, new TextGenerator(output));
94  }
95
96  /**
97   * Same as {@code print()}, except that non-ASCII characters are not
98   * escaped.
99   */
100  public static void printUnicode(final UnknownFieldSet fields,
101                                  final Appendable output)
102                                  throws IOException {
103    UNICODE_PRINTER.printUnknownFields(fields, new TextGenerator(output));
104  }
105
106  /**
107   * Generates a human readable form of this message, useful for debugging and
108   * other purposes, with no newline characters.
109   */
110  public static String shortDebugString(final MessageOrBuilder message) {
111    try {
112      final StringBuilder sb = new StringBuilder();
113      SINGLE_LINE_PRINTER.print(message, new TextGenerator(sb));
114      // Single line mode currently might have an extra space at the end.
115      return sb.toString().trim();
116    } catch (IOException e) {
117      throw new IllegalStateException(e);
118    }
119  }
120
121  /**
122   * Generates a human readable form of the unknown fields, useful for debugging
123   * and other purposes, with no newline characters.
124   */
125  public static String shortDebugString(final UnknownFieldSet fields) {
126    try {
127      final StringBuilder sb = new StringBuilder();
128      SINGLE_LINE_PRINTER.printUnknownFields(fields, new TextGenerator(sb));
129      // Single line mode currently might have an extra space at the end.
130      return sb.toString().trim();
131    } catch (IOException e) {
132      throw new IllegalStateException(e);
133    }
134  }
135
136  /**
137   * Like {@code print()}, but writes directly to a {@code String} and
138   * returns it.
139   */
140  public static String printToString(final MessageOrBuilder message) {
141    try {
142      final StringBuilder text = new StringBuilder();
143      print(message, text);
144      return text.toString();
145    } catch (IOException e) {
146      throw new IllegalStateException(e);
147    }
148  }
149
150  /**
151   * Like {@code print()}, but writes directly to a {@code String} and
152   * returns it.
153   */
154  public static String printToString(final UnknownFieldSet fields) {
155    try {
156      final StringBuilder text = new StringBuilder();
157      print(fields, text);
158      return text.toString();
159    } catch (IOException e) {
160      throw new IllegalStateException(e);
161    }
162  }
163
164  /**
165   * Same as {@code printToString()}, except that non-ASCII characters
166   * in string type fields are not escaped in backslash+octals.
167   */
168  public static String printToUnicodeString(final MessageOrBuilder message) {
169    try {
170      final StringBuilder text = new StringBuilder();
171      UNICODE_PRINTER.print(message, new TextGenerator(text));
172      return text.toString();
173    } catch (IOException e) {
174      throw new IllegalStateException(e);
175    }
176  }
177
178  /**
179   * Same as {@code printToString()}, except that non-ASCII characters
180   * in string type fields are not escaped in backslash+octals.
181   */
182  public static String printToUnicodeString(final UnknownFieldSet fields) {
183    try {
184      final StringBuilder text = new StringBuilder();
185      UNICODE_PRINTER.printUnknownFields(fields, new TextGenerator(text));
186      return text.toString();
187    } catch (IOException e) {
188      throw new IllegalStateException(e);
189    }
190  }
191
192  public static void printField(final FieldDescriptor field,
193                                final Object value,
194                                final Appendable output)
195                                throws IOException {
196    DEFAULT_PRINTER.printField(field, value, new TextGenerator(output));
197  }
198
199  public static String printFieldToString(final FieldDescriptor field,
200                                          final Object value) {
201    try {
202      final StringBuilder text = new StringBuilder();
203      printField(field, value, text);
204      return text.toString();
205    } catch (IOException e) {
206      throw new IllegalStateException(e);
207    }
208  }
209
210  /**
211   * Outputs a textual representation of the value of given field value.
212   *
213   * @param field the descriptor of the field
214   * @param value the value of the field
215   * @param output the output to which to append the formatted value
216   * @throws ClassCastException if the value is not appropriate for the
217   *     given field descriptor
218   * @throws IOException if there is an exception writing to the output
219   */
220  public static void printFieldValue(final FieldDescriptor field,
221                                     final Object value,
222                                     final Appendable output)
223                                     throws IOException {
224    DEFAULT_PRINTER.printFieldValue(field, value, new TextGenerator(output));
225  }
226
227  /**
228   * Outputs a textual representation of the value of an unknown field.
229   *
230   * @param tag the field's tag number
231   * @param value the value of the field
232   * @param output the output to which to append the formatted value
233   * @throws ClassCastException if the value is not appropriate for the
234   *     given field descriptor
235   * @throws IOException if there is an exception writing to the output
236   */
237  public static void printUnknownFieldValue(final int tag,
238                                            final Object value,
239                                            final Appendable output)
240                                            throws IOException {
241    printUnknownFieldValue(tag, value, new TextGenerator(output));
242  }
243
244  private static void printUnknownFieldValue(final int tag,
245                                             final Object value,
246                                             final TextGenerator generator)
247                                             throws IOException {
248    switch (WireFormat.getTagWireType(tag)) {
249      case WireFormat.WIRETYPE_VARINT:
250        generator.print(unsignedToString((Long) value));
251        break;
252      case WireFormat.WIRETYPE_FIXED32:
253        generator.print(
254            String.format((Locale) null, "0x%08x", (Integer) value));
255        break;
256      case WireFormat.WIRETYPE_FIXED64:
257        generator.print(String.format((Locale) null, "0x%016x", (Long) value));
258        break;
259      case WireFormat.WIRETYPE_LENGTH_DELIMITED:
260        generator.print("\"");
261        generator.print(escapeBytes((ByteString) value));
262        generator.print("\"");
263        break;
264      case WireFormat.WIRETYPE_START_GROUP:
265        DEFAULT_PRINTER.printUnknownFields((UnknownFieldSet) value, generator);
266        break;
267      default:
268        throw new IllegalArgumentException("Bad tag: " + tag);
269    }
270  }
271
272  /** Helper class for converting protobufs to text. */
273  private static final class Printer {
274    /** Whether to omit newlines from the output. */
275    boolean singleLineMode = false;
276
277    /** Whether to escape non ASCII characters with backslash and octal. */
278    boolean escapeNonAscii = true;
279
280    private Printer() {}
281
282    /** Setter of singleLineMode */
283    private Printer setSingleLineMode(boolean singleLineMode) {
284      this.singleLineMode = singleLineMode;
285      return this;
286    }
287
288    /** Setter of escapeNonAscii */
289    private Printer setEscapeNonAscii(boolean escapeNonAscii) {
290      this.escapeNonAscii = escapeNonAscii;
291      return this;
292    }
293
294    private void print(
295        final MessageOrBuilder message, final TextGenerator generator)
296        throws IOException {
297      for (Map.Entry<FieldDescriptor, Object> field
298          : message.getAllFields().entrySet()) {
299        printField(field.getKey(), field.getValue(), generator);
300      }
301      printUnknownFields(message.getUnknownFields(), generator);
302    }
303
304    private void printField(final FieldDescriptor field, final Object value,
305        final TextGenerator generator) throws IOException {
306      if (field.isRepeated()) {
307        // Repeated field.  Print each element.
308        for (Object element : (List<?>) value) {
309          printSingleField(field, element, generator);
310        }
311      } else {
312        printSingleField(field, value, generator);
313      }
314    }
315
316    private void printSingleField(final FieldDescriptor field,
317                                  final Object value,
318                                  final TextGenerator generator)
319                                  throws IOException {
320      if (field.isExtension()) {
321        generator.print("[");
322        // We special-case MessageSet elements for compatibility with proto1.
323        if (field.getContainingType().getOptions().getMessageSetWireFormat()
324            && (field.getType() == FieldDescriptor.Type.MESSAGE)
325            && (field.isOptional())
326            // object equality
327            && (field.getExtensionScope() == field.getMessageType())) {
328          generator.print(field.getMessageType().getFullName());
329        } else {
330          generator.print(field.getFullName());
331        }
332        generator.print("]");
333      } else {
334        if (field.getType() == FieldDescriptor.Type.GROUP) {
335          // Groups must be serialized with their original capitalization.
336          generator.print(field.getMessageType().getName());
337        } else {
338          generator.print(field.getName());
339        }
340      }
341
342      if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
343        if (singleLineMode) {
344          generator.print(" { ");
345        } else {
346          generator.print(" {\n");
347          generator.indent();
348        }
349      } else {
350        generator.print(": ");
351      }
352
353      printFieldValue(field, value, generator);
354
355      if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
356        if (singleLineMode) {
357          generator.print("} ");
358        } else {
359          generator.outdent();
360          generator.print("}\n");
361        }
362      } else {
363        if (singleLineMode) {
364          generator.print(" ");
365        } else {
366          generator.print("\n");
367        }
368      }
369    }
370
371    private void printFieldValue(final FieldDescriptor field,
372                                 final Object value,
373                                 final TextGenerator generator)
374                                 throws IOException {
375      switch (field.getType()) {
376        case INT32:
377        case SINT32:
378        case SFIXED32:
379          generator.print(((Integer) value).toString());
380          break;
381
382        case INT64:
383        case SINT64:
384        case SFIXED64:
385          generator.print(((Long) value).toString());
386          break;
387
388        case BOOL:
389          generator.print(((Boolean) value).toString());
390          break;
391
392        case FLOAT:
393          generator.print(((Float) value).toString());
394          break;
395
396        case DOUBLE:
397          generator.print(((Double) value).toString());
398          break;
399
400        case UINT32:
401        case FIXED32:
402          generator.print(unsignedToString((Integer) value));
403          break;
404
405        case UINT64:
406        case FIXED64:
407          generator.print(unsignedToString((Long) value));
408          break;
409
410        case STRING:
411          generator.print("\"");
412          generator.print(escapeNonAscii ?
413              escapeText((String) value) :
414              escapeDoubleQuotesAndBackslashes((String) value));
415          generator.print("\"");
416          break;
417
418        case BYTES:
419          generator.print("\"");
420          if (value instanceof ByteString) {
421            generator.print(escapeBytes((ByteString) value));
422          } else {
423            generator.print(escapeBytes((byte[]) value));
424          }
425          generator.print("\"");
426          break;
427
428        case ENUM:
429          generator.print(((EnumValueDescriptor) value).getName());
430          break;
431
432        case MESSAGE:
433        case GROUP:
434          print((Message) value, generator);
435          break;
436      }
437    }
438
439    private void printUnknownFields(final UnknownFieldSet unknownFields,
440                                    final TextGenerator generator)
441                                    throws IOException {
442      for (Map.Entry<Integer, UnknownFieldSet.Field> entry :
443               unknownFields.asMap().entrySet()) {
444        final int number = entry.getKey();
445        final UnknownFieldSet.Field field = entry.getValue();
446        printUnknownField(number, WireFormat.WIRETYPE_VARINT,
447            field.getVarintList(), generator);
448        printUnknownField(number, WireFormat.WIRETYPE_FIXED32,
449            field.getFixed32List(), generator);
450        printUnknownField(number, WireFormat.WIRETYPE_FIXED64,
451            field.getFixed64List(), generator);
452        printUnknownField(number, WireFormat.WIRETYPE_LENGTH_DELIMITED,
453            field.getLengthDelimitedList(), generator);
454        for (final UnknownFieldSet value : field.getGroupList()) {
455          generator.print(entry.getKey().toString());
456          if (singleLineMode) {
457            generator.print(" { ");
458          } else {
459            generator.print(" {\n");
460            generator.indent();
461          }
462          printUnknownFields(value, generator);
463          if (singleLineMode) {
464            generator.print("} ");
465          } else {
466            generator.outdent();
467            generator.print("}\n");
468          }
469        }
470      }
471    }
472
473    private void printUnknownField(final int number,
474                                   final int wireType,
475                                   final List<?> values,
476                                   final TextGenerator generator)
477                                   throws IOException {
478      for (final Object value : values) {
479        generator.print(String.valueOf(number));
480        generator.print(": ");
481        printUnknownFieldValue(wireType, value, generator);
482        generator.print(singleLineMode ? " " : "\n");
483      }
484    }
485  }
486
487  /** Convert an unsigned 32-bit integer to a string. */
488  public static String unsignedToString(final int value) {
489    if (value >= 0) {
490      return Integer.toString(value);
491    } else {
492      return Long.toString(value & 0x00000000FFFFFFFFL);
493    }
494  }
495
496  /** Convert an unsigned 64-bit integer to a string. */
497  public static String unsignedToString(final long value) {
498    if (value >= 0) {
499      return Long.toString(value);
500    } else {
501      // Pull off the most-significant bit so that BigInteger doesn't think
502      // the number is negative, then set it again using setBit().
503      return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL)
504                       .setBit(63).toString();
505    }
506  }
507
508  /**
509   * An inner class for writing text to the output stream.
510   */
511  private static final class TextGenerator {
512    private final Appendable output;
513    private final StringBuilder indent = new StringBuilder();
514    private boolean atStartOfLine = true;
515
516    private TextGenerator(final Appendable output) {
517      this.output = output;
518    }
519
520    /**
521     * Indent text by two spaces.  After calling Indent(), two spaces will be
522     * inserted at the beginning of each line of text.  Indent() may be called
523     * multiple times to produce deeper indents.
524     */
525    public void indent() {
526      indent.append("  ");
527    }
528
529    /**
530     * Reduces the current indent level by two spaces, or crashes if the indent
531     * level is zero.
532     */
533    public void outdent() {
534      final int length = indent.length();
535      if (length == 0) {
536        throw new IllegalArgumentException(
537            " Outdent() without matching Indent().");
538      }
539      indent.delete(length - 2, length);
540    }
541
542    /**
543     * Print text to the output stream.
544     */
545    public void print(final CharSequence text) throws IOException {
546      final int size = text.length();
547      int pos = 0;
548
549      for (int i = 0; i < size; i++) {
550        if (text.charAt(i) == '\n') {
551          write(text.subSequence(pos, i + 1));
552          pos = i + 1;
553          atStartOfLine = true;
554        }
555      }
556      write(text.subSequence(pos, size));
557    }
558
559    private void write(final CharSequence data) throws IOException {
560      if (data.length() == 0) {
561        return;
562      }
563      if (atStartOfLine) {
564        atStartOfLine = false;
565        output.append(indent);
566      }
567      output.append(data);
568    }
569  }
570
571  // =================================================================
572  // Parsing
573
574  /**
575   * Represents a stream of tokens parsed from a {@code String}.
576   *
577   * <p>The Java standard library provides many classes that you might think
578   * would be useful for implementing this, but aren't.  For example:
579   *
580   * <ul>
581   * <li>{@code java.io.StreamTokenizer}:  This almost does what we want -- or,
582   *   at least, something that would get us close to what we want -- except
583   *   for one fatal flaw:  It automatically un-escapes strings using Java
584   *   escape sequences, which do not include all the escape sequences we
585   *   need to support (e.g. '\x').
586   * <li>{@code java.util.Scanner}:  This seems like a great way at least to
587   *   parse regular expressions out of a stream (so we wouldn't have to load
588   *   the entire input into a single string before parsing).  Sadly,
589   *   {@code Scanner} requires that tokens be delimited with some delimiter.
590   *   Thus, although the text "foo:" should parse to two tokens ("foo" and
591   *   ":"), {@code Scanner} would recognize it only as a single token.
592   *   Furthermore, {@code Scanner} provides no way to inspect the contents
593   *   of delimiters, making it impossible to keep track of line and column
594   *   numbers.
595   * </ul>
596   *
597   * <p>Luckily, Java's regular expression support does manage to be useful to
598   * us.  (Barely:  We need {@code Matcher.usePattern()}, which is new in
599   * Java 1.5.)  So, we can use that, at least.  Unfortunately, this implies
600   * that we need to have the entire input in one contiguous string.
601   */
602  private static final class Tokenizer {
603    private final CharSequence text;
604    private final Matcher matcher;
605    private String currentToken;
606
607    // The character index within this.text at which the current token begins.
608    private int pos = 0;
609
610    // The line and column numbers of the current token.
611    private int line = 0;
612    private int column = 0;
613
614    // The line and column numbers of the previous token (allows throwing
615    // errors *after* consuming).
616    private int previousLine = 0;
617    private int previousColumn = 0;
618
619    // We use possessive quantifiers (*+ and ++) because otherwise the Java
620    // regex matcher has stack overflows on large inputs.
621    private static final Pattern WHITESPACE =
622      Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
623    private static final Pattern TOKEN = Pattern.compile(
624      "[a-zA-Z_][0-9a-zA-Z_+-]*+|" +                // an identifier
625      "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" +             // a number
626      "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" +       // a double-quoted string
627      "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)",         // a single-quoted string
628      Pattern.MULTILINE);
629
630    private static final Pattern DOUBLE_INFINITY = Pattern.compile(
631      "-?inf(inity)?",
632      Pattern.CASE_INSENSITIVE);
633    private static final Pattern FLOAT_INFINITY = Pattern.compile(
634      "-?inf(inity)?f?",
635      Pattern.CASE_INSENSITIVE);
636    private static final Pattern FLOAT_NAN = Pattern.compile(
637      "nanf?",
638      Pattern.CASE_INSENSITIVE);
639
640    /** Construct a tokenizer that parses tokens from the given text. */
641    private Tokenizer(final CharSequence text) {
642      this.text = text;
643      this.matcher = WHITESPACE.matcher(text);
644      skipWhitespace();
645      nextToken();
646    }
647
648    /** Are we at the end of the input? */
649    public boolean atEnd() {
650      return currentToken.length() == 0;
651    }
652
653    /** Advance to the next token. */
654    public void nextToken() {
655      previousLine = line;
656      previousColumn = column;
657
658      // Advance the line counter to the current position.
659      while (pos < matcher.regionStart()) {
660        if (text.charAt(pos) == '\n') {
661          ++line;
662          column = 0;
663        } else {
664          ++column;
665        }
666        ++pos;
667      }
668
669      // Match the next token.
670      if (matcher.regionStart() == matcher.regionEnd()) {
671        // EOF
672        currentToken = "";
673      } else {
674        matcher.usePattern(TOKEN);
675        if (matcher.lookingAt()) {
676          currentToken = matcher.group();
677          matcher.region(matcher.end(), matcher.regionEnd());
678        } else {
679          // Take one character.
680          currentToken = String.valueOf(text.charAt(pos));
681          matcher.region(pos + 1, matcher.regionEnd());
682        }
683
684        skipWhitespace();
685      }
686    }
687
688    /**
689     * Skip over any whitespace so that the matcher region starts at the next
690     * token.
691     */
692    private void skipWhitespace() {
693      matcher.usePattern(WHITESPACE);
694      if (matcher.lookingAt()) {
695        matcher.region(matcher.end(), matcher.regionEnd());
696      }
697    }
698
699    /**
700     * If the next token exactly matches {@code token}, consume it and return
701     * {@code true}.  Otherwise, return {@code false} without doing anything.
702     */
703    public boolean tryConsume(final String token) {
704      if (currentToken.equals(token)) {
705        nextToken();
706        return true;
707      } else {
708        return false;
709      }
710    }
711
712    /**
713     * If the next token exactly matches {@code token}, consume it.  Otherwise,
714     * throw a {@link ParseException}.
715     */
716    public void consume(final String token) throws ParseException {
717      if (!tryConsume(token)) {
718        throw parseException("Expected \"" + token + "\".");
719      }
720    }
721
722    /**
723     * Returns {@code true} if the next token is an integer, but does
724     * not consume it.
725     */
726    public boolean lookingAtInteger() {
727      if (currentToken.length() == 0) {
728        return false;
729      }
730
731      final char c = currentToken.charAt(0);
732      return ('0' <= c && c <= '9') ||
733             c == '-' || c == '+';
734    }
735
736    /**
737     * Returns {@code true} if the current token's text is equal to that
738     * specified.
739     */
740    public boolean lookingAt(String text) {
741      return currentToken.equals(text);
742    }
743
744    /**
745     * If the next token is an identifier, consume it and return its value.
746     * Otherwise, throw a {@link ParseException}.
747     */
748    public String consumeIdentifier() throws ParseException {
749      for (int i = 0; i < currentToken.length(); i++) {
750        final char c = currentToken.charAt(i);
751        if (('a' <= c && c <= 'z') ||
752            ('A' <= c && c <= 'Z') ||
753            ('0' <= c && c <= '9') ||
754            (c == '_') || (c == '.')) {
755          // OK
756        } else {
757          throw parseException(
758              "Expected identifier. Found '" + currentToken + "'");
759        }
760      }
761
762      final String result = currentToken;
763      nextToken();
764      return result;
765    }
766
767    /**
768     * If the next token is an identifier, consume it and return {@code true}.
769     * Otherwise, return {@code false} without doing anything.
770     */
771    public boolean tryConsumeIdentifier() {
772      try {
773        consumeIdentifier();
774        return true;
775      } catch (ParseException e) {
776        return false;
777      }
778    }
779
780    /**
781     * If the next token is a 32-bit signed integer, consume it and return its
782     * value.  Otherwise, throw a {@link ParseException}.
783     */
784    public int consumeInt32() throws ParseException {
785      try {
786        final int result = parseInt32(currentToken);
787        nextToken();
788        return result;
789      } catch (NumberFormatException e) {
790        throw integerParseException(e);
791      }
792    }
793
794    /**
795     * If the next token is a 32-bit unsigned integer, consume it and return its
796     * value.  Otherwise, throw a {@link ParseException}.
797     */
798    public int consumeUInt32() throws ParseException {
799      try {
800        final int result = parseUInt32(currentToken);
801        nextToken();
802        return result;
803      } catch (NumberFormatException e) {
804        throw integerParseException(e);
805      }
806    }
807
808    /**
809     * If the next token is a 64-bit signed integer, consume it and return its
810     * value.  Otherwise, throw a {@link ParseException}.
811     */
812    public long consumeInt64() throws ParseException {
813      try {
814        final long result = parseInt64(currentToken);
815        nextToken();
816        return result;
817      } catch (NumberFormatException e) {
818        throw integerParseException(e);
819      }
820    }
821
822    /**
823     * If the next token is a 64-bit signed integer, consume it and return
824     * {@code true}.  Otherwise, return {@code false} without doing anything.
825     */
826    public boolean tryConsumeInt64() {
827      try {
828        consumeInt64();
829        return true;
830      } catch (ParseException e) {
831        return false;
832      }
833    }
834
835    /**
836     * If the next token is a 64-bit unsigned integer, consume it and return its
837     * value.  Otherwise, throw a {@link ParseException}.
838     */
839    public long consumeUInt64() throws ParseException {
840      try {
841        final long result = parseUInt64(currentToken);
842        nextToken();
843        return result;
844      } catch (NumberFormatException e) {
845        throw integerParseException(e);
846      }
847    }
848
849    /**
850     * If the next token is a 64-bit unsigned integer, consume it and return
851     * {@code true}.  Otherwise, return {@code false} without doing anything.
852     */
853    public boolean tryConsumeUInt64() {
854      try {
855        consumeUInt64();
856        return true;
857      } catch (ParseException e) {
858        return false;
859      }
860    }
861
862    /**
863     * If the next token is a double, consume it and return its value.
864     * Otherwise, throw a {@link ParseException}.
865     */
866    public double consumeDouble() throws ParseException {
867      // We need to parse infinity and nan separately because
868      // Double.parseDouble() does not accept "inf", "infinity", or "nan".
869      if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
870        final boolean negative = currentToken.startsWith("-");
871        nextToken();
872        return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
873      }
874      if (currentToken.equalsIgnoreCase("nan")) {
875        nextToken();
876        return Double.NaN;
877      }
878      try {
879        final double result = Double.parseDouble(currentToken);
880        nextToken();
881        return result;
882      } catch (NumberFormatException e) {
883        throw floatParseException(e);
884      }
885    }
886
887    /**
888     * If the next token is a double, consume it and return {@code true}.
889     * Otherwise, return {@code false} without doing anything.
890     */
891    public boolean tryConsumeDouble() {
892      try {
893        consumeDouble();
894        return true;
895      } catch (ParseException e) {
896        return false;
897      }
898    }
899
900    /**
901     * If the next token is a float, consume it and return its value.
902     * Otherwise, throw a {@link ParseException}.
903     */
904    public float consumeFloat() throws ParseException {
905      // We need to parse infinity and nan separately because
906      // Float.parseFloat() does not accept "inf", "infinity", or "nan".
907      if (FLOAT_INFINITY.matcher(currentToken).matches()) {
908        final boolean negative = currentToken.startsWith("-");
909        nextToken();
910        return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
911      }
912      if (FLOAT_NAN.matcher(currentToken).matches()) {
913        nextToken();
914        return Float.NaN;
915      }
916      try {
917        final float result = Float.parseFloat(currentToken);
918        nextToken();
919        return result;
920      } catch (NumberFormatException e) {
921        throw floatParseException(e);
922      }
923    }
924
925    /**
926     * If the next token is a float, consume it and return {@code true}.
927     * Otherwise, return {@code false} without doing anything.
928     */
929    public boolean tryConsumeFloat() {
930      try {
931        consumeFloat();
932        return true;
933      } catch (ParseException e) {
934        return false;
935      }
936    }
937
938    /**
939     * If the next token is a boolean, consume it and return its value.
940     * Otherwise, throw a {@link ParseException}.
941     */
942    public boolean consumeBoolean() throws ParseException {
943      if (currentToken.equals("true") ||
944          currentToken.equals("t") ||
945          currentToken.equals("1")) {
946        nextToken();
947        return true;
948      } else if (currentToken.equals("false") ||
949                 currentToken.equals("f") ||
950                 currentToken.equals("0")) {
951        nextToken();
952        return false;
953      } else {
954        throw parseException("Expected \"true\" or \"false\".");
955      }
956    }
957
958    /**
959     * If the next token is a string, consume it and return its (unescaped)
960     * value.  Otherwise, throw a {@link ParseException}.
961     */
962    public String consumeString() throws ParseException {
963      return consumeByteString().toStringUtf8();
964    }
965
966    /**
967     * If the next token is a string, consume it and return true.  Otherwise,
968     * return false.
969     */
970    public boolean tryConsumeString() {
971      try {
972        consumeString();
973        return true;
974      } catch (ParseException e) {
975        return false;
976      }
977    }
978
979    /**
980     * If the next token is a string, consume it, unescape it as a
981     * {@link ByteString}, and return it.  Otherwise, throw a
982     * {@link ParseException}.
983     */
984    public ByteString consumeByteString() throws ParseException {
985      List<ByteString> list = new ArrayList<ByteString>();
986      consumeByteString(list);
987      while (currentToken.startsWith("'") || currentToken.startsWith("\"")) {
988        consumeByteString(list);
989      }
990      return ByteString.copyFrom(list);
991    }
992
993    /**
994     * Like {@link #consumeByteString()} but adds each token of the string to
995     * the given list.  String literals (whether bytes or text) may come in
996     * multiple adjacent tokens which are automatically concatenated, like in
997     * C or Python.
998     */
999    private void consumeByteString(List<ByteString> list)
1000        throws ParseException {
1001      final char quote = currentToken.length() > 0 ? currentToken.charAt(0)
1002                                                   : '\0';
1003      if (quote != '\"' && quote != '\'') {
1004        throw parseException("Expected string.");
1005      }
1006
1007      if (currentToken.length() < 2 ||
1008          currentToken.charAt(currentToken.length() - 1) != quote) {
1009        throw parseException("String missing ending quote.");
1010      }
1011
1012      try {
1013        final String escaped =
1014            currentToken.substring(1, currentToken.length() - 1);
1015        final ByteString result = unescapeBytes(escaped);
1016        nextToken();
1017        list.add(result);
1018      } catch (InvalidEscapeSequenceException e) {
1019        throw parseException(e.getMessage());
1020      }
1021    }
1022
1023    /**
1024     * Returns a {@link ParseException} with the current line and column
1025     * numbers in the description, suitable for throwing.
1026     */
1027    public ParseException parseException(final String description) {
1028      // Note:  People generally prefer one-based line and column numbers.
1029      return new ParseException(
1030        line + 1, column + 1, description);
1031    }
1032
1033    /**
1034     * Returns a {@link ParseException} with the line and column numbers of
1035     * the previous token in the description, suitable for throwing.
1036     */
1037    public ParseException parseExceptionPreviousToken(
1038        final String description) {
1039      // Note:  People generally prefer one-based line and column numbers.
1040      return new ParseException(
1041        previousLine + 1, previousColumn + 1, description);
1042    }
1043
1044    /**
1045     * Constructs an appropriate {@link ParseException} for the given
1046     * {@code NumberFormatException} when trying to parse an integer.
1047     */
1048    private ParseException integerParseException(
1049        final NumberFormatException e) {
1050      return parseException("Couldn't parse integer: " + e.getMessage());
1051    }
1052
1053    /**
1054     * Constructs an appropriate {@link ParseException} for the given
1055     * {@code NumberFormatException} when trying to parse a float or double.
1056     */
1057    private ParseException floatParseException(final NumberFormatException e) {
1058      return parseException("Couldn't parse number: " + e.getMessage());
1059    }
1060  }
1061
1062  /** Thrown when parsing an invalid text format message. */
1063  public static class ParseException extends IOException {
1064    private static final long serialVersionUID = 3196188060225107702L;
1065
1066    private final int line;
1067    private final int column;
1068
1069    /** Create a new instance, with -1 as the line and column numbers. */
1070    public ParseException(final String message) {
1071      this(-1, -1, message);
1072    }
1073
1074    /**
1075     * Create a new instance
1076     *
1077     * @param line the line number where the parse error occurred,
1078     * using 1-offset.
1079     * @param column the column number where the parser error occurred,
1080     * using 1-offset.
1081     */
1082    public ParseException(final int line, final int column,
1083        final String message) {
1084      super(Integer.toString(line) + ":" + column + ": " + message);
1085      this.line = line;
1086      this.column = column;
1087    }
1088
1089    /**
1090     * Return the line where the parse exception occurred, or -1 when
1091     * none is provided. The value is specified as 1-offset, so the first
1092     * line is line 1.
1093     */
1094    public int getLine() {
1095      return line;
1096    }
1097
1098    /**
1099     * Return the column where the parse exception occurred, or -1 when
1100     * none is provided. The value is specified as 1-offset, so the first
1101     * line is line 1.
1102     */
1103    public int getColumn() {
1104      return column;
1105    }
1106  }
1107
1108  private static final Parser PARSER = Parser.newBuilder().build();
1109
1110  /**
1111   * Return a {@link Parser} instance which can parse text-format
1112   * messages. The returned instance is thread-safe.
1113   */
1114  public static Parser getParser() {
1115    return PARSER;
1116  }
1117
1118  /**
1119   * Parse a text-format message from {@code input} and merge the contents
1120   * into {@code builder}.
1121   */
1122  public static void merge(final Readable input,
1123                           final Message.Builder builder)
1124                           throws IOException {
1125    PARSER.merge(input, builder);
1126  }
1127
1128  /**
1129   * Parse a text-format message from {@code input} and merge the contents
1130   * into {@code builder}.
1131   */
1132  public static void merge(final CharSequence input,
1133                           final Message.Builder builder)
1134                           throws ParseException {
1135    PARSER.merge(input, builder);
1136  }
1137
1138  /**
1139   * Parse a text-format message from {@code input} and merge the contents
1140   * into {@code builder}.  Extensions will be recognized if they are
1141   * registered in {@code extensionRegistry}.
1142   */
1143  public static void merge(final Readable input,
1144                           final ExtensionRegistry extensionRegistry,
1145                           final Message.Builder builder)
1146                           throws IOException {
1147    PARSER.merge(input, extensionRegistry, builder);
1148  }
1149
1150
1151  /**
1152   * Parse a text-format message from {@code input} and merge the contents
1153   * into {@code builder}.  Extensions will be recognized if they are
1154   * registered in {@code extensionRegistry}.
1155   */
1156  public static void merge(final CharSequence input,
1157                           final ExtensionRegistry extensionRegistry,
1158                           final Message.Builder builder)
1159                           throws ParseException {
1160    PARSER.merge(input, extensionRegistry, builder);
1161  }
1162
1163
1164  /**
1165   * Parser for text-format proto2 instances. This class is thread-safe.
1166   * The implementation largely follows google/protobuf/text_format.cc.
1167   *
1168   * <p>Use {@link TextFormat#getParser()} to obtain the default parser, or
1169   * {@link Builder} to control the parser behavior.
1170   */
1171  public static class Parser {
1172    /**
1173     * Determines if repeated values for non-repeated fields and
1174     * oneofs are permitted. For example, given required/optional field "foo"
1175     * and a oneof containing "baz" and "qux":
1176     * <li>
1177     * <ul>"foo: 1 foo: 2"
1178     * <ul>"baz: 1 qux: 2"
1179     * <ul>merging "foo: 2" into a proto in which foo is already set, or
1180     * <ul>merging "qux: 2" into a proto in which baz is already set.
1181     * </li>
1182     */
1183    public enum SingularOverwritePolicy {
1184      /** The last value is retained. */
1185      ALLOW_SINGULAR_OVERWRITES,
1186      /** An error is issued. */
1187      FORBID_SINGULAR_OVERWRITES
1188    }
1189
1190    private final boolean allowUnknownFields;
1191    private final SingularOverwritePolicy singularOverwritePolicy;
1192
1193    private Parser(boolean allowUnknownFields,
1194        SingularOverwritePolicy singularOverwritePolicy) {
1195      this.allowUnknownFields = allowUnknownFields;
1196      this.singularOverwritePolicy = singularOverwritePolicy;
1197    }
1198
1199    /**
1200     * Returns a new instance of {@link Builder}.
1201     */
1202    public static Builder newBuilder() {
1203      return new Builder();
1204    }
1205
1206    /**
1207     * Builder that can be used to obtain new instances of {@link Parser}.
1208     */
1209    public static class Builder {
1210      private boolean allowUnknownFields = false;
1211      private SingularOverwritePolicy singularOverwritePolicy =
1212          SingularOverwritePolicy.ALLOW_SINGULAR_OVERWRITES;
1213
1214      /**
1215       * Sets parser behavior when a non-repeated field appears more than once.
1216       */
1217      public Builder setSingularOverwritePolicy(SingularOverwritePolicy p) {
1218        this.singularOverwritePolicy = p;
1219        return this;
1220      }
1221
1222      public Parser build() {
1223        return new Parser(allowUnknownFields, singularOverwritePolicy);
1224      }
1225    }
1226
1227    /**
1228     * Parse a text-format message from {@code input} and merge the contents
1229     * into {@code builder}.
1230     */
1231    public void merge(final Readable input,
1232                      final Message.Builder builder)
1233                      throws IOException {
1234      merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
1235    }
1236
1237    /**
1238     * Parse a text-format message from {@code input} and merge the contents
1239     * into {@code builder}.
1240     */
1241    public void merge(final CharSequence input,
1242                      final Message.Builder builder)
1243                      throws ParseException {
1244      merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
1245    }
1246
1247    /**
1248     * Parse a text-format message from {@code input} and merge the contents
1249     * into {@code builder}.  Extensions will be recognized if they are
1250     * registered in {@code extensionRegistry}.
1251     */
1252    public void merge(final Readable input,
1253                      final ExtensionRegistry extensionRegistry,
1254                      final Message.Builder builder)
1255                      throws IOException {
1256      // Read the entire input to a String then parse that.
1257
1258      // If StreamTokenizer were not quite so crippled, or if there were a kind
1259      // of Reader that could read in chunks that match some particular regex,
1260      // or if we wanted to write a custom Reader to tokenize our stream, then
1261      // we would not have to read to one big String.  Alas, none of these is
1262      // the case.  Oh well.
1263
1264      merge(toStringBuilder(input), extensionRegistry, builder);
1265    }
1266
1267
1268    private static final int BUFFER_SIZE = 4096;
1269
1270    // TODO(chrisn): See if working around java.io.Reader#read(CharBuffer)
1271    // overhead is worthwhile
1272    private static StringBuilder toStringBuilder(final Readable input)
1273        throws IOException {
1274      final StringBuilder text = new StringBuilder();
1275      final CharBuffer buffer = CharBuffer.allocate(BUFFER_SIZE);
1276      while (true) {
1277        final int n = input.read(buffer);
1278        if (n == -1) {
1279          break;
1280        }
1281        buffer.flip();
1282        text.append(buffer, 0, n);
1283      }
1284      return text;
1285    }
1286
1287    /**
1288     * Parse a text-format message from {@code input} and merge the contents
1289     * into {@code builder}.  Extensions will be recognized if they are
1290     * registered in {@code extensionRegistry}.
1291     */
1292    public void merge(final CharSequence input,
1293                      final ExtensionRegistry extensionRegistry,
1294                      final Message.Builder builder)
1295                      throws ParseException {
1296      final Tokenizer tokenizer = new Tokenizer(input);
1297      MessageReflection.BuilderAdapter target =
1298          new MessageReflection.BuilderAdapter(builder);
1299
1300      while (!tokenizer.atEnd()) {
1301        mergeField(tokenizer, extensionRegistry, target);
1302      }
1303    }
1304
1305
1306    /**
1307     * Parse a single field from {@code tokenizer} and merge it into
1308     * {@code builder}.
1309     */
1310    private void mergeField(final Tokenizer tokenizer,
1311                            final ExtensionRegistry extensionRegistry,
1312                            final MessageReflection.MergeTarget target)
1313                            throws ParseException {
1314      FieldDescriptor field = null;
1315      final Descriptor type = target.getDescriptorForType();
1316      ExtensionRegistry.ExtensionInfo extension = null;
1317
1318      if (tokenizer.tryConsume("[")) {
1319        // An extension.
1320        final StringBuilder name =
1321            new StringBuilder(tokenizer.consumeIdentifier());
1322        while (tokenizer.tryConsume(".")) {
1323          name.append('.');
1324          name.append(tokenizer.consumeIdentifier());
1325        }
1326
1327        extension = target.findExtensionByName(
1328            extensionRegistry, name.toString());
1329
1330        if (extension == null) {
1331          if (!allowUnknownFields) {
1332            throw tokenizer.parseExceptionPreviousToken(
1333              "Extension \"" + name + "\" not found in the ExtensionRegistry.");
1334          } else {
1335            logger.warning(
1336              "Extension \"" + name + "\" not found in the ExtensionRegistry.");
1337          }
1338        } else {
1339          if (extension.descriptor.getContainingType() != type) {
1340            throw tokenizer.parseExceptionPreviousToken(
1341              "Extension \"" + name + "\" does not extend message type \"" +
1342              type.getFullName() + "\".");
1343          }
1344          field = extension.descriptor;
1345        }
1346
1347        tokenizer.consume("]");
1348      } else {
1349        final String name = tokenizer.consumeIdentifier();
1350        field = type.findFieldByName(name);
1351
1352        // Group names are expected to be capitalized as they appear in the
1353        // .proto file, which actually matches their type names, not their field
1354        // names.
1355        if (field == null) {
1356          // Explicitly specify US locale so that this code does not break when
1357          // executing in Turkey.
1358          final String lowerName = name.toLowerCase(Locale.US);
1359          field = type.findFieldByName(lowerName);
1360          // If the case-insensitive match worked but the field is NOT a group,
1361          if (field != null && field.getType() != FieldDescriptor.Type.GROUP) {
1362            field = null;
1363          }
1364        }
1365        // Again, special-case group names as described above.
1366        if (field != null && field.getType() == FieldDescriptor.Type.GROUP &&
1367            !field.getMessageType().getName().equals(name)) {
1368          field = null;
1369        }
1370
1371        if (field == null) {
1372          if (!allowUnknownFields) {
1373            throw tokenizer.parseExceptionPreviousToken(
1374              "Message type \"" + type.getFullName() +
1375              "\" has no field named \"" + name + "\".");
1376          } else {
1377            logger.warning(
1378              "Message type \"" + type.getFullName() +
1379              "\" has no field named \"" + name + "\".");
1380          }
1381        }
1382      }
1383
1384      // Skips unknown fields.
1385      if (field == null) {
1386        // Try to guess the type of this field.
1387        // If this field is not a message, there should be a ":" between the
1388        // field name and the field value and also the field value should not
1389        // start with "{" or "<" which indicates the begining of a message body.
1390        // If there is no ":" or there is a "{" or "<" after ":", this field has
1391        // to be a message or the input is ill-formed.
1392        if (tokenizer.tryConsume(":") && !tokenizer.lookingAt("{") &&
1393            !tokenizer.lookingAt("<")) {
1394          skipFieldValue(tokenizer);
1395        } else {
1396          skipFieldMessage(tokenizer);
1397        }
1398        return;
1399      }
1400
1401      // Handle potential ':'.
1402      if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
1403        tokenizer.tryConsume(":");  // optional
1404      } else {
1405        tokenizer.consume(":");  // required
1406      }
1407      // Support specifying repeated field values as a comma-separated list.
1408      // Ex."foo: [1, 2, 3]"
1409      if (field.isRepeated() && tokenizer.tryConsume("[")) {
1410        while (true) {
1411          consumeFieldValue(tokenizer, extensionRegistry, target, field, extension);
1412          if (tokenizer.tryConsume("]")) {
1413            // End of list.
1414            break;
1415          }
1416          tokenizer.consume(",");
1417        }
1418      } else {
1419        consumeFieldValue(tokenizer, extensionRegistry, target, field, extension);
1420      }
1421    }
1422
1423    /**
1424     * Parse a single field value from {@code tokenizer} and merge it into
1425     * {@code builder}.
1426     */
1427    private void consumeFieldValue(
1428        final Tokenizer tokenizer,
1429        final ExtensionRegistry extensionRegistry,
1430        final MessageReflection.MergeTarget target,
1431        final FieldDescriptor field,
1432        final ExtensionRegistry.ExtensionInfo extension)
1433        throws ParseException {
1434      Object value = null;
1435
1436      if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
1437        final String endToken;
1438        if (tokenizer.tryConsume("<")) {
1439          endToken = ">";
1440        } else {
1441          tokenizer.consume("{");
1442          endToken = "}";
1443        }
1444
1445        final MessageReflection.MergeTarget subField;
1446        subField = target.newMergeTargetForField(field,
1447            (extension == null) ? null : extension.defaultInstance);
1448
1449        while (!tokenizer.tryConsume(endToken)) {
1450          if (tokenizer.atEnd()) {
1451            throw tokenizer.parseException(
1452              "Expected \"" + endToken + "\".");
1453          }
1454          mergeField(tokenizer, extensionRegistry, subField);
1455        }
1456
1457        value = subField.finish();
1458
1459      } else {
1460        switch (field.getType()) {
1461          case INT32:
1462          case SINT32:
1463          case SFIXED32:
1464            value = tokenizer.consumeInt32();
1465            break;
1466
1467          case INT64:
1468          case SINT64:
1469          case SFIXED64:
1470            value = tokenizer.consumeInt64();
1471            break;
1472
1473          case UINT32:
1474          case FIXED32:
1475            value = tokenizer.consumeUInt32();
1476            break;
1477
1478          case UINT64:
1479          case FIXED64:
1480            value = tokenizer.consumeUInt64();
1481            break;
1482
1483          case FLOAT:
1484            value = tokenizer.consumeFloat();
1485            break;
1486
1487          case DOUBLE:
1488            value = tokenizer.consumeDouble();
1489            break;
1490
1491          case BOOL:
1492            value = tokenizer.consumeBoolean();
1493            break;
1494
1495          case STRING:
1496            value = tokenizer.consumeString();
1497            break;
1498
1499          case BYTES:
1500            value = tokenizer.consumeByteString();
1501            break;
1502
1503          case ENUM:
1504            final EnumDescriptor enumType = field.getEnumType();
1505
1506            if (tokenizer.lookingAtInteger()) {
1507              final int number = tokenizer.consumeInt32();
1508              value = enumType.findValueByNumber(number);
1509              if (value == null) {
1510                throw tokenizer.parseExceptionPreviousToken(
1511                  "Enum type \"" + enumType.getFullName() +
1512                  "\" has no value with number " + number + '.');
1513              }
1514            } else {
1515              final String id = tokenizer.consumeIdentifier();
1516              value = enumType.findValueByName(id);
1517              if (value == null) {
1518                throw tokenizer.parseExceptionPreviousToken(
1519                  "Enum type \"" + enumType.getFullName() +
1520                  "\" has no value named \"" + id + "\".");
1521              }
1522            }
1523
1524            break;
1525
1526          case MESSAGE:
1527          case GROUP:
1528            throw new RuntimeException("Can't get here.");
1529        }
1530      }
1531
1532      if (field.isRepeated()) {
1533        target.addRepeatedField(field, value);
1534      } else if ((singularOverwritePolicy
1535              == SingularOverwritePolicy.FORBID_SINGULAR_OVERWRITES)
1536          && target.hasField(field)) {
1537        throw tokenizer.parseExceptionPreviousToken("Non-repeated field \""
1538            + field.getFullName() + "\" cannot be overwritten.");
1539      } else if ((singularOverwritePolicy
1540              == SingularOverwritePolicy.FORBID_SINGULAR_OVERWRITES)
1541          && field.getContainingOneof() != null
1542          && target.hasOneof(field.getContainingOneof())) {
1543        Descriptors.OneofDescriptor oneof = field.getContainingOneof();
1544        throw tokenizer.parseExceptionPreviousToken("Field \""
1545            + field.getFullName() + "\" is specified along with field \""
1546            + target.getOneofFieldDescriptor(oneof).getFullName()
1547            + "\", another member of oneof \"" + oneof.getName() + "\".");
1548      } else {
1549        target.setField(field, value);
1550      }
1551    }
1552
1553    /**
1554     * Skips the next field including the field's name and value.
1555     */
1556    private void skipField(Tokenizer tokenizer) throws ParseException {
1557      if (tokenizer.tryConsume("[")) {
1558        // Extension name.
1559        do {
1560          tokenizer.consumeIdentifier();
1561        } while (tokenizer.tryConsume("."));
1562        tokenizer.consume("]");
1563      } else {
1564        tokenizer.consumeIdentifier();
1565      }
1566
1567      // Try to guess the type of this field.
1568      // If this field is not a message, there should be a ":" between the
1569      // field name and the field value and also the field value should not
1570      // start with "{" or "<" which indicates the begining of a message body.
1571      // If there is no ":" or there is a "{" or "<" after ":", this field has
1572      // to be a message or the input is ill-formed.
1573      if (tokenizer.tryConsume(":") && !tokenizer.lookingAt("<") &&
1574          !tokenizer.lookingAt("{")) {
1575        skipFieldValue(tokenizer);
1576      } else {
1577        skipFieldMessage(tokenizer);
1578      }
1579      // For historical reasons, fields may optionally be separated by commas or
1580      // semicolons.
1581      if (!tokenizer.tryConsume(";")) {
1582        tokenizer.tryConsume(",");
1583      }
1584    }
1585
1586    /**
1587     * Skips the whole body of a message including the beginning delimeter and
1588     * the ending delimeter.
1589     */
1590    private void skipFieldMessage(Tokenizer tokenizer) throws ParseException {
1591      final String delimiter;
1592      if (tokenizer.tryConsume("<")) {
1593        delimiter = ">";
1594      } else {
1595        tokenizer.consume("{");
1596        delimiter = "}";
1597      }
1598      while (!tokenizer.lookingAt(">") && !tokenizer.lookingAt("}")) {
1599        skipField(tokenizer);
1600      }
1601      tokenizer.consume(delimiter);
1602    }
1603
1604    /**
1605     * Skips a field value.
1606     */
1607    private void skipFieldValue(Tokenizer tokenizer) throws ParseException {
1608      if (tokenizer.tryConsumeString()) {
1609        while (tokenizer.tryConsumeString()) {}
1610        return;
1611      }
1612      if (!tokenizer.tryConsumeIdentifier() &&  // includes enum & boolean
1613          !tokenizer.tryConsumeInt64() &&       // includes int32
1614          !tokenizer.tryConsumeUInt64() &&      // includes uint32
1615          !tokenizer.tryConsumeDouble() &&
1616          !tokenizer.tryConsumeFloat()) {
1617        throw tokenizer.parseException(
1618            "Invalid field value: " + tokenizer.currentToken);
1619      }
1620    }
1621  }
1622
1623  // =================================================================
1624  // Utility functions
1625  //
1626  // Some of these methods are package-private because Descriptors.java uses
1627  // them.
1628
1629  private interface ByteSequence {
1630    int size();
1631    byte byteAt(int offset);
1632  }
1633
1634  /**
1635   * Escapes bytes in the format used in protocol buffer text format, which
1636   * is the same as the format used for C string literals.  All bytes
1637   * that are not printable 7-bit ASCII characters are escaped, as well as
1638   * backslash, single-quote, and double-quote characters.  Characters for
1639   * which no defined short-hand escape sequence is defined will be escaped
1640   * using 3-digit octal sequences.
1641   */
1642  private static String escapeBytes(final ByteSequence input) {
1643    final StringBuilder builder = new StringBuilder(input.size());
1644    for (int i = 0; i < input.size(); i++) {
1645      final byte b = input.byteAt(i);
1646      switch (b) {
1647        // Java does not recognize \a or \v, apparently.
1648        case 0x07: builder.append("\\a" ); break;
1649        case '\b': builder.append("\\b" ); break;
1650        case '\f': builder.append("\\f" ); break;
1651        case '\n': builder.append("\\n" ); break;
1652        case '\r': builder.append("\\r" ); break;
1653        case '\t': builder.append("\\t" ); break;
1654        case 0x0b: builder.append("\\v" ); break;
1655        case '\\': builder.append("\\\\"); break;
1656        case '\'': builder.append("\\\'"); break;
1657        case '"' : builder.append("\\\""); break;
1658        default:
1659          // Note:  Bytes with the high-order bit set should be escaped.  Since
1660          //   bytes are signed, such bytes will compare less than 0x20, hence
1661          //   the following line is correct.
1662          if (b >= 0x20) {
1663            builder.append((char) b);
1664          } else {
1665            builder.append('\\');
1666            builder.append((char) ('0' + ((b >>> 6) & 3)));
1667            builder.append((char) ('0' + ((b >>> 3) & 7)));
1668            builder.append((char) ('0' + (b & 7)));
1669          }
1670          break;
1671      }
1672    }
1673    return builder.toString();
1674  }
1675
1676  /**
1677   * Escapes bytes in the format used in protocol buffer text format, which
1678   * is the same as the format used for C string literals.  All bytes
1679   * that are not printable 7-bit ASCII characters are escaped, as well as
1680   * backslash, single-quote, and double-quote characters.  Characters for
1681   * which no defined short-hand escape sequence is defined will be escaped
1682   * using 3-digit octal sequences.
1683   */
1684  static String escapeBytes(final ByteString input) {
1685    return escapeBytes(new ByteSequence() {
1686      public int size() {
1687        return input.size();
1688      }
1689      public byte byteAt(int offset) {
1690        return input.byteAt(offset);
1691      }
1692    });
1693  }
1694
1695  /**
1696   * Like {@link #escapeBytes(ByteString)}, but used for byte array.
1697   */
1698  static String escapeBytes(final byte[] input) {
1699    return escapeBytes(new ByteSequence() {
1700      public int size() {
1701        return input.length;
1702      }
1703      public byte byteAt(int offset) {
1704        return input[offset];
1705      }
1706    });
1707  }
1708
1709  /**
1710   * Un-escape a byte sequence as escaped using
1711   * {@link #escapeBytes(ByteString)}.  Two-digit hex escapes (starting with
1712   * "\x") are also recognized.
1713   */
1714  static ByteString unescapeBytes(final CharSequence charString)
1715      throws InvalidEscapeSequenceException {
1716    // First convert the Java character sequence to UTF-8 bytes.
1717    ByteString input = ByteString.copyFromUtf8(charString.toString());
1718    // Then unescape certain byte sequences introduced by ASCII '\\'.  The valid
1719    // escapes can all be expressed with ASCII characters, so it is safe to
1720    // operate on bytes here.
1721    //
1722    // Unescaping the input byte array will result in a byte sequence that's no
1723    // longer than the input.  That's because each escape sequence is between
1724    // two and four bytes long and stands for a single byte.
1725    final byte[] result = new byte[input.size()];
1726    int pos = 0;
1727    for (int i = 0; i < input.size(); i++) {
1728      byte c = input.byteAt(i);
1729      if (c == '\\') {
1730        if (i + 1 < input.size()) {
1731          ++i;
1732          c = input.byteAt(i);
1733          if (isOctal(c)) {
1734            // Octal escape.
1735            int code = digitValue(c);
1736            if (i + 1 < input.size() && isOctal(input.byteAt(i + 1))) {
1737              ++i;
1738              code = code * 8 + digitValue(input.byteAt(i));
1739            }
1740            if (i + 1 < input.size() && isOctal(input.byteAt(i + 1))) {
1741              ++i;
1742              code = code * 8 + digitValue(input.byteAt(i));
1743            }
1744            // TODO: Check that 0 <= code && code <= 0xFF.
1745            result[pos++] = (byte)code;
1746          } else {
1747            switch (c) {
1748              case 'a' : result[pos++] = 0x07; break;
1749              case 'b' : result[pos++] = '\b'; break;
1750              case 'f' : result[pos++] = '\f'; break;
1751              case 'n' : result[pos++] = '\n'; break;
1752              case 'r' : result[pos++] = '\r'; break;
1753              case 't' : result[pos++] = '\t'; break;
1754              case 'v' : result[pos++] = 0x0b; break;
1755              case '\\': result[pos++] = '\\'; break;
1756              case '\'': result[pos++] = '\''; break;
1757              case '"' : result[pos++] = '\"'; break;
1758
1759              case 'x':
1760                // hex escape
1761                int code = 0;
1762                if (i + 1 < input.size() && isHex(input.byteAt(i + 1))) {
1763                  ++i;
1764                  code = digitValue(input.byteAt(i));
1765                } else {
1766                  throw new InvalidEscapeSequenceException(
1767                      "Invalid escape sequence: '\\x' with no digits");
1768                }
1769                if (i + 1 < input.size() && isHex(input.byteAt(i + 1))) {
1770                  ++i;
1771                  code = code * 16 + digitValue(input.byteAt(i));
1772                }
1773                result[pos++] = (byte)code;
1774                break;
1775
1776              default:
1777                throw new InvalidEscapeSequenceException(
1778                    "Invalid escape sequence: '\\" + (char)c + '\'');
1779            }
1780          }
1781        } else {
1782          throw new InvalidEscapeSequenceException(
1783              "Invalid escape sequence: '\\' at end of string.");
1784        }
1785      } else {
1786        result[pos++] = c;
1787      }
1788    }
1789
1790    return ByteString.copyFrom(result, 0, pos);
1791  }
1792
1793  /**
1794   * Thrown by {@link TextFormat#unescapeBytes} and
1795   * {@link TextFormat#unescapeText} when an invalid escape sequence is seen.
1796   */
1797  static class InvalidEscapeSequenceException extends IOException {
1798    private static final long serialVersionUID = -8164033650142593304L;
1799
1800    InvalidEscapeSequenceException(final String description) {
1801      super(description);
1802    }
1803  }
1804
1805  /**
1806   * Like {@link #escapeBytes(ByteString)}, but escapes a text string.
1807   * Non-ASCII characters are first encoded as UTF-8, then each byte is escaped
1808   * individually as a 3-digit octal escape.  Yes, it's weird.
1809   */
1810  static String escapeText(final String input) {
1811    return escapeBytes(ByteString.copyFromUtf8(input));
1812  }
1813
1814  /**
1815   * Escape double quotes and backslashes in a String for unicode output of a message.
1816   */
1817  public static String escapeDoubleQuotesAndBackslashes(final String input) {
1818    return input.replace("\\", "\\\\").replace("\"", "\\\"");
1819  }
1820
1821  /**
1822   * Un-escape a text string as escaped using {@link #escapeText(String)}.
1823   * Two-digit hex escapes (starting with "\x") are also recognized.
1824   */
1825  static String unescapeText(final String input)
1826                             throws InvalidEscapeSequenceException {
1827    return unescapeBytes(input).toStringUtf8();
1828  }
1829
1830  /** Is this an octal digit? */
1831  private static boolean isOctal(final byte c) {
1832    return '0' <= c && c <= '7';
1833  }
1834
1835  /** Is this a hex digit? */
1836  private static boolean isHex(final byte c) {
1837    return ('0' <= c && c <= '9') ||
1838           ('a' <= c && c <= 'f') ||
1839           ('A' <= c && c <= 'F');
1840  }
1841
1842  /**
1843   * Interpret a character as a digit (in any base up to 36) and return the
1844   * numeric value.  This is like {@code Character.digit()} but we don't accept
1845   * non-ASCII digits.
1846   */
1847  private static int digitValue(final byte c) {
1848    if ('0' <= c && c <= '9') {
1849      return c - '0';
1850    } else if ('a' <= c && c <= 'z') {
1851      return c - 'a' + 10;
1852    } else {
1853      return c - 'A' + 10;
1854    }
1855  }
1856
1857  /**
1858   * Parse a 32-bit signed integer from the text.  Unlike the Java standard
1859   * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1860   * and "0" to signify hexadecimal and octal numbers, respectively.
1861   */
1862  static int parseInt32(final String text) throws NumberFormatException {
1863    return (int) parseInteger(text, true, false);
1864  }
1865
1866  /**
1867   * Parse a 32-bit unsigned integer from the text.  Unlike the Java standard
1868   * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1869   * and "0" to signify hexadecimal and octal numbers, respectively.  The
1870   * result is coerced to a (signed) {@code int} when returned since Java has
1871   * no unsigned integer type.
1872   */
1873  static int parseUInt32(final String text) throws NumberFormatException {
1874    return (int) parseInteger(text, false, false);
1875  }
1876
1877  /**
1878   * Parse a 64-bit signed integer from the text.  Unlike the Java standard
1879   * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1880   * and "0" to signify hexadecimal and octal numbers, respectively.
1881   */
1882  static long parseInt64(final String text) throws NumberFormatException {
1883    return parseInteger(text, true, true);
1884  }
1885
1886  /**
1887   * Parse a 64-bit unsigned integer from the text.  Unlike the Java standard
1888   * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1889   * and "0" to signify hexadecimal and octal numbers, respectively.  The
1890   * result is coerced to a (signed) {@code long} when returned since Java has
1891   * no unsigned long type.
1892   */
1893  static long parseUInt64(final String text) throws NumberFormatException {
1894    return parseInteger(text, false, true);
1895  }
1896
1897  private static long parseInteger(final String text,
1898                                   final boolean isSigned,
1899                                   final boolean isLong)
1900                                   throws NumberFormatException {
1901    int pos = 0;
1902
1903    boolean negative = false;
1904    if (text.startsWith("-", pos)) {
1905      if (!isSigned) {
1906        throw new NumberFormatException("Number must be positive: " + text);
1907      }
1908      ++pos;
1909      negative = true;
1910    }
1911
1912    int radix = 10;
1913    if (text.startsWith("0x", pos)) {
1914      pos += 2;
1915      radix = 16;
1916    } else if (text.startsWith("0", pos)) {
1917      radix = 8;
1918    }
1919
1920    final String numberText = text.substring(pos);
1921
1922    long result = 0;
1923    if (numberText.length() < 16) {
1924      // Can safely assume no overflow.
1925      result = Long.parseLong(numberText, radix);
1926      if (negative) {
1927        result = -result;
1928      }
1929
1930      // Check bounds.
1931      // No need to check for 64-bit numbers since they'd have to be 16 chars
1932      // or longer to overflow.
1933      if (!isLong) {
1934        if (isSigned) {
1935          if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE) {
1936            throw new NumberFormatException(
1937              "Number out of range for 32-bit signed integer: " + text);
1938          }
1939        } else {
1940          if (result >= (1L << 32) || result < 0) {
1941            throw new NumberFormatException(
1942              "Number out of range for 32-bit unsigned integer: " + text);
1943          }
1944        }
1945      }
1946    } else {
1947      BigInteger bigValue = new BigInteger(numberText, radix);
1948      if (negative) {
1949        bigValue = bigValue.negate();
1950      }
1951
1952      // Check bounds.
1953      if (!isLong) {
1954        if (isSigned) {
1955          if (bigValue.bitLength() > 31) {
1956            throw new NumberFormatException(
1957              "Number out of range for 32-bit signed integer: " + text);
1958          }
1959        } else {
1960          if (bigValue.bitLength() > 32) {
1961            throw new NumberFormatException(
1962              "Number out of range for 32-bit unsigned integer: " + text);
1963          }
1964        }
1965      } else {
1966        if (isSigned) {
1967          if (bigValue.bitLength() > 63) {
1968            throw new NumberFormatException(
1969              "Number out of range for 64-bit signed integer: " + text);
1970          }
1971        } else {
1972          if (bigValue.bitLength() > 64) {
1973            throw new NumberFormatException(
1974              "Number out of range for 64-bit unsigned integer: " + text);
1975          }
1976        }
1977      }
1978
1979      result = bigValue.longValue();
1980    }
1981
1982    return result;
1983  }
1984}
1985