1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc.  All rights reserved.
3# http://code.google.com/p/protobuf/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format."""
32
33__author__ = 'kenton@google.com (Kenton Varda)'
34
35import cStringIO
36import re
37
38from collections import deque
39from google.protobuf.internal import type_checkers
40from google.protobuf import descriptor
41
42__all__ = [ 'MessageToString', 'PrintMessage', 'PrintField',
43            'PrintFieldValue', 'Merge' ]
44
45
46_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
47                     type_checkers.Int32ValueChecker(),
48                     type_checkers.Uint64ValueChecker(),
49                     type_checkers.Int64ValueChecker())
50_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE)
51_FLOAT_NAN = re.compile('nanf?', re.IGNORECASE)
52
53
54class ParseError(Exception):
55  """Thrown in case of ASCII parsing error."""
56
57
58def MessageToString(message, as_utf8=False, as_one_line=False):
59  out = cStringIO.StringIO()
60  PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line)
61  result = out.getvalue()
62  out.close()
63  if as_one_line:
64    return result.rstrip()
65  return result
66
67
68def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False):
69  for field, value in message.ListFields():
70    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
71      for element in value:
72        PrintField(field, element, out, indent, as_utf8, as_one_line)
73    else:
74      PrintField(field, value, out, indent, as_utf8, as_one_line)
75
76
77def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False):
78  """Print a single field name/value pair.  For repeated fields, the value
79  should be a single element."""
80
81  out.write(' ' * indent);
82  if field.is_extension:
83    out.write('[')
84    if (field.containing_type.GetOptions().message_set_wire_format and
85        field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
86        field.message_type == field.extension_scope and
87        field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
88      out.write(field.message_type.full_name)
89    else:
90      out.write(field.full_name)
91    out.write(']')
92  elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
93    # For groups, use the capitalized name.
94    out.write(field.message_type.name)
95  else:
96    out.write(field.name)
97
98  if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
99    # The colon is optional in this case, but our cross-language golden files
100    # don't include it.
101    out.write(': ')
102
103  PrintFieldValue(field, value, out, indent, as_utf8, as_one_line)
104  if as_one_line:
105    out.write(' ')
106  else:
107    out.write('\n')
108
109
110def PrintFieldValue(field, value, out, indent=0,
111                    as_utf8=False, as_one_line=False):
112  """Print a single field value (not including name).  For repeated fields,
113  the value should be a single element."""
114
115  if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
116    if as_one_line:
117      out.write(' { ')
118      PrintMessage(value, out, indent, as_utf8, as_one_line)
119      out.write('}')
120    else:
121      out.write(' {\n')
122      PrintMessage(value, out, indent + 2, as_utf8, as_one_line)
123      out.write(' ' * indent + '}')
124  elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
125    enum_value = field.enum_type.values_by_number.get(value, None)
126    if enum_value is not None:
127      out.write(enum_value.name)
128    else:
129      out.write(str(value))
130  elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
131    out.write('\"')
132    if type(value) is unicode:
133      out.write(_CEscape(value.encode('utf-8'), as_utf8))
134    else:
135      out.write(_CEscape(value, as_utf8))
136    out.write('\"')
137  elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
138    if value:
139      out.write("true")
140    else:
141      out.write("false")
142  else:
143    out.write(str(value))
144
145
146def Merge(text, message):
147  """Merges an ASCII representation of a protocol message into a message.
148
149  Args:
150    text: Message ASCII representation.
151    message: A protocol buffer message to merge into.
152
153  Raises:
154    ParseError: On ASCII parsing problems.
155  """
156  tokenizer = _Tokenizer(text)
157  while not tokenizer.AtEnd():
158    _MergeField(tokenizer, message)
159
160
161def _MergeField(tokenizer, message):
162  """Merges a single protocol message field into a message.
163
164  Args:
165    tokenizer: A tokenizer to parse the field name and values.
166    message: A protocol message to record the data.
167
168  Raises:
169    ParseError: In case of ASCII parsing problems.
170  """
171  message_descriptor = message.DESCRIPTOR
172  if tokenizer.TryConsume('['):
173    name = [tokenizer.ConsumeIdentifier()]
174    while tokenizer.TryConsume('.'):
175      name.append(tokenizer.ConsumeIdentifier())
176    name = '.'.join(name)
177
178    if not message_descriptor.is_extendable:
179      raise tokenizer.ParseErrorPreviousToken(
180          'Message type "%s" does not have extensions.' %
181          message_descriptor.full_name)
182    field = message.Extensions._FindExtensionByName(name)
183    if not field:
184      raise tokenizer.ParseErrorPreviousToken(
185          'Extension "%s" not registered.' % name)
186    elif message_descriptor != field.containing_type:
187      raise tokenizer.ParseErrorPreviousToken(
188          'Extension "%s" does not extend message type "%s".' % (
189              name, message_descriptor.full_name))
190    tokenizer.Consume(']')
191  else:
192    name = tokenizer.ConsumeIdentifier()
193    field = message_descriptor.fields_by_name.get(name, None)
194
195    # Group names are expected to be capitalized as they appear in the
196    # .proto file, which actually matches their type names, not their field
197    # names.
198    if not field:
199      field = message_descriptor.fields_by_name.get(name.lower(), None)
200      if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
201        field = None
202
203    if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
204        field.message_type.name != name):
205      field = None
206
207    if not field:
208      raise tokenizer.ParseErrorPreviousToken(
209          'Message type "%s" has no field named "%s".' % (
210              message_descriptor.full_name, name))
211
212  if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
213    tokenizer.TryConsume(':')
214
215    if tokenizer.TryConsume('<'):
216      end_token = '>'
217    else:
218      tokenizer.Consume('{')
219      end_token = '}'
220
221    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
222      if field.is_extension:
223        sub_message = message.Extensions[field].add()
224      else:
225        sub_message = getattr(message, field.name).add()
226    else:
227      if field.is_extension:
228        sub_message = message.Extensions[field]
229      else:
230        sub_message = getattr(message, field.name)
231      sub_message.SetInParent()
232
233    while not tokenizer.TryConsume(end_token):
234      if tokenizer.AtEnd():
235        raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token))
236      _MergeField(tokenizer, sub_message)
237  else:
238    _MergeScalarField(tokenizer, message, field)
239
240
241def _MergeScalarField(tokenizer, message, field):
242  """Merges a single protocol message scalar field into a message.
243
244  Args:
245    tokenizer: A tokenizer to parse the field value.
246    message: A protocol message to record the data.
247    field: The descriptor of the field to be merged.
248
249  Raises:
250    ParseError: In case of ASCII parsing problems.
251    RuntimeError: On runtime errors.
252  """
253  tokenizer.Consume(':')
254  value = None
255
256  if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
257                    descriptor.FieldDescriptor.TYPE_SINT32,
258                    descriptor.FieldDescriptor.TYPE_SFIXED32):
259    value = tokenizer.ConsumeInt32()
260  elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
261                      descriptor.FieldDescriptor.TYPE_SINT64,
262                      descriptor.FieldDescriptor.TYPE_SFIXED64):
263    value = tokenizer.ConsumeInt64()
264  elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
265                      descriptor.FieldDescriptor.TYPE_FIXED32):
266    value = tokenizer.ConsumeUint32()
267  elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
268                      descriptor.FieldDescriptor.TYPE_FIXED64):
269    value = tokenizer.ConsumeUint64()
270  elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
271                      descriptor.FieldDescriptor.TYPE_DOUBLE):
272    value = tokenizer.ConsumeFloat()
273  elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
274    value = tokenizer.ConsumeBool()
275  elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
276    value = tokenizer.ConsumeString()
277  elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
278    value = tokenizer.ConsumeByteString()
279  elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
280    value = tokenizer.ConsumeEnum(field)
281  else:
282    raise RuntimeError('Unknown field type %d' % field.type)
283
284  if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
285    if field.is_extension:
286      message.Extensions[field].append(value)
287    else:
288      getattr(message, field.name).append(value)
289  else:
290    if field.is_extension:
291      message.Extensions[field] = value
292    else:
293      setattr(message, field.name, value)
294
295
296class _Tokenizer(object):
297  """Protocol buffer ASCII representation tokenizer.
298
299  This class handles the lower level string parsing by splitting it into
300  meaningful tokens.
301
302  It was directly ported from the Java protocol buffer API.
303  """
304
305  _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE)
306  _TOKEN = re.compile(
307      '[a-zA-Z_][0-9a-zA-Z_+-]*|'           # an identifier
308      '[0-9+-][0-9a-zA-Z_.+-]*|'            # a number
309      '\"([^\"\n\\\\]|\\\\.)*(\"|\\\\?$)|'  # a double-quoted string
310      '\'([^\'\n\\\\]|\\\\.)*(\'|\\\\?$)')  # a single-quoted string
311  _IDENTIFIER = re.compile('\w+')
312
313  def __init__(self, text_message):
314    self._text_message = text_message
315
316    self._position = 0
317    self._line = -1
318    self._column = 0
319    self._token_start = None
320    self.token = ''
321    self._lines = deque(text_message.split('\n'))
322    self._current_line = ''
323    self._previous_line = 0
324    self._previous_column = 0
325    self._SkipWhitespace()
326    self.NextToken()
327
328  def AtEnd(self):
329    """Checks the end of the text was reached.
330
331    Returns:
332      True iff the end was reached.
333    """
334    return self.token == ''
335
336  def _PopLine(self):
337    while len(self._current_line) <= self._column:
338      if not self._lines:
339        self._current_line = ''
340        return
341      self._line += 1
342      self._column = 0
343      self._current_line = self._lines.popleft()
344
345  def _SkipWhitespace(self):
346    while True:
347      self._PopLine()
348      match = self._WHITESPACE.match(self._current_line, self._column)
349      if not match:
350        break
351      length = len(match.group(0))
352      self._column += length
353
354  def TryConsume(self, token):
355    """Tries to consume a given piece of text.
356
357    Args:
358      token: Text to consume.
359
360    Returns:
361      True iff the text was consumed.
362    """
363    if self.token == token:
364      self.NextToken()
365      return True
366    return False
367
368  def Consume(self, token):
369    """Consumes a piece of text.
370
371    Args:
372      token: Text to consume.
373
374    Raises:
375      ParseError: If the text couldn't be consumed.
376    """
377    if not self.TryConsume(token):
378      raise self._ParseError('Expected "%s".' % token)
379
380  def ConsumeIdentifier(self):
381    """Consumes protocol message field identifier.
382
383    Returns:
384      Identifier string.
385
386    Raises:
387      ParseError: If an identifier couldn't be consumed.
388    """
389    result = self.token
390    if not self._IDENTIFIER.match(result):
391      raise self._ParseError('Expected identifier.')
392    self.NextToken()
393    return result
394
395  def ConsumeInt32(self):
396    """Consumes a signed 32bit integer number.
397
398    Returns:
399      The integer parsed.
400
401    Raises:
402      ParseError: If a signed 32bit integer couldn't be consumed.
403    """
404    try:
405      result = ParseInteger(self.token, is_signed=True, is_long=False)
406    except ValueError, e:
407      raise self._ParseError(str(e))
408    self.NextToken()
409    return result
410
411  def ConsumeUint32(self):
412    """Consumes an unsigned 32bit integer number.
413
414    Returns:
415      The integer parsed.
416
417    Raises:
418      ParseError: If an unsigned 32bit integer couldn't be consumed.
419    """
420    try:
421      result = ParseInteger(self.token, is_signed=False, is_long=False)
422    except ValueError, e:
423      raise self._ParseError(str(e))
424    self.NextToken()
425    return result
426
427  def ConsumeInt64(self):
428    """Consumes a signed 64bit integer number.
429
430    Returns:
431      The integer parsed.
432
433    Raises:
434      ParseError: If a signed 64bit integer couldn't be consumed.
435    """
436    try:
437      result = ParseInteger(self.token, is_signed=True, is_long=True)
438    except ValueError, e:
439      raise self._ParseError(str(e))
440    self.NextToken()
441    return result
442
443  def ConsumeUint64(self):
444    """Consumes an unsigned 64bit integer number.
445
446    Returns:
447      The integer parsed.
448
449    Raises:
450      ParseError: If an unsigned 64bit integer couldn't be consumed.
451    """
452    try:
453      result = ParseInteger(self.token, is_signed=False, is_long=True)
454    except ValueError, e:
455      raise self._ParseError(str(e))
456    self.NextToken()
457    return result
458
459  def ConsumeFloat(self):
460    """Consumes an floating point number.
461
462    Returns:
463      The number parsed.
464
465    Raises:
466      ParseError: If a floating point number couldn't be consumed.
467    """
468    try:
469      result = ParseFloat(self.token)
470    except ValueError, e:
471      raise self._ParseError(str(e))
472    self.NextToken()
473    return result
474
475  def ConsumeBool(self):
476    """Consumes a boolean value.
477
478    Returns:
479      The bool parsed.
480
481    Raises:
482      ParseError: If a boolean value couldn't be consumed.
483    """
484    try:
485      result = ParseBool(self.token)
486    except ValueError, e:
487      raise self._ParseError(str(e))
488    self.NextToken()
489    return result
490
491  def ConsumeString(self):
492    """Consumes a string value.
493
494    Returns:
495      The string parsed.
496
497    Raises:
498      ParseError: If a string value couldn't be consumed.
499    """
500    bytes = self.ConsumeByteString()
501    try:
502      return unicode(bytes, 'utf-8')
503    except UnicodeDecodeError, e:
504      raise self._StringParseError(e)
505
506  def ConsumeByteString(self):
507    """Consumes a byte array value.
508
509    Returns:
510      The array parsed (as a string).
511
512    Raises:
513      ParseError: If a byte array value couldn't be consumed.
514    """
515    list = [self._ConsumeSingleByteString()]
516    while len(self.token) > 0 and self.token[0] in ('\'', '"'):
517      list.append(self._ConsumeSingleByteString())
518    return "".join(list)
519
520  def _ConsumeSingleByteString(self):
521    """Consume one token of a string literal.
522
523    String literals (whether bytes or text) can come in multiple adjacent
524    tokens which are automatically concatenated, like in C or Python.  This
525    method only consumes one token.
526    """
527    text = self.token
528    if len(text) < 1 or text[0] not in ('\'', '"'):
529      raise self._ParseError('Expected string.')
530
531    if len(text) < 2 or text[-1] != text[0]:
532      raise self._ParseError('String missing ending quote.')
533
534    try:
535      result = _CUnescape(text[1:-1])
536    except ValueError, e:
537      raise self._ParseError(str(e))
538    self.NextToken()
539    return result
540
541  def ConsumeEnum(self, field):
542    try:
543      result = ParseEnum(field, self.token)
544    except ValueError, e:
545      raise self._ParseError(str(e))
546    self.NextToken()
547    return result
548
549  def ParseErrorPreviousToken(self, message):
550    """Creates and *returns* a ParseError for the previously read token.
551
552    Args:
553      message: A message to set for the exception.
554
555    Returns:
556      A ParseError instance.
557    """
558    return ParseError('%d:%d : %s' % (
559        self._previous_line + 1, self._previous_column + 1, message))
560
561  def _ParseError(self, message):
562    """Creates and *returns* a ParseError for the current token."""
563    return ParseError('%d:%d : %s' % (
564        self._line + 1, self._column + 1, message))
565
566  def _StringParseError(self, e):
567    return self._ParseError('Couldn\'t parse string: ' + str(e))
568
569  def NextToken(self):
570    """Reads the next meaningful token."""
571    self._previous_line = self._line
572    self._previous_column = self._column
573
574    self._column += len(self.token)
575    self._SkipWhitespace()
576
577    if not self._lines and len(self._current_line) <= self._column:
578      self.token = ''
579      return
580
581    match = self._TOKEN.match(self._current_line, self._column)
582    if match:
583      token = match.group(0)
584      self.token = token
585    else:
586      self.token = self._current_line[self._column]
587
588
589# text.encode('string_escape') does not seem to satisfy our needs as it
590# encodes unprintable characters using two-digit hex escapes whereas our
591# C++ unescaping function allows hex escapes to be any length.  So,
592# "\0011".encode('string_escape') ends up being "\\x011", which will be
593# decoded in C++ as a single-character string with char code 0x11.
594def _CEscape(text, as_utf8):
595  def escape(c):
596    o = ord(c)
597    if o == 10: return r"\n"   # optional escape
598    if o == 13: return r"\r"   # optional escape
599    if o ==  9: return r"\t"   # optional escape
600    if o == 39: return r"\'"   # optional escape
601
602    if o == 34: return r'\"'   # necessary escape
603    if o == 92: return r"\\"   # necessary escape
604
605    # necessary escapes
606    if not as_utf8 and (o >= 127 or o < 32): return "\\%03o" % o
607    return c
608  return "".join([escape(c) for c in text])
609
610
611_CUNESCAPE_HEX = re.compile(r'(\\+)x([0-9a-fA-F])(?![0-9a-fA-F])')
612
613
614def _CUnescape(text):
615  def ReplaceHex(m):
616    # Only replace the match if the number of leading back slashes is odd. i.e.
617    # the slash itself is not escaped.
618    if len(m.group(1)) & 1:
619      return m.group(1) + 'x0' + m.group(2)
620    return m.group(0)
621
622  # This is required because the 'string_escape' encoding doesn't
623  # allow single-digit hex escapes (like '\xf').
624  result = _CUNESCAPE_HEX.sub(ReplaceHex, text)
625  return result.decode('string_escape')
626
627
628def ParseInteger(text, is_signed=False, is_long=False):
629  """Parses an integer.
630
631  Args:
632    text: The text to parse.
633    is_signed: True if a signed integer must be parsed.
634    is_long: True if a long integer must be parsed.
635
636  Returns:
637    The integer value.
638
639  Raises:
640    ValueError: Thrown Iff the text is not a valid integer.
641  """
642  # Do the actual parsing. Exception handling is propagated to caller.
643  try:
644    result = int(text, 0)
645  except ValueError:
646    raise ValueError('Couldn\'t parse integer: %s' % text)
647
648  # Check if the integer is sane. Exceptions handled by callers.
649  checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
650  checker.CheckValue(result)
651  return result
652
653
654def ParseFloat(text):
655  """Parse a floating point number.
656
657  Args:
658    text: Text to parse.
659
660  Returns:
661    The number parsed.
662
663  Raises:
664    ValueError: If a floating point number couldn't be parsed.
665  """
666  try:
667    # Assume Python compatible syntax.
668    return float(text)
669  except ValueError:
670    # Check alternative spellings.
671    if _FLOAT_INFINITY.match(text):
672      if text[0] == '-':
673        return float('-inf')
674      else:
675        return float('inf')
676    elif _FLOAT_NAN.match(text):
677      return float('nan')
678    else:
679      # assume '1.0f' format
680      try:
681        return float(text.rstrip('f'))
682      except ValueError:
683        raise ValueError('Couldn\'t parse float: %s' % text)
684
685
686def ParseBool(text):
687  """Parse a boolean value.
688
689  Args:
690    text: Text to parse.
691
692  Returns:
693    Boolean values parsed
694
695  Raises:
696    ValueError: If text is not a valid boolean.
697  """
698  if text in ('true', 't', '1'):
699    return True
700  elif text in ('false', 'f', '0'):
701    return False
702  else:
703    raise ValueError('Expected "true" or "false".')
704
705
706def ParseEnum(field, value):
707  """Parse an enum value.
708
709  The value can be specified by a number (the enum value), or by
710  a string literal (the enum name).
711
712  Args:
713    field: Enum field descriptor.
714    value: String value.
715
716  Returns:
717    Enum value number.
718
719  Raises:
720    ValueError: If the enum value could not be parsed.
721  """
722  enum_descriptor = field.enum_type
723  try:
724    number = int(value, 0)
725  except ValueError:
726    # Identifier.
727    enum_value = enum_descriptor.values_by_name.get(value, None)
728    if enum_value is None:
729      raise ValueError(
730          'Enum type "%s" has no value named %s.' % (
731              enum_descriptor.full_name, value))
732  else:
733    # Numeric value.
734    enum_value = enum_descriptor.values_by_number.get(number, None)
735    if enum_value is None:
736      raise ValueError(
737          'Enum type "%s" has no value with number %d.' % (
738              enum_descriptor.full_name, number))
739  return enum_value.number
740