1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc.  All rights reserved.
3# http://code.google.com/p/protobuf/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format."""
32
33__author__ = 'kenton@google.com (Kenton Varda)'
34
35import cStringIO
36import re
37
38from collections import deque
39from google.protobuf.internal import type_checkers
40from google.protobuf import descriptor
41
42__all__ = [ 'MessageToString', 'PrintMessage', 'PrintField',
43            'PrintFieldValue', 'Merge' ]
44
45
46# Infinity and NaN are not explicitly supported by Python pre-2.6, and
47# float('inf') does not work on Windows (pre-2.6).
48_INFINITY = 1e10000    # overflows, thus will actually be infinity.
49_NAN = _INFINITY * 0
50
51
52class ParseError(Exception):
53  """Thrown in case of ASCII parsing error."""
54
55
56def MessageToString(message):
57  out = cStringIO.StringIO()
58  PrintMessage(message, out)
59  result = out.getvalue()
60  out.close()
61  return result
62
63
64def PrintMessage(message, out, indent = 0):
65  for field, value in message.ListFields():
66    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
67      for element in value:
68        PrintField(field, element, out, indent)
69    else:
70      PrintField(field, value, out, indent)
71
72
73def PrintField(field, value, out, indent = 0):
74  """Print a single field name/value pair.  For repeated fields, the value
75  should be a single element."""
76
77  out.write(' ' * indent);
78  if field.is_extension:
79    out.write('[')
80    if (field.containing_type.GetOptions().message_set_wire_format and
81        field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
82        field.message_type == field.extension_scope and
83        field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
84      out.write(field.message_type.full_name)
85    else:
86      out.write(field.full_name)
87    out.write(']')
88  elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
89    # For groups, use the capitalized name.
90    out.write(field.message_type.name)
91  else:
92    out.write(field.name)
93
94  if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
95    # The colon is optional in this case, but our cross-language golden files
96    # don't include it.
97    out.write(': ')
98
99  PrintFieldValue(field, value, out, indent)
100  out.write('\n')
101
102
103def PrintFieldValue(field, value, out, indent = 0):
104  """Print a single field value (not including name).  For repeated fields,
105  the value should be a single element."""
106
107  if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
108    out.write(' {\n')
109    PrintMessage(value, out, indent + 2)
110    out.write(' ' * indent + '}')
111  elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
112    out.write(field.enum_type.values_by_number[value].name)
113  elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
114    out.write('\"')
115    out.write(_CEscape(value))
116    out.write('\"')
117  elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
118    if value:
119      out.write("true")
120    else:
121      out.write("false")
122  else:
123    out.write(str(value))
124
125
126def Merge(text, message):
127  """Merges an ASCII representation of a protocol message into a message.
128
129  Args:
130    text: Message ASCII representation.
131    message: A protocol buffer message to merge into.
132
133  Raises:
134    ParseError: On ASCII parsing problems.
135  """
136  tokenizer = _Tokenizer(text)
137  while not tokenizer.AtEnd():
138    _MergeField(tokenizer, message)
139
140
141def _MergeField(tokenizer, message):
142  """Merges a single protocol message field into a message.
143
144  Args:
145    tokenizer: A tokenizer to parse the field name and values.
146    message: A protocol message to record the data.
147
148  Raises:
149    ParseError: In case of ASCII parsing problems.
150  """
151  message_descriptor = message.DESCRIPTOR
152  if tokenizer.TryConsume('['):
153    name = [tokenizer.ConsumeIdentifier()]
154    while tokenizer.TryConsume('.'):
155      name.append(tokenizer.ConsumeIdentifier())
156    name = '.'.join(name)
157
158    if not message_descriptor.is_extendable:
159      raise tokenizer.ParseErrorPreviousToken(
160          'Message type "%s" does not have extensions.' %
161          message_descriptor.full_name)
162    field = message.Extensions._FindExtensionByName(name)
163    if not field:
164      raise tokenizer.ParseErrorPreviousToken(
165          'Extension "%s" not registered.' % name)
166    elif message_descriptor != field.containing_type:
167      raise tokenizer.ParseErrorPreviousToken(
168          'Extension "%s" does not extend message type "%s".' % (
169              name, message_descriptor.full_name))
170    tokenizer.Consume(']')
171  else:
172    name = tokenizer.ConsumeIdentifier()
173    field = message_descriptor.fields_by_name.get(name, None)
174
175    # Group names are expected to be capitalized as they appear in the
176    # .proto file, which actually matches their type names, not their field
177    # names.
178    if not field:
179      field = message_descriptor.fields_by_name.get(name.lower(), None)
180      if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
181        field = None
182
183    if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
184        field.message_type.name != name):
185      field = None
186
187    if not field:
188      raise tokenizer.ParseErrorPreviousToken(
189          'Message type "%s" has no field named "%s".' % (
190              message_descriptor.full_name, name))
191
192  if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
193    tokenizer.TryConsume(':')
194
195    if tokenizer.TryConsume('<'):
196      end_token = '>'
197    else:
198      tokenizer.Consume('{')
199      end_token = '}'
200
201    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
202      if field.is_extension:
203        sub_message = message.Extensions[field].add()
204      else:
205        sub_message = getattr(message, field.name).add()
206    else:
207      if field.is_extension:
208        sub_message = message.Extensions[field]
209      else:
210        sub_message = getattr(message, field.name)
211        sub_message.SetInParent()
212
213    while not tokenizer.TryConsume(end_token):
214      if tokenizer.AtEnd():
215        raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token))
216      _MergeField(tokenizer, sub_message)
217  else:
218    _MergeScalarField(tokenizer, message, field)
219
220
221def _MergeScalarField(tokenizer, message, field):
222  """Merges a single protocol message scalar field into a message.
223
224  Args:
225    tokenizer: A tokenizer to parse the field value.
226    message: A protocol message to record the data.
227    field: The descriptor of the field to be merged.
228
229  Raises:
230    ParseError: In case of ASCII parsing problems.
231    RuntimeError: On runtime errors.
232  """
233  tokenizer.Consume(':')
234  value = None
235
236  if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
237                    descriptor.FieldDescriptor.TYPE_SINT32,
238                    descriptor.FieldDescriptor.TYPE_SFIXED32):
239    value = tokenizer.ConsumeInt32()
240  elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
241                      descriptor.FieldDescriptor.TYPE_SINT64,
242                      descriptor.FieldDescriptor.TYPE_SFIXED64):
243    value = tokenizer.ConsumeInt64()
244  elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
245                      descriptor.FieldDescriptor.TYPE_FIXED32):
246    value = tokenizer.ConsumeUint32()
247  elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
248                      descriptor.FieldDescriptor.TYPE_FIXED64):
249    value = tokenizer.ConsumeUint64()
250  elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
251                      descriptor.FieldDescriptor.TYPE_DOUBLE):
252    value = tokenizer.ConsumeFloat()
253  elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
254    value = tokenizer.ConsumeBool()
255  elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
256    value = tokenizer.ConsumeString()
257  elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
258    value = tokenizer.ConsumeByteString()
259  elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
260    # Enum can be specified by a number (the enum value), or by
261    # a string literal (the enum name).
262    enum_descriptor = field.enum_type
263    if tokenizer.LookingAtInteger():
264      number = tokenizer.ConsumeInt32()
265      enum_value = enum_descriptor.values_by_number.get(number, None)
266      if enum_value is None:
267        raise tokenizer.ParseErrorPreviousToken(
268            'Enum type "%s" has no value with number %d.' % (
269                enum_descriptor.full_name, number))
270    else:
271      identifier = tokenizer.ConsumeIdentifier()
272      enum_value = enum_descriptor.values_by_name.get(identifier, None)
273      if enum_value is None:
274        raise tokenizer.ParseErrorPreviousToken(
275            'Enum type "%s" has no value named %s.' % (
276                enum_descriptor.full_name, identifier))
277    value = enum_value.number
278  else:
279    raise RuntimeError('Unknown field type %d' % field.type)
280
281  if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
282    if field.is_extension:
283      message.Extensions[field].append(value)
284    else:
285      getattr(message, field.name).append(value)
286  else:
287    if field.is_extension:
288      message.Extensions[field] = value
289    else:
290      setattr(message, field.name, value)
291
292
293class _Tokenizer(object):
294  """Protocol buffer ASCII representation tokenizer.
295
296  This class handles the lower level string parsing by splitting it into
297  meaningful tokens.
298
299  It was directly ported from the Java protocol buffer API.
300  """
301
302  _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE)
303  _TOKEN = re.compile(
304      '[a-zA-Z_][0-9a-zA-Z_+-]*|'           # an identifier
305      '[0-9+-][0-9a-zA-Z_.+-]*|'            # a number
306      '\"([^\"\n\\\\]|\\\\.)*(\"|\\\\?$)|'  # a double-quoted string
307      '\'([^\'\n\\\\]|\\\\.)*(\'|\\\\?$)')  # a single-quoted string
308  _IDENTIFIER = re.compile('\w+')
309  _INTEGER_CHECKERS = [type_checkers.Uint32ValueChecker(),
310                       type_checkers.Int32ValueChecker(),
311                       type_checkers.Uint64ValueChecker(),
312                       type_checkers.Int64ValueChecker()]
313  _FLOAT_INFINITY = re.compile('-?inf(inity)?f?', re.IGNORECASE)
314  _FLOAT_NAN = re.compile("nanf?", re.IGNORECASE)
315
316  def __init__(self, text_message):
317    self._text_message = text_message
318
319    self._position = 0
320    self._line = -1
321    self._column = 0
322    self._token_start = None
323    self.token = ''
324    self._lines = deque(text_message.split('\n'))
325    self._current_line = ''
326    self._previous_line = 0
327    self._previous_column = 0
328    self._SkipWhitespace()
329    self.NextToken()
330
331  def AtEnd(self):
332    """Checks the end of the text was reached.
333
334    Returns:
335      True iff the end was reached.
336    """
337    return not self._lines and not self._current_line
338
339  def _PopLine(self):
340    while not self._current_line:
341      if not self._lines:
342        self._current_line = ''
343        return
344      self._line += 1
345      self._column = 0
346      self._current_line = self._lines.popleft()
347
348  def _SkipWhitespace(self):
349    while True:
350      self._PopLine()
351      match = re.match(self._WHITESPACE, self._current_line)
352      if not match:
353        break
354      length = len(match.group(0))
355      self._current_line = self._current_line[length:]
356      self._column += length
357
358  def TryConsume(self, token):
359    """Tries to consume a given piece of text.
360
361    Args:
362      token: Text to consume.
363
364    Returns:
365      True iff the text was consumed.
366    """
367    if self.token == token:
368      self.NextToken()
369      return True
370    return False
371
372  def Consume(self, token):
373    """Consumes a piece of text.
374
375    Args:
376      token: Text to consume.
377
378    Raises:
379      ParseError: If the text couldn't be consumed.
380    """
381    if not self.TryConsume(token):
382      raise self._ParseError('Expected "%s".' % token)
383
384  def LookingAtInteger(self):
385    """Checks if the current token is an integer.
386
387    Returns:
388      True iff the current token is an integer.
389    """
390    if not self.token:
391      return False
392    c = self.token[0]
393    return (c >= '0' and c <= '9') or c == '-' or c == '+'
394
395  def ConsumeIdentifier(self):
396    """Consumes protocol message field identifier.
397
398    Returns:
399      Identifier string.
400
401    Raises:
402      ParseError: If an identifier couldn't be consumed.
403    """
404    result = self.token
405    if not re.match(self._IDENTIFIER, result):
406      raise self._ParseError('Expected identifier.')
407    self.NextToken()
408    return result
409
410  def ConsumeInt32(self):
411    """Consumes a signed 32bit integer number.
412
413    Returns:
414      The integer parsed.
415
416    Raises:
417      ParseError: If a signed 32bit integer couldn't be consumed.
418    """
419    try:
420      result = self._ParseInteger(self.token, is_signed=True, is_long=False)
421    except ValueError, e:
422      raise self._IntegerParseError(e)
423    self.NextToken()
424    return result
425
426  def ConsumeUint32(self):
427    """Consumes an unsigned 32bit integer number.
428
429    Returns:
430      The integer parsed.
431
432    Raises:
433      ParseError: If an unsigned 32bit integer couldn't be consumed.
434    """
435    try:
436      result = self._ParseInteger(self.token, is_signed=False, is_long=False)
437    except ValueError, e:
438      raise self._IntegerParseError(e)
439    self.NextToken()
440    return result
441
442  def ConsumeInt64(self):
443    """Consumes a signed 64bit integer number.
444
445    Returns:
446      The integer parsed.
447
448    Raises:
449      ParseError: If a signed 64bit integer couldn't be consumed.
450    """
451    try:
452      result = self._ParseInteger(self.token, is_signed=True, is_long=True)
453    except ValueError, e:
454      raise self._IntegerParseError(e)
455    self.NextToken()
456    return result
457
458  def ConsumeUint64(self):
459    """Consumes an unsigned 64bit integer number.
460
461    Returns:
462      The integer parsed.
463
464    Raises:
465      ParseError: If an unsigned 64bit integer couldn't be consumed.
466    """
467    try:
468      result = self._ParseInteger(self.token, is_signed=False, is_long=True)
469    except ValueError, e:
470      raise self._IntegerParseError(e)
471    self.NextToken()
472    return result
473
474  def ConsumeFloat(self):
475    """Consumes an floating point number.
476
477    Returns:
478      The number parsed.
479
480    Raises:
481      ParseError: If a floating point number couldn't be consumed.
482    """
483    text = self.token
484    if re.match(self._FLOAT_INFINITY, text):
485      self.NextToken()
486      if text.startswith('-'):
487        return -_INFINITY
488      return _INFINITY
489
490    if re.match(self._FLOAT_NAN, text):
491      self.NextToken()
492      return _NAN
493
494    try:
495      result = float(text)
496    except ValueError, e:
497      raise self._FloatParseError(e)
498    self.NextToken()
499    return result
500
501  def ConsumeBool(self):
502    """Consumes a boolean value.
503
504    Returns:
505      The bool parsed.
506
507    Raises:
508      ParseError: If a boolean value couldn't be consumed.
509    """
510    if self.token == 'true':
511      self.NextToken()
512      return True
513    elif self.token == 'false':
514      self.NextToken()
515      return False
516    else:
517      raise self._ParseError('Expected "true" or "false".')
518
519  def ConsumeString(self):
520    """Consumes a string value.
521
522    Returns:
523      The string parsed.
524
525    Raises:
526      ParseError: If a string value couldn't be consumed.
527    """
528    return unicode(self.ConsumeByteString(), 'utf-8')
529
530  def ConsumeByteString(self):
531    """Consumes a byte array value.
532
533    Returns:
534      The array parsed (as a string).
535
536    Raises:
537      ParseError: If a byte array value couldn't be consumed.
538    """
539    list = [self._ConsumeSingleByteString()]
540    while len(self.token) > 0 and self.token[0] in ('\'', '"'):
541      list.append(self._ConsumeSingleByteString())
542    return "".join(list)
543
544  def _ConsumeSingleByteString(self):
545    """Consume one token of a string literal.
546
547    String literals (whether bytes or text) can come in multiple adjacent
548    tokens which are automatically concatenated, like in C or Python.  This
549    method only consumes one token.
550    """
551    text = self.token
552    if len(text) < 1 or text[0] not in ('\'', '"'):
553      raise self._ParseError('Exptected string.')
554
555    if len(text) < 2 or text[-1] != text[0]:
556      raise self._ParseError('String missing ending quote.')
557
558    try:
559      result = _CUnescape(text[1:-1])
560    except ValueError, e:
561      raise self._ParseError(str(e))
562    self.NextToken()
563    return result
564
565  def _ParseInteger(self, text, is_signed=False, is_long=False):
566    """Parses an integer.
567
568    Args:
569      text: The text to parse.
570      is_signed: True if a signed integer must be parsed.
571      is_long: True if a long integer must be parsed.
572
573    Returns:
574      The integer value.
575
576    Raises:
577      ValueError: Thrown Iff the text is not a valid integer.
578    """
579    pos = 0
580    if text.startswith('-'):
581      pos += 1
582
583    base = 10
584    if text.startswith('0x', pos) or text.startswith('0X', pos):
585      base = 16
586    elif text.startswith('0', pos):
587      base = 8
588
589    # Do the actual parsing. Exception handling is propagated to caller.
590    result = int(text, base)
591
592    # Check if the integer is sane. Exceptions handled by callers.
593    checker = self._INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
594    checker.CheckValue(result)
595    return result
596
597  def ParseErrorPreviousToken(self, message):
598    """Creates and *returns* a ParseError for the previously read token.
599
600    Args:
601      message: A message to set for the exception.
602
603    Returns:
604      A ParseError instance.
605    """
606    return ParseError('%d:%d : %s' % (
607        self._previous_line + 1, self._previous_column + 1, message))
608
609  def _ParseError(self, message):
610    """Creates and *returns* a ParseError for the current token."""
611    return ParseError('%d:%d : %s' % (
612        self._line + 1, self._column + 1, message))
613
614  def _IntegerParseError(self, e):
615    return self._ParseError('Couldn\'t parse integer: ' + str(e))
616
617  def _FloatParseError(self, e):
618    return self._ParseError('Couldn\'t parse number: ' + str(e))
619
620  def NextToken(self):
621    """Reads the next meaningful token."""
622    self._previous_line = self._line
623    self._previous_column = self._column
624    if self.AtEnd():
625      self.token = ''
626      return
627    self._column += len(self.token)
628
629    # Make sure there is data to work on.
630    self._PopLine()
631
632    match = re.match(self._TOKEN, self._current_line)
633    if match:
634      token = match.group(0)
635      self._current_line = self._current_line[len(token):]
636      self.token = token
637    else:
638      self.token = self._current_line[0]
639      self._current_line = self._current_line[1:]
640    self._SkipWhitespace()
641
642
643# text.encode('string_escape') does not seem to satisfy our needs as it
644# encodes unprintable characters using two-digit hex escapes whereas our
645# C++ unescaping function allows hex escapes to be any length.  So,
646# "\0011".encode('string_escape') ends up being "\\x011", which will be
647# decoded in C++ as a single-character string with char code 0x11.
648def _CEscape(text):
649  def escape(c):
650    o = ord(c)
651    if o == 10: return r"\n"   # optional escape
652    if o == 13: return r"\r"   # optional escape
653    if o ==  9: return r"\t"   # optional escape
654    if o == 39: return r"\'"   # optional escape
655
656    if o == 34: return r'\"'   # necessary escape
657    if o == 92: return r"\\"   # necessary escape
658
659    if o >= 127 or o < 32: return "\\%03o" % o # necessary escapes
660    return c
661  return "".join([escape(c) for c in text])
662
663
664_CUNESCAPE_HEX = re.compile('\\\\x([0-9a-fA-F]{2}|[0-9a-f-A-F])')
665
666
667def _CUnescape(text):
668  def ReplaceHex(m):
669    return chr(int(m.group(0)[2:], 16))
670  # This is required because the 'string_escape' encoding doesn't
671  # allow single-digit hex escapes (like '\xf').
672  result = _CUNESCAPE_HEX.sub(ReplaceHex, text)
673  return result.decode('string_escape')
674