1# Protocol Buffers - Google's data interchange format 2# Copyright 2008 Google Inc. All rights reserved. 3# http://code.google.com/p/protobuf/ 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""Contains routines for printing protocol messages in text format.""" 32 33__author__ = 'kenton@google.com (Kenton Varda)' 34 35import cStringIO 36import re 37 38from collections import deque 39from google.protobuf.internal import type_checkers 40from google.protobuf import descriptor 41 42__all__ = [ 'MessageToString', 'PrintMessage', 'PrintField', 43 'PrintFieldValue', 'Merge' ] 44 45 46_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(), 47 type_checkers.Int32ValueChecker(), 48 type_checkers.Uint64ValueChecker(), 49 type_checkers.Int64ValueChecker()) 50_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE) 51_FLOAT_NAN = re.compile('nanf?', re.IGNORECASE) 52 53 54class ParseError(Exception): 55 """Thrown in case of ASCII parsing error.""" 56 57 58def MessageToString(message, as_utf8=False, as_one_line=False): 59 out = cStringIO.StringIO() 60 PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line) 61 result = out.getvalue() 62 out.close() 63 if as_one_line: 64 return result.rstrip() 65 return result 66 67 68def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False): 69 for field, value in message.ListFields(): 70 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 71 for element in value: 72 PrintField(field, element, out, indent, as_utf8, as_one_line) 73 else: 74 PrintField(field, value, out, indent, as_utf8, as_one_line) 75 76 77def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False): 78 """Print a single field name/value pair. For repeated fields, the value 79 should be a single element.""" 80 81 out.write(' ' * indent); 82 if field.is_extension: 83 out.write('[') 84 if (field.containing_type.GetOptions().message_set_wire_format and 85 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 86 field.message_type == field.extension_scope and 87 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL): 88 out.write(field.message_type.full_name) 89 else: 90 out.write(field.full_name) 91 out.write(']') 92 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP: 93 # For groups, use the capitalized name. 94 out.write(field.message_type.name) 95 else: 96 out.write(field.name) 97 98 if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 99 # The colon is optional in this case, but our cross-language golden files 100 # don't include it. 101 out.write(': ') 102 103 PrintFieldValue(field, value, out, indent, as_utf8, as_one_line) 104 if as_one_line: 105 out.write(' ') 106 else: 107 out.write('\n') 108 109 110def PrintFieldValue(field, value, out, indent=0, 111 as_utf8=False, as_one_line=False): 112 """Print a single field value (not including name). For repeated fields, 113 the value should be a single element.""" 114 115 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 116 if as_one_line: 117 out.write(' { ') 118 PrintMessage(value, out, indent, as_utf8, as_one_line) 119 out.write('}') 120 else: 121 out.write(' {\n') 122 PrintMessage(value, out, indent + 2, as_utf8, as_one_line) 123 out.write(' ' * indent + '}') 124 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: 125 enum_value = field.enum_type.values_by_number.get(value, None) 126 if enum_value is not None: 127 out.write(enum_value.name) 128 else: 129 out.write(str(value)) 130 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: 131 out.write('\"') 132 if type(value) is unicode: 133 out.write(_CEscape(value.encode('utf-8'), as_utf8)) 134 else: 135 out.write(_CEscape(value, as_utf8)) 136 out.write('\"') 137 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: 138 if value: 139 out.write("true") 140 else: 141 out.write("false") 142 else: 143 out.write(str(value)) 144 145 146def Merge(text, message): 147 """Merges an ASCII representation of a protocol message into a message. 148 149 Args: 150 text: Message ASCII representation. 151 message: A protocol buffer message to merge into. 152 153 Raises: 154 ParseError: On ASCII parsing problems. 155 """ 156 tokenizer = _Tokenizer(text) 157 while not tokenizer.AtEnd(): 158 _MergeField(tokenizer, message) 159 160 161def _MergeField(tokenizer, message): 162 """Merges a single protocol message field into a message. 163 164 Args: 165 tokenizer: A tokenizer to parse the field name and values. 166 message: A protocol message to record the data. 167 168 Raises: 169 ParseError: In case of ASCII parsing problems. 170 """ 171 message_descriptor = message.DESCRIPTOR 172 if tokenizer.TryConsume('['): 173 name = [tokenizer.ConsumeIdentifier()] 174 while tokenizer.TryConsume('.'): 175 name.append(tokenizer.ConsumeIdentifier()) 176 name = '.'.join(name) 177 178 if not message_descriptor.is_extendable: 179 raise tokenizer.ParseErrorPreviousToken( 180 'Message type "%s" does not have extensions.' % 181 message_descriptor.full_name) 182 field = message.Extensions._FindExtensionByName(name) 183 if not field: 184 raise tokenizer.ParseErrorPreviousToken( 185 'Extension "%s" not registered.' % name) 186 elif message_descriptor != field.containing_type: 187 raise tokenizer.ParseErrorPreviousToken( 188 'Extension "%s" does not extend message type "%s".' % ( 189 name, message_descriptor.full_name)) 190 tokenizer.Consume(']') 191 else: 192 name = tokenizer.ConsumeIdentifier() 193 field = message_descriptor.fields_by_name.get(name, None) 194 195 # Group names are expected to be capitalized as they appear in the 196 # .proto file, which actually matches their type names, not their field 197 # names. 198 if not field: 199 field = message_descriptor.fields_by_name.get(name.lower(), None) 200 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP: 201 field = None 202 203 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and 204 field.message_type.name != name): 205 field = None 206 207 if not field: 208 raise tokenizer.ParseErrorPreviousToken( 209 'Message type "%s" has no field named "%s".' % ( 210 message_descriptor.full_name, name)) 211 212 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 213 tokenizer.TryConsume(':') 214 215 if tokenizer.TryConsume('<'): 216 end_token = '>' 217 else: 218 tokenizer.Consume('{') 219 end_token = '}' 220 221 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 222 if field.is_extension: 223 sub_message = message.Extensions[field].add() 224 else: 225 sub_message = getattr(message, field.name).add() 226 else: 227 if field.is_extension: 228 sub_message = message.Extensions[field] 229 else: 230 sub_message = getattr(message, field.name) 231 sub_message.SetInParent() 232 233 while not tokenizer.TryConsume(end_token): 234 if tokenizer.AtEnd(): 235 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token)) 236 _MergeField(tokenizer, sub_message) 237 else: 238 _MergeScalarField(tokenizer, message, field) 239 240 241def _MergeScalarField(tokenizer, message, field): 242 """Merges a single protocol message scalar field into a message. 243 244 Args: 245 tokenizer: A tokenizer to parse the field value. 246 message: A protocol message to record the data. 247 field: The descriptor of the field to be merged. 248 249 Raises: 250 ParseError: In case of ASCII parsing problems. 251 RuntimeError: On runtime errors. 252 """ 253 tokenizer.Consume(':') 254 value = None 255 256 if field.type in (descriptor.FieldDescriptor.TYPE_INT32, 257 descriptor.FieldDescriptor.TYPE_SINT32, 258 descriptor.FieldDescriptor.TYPE_SFIXED32): 259 value = tokenizer.ConsumeInt32() 260 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64, 261 descriptor.FieldDescriptor.TYPE_SINT64, 262 descriptor.FieldDescriptor.TYPE_SFIXED64): 263 value = tokenizer.ConsumeInt64() 264 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32, 265 descriptor.FieldDescriptor.TYPE_FIXED32): 266 value = tokenizer.ConsumeUint32() 267 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64, 268 descriptor.FieldDescriptor.TYPE_FIXED64): 269 value = tokenizer.ConsumeUint64() 270 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT, 271 descriptor.FieldDescriptor.TYPE_DOUBLE): 272 value = tokenizer.ConsumeFloat() 273 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL: 274 value = tokenizer.ConsumeBool() 275 elif field.type == descriptor.FieldDescriptor.TYPE_STRING: 276 value = tokenizer.ConsumeString() 277 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES: 278 value = tokenizer.ConsumeByteString() 279 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM: 280 value = tokenizer.ConsumeEnum(field) 281 else: 282 raise RuntimeError('Unknown field type %d' % field.type) 283 284 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 285 if field.is_extension: 286 message.Extensions[field].append(value) 287 else: 288 getattr(message, field.name).append(value) 289 else: 290 if field.is_extension: 291 message.Extensions[field] = value 292 else: 293 setattr(message, field.name, value) 294 295 296class _Tokenizer(object): 297 """Protocol buffer ASCII representation tokenizer. 298 299 This class handles the lower level string parsing by splitting it into 300 meaningful tokens. 301 302 It was directly ported from the Java protocol buffer API. 303 """ 304 305 _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE) 306 _TOKEN = re.compile( 307 '[a-zA-Z_][0-9a-zA-Z_+-]*|' # an identifier 308 '[0-9+-][0-9a-zA-Z_.+-]*|' # a number 309 '\"([^\"\n\\\\]|\\\\.)*(\"|\\\\?$)|' # a double-quoted string 310 '\'([^\'\n\\\\]|\\\\.)*(\'|\\\\?$)') # a single-quoted string 311 _IDENTIFIER = re.compile('\w+') 312 313 def __init__(self, text_message): 314 self._text_message = text_message 315 316 self._position = 0 317 self._line = -1 318 self._column = 0 319 self._token_start = None 320 self.token = '' 321 self._lines = deque(text_message.split('\n')) 322 self._current_line = '' 323 self._previous_line = 0 324 self._previous_column = 0 325 self._SkipWhitespace() 326 self.NextToken() 327 328 def AtEnd(self): 329 """Checks the end of the text was reached. 330 331 Returns: 332 True iff the end was reached. 333 """ 334 return self.token == '' 335 336 def _PopLine(self): 337 while len(self._current_line) <= self._column: 338 if not self._lines: 339 self._current_line = '' 340 return 341 self._line += 1 342 self._column = 0 343 self._current_line = self._lines.popleft() 344 345 def _SkipWhitespace(self): 346 while True: 347 self._PopLine() 348 match = self._WHITESPACE.match(self._current_line, self._column) 349 if not match: 350 break 351 length = len(match.group(0)) 352 self._column += length 353 354 def TryConsume(self, token): 355 """Tries to consume a given piece of text. 356 357 Args: 358 token: Text to consume. 359 360 Returns: 361 True iff the text was consumed. 362 """ 363 if self.token == token: 364 self.NextToken() 365 return True 366 return False 367 368 def Consume(self, token): 369 """Consumes a piece of text. 370 371 Args: 372 token: Text to consume. 373 374 Raises: 375 ParseError: If the text couldn't be consumed. 376 """ 377 if not self.TryConsume(token): 378 raise self._ParseError('Expected "%s".' % token) 379 380 def ConsumeIdentifier(self): 381 """Consumes protocol message field identifier. 382 383 Returns: 384 Identifier string. 385 386 Raises: 387 ParseError: If an identifier couldn't be consumed. 388 """ 389 result = self.token 390 if not self._IDENTIFIER.match(result): 391 raise self._ParseError('Expected identifier.') 392 self.NextToken() 393 return result 394 395 def ConsumeInt32(self): 396 """Consumes a signed 32bit integer number. 397 398 Returns: 399 The integer parsed. 400 401 Raises: 402 ParseError: If a signed 32bit integer couldn't be consumed. 403 """ 404 try: 405 result = ParseInteger(self.token, is_signed=True, is_long=False) 406 except ValueError, e: 407 raise self._ParseError(str(e)) 408 self.NextToken() 409 return result 410 411 def ConsumeUint32(self): 412 """Consumes an unsigned 32bit integer number. 413 414 Returns: 415 The integer parsed. 416 417 Raises: 418 ParseError: If an unsigned 32bit integer couldn't be consumed. 419 """ 420 try: 421 result = ParseInteger(self.token, is_signed=False, is_long=False) 422 except ValueError, e: 423 raise self._ParseError(str(e)) 424 self.NextToken() 425 return result 426 427 def ConsumeInt64(self): 428 """Consumes a signed 64bit integer number. 429 430 Returns: 431 The integer parsed. 432 433 Raises: 434 ParseError: If a signed 64bit integer couldn't be consumed. 435 """ 436 try: 437 result = ParseInteger(self.token, is_signed=True, is_long=True) 438 except ValueError, e: 439 raise self._ParseError(str(e)) 440 self.NextToken() 441 return result 442 443 def ConsumeUint64(self): 444 """Consumes an unsigned 64bit integer number. 445 446 Returns: 447 The integer parsed. 448 449 Raises: 450 ParseError: If an unsigned 64bit integer couldn't be consumed. 451 """ 452 try: 453 result = ParseInteger(self.token, is_signed=False, is_long=True) 454 except ValueError, e: 455 raise self._ParseError(str(e)) 456 self.NextToken() 457 return result 458 459 def ConsumeFloat(self): 460 """Consumes an floating point number. 461 462 Returns: 463 The number parsed. 464 465 Raises: 466 ParseError: If a floating point number couldn't be consumed. 467 """ 468 try: 469 result = ParseFloat(self.token) 470 except ValueError, e: 471 raise self._ParseError(str(e)) 472 self.NextToken() 473 return result 474 475 def ConsumeBool(self): 476 """Consumes a boolean value. 477 478 Returns: 479 The bool parsed. 480 481 Raises: 482 ParseError: If a boolean value couldn't be consumed. 483 """ 484 try: 485 result = ParseBool(self.token) 486 except ValueError, e: 487 raise self._ParseError(str(e)) 488 self.NextToken() 489 return result 490 491 def ConsumeString(self): 492 """Consumes a string value. 493 494 Returns: 495 The string parsed. 496 497 Raises: 498 ParseError: If a string value couldn't be consumed. 499 """ 500 bytes = self.ConsumeByteString() 501 try: 502 return unicode(bytes, 'utf-8') 503 except UnicodeDecodeError, e: 504 raise self._StringParseError(e) 505 506 def ConsumeByteString(self): 507 """Consumes a byte array value. 508 509 Returns: 510 The array parsed (as a string). 511 512 Raises: 513 ParseError: If a byte array value couldn't be consumed. 514 """ 515 list = [self._ConsumeSingleByteString()] 516 while len(self.token) > 0 and self.token[0] in ('\'', '"'): 517 list.append(self._ConsumeSingleByteString()) 518 return "".join(list) 519 520 def _ConsumeSingleByteString(self): 521 """Consume one token of a string literal. 522 523 String literals (whether bytes or text) can come in multiple adjacent 524 tokens which are automatically concatenated, like in C or Python. This 525 method only consumes one token. 526 """ 527 text = self.token 528 if len(text) < 1 or text[0] not in ('\'', '"'): 529 raise self._ParseError('Expected string.') 530 531 if len(text) < 2 or text[-1] != text[0]: 532 raise self._ParseError('String missing ending quote.') 533 534 try: 535 result = _CUnescape(text[1:-1]) 536 except ValueError, e: 537 raise self._ParseError(str(e)) 538 self.NextToken() 539 return result 540 541 def ConsumeEnum(self, field): 542 try: 543 result = ParseEnum(field, self.token) 544 except ValueError, e: 545 raise self._ParseError(str(e)) 546 self.NextToken() 547 return result 548 549 def ParseErrorPreviousToken(self, message): 550 """Creates and *returns* a ParseError for the previously read token. 551 552 Args: 553 message: A message to set for the exception. 554 555 Returns: 556 A ParseError instance. 557 """ 558 return ParseError('%d:%d : %s' % ( 559 self._previous_line + 1, self._previous_column + 1, message)) 560 561 def _ParseError(self, message): 562 """Creates and *returns* a ParseError for the current token.""" 563 return ParseError('%d:%d : %s' % ( 564 self._line + 1, self._column + 1, message)) 565 566 def _StringParseError(self, e): 567 return self._ParseError('Couldn\'t parse string: ' + str(e)) 568 569 def NextToken(self): 570 """Reads the next meaningful token.""" 571 self._previous_line = self._line 572 self._previous_column = self._column 573 574 self._column += len(self.token) 575 self._SkipWhitespace() 576 577 if not self._lines and len(self._current_line) <= self._column: 578 self.token = '' 579 return 580 581 match = self._TOKEN.match(self._current_line, self._column) 582 if match: 583 token = match.group(0) 584 self.token = token 585 else: 586 self.token = self._current_line[self._column] 587 588 589# text.encode('string_escape') does not seem to satisfy our needs as it 590# encodes unprintable characters using two-digit hex escapes whereas our 591# C++ unescaping function allows hex escapes to be any length. So, 592# "\0011".encode('string_escape') ends up being "\\x011", which will be 593# decoded in C++ as a single-character string with char code 0x11. 594def _CEscape(text, as_utf8): 595 def escape(c): 596 o = ord(c) 597 if o == 10: return r"\n" # optional escape 598 if o == 13: return r"\r" # optional escape 599 if o == 9: return r"\t" # optional escape 600 if o == 39: return r"\'" # optional escape 601 602 if o == 34: return r'\"' # necessary escape 603 if o == 92: return r"\\" # necessary escape 604 605 # necessary escapes 606 if not as_utf8 and (o >= 127 or o < 32): return "\\%03o" % o 607 return c 608 return "".join([escape(c) for c in text]) 609 610 611_CUNESCAPE_HEX = re.compile(r'(\\+)x([0-9a-fA-F])(?![0-9a-fA-F])') 612 613 614def _CUnescape(text): 615 def ReplaceHex(m): 616 # Only replace the match if the number of leading back slashes is odd. i.e. 617 # the slash itself is not escaped. 618 if len(m.group(1)) & 1: 619 return m.group(1) + 'x0' + m.group(2) 620 return m.group(0) 621 622 # This is required because the 'string_escape' encoding doesn't 623 # allow single-digit hex escapes (like '\xf'). 624 result = _CUNESCAPE_HEX.sub(ReplaceHex, text) 625 return result.decode('string_escape') 626 627 628def ParseInteger(text, is_signed=False, is_long=False): 629 """Parses an integer. 630 631 Args: 632 text: The text to parse. 633 is_signed: True if a signed integer must be parsed. 634 is_long: True if a long integer must be parsed. 635 636 Returns: 637 The integer value. 638 639 Raises: 640 ValueError: Thrown Iff the text is not a valid integer. 641 """ 642 # Do the actual parsing. Exception handling is propagated to caller. 643 try: 644 result = int(text, 0) 645 except ValueError: 646 raise ValueError('Couldn\'t parse integer: %s' % text) 647 648 # Check if the integer is sane. Exceptions handled by callers. 649 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)] 650 checker.CheckValue(result) 651 return result 652 653 654def ParseFloat(text): 655 """Parse a floating point number. 656 657 Args: 658 text: Text to parse. 659 660 Returns: 661 The number parsed. 662 663 Raises: 664 ValueError: If a floating point number couldn't be parsed. 665 """ 666 try: 667 # Assume Python compatible syntax. 668 return float(text) 669 except ValueError: 670 # Check alternative spellings. 671 if _FLOAT_INFINITY.match(text): 672 if text[0] == '-': 673 return float('-inf') 674 else: 675 return float('inf') 676 elif _FLOAT_NAN.match(text): 677 return float('nan') 678 else: 679 # assume '1.0f' format 680 try: 681 return float(text.rstrip('f')) 682 except ValueError: 683 raise ValueError('Couldn\'t parse float: %s' % text) 684 685 686def ParseBool(text): 687 """Parse a boolean value. 688 689 Args: 690 text: Text to parse. 691 692 Returns: 693 Boolean values parsed 694 695 Raises: 696 ValueError: If text is not a valid boolean. 697 """ 698 if text in ('true', 't', '1'): 699 return True 700 elif text in ('false', 'f', '0'): 701 return False 702 else: 703 raise ValueError('Expected "true" or "false".') 704 705 706def ParseEnum(field, value): 707 """Parse an enum value. 708 709 The value can be specified by a number (the enum value), or by 710 a string literal (the enum name). 711 712 Args: 713 field: Enum field descriptor. 714 value: String value. 715 716 Returns: 717 Enum value number. 718 719 Raises: 720 ValueError: If the enum value could not be parsed. 721 """ 722 enum_descriptor = field.enum_type 723 try: 724 number = int(value, 0) 725 except ValueError: 726 # Identifier. 727 enum_value = enum_descriptor.values_by_name.get(value, None) 728 if enum_value is None: 729 raise ValueError( 730 'Enum type "%s" has no value named %s.' % ( 731 enum_descriptor.full_name, value)) 732 else: 733 # Numeric value. 734 enum_value = enum_descriptor.values_by_number.get(number, None) 735 if enum_value is None: 736 raise ValueError( 737 'Enum type "%s" has no value with number %d.' % ( 738 enum_descriptor.full_name, number)) 739 return enum_value.number 740