1# Protocol Buffers - Google's data interchange format 2# Copyright 2008 Google Inc. All rights reserved. 3# http://code.google.com/p/protobuf/ 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""Contains routines for printing protocol messages in text format.""" 32 33__author__ = 'kenton@google.com (Kenton Varda)' 34 35import cStringIO 36import re 37 38from collections import deque 39from google.protobuf.internal import type_checkers 40from google.protobuf import descriptor 41 42__all__ = [ 'MessageToString', 'PrintMessage', 'PrintField', 43 'PrintFieldValue', 'Merge' ] 44 45 46# Infinity and NaN are not explicitly supported by Python pre-2.6, and 47# float('inf') does not work on Windows (pre-2.6). 48_INFINITY = 1e10000 # overflows, thus will actually be infinity. 49_NAN = _INFINITY * 0 50 51 52class ParseError(Exception): 53 """Thrown in case of ASCII parsing error.""" 54 55 56def MessageToString(message): 57 out = cStringIO.StringIO() 58 PrintMessage(message, out) 59 result = out.getvalue() 60 out.close() 61 return result 62 63 64def PrintMessage(message, out, indent = 0): 65 for field, value in message.ListFields(): 66 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 67 for element in value: 68 PrintField(field, element, out, indent) 69 else: 70 PrintField(field, value, out, indent) 71 72 73def PrintField(field, value, out, indent = 0): 74 """Print a single field name/value pair. For repeated fields, the value 75 should be a single element.""" 76 77 out.write(' ' * indent); 78 if field.is_extension: 79 out.write('[') 80 if (field.containing_type.GetOptions().message_set_wire_format and 81 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 82 field.message_type == field.extension_scope and 83 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL): 84 out.write(field.message_type.full_name) 85 else: 86 out.write(field.full_name) 87 out.write(']') 88 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP: 89 # For groups, use the capitalized name. 90 out.write(field.message_type.name) 91 else: 92 out.write(field.name) 93 94 if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 95 # The colon is optional in this case, but our cross-language golden files 96 # don't include it. 97 out.write(': ') 98 99 PrintFieldValue(field, value, out, indent) 100 out.write('\n') 101 102 103def PrintFieldValue(field, value, out, indent = 0): 104 """Print a single field value (not including name). For repeated fields, 105 the value should be a single element.""" 106 107 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 108 out.write(' {\n') 109 PrintMessage(value, out, indent + 2) 110 out.write(' ' * indent + '}') 111 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: 112 out.write(field.enum_type.values_by_number[value].name) 113 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: 114 out.write('\"') 115 out.write(_CEscape(value)) 116 out.write('\"') 117 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: 118 if value: 119 out.write("true") 120 else: 121 out.write("false") 122 else: 123 out.write(str(value)) 124 125 126def Merge(text, message): 127 """Merges an ASCII representation of a protocol message into a message. 128 129 Args: 130 text: Message ASCII representation. 131 message: A protocol buffer message to merge into. 132 133 Raises: 134 ParseError: On ASCII parsing problems. 135 """ 136 tokenizer = _Tokenizer(text) 137 while not tokenizer.AtEnd(): 138 _MergeField(tokenizer, message) 139 140 141def _MergeField(tokenizer, message): 142 """Merges a single protocol message field into a message. 143 144 Args: 145 tokenizer: A tokenizer to parse the field name and values. 146 message: A protocol message to record the data. 147 148 Raises: 149 ParseError: In case of ASCII parsing problems. 150 """ 151 message_descriptor = message.DESCRIPTOR 152 if tokenizer.TryConsume('['): 153 name = [tokenizer.ConsumeIdentifier()] 154 while tokenizer.TryConsume('.'): 155 name.append(tokenizer.ConsumeIdentifier()) 156 name = '.'.join(name) 157 158 if not message_descriptor.is_extendable: 159 raise tokenizer.ParseErrorPreviousToken( 160 'Message type "%s" does not have extensions.' % 161 message_descriptor.full_name) 162 field = message.Extensions._FindExtensionByName(name) 163 if not field: 164 raise tokenizer.ParseErrorPreviousToken( 165 'Extension "%s" not registered.' % name) 166 elif message_descriptor != field.containing_type: 167 raise tokenizer.ParseErrorPreviousToken( 168 'Extension "%s" does not extend message type "%s".' % ( 169 name, message_descriptor.full_name)) 170 tokenizer.Consume(']') 171 else: 172 name = tokenizer.ConsumeIdentifier() 173 field = message_descriptor.fields_by_name.get(name, None) 174 175 # Group names are expected to be capitalized as they appear in the 176 # .proto file, which actually matches their type names, not their field 177 # names. 178 if not field: 179 field = message_descriptor.fields_by_name.get(name.lower(), None) 180 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP: 181 field = None 182 183 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and 184 field.message_type.name != name): 185 field = None 186 187 if not field: 188 raise tokenizer.ParseErrorPreviousToken( 189 'Message type "%s" has no field named "%s".' % ( 190 message_descriptor.full_name, name)) 191 192 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 193 tokenizer.TryConsume(':') 194 195 if tokenizer.TryConsume('<'): 196 end_token = '>' 197 else: 198 tokenizer.Consume('{') 199 end_token = '}' 200 201 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 202 if field.is_extension: 203 sub_message = message.Extensions[field].add() 204 else: 205 sub_message = getattr(message, field.name).add() 206 else: 207 if field.is_extension: 208 sub_message = message.Extensions[field] 209 else: 210 sub_message = getattr(message, field.name) 211 sub_message.SetInParent() 212 213 while not tokenizer.TryConsume(end_token): 214 if tokenizer.AtEnd(): 215 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token)) 216 _MergeField(tokenizer, sub_message) 217 else: 218 _MergeScalarField(tokenizer, message, field) 219 220 221def _MergeScalarField(tokenizer, message, field): 222 """Merges a single protocol message scalar field into a message. 223 224 Args: 225 tokenizer: A tokenizer to parse the field value. 226 message: A protocol message to record the data. 227 field: The descriptor of the field to be merged. 228 229 Raises: 230 ParseError: In case of ASCII parsing problems. 231 RuntimeError: On runtime errors. 232 """ 233 tokenizer.Consume(':') 234 value = None 235 236 if field.type in (descriptor.FieldDescriptor.TYPE_INT32, 237 descriptor.FieldDescriptor.TYPE_SINT32, 238 descriptor.FieldDescriptor.TYPE_SFIXED32): 239 value = tokenizer.ConsumeInt32() 240 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64, 241 descriptor.FieldDescriptor.TYPE_SINT64, 242 descriptor.FieldDescriptor.TYPE_SFIXED64): 243 value = tokenizer.ConsumeInt64() 244 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32, 245 descriptor.FieldDescriptor.TYPE_FIXED32): 246 value = tokenizer.ConsumeUint32() 247 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64, 248 descriptor.FieldDescriptor.TYPE_FIXED64): 249 value = tokenizer.ConsumeUint64() 250 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT, 251 descriptor.FieldDescriptor.TYPE_DOUBLE): 252 value = tokenizer.ConsumeFloat() 253 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL: 254 value = tokenizer.ConsumeBool() 255 elif field.type == descriptor.FieldDescriptor.TYPE_STRING: 256 value = tokenizer.ConsumeString() 257 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES: 258 value = tokenizer.ConsumeByteString() 259 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM: 260 # Enum can be specified by a number (the enum value), or by 261 # a string literal (the enum name). 262 enum_descriptor = field.enum_type 263 if tokenizer.LookingAtInteger(): 264 number = tokenizer.ConsumeInt32() 265 enum_value = enum_descriptor.values_by_number.get(number, None) 266 if enum_value is None: 267 raise tokenizer.ParseErrorPreviousToken( 268 'Enum type "%s" has no value with number %d.' % ( 269 enum_descriptor.full_name, number)) 270 else: 271 identifier = tokenizer.ConsumeIdentifier() 272 enum_value = enum_descriptor.values_by_name.get(identifier, None) 273 if enum_value is None: 274 raise tokenizer.ParseErrorPreviousToken( 275 'Enum type "%s" has no value named %s.' % ( 276 enum_descriptor.full_name, identifier)) 277 value = enum_value.number 278 else: 279 raise RuntimeError('Unknown field type %d' % field.type) 280 281 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 282 if field.is_extension: 283 message.Extensions[field].append(value) 284 else: 285 getattr(message, field.name).append(value) 286 else: 287 if field.is_extension: 288 message.Extensions[field] = value 289 else: 290 setattr(message, field.name, value) 291 292 293class _Tokenizer(object): 294 """Protocol buffer ASCII representation tokenizer. 295 296 This class handles the lower level string parsing by splitting it into 297 meaningful tokens. 298 299 It was directly ported from the Java protocol buffer API. 300 """ 301 302 _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE) 303 _TOKEN = re.compile( 304 '[a-zA-Z_][0-9a-zA-Z_+-]*|' # an identifier 305 '[0-9+-][0-9a-zA-Z_.+-]*|' # a number 306 '\"([^\"\n\\\\]|\\\\.)*(\"|\\\\?$)|' # a double-quoted string 307 '\'([^\'\n\\\\]|\\\\.)*(\'|\\\\?$)') # a single-quoted string 308 _IDENTIFIER = re.compile('\w+') 309 _INTEGER_CHECKERS = [type_checkers.Uint32ValueChecker(), 310 type_checkers.Int32ValueChecker(), 311 type_checkers.Uint64ValueChecker(), 312 type_checkers.Int64ValueChecker()] 313 _FLOAT_INFINITY = re.compile('-?inf(inity)?f?', re.IGNORECASE) 314 _FLOAT_NAN = re.compile("nanf?", re.IGNORECASE) 315 316 def __init__(self, text_message): 317 self._text_message = text_message 318 319 self._position = 0 320 self._line = -1 321 self._column = 0 322 self._token_start = None 323 self.token = '' 324 self._lines = deque(text_message.split('\n')) 325 self._current_line = '' 326 self._previous_line = 0 327 self._previous_column = 0 328 self._SkipWhitespace() 329 self.NextToken() 330 331 def AtEnd(self): 332 """Checks the end of the text was reached. 333 334 Returns: 335 True iff the end was reached. 336 """ 337 return not self._lines and not self._current_line 338 339 def _PopLine(self): 340 while not self._current_line: 341 if not self._lines: 342 self._current_line = '' 343 return 344 self._line += 1 345 self._column = 0 346 self._current_line = self._lines.popleft() 347 348 def _SkipWhitespace(self): 349 while True: 350 self._PopLine() 351 match = re.match(self._WHITESPACE, self._current_line) 352 if not match: 353 break 354 length = len(match.group(0)) 355 self._current_line = self._current_line[length:] 356 self._column += length 357 358 def TryConsume(self, token): 359 """Tries to consume a given piece of text. 360 361 Args: 362 token: Text to consume. 363 364 Returns: 365 True iff the text was consumed. 366 """ 367 if self.token == token: 368 self.NextToken() 369 return True 370 return False 371 372 def Consume(self, token): 373 """Consumes a piece of text. 374 375 Args: 376 token: Text to consume. 377 378 Raises: 379 ParseError: If the text couldn't be consumed. 380 """ 381 if not self.TryConsume(token): 382 raise self._ParseError('Expected "%s".' % token) 383 384 def LookingAtInteger(self): 385 """Checks if the current token is an integer. 386 387 Returns: 388 True iff the current token is an integer. 389 """ 390 if not self.token: 391 return False 392 c = self.token[0] 393 return (c >= '0' and c <= '9') or c == '-' or c == '+' 394 395 def ConsumeIdentifier(self): 396 """Consumes protocol message field identifier. 397 398 Returns: 399 Identifier string. 400 401 Raises: 402 ParseError: If an identifier couldn't be consumed. 403 """ 404 result = self.token 405 if not re.match(self._IDENTIFIER, result): 406 raise self._ParseError('Expected identifier.') 407 self.NextToken() 408 return result 409 410 def ConsumeInt32(self): 411 """Consumes a signed 32bit integer number. 412 413 Returns: 414 The integer parsed. 415 416 Raises: 417 ParseError: If a signed 32bit integer couldn't be consumed. 418 """ 419 try: 420 result = self._ParseInteger(self.token, is_signed=True, is_long=False) 421 except ValueError, e: 422 raise self._IntegerParseError(e) 423 self.NextToken() 424 return result 425 426 def ConsumeUint32(self): 427 """Consumes an unsigned 32bit integer number. 428 429 Returns: 430 The integer parsed. 431 432 Raises: 433 ParseError: If an unsigned 32bit integer couldn't be consumed. 434 """ 435 try: 436 result = self._ParseInteger(self.token, is_signed=False, is_long=False) 437 except ValueError, e: 438 raise self._IntegerParseError(e) 439 self.NextToken() 440 return result 441 442 def ConsumeInt64(self): 443 """Consumes a signed 64bit integer number. 444 445 Returns: 446 The integer parsed. 447 448 Raises: 449 ParseError: If a signed 64bit integer couldn't be consumed. 450 """ 451 try: 452 result = self._ParseInteger(self.token, is_signed=True, is_long=True) 453 except ValueError, e: 454 raise self._IntegerParseError(e) 455 self.NextToken() 456 return result 457 458 def ConsumeUint64(self): 459 """Consumes an unsigned 64bit integer number. 460 461 Returns: 462 The integer parsed. 463 464 Raises: 465 ParseError: If an unsigned 64bit integer couldn't be consumed. 466 """ 467 try: 468 result = self._ParseInteger(self.token, is_signed=False, is_long=True) 469 except ValueError, e: 470 raise self._IntegerParseError(e) 471 self.NextToken() 472 return result 473 474 def ConsumeFloat(self): 475 """Consumes an floating point number. 476 477 Returns: 478 The number parsed. 479 480 Raises: 481 ParseError: If a floating point number couldn't be consumed. 482 """ 483 text = self.token 484 if re.match(self._FLOAT_INFINITY, text): 485 self.NextToken() 486 if text.startswith('-'): 487 return -_INFINITY 488 return _INFINITY 489 490 if re.match(self._FLOAT_NAN, text): 491 self.NextToken() 492 return _NAN 493 494 try: 495 result = float(text) 496 except ValueError, e: 497 raise self._FloatParseError(e) 498 self.NextToken() 499 return result 500 501 def ConsumeBool(self): 502 """Consumes a boolean value. 503 504 Returns: 505 The bool parsed. 506 507 Raises: 508 ParseError: If a boolean value couldn't be consumed. 509 """ 510 if self.token == 'true': 511 self.NextToken() 512 return True 513 elif self.token == 'false': 514 self.NextToken() 515 return False 516 else: 517 raise self._ParseError('Expected "true" or "false".') 518 519 def ConsumeString(self): 520 """Consumes a string value. 521 522 Returns: 523 The string parsed. 524 525 Raises: 526 ParseError: If a string value couldn't be consumed. 527 """ 528 return unicode(self.ConsumeByteString(), 'utf-8') 529 530 def ConsumeByteString(self): 531 """Consumes a byte array value. 532 533 Returns: 534 The array parsed (as a string). 535 536 Raises: 537 ParseError: If a byte array value couldn't be consumed. 538 """ 539 list = [self._ConsumeSingleByteString()] 540 while len(self.token) > 0 and self.token[0] in ('\'', '"'): 541 list.append(self._ConsumeSingleByteString()) 542 return "".join(list) 543 544 def _ConsumeSingleByteString(self): 545 """Consume one token of a string literal. 546 547 String literals (whether bytes or text) can come in multiple adjacent 548 tokens which are automatically concatenated, like in C or Python. This 549 method only consumes one token. 550 """ 551 text = self.token 552 if len(text) < 1 or text[0] not in ('\'', '"'): 553 raise self._ParseError('Exptected string.') 554 555 if len(text) < 2 or text[-1] != text[0]: 556 raise self._ParseError('String missing ending quote.') 557 558 try: 559 result = _CUnescape(text[1:-1]) 560 except ValueError, e: 561 raise self._ParseError(str(e)) 562 self.NextToken() 563 return result 564 565 def _ParseInteger(self, text, is_signed=False, is_long=False): 566 """Parses an integer. 567 568 Args: 569 text: The text to parse. 570 is_signed: True if a signed integer must be parsed. 571 is_long: True if a long integer must be parsed. 572 573 Returns: 574 The integer value. 575 576 Raises: 577 ValueError: Thrown Iff the text is not a valid integer. 578 """ 579 pos = 0 580 if text.startswith('-'): 581 pos += 1 582 583 base = 10 584 if text.startswith('0x', pos) or text.startswith('0X', pos): 585 base = 16 586 elif text.startswith('0', pos): 587 base = 8 588 589 # Do the actual parsing. Exception handling is propagated to caller. 590 result = int(text, base) 591 592 # Check if the integer is sane. Exceptions handled by callers. 593 checker = self._INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)] 594 checker.CheckValue(result) 595 return result 596 597 def ParseErrorPreviousToken(self, message): 598 """Creates and *returns* a ParseError for the previously read token. 599 600 Args: 601 message: A message to set for the exception. 602 603 Returns: 604 A ParseError instance. 605 """ 606 return ParseError('%d:%d : %s' % ( 607 self._previous_line + 1, self._previous_column + 1, message)) 608 609 def _ParseError(self, message): 610 """Creates and *returns* a ParseError for the current token.""" 611 return ParseError('%d:%d : %s' % ( 612 self._line + 1, self._column + 1, message)) 613 614 def _IntegerParseError(self, e): 615 return self._ParseError('Couldn\'t parse integer: ' + str(e)) 616 617 def _FloatParseError(self, e): 618 return self._ParseError('Couldn\'t parse number: ' + str(e)) 619 620 def NextToken(self): 621 """Reads the next meaningful token.""" 622 self._previous_line = self._line 623 self._previous_column = self._column 624 if self.AtEnd(): 625 self.token = '' 626 return 627 self._column += len(self.token) 628 629 # Make sure there is data to work on. 630 self._PopLine() 631 632 match = re.match(self._TOKEN, self._current_line) 633 if match: 634 token = match.group(0) 635 self._current_line = self._current_line[len(token):] 636 self.token = token 637 else: 638 self.token = self._current_line[0] 639 self._current_line = self._current_line[1:] 640 self._SkipWhitespace() 641 642 643# text.encode('string_escape') does not seem to satisfy our needs as it 644# encodes unprintable characters using two-digit hex escapes whereas our 645# C++ unescaping function allows hex escapes to be any length. So, 646# "\0011".encode('string_escape') ends up being "\\x011", which will be 647# decoded in C++ as a single-character string with char code 0x11. 648def _CEscape(text): 649 def escape(c): 650 o = ord(c) 651 if o == 10: return r"\n" # optional escape 652 if o == 13: return r"\r" # optional escape 653 if o == 9: return r"\t" # optional escape 654 if o == 39: return r"\'" # optional escape 655 656 if o == 34: return r'\"' # necessary escape 657 if o == 92: return r"\\" # necessary escape 658 659 if o >= 127 or o < 32: return "\\%03o" % o # necessary escapes 660 return c 661 return "".join([escape(c) for c in text]) 662 663 664_CUNESCAPE_HEX = re.compile('\\\\x([0-9a-fA-F]{2}|[0-9a-f-A-F])') 665 666 667def _CUnescape(text): 668 def ReplaceHex(m): 669 return chr(int(m.group(0)[2:], 16)) 670 # This is required because the 'string_escape' encoding doesn't 671 # allow single-digit hex escapes (like '\xf'). 672 result = _CUNESCAPE_HEX.sub(ReplaceHex, text) 673 return result.decode('string_escape') 674