codecs.py revision e2713becd8cb0c3b2db4d33832dd57a1d619f0f3
1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import __builtin__, sys 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError, why: 17 raise SystemError,\ 18 'Failed to load the builtin codecs: %s' % why 19 20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 24 "strict_errors", "ignore_errors", "replace_errors", 25 "xmlcharrefreplace_errors", 26 "register_error", "lookup_error"] 27 28### Constants 29 30# 31# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 32# and its possible byte string values 33# for UTF8/UTF16/UTF32 output and little/big endian machines 34# 35 36# UTF-8 37BOM_UTF8 = '\xef\xbb\xbf' 38 39# UTF-16, little endian 40BOM_LE = BOM_UTF16_LE = '\xff\xfe' 41 42# UTF-16, big endian 43BOM_BE = BOM_UTF16_BE = '\xfe\xff' 44 45# UTF-32, little endian 46BOM_UTF32_LE = '\xff\xfe\x00\x00' 47 48# UTF-32, big endian 49BOM_UTF32_BE = '\x00\x00\xfe\xff' 50 51if sys.byteorder == 'little': 52 53 # UTF-16, native endianness 54 BOM = BOM_UTF16 = BOM_UTF16_LE 55 56 # UTF-32, native endianness 57 BOM_UTF32 = BOM_UTF32_LE 58 59else: 60 61 # UTF-16, native endianness 62 BOM = BOM_UTF16 = BOM_UTF16_BE 63 64 # UTF-32, native endianness 65 BOM_UTF32 = BOM_UTF32_BE 66 67# Old broken names (don't use in new code) 68BOM32_LE = BOM_UTF16_LE 69BOM32_BE = BOM_UTF16_BE 70BOM64_LE = BOM_UTF32_LE 71BOM64_BE = BOM_UTF32_BE 72 73 74### Codec base classes (defining the API) 75 76class Codec: 77 78 """ Defines the interface for stateless encoders/decoders. 79 80 The .encode()/.decode() methods may use different error 81 handling schemes by providing the errors argument. These 82 string values are predefined: 83 84 'strict' - raise a ValueError error (or a subclass) 85 'ignore' - ignore the character and continue with the next 86 'replace' - replace with a suitable replacement character; 87 Python will use the official U+FFFD REPLACEMENT 88 CHARACTER for the builtin Unicode codecs on 89 decoding and '?' on encoding. 90 'xmlcharrefreplace' - Replace with the appropriate XML 91 character reference (only for encoding). 92 'backslashreplace' - Replace with backslashed escape sequences 93 (only for encoding). 94 95 The set of allowed values can be extended via register_error. 96 97 """ 98 def encode(self, input, errors='strict'): 99 100 """ Encodes the object input and returns a tuple (output 101 object, length consumed). 102 103 errors defines the error handling to apply. It defaults to 104 'strict' handling. 105 106 The method may not store state in the Codec instance. Use 107 StreamCodec for codecs which have to keep state in order to 108 make encoding/decoding efficient. 109 110 The encoder must be able to handle zero length input and 111 return an empty object of the output object type in this 112 situation. 113 114 """ 115 raise NotImplementedError 116 117 def decode(self, input, errors='strict'): 118 119 """ Decodes the object input and returns a tuple (output 120 object, length consumed). 121 122 input must be an object which provides the bf_getreadbuf 123 buffer slot. Python strings, buffer objects and memory 124 mapped files are examples of objects providing this slot. 125 126 errors defines the error handling to apply. It defaults to 127 'strict' handling. 128 129 The method may not store state in the Codec instance. Use 130 StreamCodec for codecs which have to keep state in order to 131 make encoding/decoding efficient. 132 133 The decoder must be able to handle zero length input and 134 return an empty object of the output object type in this 135 situation. 136 137 """ 138 raise NotImplementedError 139 140# 141# The StreamWriter and StreamReader class provide generic working 142# interfaces which can be used to implement new encoding submodules 143# very easily. See encodings/utf_8.py for an example on how this is 144# done. 145# 146 147class StreamWriter(Codec): 148 149 def __init__(self, stream, errors='strict'): 150 151 """ Creates a StreamWriter instance. 152 153 stream must be a file-like object open for writing 154 (binary) data. 155 156 The StreamWriter may use different error handling 157 schemes by providing the errors keyword argument. These 158 parameters are predefined: 159 160 'strict' - raise a ValueError (or a subclass) 161 'ignore' - ignore the character and continue with the next 162 'replace'- replace with a suitable replacement character 163 'xmlcharrefreplace' - Replace with the appropriate XML 164 character reference. 165 'backslashreplace' - Replace with backslashed escape 166 sequences (only for encoding). 167 168 The set of allowed parameter values can be extended via 169 register_error. 170 """ 171 self.stream = stream 172 self.errors = errors 173 174 def write(self, object): 175 176 """ Writes the object's contents encoded to self.stream. 177 """ 178 data, consumed = self.encode(object, self.errors) 179 self.stream.write(data) 180 181 def writelines(self, list): 182 183 """ Writes the concatenated list of strings to the stream 184 using .write(). 185 """ 186 self.write(''.join(list)) 187 188 def reset(self): 189 190 """ Flushes and resets the codec buffers used for keeping state. 191 192 Calling this method should ensure that the data on the 193 output is put into a clean state, that allows appending 194 of new fresh data without having to rescan the whole 195 stream to recover state. 196 197 """ 198 pass 199 200 def __getattr__(self, name, 201 getattr=getattr): 202 203 """ Inherit all other methods from the underlying stream. 204 """ 205 return getattr(self.stream, name) 206 207### 208 209class StreamReader(Codec): 210 211 def __init__(self, stream, errors='strict'): 212 213 """ Creates a StreamReader instance. 214 215 stream must be a file-like object open for reading 216 (binary) data. 217 218 The StreamReader may use different error handling 219 schemes by providing the errors keyword argument. These 220 parameters are predefined: 221 222 'strict' - raise a ValueError (or a subclass) 223 'ignore' - ignore the character and continue with the next 224 'replace'- replace with a suitable replacement character; 225 226 The set of allowed parameter values can be extended via 227 register_error. 228 """ 229 self.stream = stream 230 self.errors = errors 231 self.bytebuffer = "" 232 self.charbuffer = u"" 233 self.atcr = False 234 235 def decode(self, input, errors='strict'): 236 raise NotImplementedError 237 238 def read(self, size=-1, chars=-1): 239 240 """ Decodes data from the stream self.stream and returns the 241 resulting object. 242 243 chars indicates the number of characters to read from the 244 stream. read() will never return more than chars 245 characters, but it might return less, if there are not enough 246 characters available. 247 248 size indicates the approximate maximum number of bytes to 249 read from the stream for decoding purposes. The decoder 250 can modify this setting as appropriate. The default value 251 -1 indicates to read and decode as much as possible. size 252 is intended to prevent having to decode huge files in one 253 step. 254 255 The method should use a greedy read strategy meaning that 256 it should read as much data as is allowed within the 257 definition of the encoding and the given size, e.g. if 258 optional encoding endings or state markers are available 259 on the stream, these should be read too. 260 """ 261 # read until we get the required number of characters (if available) 262 while True: 263 # can the request can be satisfied from the character buffer? 264 if chars < 0: 265 if self.charbuffer: 266 break 267 else: 268 if len(self.charbuffer) >= chars: 269 break 270 # we need more data 271 if size < 0: 272 newdata = self.stream.read() 273 else: 274 newdata = self.stream.read(size) 275 # decode bytes (those remaining from the last call included) 276 data = self.bytebuffer + newdata 277 newchars, decodedbytes = self.decode(data, self.errors) 278 # keep undecoded bytes until the next call 279 self.bytebuffer = data[decodedbytes:] 280 # put new characters in the character buffer 281 self.charbuffer += newchars 282 # there was no data available 283 if not newdata: 284 break 285 if chars < 0: 286 # Return everything we've got 287 result = self.charbuffer 288 self.charbuffer = u"" 289 else: 290 # Return the first chars characters 291 result = self.charbuffer[:chars] 292 self.charbuffer = self.charbuffer[chars:] 293 return result 294 295 def readline(self, size=None, keepends=True): 296 297 """ Read one line from the input stream and return the 298 decoded data. 299 300 size, if given, is passed as size argument to the 301 read() method. 302 303 """ 304 readsize = size or 72 305 line = u"" 306 # If size is given, we call read() only once 307 while True: 308 data = self.read(readsize) 309 if self.atcr and data.startswith(u"\n"): 310 data = data[1:] 311 if data: 312 self.atcr = data.endswith(u"\r") 313 line += data 314 lines = line.splitlines(True) 315 if lines: 316 line0withend = lines[0] 317 line0withoutend = lines[0].splitlines(False)[0] 318 if line0withend != line0withoutend: # We really have a line end 319 # Put the rest back together and keep it until the next call 320 self.charbuffer = u"".join(lines[1:]) + self.charbuffer 321 if keepends: 322 line = line0withend 323 else: 324 line = line0withoutend 325 break 326 # we didn't get anything or this was our only try 327 if not data or size is not None: 328 if line and not keepends: 329 line = line.splitlines(False)[0] 330 break 331 if readsize<8000: 332 readsize *= 2 333 return line 334 335 def readlines(self, sizehint=None, keepends=True): 336 337 """ Read all lines available on the input stream 338 and return them as list of lines. 339 340 Line breaks are implemented using the codec's decoder 341 method and are included in the list entries. 342 343 sizehint, if given, is ignored since there is no efficient 344 way to finding the true end-of-line. 345 346 """ 347 data = self.read() 348 return data.splitlines(keepends) 349 350 def reset(self): 351 352 """ Resets the codec buffers used for keeping state. 353 354 Note that no stream repositioning should take place. 355 This method is primarily intended to be able to recover 356 from decoding errors. 357 358 """ 359 pass 360 361 def next(self): 362 363 """ Return the next decoded line from the input stream.""" 364 line = self.readline() 365 if line: 366 return line 367 raise StopIteration 368 369 def __iter__(self): 370 return self 371 372 def __getattr__(self, name, 373 getattr=getattr): 374 375 """ Inherit all other methods from the underlying stream. 376 """ 377 return getattr(self.stream, name) 378 379### 380 381class StreamReaderWriter: 382 383 """ StreamReaderWriter instances allow wrapping streams which 384 work in both read and write modes. 385 386 The design is such that one can use the factory functions 387 returned by the codec.lookup() function to construct the 388 instance. 389 390 """ 391 # Optional attributes set by the file wrappers below 392 encoding = 'unknown' 393 394 def __init__(self, stream, Reader, Writer, errors='strict'): 395 396 """ Creates a StreamReaderWriter instance. 397 398 stream must be a Stream-like object. 399 400 Reader, Writer must be factory functions or classes 401 providing the StreamReader, StreamWriter interface resp. 402 403 Error handling is done in the same way as defined for the 404 StreamWriter/Readers. 405 406 """ 407 self.stream = stream 408 self.reader = Reader(stream, errors) 409 self.writer = Writer(stream, errors) 410 self.errors = errors 411 412 def read(self, size=-1): 413 414 return self.reader.read(size) 415 416 def readline(self, size=None): 417 418 return self.reader.readline(size) 419 420 def readlines(self, sizehint=None): 421 422 return self.reader.readlines(sizehint) 423 424 def next(self): 425 426 """ Return the next decoded line from the input stream.""" 427 return self.reader.next() 428 429 def __iter__(self): 430 return self 431 432 def write(self, data): 433 434 return self.writer.write(data) 435 436 def writelines(self, list): 437 438 return self.writer.writelines(list) 439 440 def reset(self): 441 442 self.reader.reset() 443 self.writer.reset() 444 445 def __getattr__(self, name, 446 getattr=getattr): 447 448 """ Inherit all other methods from the underlying stream. 449 """ 450 return getattr(self.stream, name) 451 452### 453 454class StreamRecoder: 455 456 """ StreamRecoder instances provide a frontend - backend 457 view of encoding data. 458 459 They use the complete set of APIs returned by the 460 codecs.lookup() function to implement their task. 461 462 Data written to the stream is first decoded into an 463 intermediate format (which is dependent on the given codec 464 combination) and then written to the stream using an instance 465 of the provided Writer class. 466 467 In the other direction, data is read from the stream using a 468 Reader instance and then return encoded data to the caller. 469 470 """ 471 # Optional attributes set by the file wrappers below 472 data_encoding = 'unknown' 473 file_encoding = 'unknown' 474 475 def __init__(self, stream, encode, decode, Reader, Writer, 476 errors='strict'): 477 478 """ Creates a StreamRecoder instance which implements a two-way 479 conversion: encode and decode work on the frontend (the 480 input to .read() and output of .write()) while 481 Reader and Writer work on the backend (reading and 482 writing to the stream). 483 484 You can use these objects to do transparent direct 485 recodings from e.g. latin-1 to utf-8 and back. 486 487 stream must be a file-like object. 488 489 encode, decode must adhere to the Codec interface, Reader, 490 Writer must be factory functions or classes providing the 491 StreamReader, StreamWriter interface resp. 492 493 encode and decode are needed for the frontend translation, 494 Reader and Writer for the backend translation. Unicode is 495 used as intermediate encoding. 496 497 Error handling is done in the same way as defined for the 498 StreamWriter/Readers. 499 500 """ 501 self.stream = stream 502 self.encode = encode 503 self.decode = decode 504 self.reader = Reader(stream, errors) 505 self.writer = Writer(stream, errors) 506 self.errors = errors 507 508 def read(self, size=-1): 509 510 data = self.reader.read(size) 511 data, bytesencoded = self.encode(data, self.errors) 512 return data 513 514 def readline(self, size=None): 515 516 if size is None: 517 data = self.reader.readline() 518 else: 519 data = self.reader.readline(size) 520 data, bytesencoded = self.encode(data, self.errors) 521 return data 522 523 def readlines(self, sizehint=None): 524 525 data = self.reader.read() 526 data, bytesencoded = self.encode(data, self.errors) 527 return data.splitlines(1) 528 529 def next(self): 530 531 """ Return the next decoded line from the input stream.""" 532 return self.reader.next() 533 534 def __iter__(self): 535 return self 536 537 def write(self, data): 538 539 data, bytesdecoded = self.decode(data, self.errors) 540 return self.writer.write(data) 541 542 def writelines(self, list): 543 544 data = ''.join(list) 545 data, bytesdecoded = self.decode(data, self.errors) 546 return self.writer.write(data) 547 548 def reset(self): 549 550 self.reader.reset() 551 self.writer.reset() 552 553 def __getattr__(self, name, 554 getattr=getattr): 555 556 """ Inherit all other methods from the underlying stream. 557 """ 558 return getattr(self.stream, name) 559 560### Shortcuts 561 562def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 563 564 """ Open an encoded file using the given mode and return 565 a wrapped version providing transparent encoding/decoding. 566 567 Note: The wrapped version will only accept the object format 568 defined by the codecs, i.e. Unicode objects for most builtin 569 codecs. Output is also codec dependent and will usually by 570 Unicode as well. 571 572 Files are always opened in binary mode, even if no binary mode 573 was specified. This is done to avoid data loss due to encodings 574 using 8-bit values. The default file mode is 'rb' meaning to 575 open the file in binary read mode. 576 577 encoding specifies the encoding which is to be used for the 578 file. 579 580 errors may be given to define the error handling. It defaults 581 to 'strict' which causes ValueErrors to be raised in case an 582 encoding error occurs. 583 584 buffering has the same meaning as for the builtin open() API. 585 It defaults to line buffered. 586 587 The returned wrapped file object provides an extra attribute 588 .encoding which allows querying the used encoding. This 589 attribute is only available if an encoding was specified as 590 parameter. 591 592 """ 593 if encoding is not None and \ 594 'b' not in mode: 595 # Force opening of the file in binary mode 596 mode = mode + 'b' 597 file = __builtin__.open(filename, mode, buffering) 598 if encoding is None: 599 return file 600 (e, d, sr, sw) = lookup(encoding) 601 srw = StreamReaderWriter(file, sr, sw, errors) 602 # Add attributes to simplify introspection 603 srw.encoding = encoding 604 return srw 605 606def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 607 608 """ Return a wrapped version of file which provides transparent 609 encoding translation. 610 611 Strings written to the wrapped file are interpreted according 612 to the given data_encoding and then written to the original 613 file as string using file_encoding. The intermediate encoding 614 will usually be Unicode but depends on the specified codecs. 615 616 Strings are read from the file using file_encoding and then 617 passed back to the caller as string using data_encoding. 618 619 If file_encoding is not given, it defaults to data_encoding. 620 621 errors may be given to define the error handling. It defaults 622 to 'strict' which causes ValueErrors to be raised in case an 623 encoding error occurs. 624 625 The returned wrapped file object provides two extra attributes 626 .data_encoding and .file_encoding which reflect the given 627 parameters of the same name. The attributes can be used for 628 introspection by Python programs. 629 630 """ 631 if file_encoding is None: 632 file_encoding = data_encoding 633 encode, decode = lookup(data_encoding)[:2] 634 Reader, Writer = lookup(file_encoding)[2:] 635 sr = StreamRecoder(file, 636 encode, decode, Reader, Writer, 637 errors) 638 # Add attributes to simplify introspection 639 sr.data_encoding = data_encoding 640 sr.file_encoding = file_encoding 641 return sr 642 643### Helpers for codec lookup 644 645def getencoder(encoding): 646 647 """ Lookup up the codec for the given encoding and return 648 its encoder function. 649 650 Raises a LookupError in case the encoding cannot be found. 651 652 """ 653 return lookup(encoding)[0] 654 655def getdecoder(encoding): 656 657 """ Lookup up the codec for the given encoding and return 658 its decoder function. 659 660 Raises a LookupError in case the encoding cannot be found. 661 662 """ 663 return lookup(encoding)[1] 664 665def getreader(encoding): 666 667 """ Lookup up the codec for the given encoding and return 668 its StreamReader class or factory function. 669 670 Raises a LookupError in case the encoding cannot be found. 671 672 """ 673 return lookup(encoding)[2] 674 675def getwriter(encoding): 676 677 """ Lookup up the codec for the given encoding and return 678 its StreamWriter class or factory function. 679 680 Raises a LookupError in case the encoding cannot be found. 681 682 """ 683 return lookup(encoding)[3] 684 685### Helpers for charmap-based codecs 686 687def make_identity_dict(rng): 688 689 """ make_identity_dict(rng) -> dict 690 691 Return a dictionary where elements of the rng sequence are 692 mapped to themselves. 693 694 """ 695 res = {} 696 for i in rng: 697 res[i]=i 698 return res 699 700def make_encoding_map(decoding_map): 701 702 """ Creates an encoding map from a decoding map. 703 704 If a target mapping in the decoding map occurs multiple 705 times, then that target is mapped to None (undefined mapping), 706 causing an exception when encountered by the charmap codec 707 during translation. 708 709 One example where this happens is cp875.py which decodes 710 multiple character to \u001a. 711 712 """ 713 m = {} 714 for k,v in decoding_map.items(): 715 if not v in m: 716 m[v] = k 717 else: 718 m[v] = None 719 return m 720 721### error handlers 722 723try: 724 strict_errors = lookup_error("strict") 725 ignore_errors = lookup_error("ignore") 726 replace_errors = lookup_error("replace") 727 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 728 backslashreplace_errors = lookup_error("backslashreplace") 729except LookupError: 730 # In --disable-unicode builds, these error handler are missing 731 strict_errors = None 732 ignore_errors = None 733 replace_errors = None 734 xmlcharrefreplace_errors = None 735 backslashreplace_errors = None 736 737# Tell modulefinder that using codecs probably needs the encodings 738# package 739_false = 0 740if _false: 741 import encodings 742 743### Tests 744 745if __name__ == '__main__': 746 747 # Make stdout translate Latin-1 output into UTF-8 output 748 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 749 750 # Have stdin translate Latin-1 input into UTF-8 input 751 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 752