codecs.py revision d594849c42b6141622f8e442e26b49e2df6ef4ff
1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import __builtin__, sys 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError, why: 17 raise SystemError,\ 18 'Failed to load the builtin codecs: %s' % why 19 20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 24 "strict_errors", "ignore_errors", "replace_errors", 25 "xmlcharrefreplace_errors", 26 "register_error", "lookup_error"] 27 28### Constants 29 30# 31# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 32# and its possible byte string values 33# for UTF8/UTF16/UTF32 output and little/big endian machines 34# 35 36# UTF-8 37BOM_UTF8 = '\xef\xbb\xbf' 38 39# UTF-16, little endian 40BOM_LE = BOM_UTF16_LE = '\xff\xfe' 41 42# UTF-16, big endian 43BOM_BE = BOM_UTF16_BE = '\xfe\xff' 44 45# UTF-32, little endian 46BOM_UTF32_LE = '\xff\xfe\x00\x00' 47 48# UTF-32, big endian 49BOM_UTF32_BE = '\x00\x00\xfe\xff' 50 51if sys.byteorder == 'little': 52 53 # UTF-16, native endianness 54 BOM = BOM_UTF16 = BOM_UTF16_LE 55 56 # UTF-32, native endianness 57 BOM_UTF32 = BOM_UTF32_LE 58 59else: 60 61 # UTF-16, native endianness 62 BOM = BOM_UTF16 = BOM_UTF16_BE 63 64 # UTF-32, native endianness 65 BOM_UTF32 = BOM_UTF32_BE 66 67# Old broken names (don't use in new code) 68BOM32_LE = BOM_UTF16_LE 69BOM32_BE = BOM_UTF16_BE 70BOM64_LE = BOM_UTF32_LE 71BOM64_BE = BOM_UTF32_BE 72 73 74### Codec base classes (defining the API) 75 76class Codec: 77 78 """ Defines the interface for stateless encoders/decoders. 79 80 The .encode()/.decode() methods may use different error 81 handling schemes by providing the errors argument. These 82 string values are predefined: 83 84 'strict' - raise a ValueError error (or a subclass) 85 'ignore' - ignore the character and continue with the next 86 'replace' - replace with a suitable replacement character; 87 Python will use the official U+FFFD REPLACEMENT 88 CHARACTER for the builtin Unicode codecs on 89 decoding and '?' on encoding. 90 'xmlcharrefreplace' - Replace with the appropriate XML 91 character reference (only for encoding). 92 'backslashreplace' - Replace with backslashed escape sequences 93 (only for encoding). 94 95 The set of allowed values can be extended via register_error. 96 97 """ 98 def encode(self, input, errors='strict'): 99 100 """ Encodes the object input and returns a tuple (output 101 object, length consumed). 102 103 errors defines the error handling to apply. It defaults to 104 'strict' handling. 105 106 The method may not store state in the Codec instance. Use 107 StreamCodec for codecs which have to keep state in order to 108 make encoding/decoding efficient. 109 110 The encoder must be able to handle zero length input and 111 return an empty object of the output object type in this 112 situation. 113 114 """ 115 raise NotImplementedError 116 117 def decode(self, input, errors='strict'): 118 119 """ Decodes the object input and returns a tuple (output 120 object, length consumed). 121 122 input must be an object which provides the bf_getreadbuf 123 buffer slot. Python strings, buffer objects and memory 124 mapped files are examples of objects providing this slot. 125 126 errors defines the error handling to apply. It defaults to 127 'strict' handling. 128 129 The method may not store state in the Codec instance. Use 130 StreamCodec for codecs which have to keep state in order to 131 make encoding/decoding efficient. 132 133 The decoder must be able to handle zero length input and 134 return an empty object of the output object type in this 135 situation. 136 137 """ 138 raise NotImplementedError 139 140# 141# The StreamWriter and StreamReader class provide generic working 142# interfaces which can be used to implement new encoding submodules 143# very easily. See encodings/utf_8.py for an example on how this is 144# done. 145# 146 147class StreamWriter(Codec): 148 149 def __init__(self, stream, errors='strict'): 150 151 """ Creates a StreamWriter instance. 152 153 stream must be a file-like object open for writing 154 (binary) data. 155 156 The StreamWriter may use different error handling 157 schemes by providing the errors keyword argument. These 158 parameters are predefined: 159 160 'strict' - raise a ValueError (or a subclass) 161 'ignore' - ignore the character and continue with the next 162 'replace'- replace with a suitable replacement character 163 'xmlcharrefreplace' - Replace with the appropriate XML 164 character reference. 165 'backslashreplace' - Replace with backslashed escape 166 sequences (only for encoding). 167 168 The set of allowed parameter values can be extended via 169 register_error. 170 """ 171 self.stream = stream 172 self.errors = errors 173 174 def write(self, object): 175 176 """ Writes the object's contents encoded to self.stream. 177 """ 178 data, consumed = self.encode(object, self.errors) 179 self.stream.write(data) 180 181 def writelines(self, list): 182 183 """ Writes the concatenated list of strings to the stream 184 using .write(). 185 """ 186 self.write(''.join(list)) 187 188 def reset(self): 189 190 """ Flushes and resets the codec buffers used for keeping state. 191 192 Calling this method should ensure that the data on the 193 output is put into a clean state, that allows appending 194 of new fresh data without having to rescan the whole 195 stream to recover state. 196 197 """ 198 pass 199 200 def __getattr__(self, name, 201 getattr=getattr): 202 203 """ Inherit all other methods from the underlying stream. 204 """ 205 return getattr(self.stream, name) 206 207### 208 209class StreamReader(Codec): 210 211 def __init__(self, stream, errors='strict'): 212 213 """ Creates a StreamReader instance. 214 215 stream must be a file-like object open for reading 216 (binary) data. 217 218 The StreamReader may use different error handling 219 schemes by providing the errors keyword argument. These 220 parameters are predefined: 221 222 'strict' - raise a ValueError (or a subclass) 223 'ignore' - ignore the character and continue with the next 224 'replace'- replace with a suitable replacement character; 225 226 The set of allowed parameter values can be extended via 227 register_error. 228 """ 229 self.stream = stream 230 self.errors = errors 231 232 def read(self, size=-1): 233 234 """ Decodes data from the stream self.stream and returns the 235 resulting object. 236 237 size indicates the approximate maximum number of bytes to 238 read from the stream for decoding purposes. The decoder 239 can modify this setting as appropriate. The default value 240 -1 indicates to read and decode as much as possible. size 241 is intended to prevent having to decode huge files in one 242 step. 243 244 The method should use a greedy read strategy meaning that 245 it should read as much data as is allowed within the 246 definition of the encoding and the given size, e.g. if 247 optional encoding endings or state markers are available 248 on the stream, these should be read too. 249 250 """ 251 # Unsliced reading: 252 if size < 0: 253 return self.decode(self.stream.read(), self.errors)[0] 254 255 # Sliced reading: 256 read = self.stream.read 257 decode = self.decode 258 data = read(size) 259 i = 0 260 while 1: 261 try: 262 object, decodedbytes = decode(data, self.errors) 263 except ValueError, why: 264 # This method is slow but should work under pretty much 265 # all conditions; at most 10 tries are made 266 i = i + 1 267 newdata = read(1) 268 if not newdata or i > 10: 269 raise 270 data = data + newdata 271 else: 272 return object 273 274 def readline(self, size=None): 275 276 """ Read one line from the input stream and return the 277 decoded data. 278 279 Note: Unlike the .readlines() method, this method inherits 280 the line breaking knowledge from the underlying stream's 281 .readline() method -- there is currently no support for 282 line breaking using the codec decoder due to lack of line 283 buffering. Subclasses should however, if possible, try to 284 implement this method using their own knowledge of line 285 breaking. 286 287 size, if given, is passed as size argument to the stream's 288 .readline() method. 289 290 """ 291 if size is None: 292 line = self.stream.readline() 293 else: 294 line = self.stream.readline(size) 295 return self.decode(line, self.errors)[0] 296 297 298 def readlines(self, sizehint=None): 299 300 """ Read all lines available on the input stream 301 and return them as list of lines. 302 303 Line breaks are implemented using the codec's decoder 304 method and are included in the list entries. 305 306 sizehint, if given, is ignored since there is no efficient 307 way to finding the true end-of-line. 308 309 """ 310 data = self.stream.read() 311 return self.decode(data, self.errors)[0].splitlines(1) 312 313 def reset(self): 314 315 """ Resets the codec buffers used for keeping state. 316 317 Note that no stream repositioning should take place. 318 This method is primarily intended to be able to recover 319 from decoding errors. 320 321 """ 322 pass 323 324 def next(self): 325 326 """ Return the next decoded line from the input stream.""" 327 line = self.readline() 328 if line: 329 return line 330 raise StopIteration 331 332 def __iter__(self): 333 return self 334 335 def __getattr__(self, name, 336 getattr=getattr): 337 338 """ Inherit all other methods from the underlying stream. 339 """ 340 return getattr(self.stream, name) 341 342### 343 344class StreamReaderWriter: 345 346 """ StreamReaderWriter instances allow wrapping streams which 347 work in both read and write modes. 348 349 The design is such that one can use the factory functions 350 returned by the codec.lookup() function to construct the 351 instance. 352 353 """ 354 # Optional attributes set by the file wrappers below 355 encoding = 'unknown' 356 357 def __init__(self, stream, Reader, Writer, errors='strict'): 358 359 """ Creates a StreamReaderWriter instance. 360 361 stream must be a Stream-like object. 362 363 Reader, Writer must be factory functions or classes 364 providing the StreamReader, StreamWriter interface resp. 365 366 Error handling is done in the same way as defined for the 367 StreamWriter/Readers. 368 369 """ 370 self.stream = stream 371 self.reader = Reader(stream, errors) 372 self.writer = Writer(stream, errors) 373 self.errors = errors 374 375 def read(self, size=-1): 376 377 return self.reader.read(size) 378 379 def readline(self, size=None): 380 381 return self.reader.readline(size) 382 383 def readlines(self, sizehint=None): 384 385 return self.reader.readlines(sizehint) 386 387 def next(self): 388 389 """ Return the next decoded line from the input stream.""" 390 return self.reader.next() 391 392 def __iter__(self): 393 return self 394 395 def write(self, data): 396 397 return self.writer.write(data) 398 399 def writelines(self, list): 400 401 return self.writer.writelines(list) 402 403 def reset(self): 404 405 self.reader.reset() 406 self.writer.reset() 407 408 def __getattr__(self, name, 409 getattr=getattr): 410 411 """ Inherit all other methods from the underlying stream. 412 """ 413 return getattr(self.stream, name) 414 415### 416 417class StreamRecoder: 418 419 """ StreamRecoder instances provide a frontend - backend 420 view of encoding data. 421 422 They use the complete set of APIs returned by the 423 codecs.lookup() function to implement their task. 424 425 Data written to the stream is first decoded into an 426 intermediate format (which is dependent on the given codec 427 combination) and then written to the stream using an instance 428 of the provided Writer class. 429 430 In the other direction, data is read from the stream using a 431 Reader instance and then return encoded data to the caller. 432 433 """ 434 # Optional attributes set by the file wrappers below 435 data_encoding = 'unknown' 436 file_encoding = 'unknown' 437 438 def __init__(self, stream, encode, decode, Reader, Writer, 439 errors='strict'): 440 441 """ Creates a StreamRecoder instance which implements a two-way 442 conversion: encode and decode work on the frontend (the 443 input to .read() and output of .write()) while 444 Reader and Writer work on the backend (reading and 445 writing to the stream). 446 447 You can use these objects to do transparent direct 448 recodings from e.g. latin-1 to utf-8 and back. 449 450 stream must be a file-like object. 451 452 encode, decode must adhere to the Codec interface, Reader, 453 Writer must be factory functions or classes providing the 454 StreamReader, StreamWriter interface resp. 455 456 encode and decode are needed for the frontend translation, 457 Reader and Writer for the backend translation. Unicode is 458 used as intermediate encoding. 459 460 Error handling is done in the same way as defined for the 461 StreamWriter/Readers. 462 463 """ 464 self.stream = stream 465 self.encode = encode 466 self.decode = decode 467 self.reader = Reader(stream, errors) 468 self.writer = Writer(stream, errors) 469 self.errors = errors 470 471 def read(self, size=-1): 472 473 data = self.reader.read(size) 474 data, bytesencoded = self.encode(data, self.errors) 475 return data 476 477 def readline(self, size=None): 478 479 if size is None: 480 data = self.reader.readline() 481 else: 482 data = self.reader.readline(size) 483 data, bytesencoded = self.encode(data, self.errors) 484 return data 485 486 def readlines(self, sizehint=None): 487 488 data = self.reader.read() 489 data, bytesencoded = self.encode(data, self.errors) 490 return data.splitlines(1) 491 492 def next(self): 493 494 """ Return the next decoded line from the input stream.""" 495 return self.reader.next() 496 497 def __iter__(self): 498 return self 499 500 def write(self, data): 501 502 data, bytesdecoded = self.decode(data, self.errors) 503 return self.writer.write(data) 504 505 def writelines(self, list): 506 507 data = ''.join(list) 508 data, bytesdecoded = self.decode(data, self.errors) 509 return self.writer.write(data) 510 511 def reset(self): 512 513 self.reader.reset() 514 self.writer.reset() 515 516 def __getattr__(self, name, 517 getattr=getattr): 518 519 """ Inherit all other methods from the underlying stream. 520 """ 521 return getattr(self.stream, name) 522 523### Shortcuts 524 525def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 526 527 """ Open an encoded file using the given mode and return 528 a wrapped version providing transparent encoding/decoding. 529 530 Note: The wrapped version will only accept the object format 531 defined by the codecs, i.e. Unicode objects for most builtin 532 codecs. Output is also codec dependent and will usually by 533 Unicode as well. 534 535 Files are always opened in binary mode, even if no binary mode 536 was specified. This is done to avoid data loss due to encodings 537 using 8-bit values. The default file mode is 'rb' meaning to 538 open the file in binary read mode. 539 540 encoding specifies the encoding which is to be used for the 541 file. 542 543 errors may be given to define the error handling. It defaults 544 to 'strict' which causes ValueErrors to be raised in case an 545 encoding error occurs. 546 547 buffering has the same meaning as for the builtin open() API. 548 It defaults to line buffered. 549 550 The returned wrapped file object provides an extra attribute 551 .encoding which allows querying the used encoding. This 552 attribute is only available if an encoding was specified as 553 parameter. 554 555 """ 556 if encoding is not None and \ 557 'b' not in mode: 558 # Force opening of the file in binary mode 559 mode = mode + 'b' 560 file = __builtin__.open(filename, mode, buffering) 561 if encoding is None: 562 return file 563 (e, d, sr, sw) = lookup(encoding) 564 srw = StreamReaderWriter(file, sr, sw, errors) 565 # Add attributes to simplify introspection 566 srw.encoding = encoding 567 return srw 568 569def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 570 571 """ Return a wrapped version of file which provides transparent 572 encoding translation. 573 574 Strings written to the wrapped file are interpreted according 575 to the given data_encoding and then written to the original 576 file as string using file_encoding. The intermediate encoding 577 will usually be Unicode but depends on the specified codecs. 578 579 Strings are read from the file using file_encoding and then 580 passed back to the caller as string using data_encoding. 581 582 If file_encoding is not given, it defaults to data_encoding. 583 584 errors may be given to define the error handling. It defaults 585 to 'strict' which causes ValueErrors to be raised in case an 586 encoding error occurs. 587 588 The returned wrapped file object provides two extra attributes 589 .data_encoding and .file_encoding which reflect the given 590 parameters of the same name. The attributes can be used for 591 introspection by Python programs. 592 593 """ 594 if file_encoding is None: 595 file_encoding = data_encoding 596 encode, decode = lookup(data_encoding)[:2] 597 Reader, Writer = lookup(file_encoding)[2:] 598 sr = StreamRecoder(file, 599 encode, decode, Reader, Writer, 600 errors) 601 # Add attributes to simplify introspection 602 sr.data_encoding = data_encoding 603 sr.file_encoding = file_encoding 604 return sr 605 606### Helpers for codec lookup 607 608def getencoder(encoding): 609 610 """ Lookup up the codec for the given encoding and return 611 its encoder function. 612 613 Raises a LookupError in case the encoding cannot be found. 614 615 """ 616 return lookup(encoding)[0] 617 618def getdecoder(encoding): 619 620 """ Lookup up the codec for the given encoding and return 621 its decoder function. 622 623 Raises a LookupError in case the encoding cannot be found. 624 625 """ 626 return lookup(encoding)[1] 627 628def getreader(encoding): 629 630 """ Lookup up the codec for the given encoding and return 631 its StreamReader class or factory function. 632 633 Raises a LookupError in case the encoding cannot be found. 634 635 """ 636 return lookup(encoding)[2] 637 638def getwriter(encoding): 639 640 """ Lookup up the codec for the given encoding and return 641 its StreamWriter class or factory function. 642 643 Raises a LookupError in case the encoding cannot be found. 644 645 """ 646 return lookup(encoding)[3] 647 648### Helpers for charmap-based codecs 649 650def make_identity_dict(rng): 651 652 """ make_identity_dict(rng) -> dict 653 654 Return a dictionary where elements of the rng sequence are 655 mapped to themselves. 656 657 """ 658 res = {} 659 for i in rng: 660 res[i]=i 661 return res 662 663def make_encoding_map(decoding_map): 664 665 """ Creates an encoding map from a decoding map. 666 667 If a target mapping in the decoding map occurs multiple 668 times, then that target is mapped to None (undefined mapping), 669 causing an exception when encountered by the charmap codec 670 during translation. 671 672 One example where this happens is cp875.py which decodes 673 multiple character to \u001a. 674 675 """ 676 m = {} 677 for k,v in decoding_map.items(): 678 if not v in m: 679 m[v] = k 680 else: 681 m[v] = None 682 return m 683 684### error handlers 685 686strict_errors = lookup_error("strict") 687ignore_errors = lookup_error("ignore") 688replace_errors = lookup_error("replace") 689xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 690backslashreplace_errors = lookup_error("backslashreplace") 691 692# Tell modulefinder that using codecs probably needs the encodings 693# package 694_false = 0 695if _false: 696 import encodings 697 698### Tests 699 700if __name__ == '__main__': 701 702 # Make stdout translate Latin-1 output into UTF-8 output 703 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 704 705 # Have stdin translate Latin-1 input into UTF-8 input 706 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 707