codecs.py revision 7f82f7955efb5ad32e142a3164341c53565c7df0
1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import struct, __builtin__ 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError, why: 17 raise SystemError,\ 18 'Failed to load the builtin codecs: %s' % why 19 20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 24 "strict_errors", "ignore_errors", "replace_errors", 25 "xmlcharrefreplace_errors", 26 "register_error", "lookup_error"] 27 28### Constants 29 30# 31# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 32# and its possible byte string values 33# for UTF8/UTF16/UTF32 output and little/big endian machines 34# 35 36# UTF-8 37BOM_UTF8 = '\xef\xbb\xbf' 38 39# UTF-16, little endian 40BOM_LE = BOM_UTF16_LE = '\xff\xfe' 41 42# UTF-16, big endian 43BOM_BE = BOM_UTF16_BE = '\xfe\xff' 44 45# UTF-32, little endian 46BOM_UTF32_LE = '\xff\xfe\x00\x00' 47 48# UTF-32, big endian 49BOM_UTF32_BE = '\x00\x00\xfe\xff' 50 51# UTF-16, native endianness 52BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF) 53 54# UTF-32, native endianness 55BOM_UTF32 = struct.pack('=L', 0x0000FEFF) 56 57# Old broken names (don't use in new code) 58BOM32_LE = BOM_UTF16_LE 59BOM32_BE = BOM_UTF16_BE 60BOM64_LE = BOM_UTF32_LE 61BOM64_BE = BOM_UTF32_BE 62 63 64### Codec base classes (defining the API) 65 66class Codec: 67 68 """ Defines the interface for stateless encoders/decoders. 69 70 The .encode()/.decode() methods may use different error 71 handling schemes by providing the errors argument. These 72 string values are predefined: 73 74 'strict' - raise a ValueError error (or a subclass) 75 'ignore' - ignore the character and continue with the next 76 'replace' - replace with a suitable replacement character; 77 Python will use the official U+FFFD REPLACEMENT 78 CHARACTER for the builtin Unicode codecs on 79 decoding and '?' on encoding. 80 'xmlcharrefreplace' - Replace with the appropriate XML 81 character reference (only for encoding). 82 'backslashreplace' - Replace with backslashed escape sequences 83 (only for encoding). 84 85 The set of allowed values can be extended via register_error. 86 87 """ 88 def encode(self, input, errors='strict'): 89 90 """ Encodes the object input and returns a tuple (output 91 object, length consumed). 92 93 errors defines the error handling to apply. It defaults to 94 'strict' handling. 95 96 The method may not store state in the Codec instance. Use 97 StreamCodec for codecs which have to keep state in order to 98 make encoding/decoding efficient. 99 100 The encoder must be able to handle zero length input and 101 return an empty object of the output object type in this 102 situation. 103 104 """ 105 raise NotImplementedError 106 107 def decode(self, input, errors='strict'): 108 109 """ Decodes the object input and returns a tuple (output 110 object, length consumed). 111 112 input must be an object which provides the bf_getreadbuf 113 buffer slot. Python strings, buffer objects and memory 114 mapped files are examples of objects providing this slot. 115 116 errors defines the error handling to apply. It defaults to 117 'strict' handling. 118 119 The method may not store state in the Codec instance. Use 120 StreamCodec for codecs which have to keep state in order to 121 make encoding/decoding efficient. 122 123 The decoder must be able to handle zero length input and 124 return an empty object of the output object type in this 125 situation. 126 127 """ 128 raise NotImplementedError 129 130# 131# The StreamWriter and StreamReader class provide generic working 132# interfaces which can be used to implement new encoding submodules 133# very easily. See encodings/utf_8.py for an example on how this is 134# done. 135# 136 137class StreamWriter(Codec): 138 139 def __init__(self, stream, errors='strict'): 140 141 """ Creates a StreamWriter instance. 142 143 stream must be a file-like object open for writing 144 (binary) data. 145 146 The StreamWriter may use different error handling 147 schemes by providing the errors keyword argument. These 148 parameters are predefined: 149 150 'strict' - raise a ValueError (or a subclass) 151 'ignore' - ignore the character and continue with the next 152 'replace'- replace with a suitable replacement character 153 'xmlcharrefreplace' - Replace with the appropriate XML 154 character reference. 155 'backslashreplace' - Replace with backslashed escape 156 sequences (only for encoding). 157 158 The set of allowed parameter values can be extended via 159 register_error. 160 """ 161 self.stream = stream 162 self.errors = errors 163 164 def write(self, object): 165 166 """ Writes the object's contents encoded to self.stream. 167 """ 168 data, consumed = self.encode(object, self.errors) 169 self.stream.write(data) 170 171 def writelines(self, list): 172 173 """ Writes the concatenated list of strings to the stream 174 using .write(). 175 """ 176 self.write(''.join(list)) 177 178 def reset(self): 179 180 """ Flushes and resets the codec buffers used for keeping state. 181 182 Calling this method should ensure that the data on the 183 output is put into a clean state, that allows appending 184 of new fresh data without having to rescan the whole 185 stream to recover state. 186 187 """ 188 pass 189 190 def __getattr__(self, name, 191 getattr=getattr): 192 193 """ Inherit all other methods from the underlying stream. 194 """ 195 return getattr(self.stream, name) 196 197### 198 199class StreamReader(Codec): 200 201 def __init__(self, stream, errors='strict'): 202 203 """ Creates a StreamReader instance. 204 205 stream must be a file-like object open for reading 206 (binary) data. 207 208 The StreamReader may use different error handling 209 schemes by providing the errors keyword argument. These 210 parameters are predefined: 211 212 'strict' - raise a ValueError (or a subclass) 213 'ignore' - ignore the character and continue with the next 214 'replace'- replace with a suitable replacement character; 215 216 The set of allowed parameter values can be extended via 217 register_error. 218 """ 219 self.stream = stream 220 self.errors = errors 221 222 def read(self, size=-1): 223 224 """ Decodes data from the stream self.stream and returns the 225 resulting object. 226 227 size indicates the approximate maximum number of bytes to 228 read from the stream for decoding purposes. The decoder 229 can modify this setting as appropriate. The default value 230 -1 indicates to read and decode as much as possible. size 231 is intended to prevent having to decode huge files in one 232 step. 233 234 The method should use a greedy read strategy meaning that 235 it should read as much data as is allowed within the 236 definition of the encoding and the given size, e.g. if 237 optional encoding endings or state markers are available 238 on the stream, these should be read too. 239 240 """ 241 # Unsliced reading: 242 if size < 0: 243 return self.decode(self.stream.read(), self.errors)[0] 244 245 # Sliced reading: 246 read = self.stream.read 247 decode = self.decode 248 data = read(size) 249 i = 0 250 while 1: 251 try: 252 object, decodedbytes = decode(data, self.errors) 253 except ValueError, why: 254 # This method is slow but should work under pretty much 255 # all conditions; at most 10 tries are made 256 i = i + 1 257 newdata = read(1) 258 if not newdata or i > 10: 259 raise 260 data = data + newdata 261 else: 262 return object 263 264 def readline(self, size=None): 265 266 """ Read one line from the input stream and return the 267 decoded data. 268 269 Note: Unlike the .readlines() method, this method inherits 270 the line breaking knowledge from the underlying stream's 271 .readline() method -- there is currently no support for 272 line breaking using the codec decoder due to lack of line 273 buffering. Sublcasses should however, if possible, try to 274 implement this method using their own knowledge of line 275 breaking. 276 277 size, if given, is passed as size argument to the stream's 278 .readline() method. 279 280 """ 281 if size is None: 282 line = self.stream.readline() 283 else: 284 line = self.stream.readline(size) 285 return self.decode(line, self.errors)[0] 286 287 288 def readlines(self, sizehint=None): 289 290 """ Read all lines available on the input stream 291 and return them as list of lines. 292 293 Line breaks are implemented using the codec's decoder 294 method and are included in the list entries. 295 296 sizehint, if given, is passed as size argument to the 297 stream's .read() method. 298 299 """ 300 if sizehint is None: 301 data = self.stream.read() 302 else: 303 data = self.stream.read(sizehint) 304 return self.decode(data, self.errors)[0].splitlines(1) 305 306 def reset(self): 307 308 """ Resets the codec buffers used for keeping state. 309 310 Note that no stream repositioning should take place. 311 This method is primarily intended to be able to recover 312 from decoding errors. 313 314 """ 315 pass 316 317 def next(self): 318 319 """ Return the next decoded line from the input stream.""" 320 line = self.readline() 321 if line: 322 return line 323 raise StopIteration 324 325 def __iter__(self): 326 return self 327 328 def __getattr__(self, name, 329 getattr=getattr): 330 331 """ Inherit all other methods from the underlying stream. 332 """ 333 return getattr(self.stream, name) 334 335### 336 337class StreamReaderWriter: 338 339 """ StreamReaderWriter instances allow wrapping streams which 340 work in both read and write modes. 341 342 The design is such that one can use the factory functions 343 returned by the codec.lookup() function to construct the 344 instance. 345 346 """ 347 # Optional attributes set by the file wrappers below 348 encoding = 'unknown' 349 350 def __init__(self, stream, Reader, Writer, errors='strict'): 351 352 """ Creates a StreamReaderWriter instance. 353 354 stream must be a Stream-like object. 355 356 Reader, Writer must be factory functions or classes 357 providing the StreamReader, StreamWriter interface resp. 358 359 Error handling is done in the same way as defined for the 360 StreamWriter/Readers. 361 362 """ 363 self.stream = stream 364 self.reader = Reader(stream, errors) 365 self.writer = Writer(stream, errors) 366 self.errors = errors 367 368 def read(self, size=-1): 369 370 return self.reader.read(size) 371 372 def readline(self, size=None): 373 374 return self.reader.readline(size) 375 376 def readlines(self, sizehint=None): 377 378 return self.reader.readlines(sizehint) 379 380 def next(self): 381 382 """ Return the next decoded line from the input stream.""" 383 return self.reader.next() 384 385 def __iter__(self): 386 return self 387 388 def write(self, data): 389 390 return self.writer.write(data) 391 392 def writelines(self, list): 393 394 return self.writer.writelines(list) 395 396 def reset(self): 397 398 self.reader.reset() 399 self.writer.reset() 400 401 def __getattr__(self, name, 402 getattr=getattr): 403 404 """ Inherit all other methods from the underlying stream. 405 """ 406 return getattr(self.stream, name) 407 408### 409 410class StreamRecoder: 411 412 """ StreamRecoder instances provide a frontend - backend 413 view of encoding data. 414 415 They use the complete set of APIs returned by the 416 codecs.lookup() function to implement their task. 417 418 Data written to the stream is first decoded into an 419 intermediate format (which is dependent on the given codec 420 combination) and then written to the stream using an instance 421 of the provided Writer class. 422 423 In the other direction, data is read from the stream using a 424 Reader instance and then return encoded data to the caller. 425 426 """ 427 # Optional attributes set by the file wrappers below 428 data_encoding = 'unknown' 429 file_encoding = 'unknown' 430 431 def __init__(self, stream, encode, decode, Reader, Writer, 432 errors='strict'): 433 434 """ Creates a StreamRecoder instance which implements a two-way 435 conversion: encode and decode work on the frontend (the 436 input to .read() and output of .write()) while 437 Reader and Writer work on the backend (reading and 438 writing to the stream). 439 440 You can use these objects to do transparent direct 441 recodings from e.g. latin-1 to utf-8 and back. 442 443 stream must be a file-like object. 444 445 encode, decode must adhere to the Codec interface, Reader, 446 Writer must be factory functions or classes providing the 447 StreamReader, StreamWriter interface resp. 448 449 encode and decode are needed for the frontend translation, 450 Reader and Writer for the backend translation. Unicode is 451 used as intermediate encoding. 452 453 Error handling is done in the same way as defined for the 454 StreamWriter/Readers. 455 456 """ 457 self.stream = stream 458 self.encode = encode 459 self.decode = decode 460 self.reader = Reader(stream, errors) 461 self.writer = Writer(stream, errors) 462 self.errors = errors 463 464 def read(self, size=-1): 465 466 data = self.reader.read(size) 467 data, bytesencoded = self.encode(data, self.errors) 468 return data 469 470 def readline(self, size=None): 471 472 if size is None: 473 data = self.reader.readline() 474 else: 475 data = self.reader.readline(size) 476 data, bytesencoded = self.encode(data, self.errors) 477 return data 478 479 def readlines(self, sizehint=None): 480 481 if sizehint is None: 482 data = self.reader.read() 483 else: 484 data = self.reader.read(sizehint) 485 data, bytesencoded = self.encode(data, self.errors) 486 return data.splitlines(1) 487 488 def next(self): 489 490 """ Return the next decoded line from the input stream.""" 491 return self.reader.next() 492 493 def __iter__(self): 494 return self 495 496 def write(self, data): 497 498 data, bytesdecoded = self.decode(data, self.errors) 499 return self.writer.write(data) 500 501 def writelines(self, list): 502 503 data = ''.join(list) 504 data, bytesdecoded = self.decode(data, self.errors) 505 return self.writer.write(data) 506 507 def reset(self): 508 509 self.reader.reset() 510 self.writer.reset() 511 512 def __getattr__(self, name, 513 getattr=getattr): 514 515 """ Inherit all other methods from the underlying stream. 516 """ 517 return getattr(self.stream, name) 518 519### Shortcuts 520 521def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 522 523 """ Open an encoded file using the given mode and return 524 a wrapped version providing transparent encoding/decoding. 525 526 Note: The wrapped version will only accept the object format 527 defined by the codecs, i.e. Unicode objects for most builtin 528 codecs. Output is also codec dependent and will usually by 529 Unicode as well. 530 531 Files are always opened in binary mode, even if no binary mode 532 was specified. Thisis done to avoid data loss due to encodings 533 using 8-bit values. The default file mode is 'rb' meaning to 534 open the file in binary read mode. 535 536 encoding specifies the encoding which is to be used for the 537 the file. 538 539 errors may be given to define the error handling. It defaults 540 to 'strict' which causes ValueErrors to be raised in case an 541 encoding error occurs. 542 543 buffering has the same meaning as for the builtin open() API. 544 It defaults to line buffered. 545 546 The returned wrapped file object provides an extra attribute 547 .encoding which allows querying the used encoding. This 548 attribute is only available if an encoding was specified as 549 parameter. 550 551 """ 552 if encoding is not None and \ 553 'b' not in mode: 554 # Force opening of the file in binary mode 555 mode = mode + 'b' 556 file = __builtin__.open(filename, mode, buffering) 557 if encoding is None: 558 return file 559 (e, d, sr, sw) = lookup(encoding) 560 srw = StreamReaderWriter(file, sr, sw, errors) 561 # Add attributes to simplify introspection 562 srw.encoding = encoding 563 return srw 564 565def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 566 567 """ Return a wrapped version of file which provides transparent 568 encoding translation. 569 570 Strings written to the wrapped file are interpreted according 571 to the given data_encoding and then written to the original 572 file as string using file_encoding. The intermediate encoding 573 will usually be Unicode but depends on the specified codecs. 574 575 Strings are read from the file using file_encoding and then 576 passed back to the caller as string using data_encoding. 577 578 If file_encoding is not given, it defaults to data_encoding. 579 580 errors may be given to define the error handling. It defaults 581 to 'strict' which causes ValueErrors to be raised in case an 582 encoding error occurs. 583 584 The returned wrapped file object provides two extra attributes 585 .data_encoding and .file_encoding which reflect the given 586 parameters of the same name. The attributes can be used for 587 introspection by Python programs. 588 589 """ 590 if file_encoding is None: 591 file_encoding = data_encoding 592 encode, decode = lookup(data_encoding)[:2] 593 Reader, Writer = lookup(file_encoding)[2:] 594 sr = StreamRecoder(file, 595 encode, decode, Reader, Writer, 596 errors) 597 # Add attributes to simplify introspection 598 sr.data_encoding = data_encoding 599 sr.file_encoding = file_encoding 600 return sr 601 602### Helpers for codec lookup 603 604def getencoder(encoding): 605 606 """ Lookup up the codec for the given encoding and return 607 its encoder function. 608 609 Raises a LookupError in case the encoding cannot be found. 610 611 """ 612 return lookup(encoding)[0] 613 614def getdecoder(encoding): 615 616 """ Lookup up the codec for the given encoding and return 617 its decoder function. 618 619 Raises a LookupError in case the encoding cannot be found. 620 621 """ 622 return lookup(encoding)[1] 623 624def getreader(encoding): 625 626 """ Lookup up the codec for the given encoding and return 627 its StreamReader class or factory function. 628 629 Raises a LookupError in case the encoding cannot be found. 630 631 """ 632 return lookup(encoding)[2] 633 634def getwriter(encoding): 635 636 """ Lookup up the codec for the given encoding and return 637 its StreamWriter class or factory function. 638 639 Raises a LookupError in case the encoding cannot be found. 640 641 """ 642 return lookup(encoding)[3] 643 644### Helpers for charmap-based codecs 645 646def make_identity_dict(rng): 647 648 """ make_identity_dict(rng) -> dict 649 650 Return a dictionary where elements of the rng sequence are 651 mapped to themselves. 652 653 """ 654 res = {} 655 for i in rng: 656 res[i]=i 657 return res 658 659def make_encoding_map(decoding_map): 660 661 """ Creates an encoding map from a decoding map. 662 663 If a target mapping in the decoding map occurrs multiple 664 times, then that target is mapped to None (undefined mapping), 665 causing an exception when encountered by the charmap codec 666 during translation. 667 668 One example where this happens is cp875.py which decodes 669 multiple character to \u001a. 670 671 """ 672 m = {} 673 for k,v in decoding_map.items(): 674 if not v in m: 675 m[v] = k 676 else: 677 m[v] = None 678 return m 679 680### error handlers 681 682strict_errors = lookup_error("strict") 683ignore_errors = lookup_error("ignore") 684replace_errors = lookup_error("replace") 685xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 686backslashreplace_errors = lookup_error("backslashreplace") 687 688# Tell modulefinder that using codecs probably needs the encodings 689# package 690_false = 0 691if _false: 692 import encodings 693 694### Tests 695 696if __name__ == '__main__': 697 698 import sys 699 700 # Make stdout translate Latin-1 output into UTF-8 output 701 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 702 703 # Have stdin translate Latin-1 input into UTF-8 input 704 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 705