| """ codecs -- Python Codec Registry, API and helpers. |
| |
| |
| Written by Marc-Andre Lemburg ([email protected]). |
| |
| (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
| |
| """#" |
| |
| import __builtin__, sys |
| |
| ### Registry and builtin stateless codec functions |
| |
| try: |
| from _codecs import * |
| except ImportError, why: |
| raise SystemError('Failed to load the builtin codecs: %s' % why) |
| |
| __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", |
| "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", |
| "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", |
| "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", |
| "strict_errors", "ignore_errors", "replace_errors", |
| "xmlcharrefreplace_errors", |
| "register_error", "lookup_error"] |
| |
| ### Constants |
| |
| # |
| # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) |
| # and its possible byte string values |
| # for UTF8/UTF16/UTF32 output and little/big endian machines |
| # |
| |
| # UTF-8 |
| BOM_UTF8 = '\xef\xbb\xbf' |
| |
| # UTF-16, little endian |
| BOM_LE = BOM_UTF16_LE = '\xff\xfe' |
| |
| # UTF-16, big endian |
| BOM_BE = BOM_UTF16_BE = '\xfe\xff' |
| |
| # UTF-32, little endian |
| BOM_UTF32_LE = '\xff\xfe\x00\x00' |
| |
| # UTF-32, big endian |
| BOM_UTF32_BE = '\x00\x00\xfe\xff' |
| |
| if sys.byteorder == 'little': |
| |
| # UTF-16, native endianness |
| BOM = BOM_UTF16 = BOM_UTF16_LE |
| |
| # UTF-32, native endianness |
| BOM_UTF32 = BOM_UTF32_LE |
| |
| else: |
| |
| # UTF-16, native endianness |
| BOM = BOM_UTF16 = BOM_UTF16_BE |
| |
| # UTF-32, native endianness |
| BOM_UTF32 = BOM_UTF32_BE |
| |
| # Old broken names (don't use in new code) |
| BOM32_LE = BOM_UTF16_LE |
| BOM32_BE = BOM_UTF16_BE |
| BOM64_LE = BOM_UTF32_LE |
| BOM64_BE = BOM_UTF32_BE |
| |
| |
| ### Codec base classes (defining the API) |
| |
| class CodecInfo(tuple): |
| |
| def __new__(cls, encode, decode, streamreader=None, streamwriter=None, |
| incrementalencoder=None, incrementaldecoder=None, name=None): |
| self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) |
| self.name = name |
| self.encode = encode |
| self.decode = decode |
| self.incrementalencoder = incrementalencoder |
| self.incrementaldecoder = incrementaldecoder |
| self.streamwriter = streamwriter |
| self.streamreader = streamreader |
| return self |
| |
| def __repr__(self): |
| return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) |
| |
| class Codec: |
| |
| """ Defines the interface for stateless encoders/decoders. |
| |
| The .encode()/.decode() methods may use different error |
| handling schemes by providing the errors argument. These |
| string values are predefined: |
| |
| 'strict' - raise a ValueError error (or a subclass) |
| 'ignore' - ignore the character and continue with the next |
| 'replace' - replace with a suitable replacement character; |
| Python will use the official U+FFFD REPLACEMENT |
| CHARACTER for the builtin Unicode codecs on |
| decoding and '?' on encoding. |
| 'xmlcharrefreplace' - Replace with the appropriate XML |
| character reference (only for encoding). |
| 'backslashreplace' - Replace with backslashed escape sequences |
| (only for encoding). |
| |
| The set of allowed values can be extended via register_error. |
| |
| """ |
| def encode(self, input, errors='strict'): |
| |
| """ Encodes the object input and returns a tuple (output |
| object, length consumed). |
| |
| errors defines the error handling to apply. It defaults to |
| 'strict' handling. |
| |
| The method may not store state in the Codec instance. Use |
| StreamCodec for codecs which have to keep state in order to |
| make encoding/decoding efficient. |
| |
| The encoder must be able to handle zero length input and |
| return an empty object of the output object type in this |
| situation. |
| |
| """ |
| raise NotImplementedError |
| |
| def decode(self, input, errors='strict'): |
| |
| """ Decodes the object input and returns a tuple (output |
| object, length consumed). |
| |
| input must be an object which provides the bf_getreadbuf |
| buffer slot. Python strings, buffer objects and memory |
| mapped files are examples of objects providing this slot. |
| |
| errors defines the error handling to apply. It defaults to |
| 'strict' handling. |
| |
| The method may not store state in the Codec instance. Use |
| StreamCodec for codecs which have to keep state in order to |
| make encoding/decoding efficient. |
| |
| The decoder must be able to handle zero length input and |
| return an empty object of the output object type in this |
| situation. |
| |
| """ |
| raise NotImplementedError |
| |
| class IncrementalEncoder(object): |
| """ |
| An IncrementalEncoder encodes an input in multiple steps. The input can be |
| passed piece by piece to the encode() method. The IncrementalEncoder remembers |
| the state of the Encoding process between calls to encode(). |
| """ |
| def __init__(self, errors='strict'): |
| """ |
| Creates an IncrementalEncoder instance. |
| |
| The IncrementalEncoder may use different error handling schemes by |
| providing the errors keyword argument. See the module docstring |
| for a list of possible values. |
| """ |
| self.errors = errors |
| self.buffer = "" |
| |
| def encode(self, input, final=False): |
| """ |
| Encodes input and returns the resulting object. |
| """ |
| raise NotImplementedError |
| |
| def reset(self): |
| """ |
| Resets the encoder to the initial state. |
| """ |
| |
| def getstate(self): |
| """ |
| Return the current state of the encoder. |
| """ |
| return 0 |
| |
| def setstate(self, state): |
| """ |
| Set the current state of the encoder. state must have been |
| returned by getstate(). |
| """ |
| |
| class BufferedIncrementalEncoder(IncrementalEncoder): |
| """ |
| This subclass of IncrementalEncoder can be used as the baseclass for an |
| incremental encoder if the encoder must keep some of the output in a |
| buffer between calls to encode(). |
| """ |
| def __init__(self, errors='strict'): |
| IncrementalEncoder.__init__(self, errors) |
| self.buffer = "" # unencoded input that is kept between calls to encode() |
| |
| def _buffer_encode(self, input, errors, final): |
| # Overwrite this method in subclasses: It must encode input |
| # and return an (output, length consumed) tuple |
| raise NotImplementedError |
| |
| def encode(self, input, final=False): |
| # encode input (taking the buffer into account) |
| data = self.buffer + input |
| (result, consumed) = self._buffer_encode(data, self.errors, final) |
| # keep unencoded input until the next call |
| self.buffer = data[consumed:] |
| return result |
| |
| def reset(self): |
| IncrementalEncoder.reset(self) |
| self.buffer = "" |
| |
| def getstate(self): |
| return self.buffer or 0 |
| |
| def setstate(self, state): |
| self.buffer = state or "" |
| |
| class IncrementalDecoder(object): |
| """ |
| An IncrementalDecoder decodes an input in multiple steps. The input can be |
| passed piece by piece to the decode() method. The IncrementalDecoder |
| remembers the state of the decoding process between calls to decode(). |
| """ |
| def __init__(self, errors='strict'): |
| """ |
| Creates a IncrementalDecoder instance. |
| |
| The IncrementalDecoder may use different error handling schemes by |
| providing the errors keyword argument. See the module docstring |
| for a list of possible values. |
| """ |
| self.errors = errors |
| |
| def decode(self, input, final=False): |
| """ |
| Decodes input and returns the resulting object. |
| """ |
| raise NotImplementedError |
| |
| def reset(self): |
| """ |
| Resets the decoder to the initial state. |
| """ |
| |
| def getstate(self): |
| """ |
| Return the current state of the decoder. |
| |
| This must be a (buffered_input, additional_state_info) tuple. |
| buffered_input must be a bytes object containing bytes that |
| were passed to decode() that have not yet been converted. |
| additional_state_info must be a non-negative integer |
| representing the state of the decoder WITHOUT yet having |
| processed the contents of buffered_input. In the initial state |
| and after reset(), getstate() must return (b"", 0). |
| """ |
| return (b"", 0) |
| |
| def setstate(self, state): |
| """ |
| Set the current state of the decoder. |
| |
| state must have been returned by getstate(). The effect of |
| setstate((b"", 0)) must be equivalent to reset(). |
| """ |
| |
| class BufferedIncrementalDecoder(IncrementalDecoder): |
| """ |
| This subclass of IncrementalDecoder can be used as the baseclass for an |
| incremental decoder if the decoder must be able to handle incomplete byte |
| sequences. |
| """ |
| def __init__(self, errors='strict'): |
| IncrementalDecoder.__init__(self, errors) |
| self.buffer = "" # undecoded input that is kept between calls to decode() |
| |
| def _buffer_decode(self, input, errors, final): |
| # Overwrite this method in subclasses: It must decode input |
| # and return an (output, length consumed) tuple |
| raise NotImplementedError |
| |
| def decode(self, input, final=False): |
| # decode input (taking the buffer into account) |
| data = self.buffer + input |
| (result, consumed) = self._buffer_decode(data, self.errors, final) |
| # keep undecoded input until the next call |
| self.buffer = data[consumed:] |
| return result |
| |
| def reset(self): |
| IncrementalDecoder.reset(self) |
| self.buffer = "" |
| |
| def getstate(self): |
| # additional state info is always 0 |
| return (self.buffer, 0) |
| |
| def setstate(self, state): |
| # ignore additional state info |
| self.buffer = state[0] |
| |
| # |
| # The StreamWriter and StreamReader class provide generic working |
| # interfaces which can be used to implement new encoding submodules |
| # very easily. See encodings/utf_8.py for an example on how this is |
| # done. |
| # |
| |
| class StreamWriter(Codec): |
| |
| def __init__(self, stream, errors='strict'): |
| |
| """ Creates a StreamWriter instance. |
| |
| stream must be a file-like object open for writing |
| (binary) data. |
| |
| The StreamWriter may use different error handling |
| schemes by providing the errors keyword argument. These |
| parameters are predefined: |
| |
| 'strict' - raise a ValueError (or a subclass) |
| 'ignore' - ignore the character and continue with the next |
| 'replace'- replace with a suitable replacement character |
| 'xmlcharrefreplace' - Replace with the appropriate XML |
| character reference. |
| 'backslashreplace' - Replace with backslashed escape |
| sequences (only for encoding). |
| |
| The set of allowed parameter values can be extended via |
| register_error. |
| """ |
| self.stream = stream |
| self.errors = errors |
| |
| def write(self, object): |
| |
| """ Writes the object's contents encoded to self.stream. |
| """ |
| data, consumed = self.encode(object, self.errors) |
| self.stream.write(data) |
| |
| def writelines(self, list): |
| |
| """ Writes the concatenated list of strings to the stream |
| using .write(). |
| """ |
| self.write(''.join(list)) |
| |
| def reset(self): |
| |
| """ Flushes and resets the codec buffers used for keeping state. |
| |
| Calling this method should ensure that the data on the |
| output is put into a clean state, that allows appending |
| of new fresh data without having to rescan the whole |
| stream to recover state. |
| |
| """ |
| pass |
| |
| def seek(self, offset, whence=0): |
| self.stream.seek(offset, whence) |
| if whence == 0 and offset == 0: |
| self.reset() |
| |
| def __getattr__(self, name, |
| getattr=getattr): |
| |
| """ Inherit all other methods from the underlying stream. |
| """ |
| return getattr(self.stream, name) |
| |
| def __enter__(self): |
| return self |
| |
| def __exit__(self, type, value, tb): |
| self.stream.close() |
| |
| ### |
| |
| class StreamReader(Codec): |
| |
| def __init__(self, stream, errors='strict'): |
| |
| """ Creates a StreamReader instance. |
| |
| stream must be a file-like object open for reading |
| (binary) data. |
| |
| The StreamReader may use different error handling |
| schemes by providing the errors keyword argument. These |
| parameters are predefined: |
| |
| 'strict' - raise a ValueError (or a subclass) |
| 'ignore' - ignore the character and continue with the next |
| 'replace'- replace with a suitable replacement character; |
| |
| The set of allowed parameter values can be extended via |
| register_error. |
| """ |
| self.stream = stream |
| self.errors = errors |
| self.bytebuffer = "" |
| # For str->str decoding this will stay a str |
| # For str->unicode decoding the first read will promote it to unicode |
| self.charbuffer = "" |
| self.linebuffer = None |
| |
| def decode(self, input, errors='strict'): |
| raise NotImplementedError |
| |
| def read(self, size=-1, chars=-1, firstline=False): |
| |
| """ Decodes data from the stream self.stream and returns the |
| resulting object. |
| |
| chars indicates the number of characters to read from the |
| stream. read() will never return more than chars |
| characters, but it might return less, if there are not enough |
| characters available. |
| |
| size indicates the approximate maximum number of bytes to |
| read from the stream for decoding purposes. The decoder |
| can modify this setting as appropriate. The default value |
| -1 indicates to read and decode as much as possible. size |
| is intended to prevent having to decode huge files in one |
| step. |
| |
| If firstline is true, and a UnicodeDecodeError happens |
| after the first line terminator in the input only the first line |
| will be returned, the rest of the input will be kept until the |
| next call to read(). |
| |
| The method should use a greedy read strategy meaning that |
| it should read as much data as is allowed within the |
| definition of the encoding and the given size, e.g. if |
| optional encoding endings or state markers are available |
| on the stream, these should be read too. |
| """ |
| # If we have lines cached, first merge them back into characters |
| if self.linebuffer: |
| self.charbuffer = "".join(self.linebuffer) |
| self.linebuffer = None |
| |
| # read until we get the required number of characters (if available) |
| while True: |
| # can the request can be satisfied from the character buffer? |
| if chars < 0: |
| if size < 0: |
| if self.charbuffer: |
| break |
| elif len(self.charbuffer) >= size: |
| break |
| else: |
| if len(self.charbuffer) >= chars: |
| break |
| # we need more data |
| if size < 0: |
| newdata = self.stream.read() |
| else: |
| newdata = self.stream.read(size) |
| # decode bytes (those remaining from the last call included) |
| data = self.bytebuffer + newdata |
| try: |
| newchars, decodedbytes = self.decode(data, self.errors) |
| except UnicodeDecodeError, exc: |
| if firstline: |
| newchars, decodedbytes = self.decode(data[:exc.start], self.errors) |
| lines = newchars.splitlines(True) |
| if len(lines)<=1: |
| raise |
| else: |
| raise |
| # keep undecoded bytes until the next call |
| self.bytebuffer = data[decodedbytes:] |
| # put new characters in the character buffer |
| self.charbuffer += newchars |
| # there was no data available |
| if not newdata: |
| break |
| if chars < 0: |
| # Return everything we've got |
| result = self.charbuffer |
| self.charbuffer = "" |
| else: |
| # Return the first chars characters |
| result = self.charbuffer[:chars] |
| self.charbuffer = self.charbuffer[chars:] |
| return result |
| |
| def readline(self, size=None, keepends=True): |
| |
| """ Read one line from the input stream and return the |
| decoded data. |
| |
| size, if given, is passed as size argument to the |
| read() method. |
| |
| """ |
| # If we have lines cached from an earlier read, return |
| # them unconditionally |
| if self.linebuffer: |
| line = self.linebuffer[0] |
| del self.linebuffer[0] |
| if len(self.linebuffer) == 1: |
| # revert to charbuffer mode; we might need more data |
| # next time |
| self.charbuffer = self.linebuffer[0] |
| self.linebuffer = None |
| if not keepends: |
| line = line.splitlines(False)[0] |
| return line |
| |
| readsize = size or 72 |
| line = "" |
| # If size is given, we call read() only once |
| while True: |
| data = self.read(readsize, firstline=True) |
| if data: |
| # If we're at a "\r" read one extra character (which might |
| # be a "\n") to get a proper line ending. If the stream is |
| # temporarily exhausted we return the wrong line ending. |
| if data.endswith("\r"): |
| data += self.read(size=1, chars=1) |
| |
| line += data |
| lines = line.splitlines(True) |
| if lines: |
| if len(lines) > 1: |
| # More than one line result; the first line is a full line |
| # to return |
| line = lines[0] |
| del lines[0] |
| if len(lines) > 1: |
| # cache the remaining lines |
| lines[-1] += self.charbuffer |
| self.linebuffer = lines |
| self.charbuffer = None |
| else: |
| # only one remaining line, put it back into charbuffer |
| self.charbuffer = lines[0] + self.charbuffer |
| if not keepends: |
| line = line.splitlines(False)[0] |
| break |
| line0withend = lines[0] |
| line0withoutend = lines[0].splitlines(False)[0] |
| if line0withend != line0withoutend: # We really have a line end |
| # Put the rest back together and keep it until the next call |
| self.charbuffer = "".join(lines[1:]) + self.charbuffer |
| if keepends: |
| line = line0withend |
| else: |
| line = line0withoutend |
| break |
| # we didn't get anything or this was our only try |
| if not data or size is not None: |
| if line and not keepends: |
| line = line.splitlines(False)[0] |
| break |
| if readsize<8000: |
| readsize *= 2 |
| return line |
| |
| def readlines(self, sizehint=None, keepends=True): |
| |
| """ Read all lines available on the input stream |
| and return them as list of lines. |
| |
| Line breaks are implemented using the codec's decoder |
| method and are included in the list entries. |
| |
| sizehint, if given, is ignored since there is no efficient |
| way to finding the true end-of-line. |
| |
| """ |
| data = self.read() |
| return data.splitlines(keepends) |
| |
| def reset(self): |
| |
| """ Resets the codec buffers used for keeping state. |
| |
| Note that no stream repositioning should take place. |
| This method is primarily intended to be able to recover |
| from decoding errors. |
| |
| """ |
| self.bytebuffer = "" |
| self.charbuffer = u"" |
| self.linebuffer = None |
| |
| def seek(self, offset, whence=0): |
| """ Set the input stream's current position. |
| |
| Resets the codec buffers used for keeping state. |
| """ |
| self.stream.seek(offset, whence) |
| self.reset() |
| |
| def next(self): |
| |
| """ Return the next decoded line from the input stream.""" |
| line = self.readline() |
| if line: |
| return line |
| raise StopIteration |
| |
| def __iter__(self): |
| return self |
| |
| def __getattr__(self, name, |
| getattr=getattr): |
| |
| """ Inherit all other methods from the underlying stream. |
| """ |
| return getattr(self.stream, name) |
| |
| def __enter__(self): |
| return self |
| |
| def __exit__(self, type, value, tb): |
| self.stream.close() |
| |
| ### |
| |
| class StreamReaderWriter: |
| |
| """ StreamReaderWriter instances allow wrapping streams which |
| work in both read and write modes. |
| |
| The design is such that one can use the factory functions |
| returned by the codec.lookup() function to construct the |
| instance. |
| |
| """ |
| # Optional attributes set by the file wrappers below |
| encoding = 'unknown' |
| |
| def __init__(self, stream, Reader, Writer, errors='strict'): |
| |
| """ Creates a StreamReaderWriter instance. |
| |
| stream must be a Stream-like object. |
| |
| Reader, Writer must be factory functions or classes |
| providing the StreamReader, StreamWriter interface resp. |
| |
| Error handling is done in the same way as defined for the |
| StreamWriter/Readers. |
| |
| """ |
| self.stream = stream |
| self.reader = Reader(stream, errors) |
| self.writer = Writer(stream, errors) |
| self.errors = errors |
| |
| def read(self, size=-1): |
| |
| return self.reader.read(size) |
| |
| def readline(self, size=None): |
| |
| return self.reader.readline(size) |
| |
| def readlines(self, sizehint=None): |
| |
| return self.reader.readlines(sizehint) |
| |
| def next(self): |
| |
| """ Return the next decoded line from the input stream.""" |
| return self.reader.next() |
| |
| def __iter__(self): |
| return self |
| |
| def write(self, data): |
| |
| return self.writer.write(data) |
| |
| def writelines(self, list): |
| |
| return self.writer.writelines(list) |
| |
| def reset(self): |
| |
| self.reader.reset() |
| self.writer.reset() |
| |
| def seek(self, offset, whence=0): |
| self.stream.seek(offset, whence) |
| self.reader.reset() |
| if whence == 0 and offset == 0: |
| self.writer.reset() |
| |
| def __getattr__(self, name, |
| getattr=getattr): |
| |
| """ Inherit all other methods from the underlying stream. |
| """ |
| return getattr(self.stream, name) |
| |
| # these are needed to make "with codecs.open(...)" work properly |
| |
| def __enter__(self): |
| return self |
| |
| def __exit__(self, type, value, tb): |
| self.stream.close() |
| |
| ### |
| |
| class StreamRecoder: |
| |
| """ StreamRecoder instances provide a frontend - backend |
| view of encoding data. |
| |
| They use the complete set of APIs returned by the |
| codecs.lookup() function to implement their task. |
| |
| Data written to the stream is first decoded into an |
| intermediate format (which is dependent on the given codec |
| combination) and then written to the stream using an instance |
| of the provided Writer class. |
| |
| In the other direction, data is read from the stream using a |
| Reader instance and then return encoded data to the caller. |
| |
| """ |
| # Optional attributes set by the file wrappers below |
| data_encoding = 'unknown' |
| file_encoding = 'unknown' |
| |
| def __init__(self, stream, encode, decode, Reader, Writer, |
| errors='strict'): |
| |
| """ Creates a StreamRecoder instance which implements a two-way |
| conversion: encode and decode work on the frontend (the |
| input to .read() and output of .write()) while |
| Reader and Writer work on the backend (reading and |
| writing to the stream). |
| |
| You can use these objects to do transparent direct |
| recodings from e.g. latin-1 to utf-8 and back. |
| |
| stream must be a file-like object. |
| |
| encode, decode must adhere to the Codec interface, Reader, |
| Writer must be factory functions or classes providing the |
| StreamReader, StreamWriter interface resp. |
| |
| encode and decode are needed for the frontend translation, |
| Reader and Writer for the backend translation. Unicode is |
| used as intermediate encoding. |
| |
| Error handling is done in the same way as defined for the |
| StreamWriter/Readers. |
| |
| """ |
| self.stream = stream |
| self.encode = encode |
| self.decode = decode |
| self.reader = Reader(stream, errors) |
| self.writer = Writer(stream, errors) |
| self.errors = errors |
| |
| def read(self, size=-1): |
| |
| data = self.reader.read(size) |
| data, bytesencoded = self.encode(data, self.errors) |
| return data |
| |
| def readline(self, size=None): |
| |
| if size is None: |
| data = self.reader.readline() |
| else: |
| data = self.reader.readline(size) |
| data, bytesencoded = self.encode(data, self.errors) |
| return data |
| |
| def readlines(self, sizehint=None): |
| |
| data = self.reader.read() |
| data, bytesencoded = self.encode(data, self.errors) |
| return data.splitlines(1) |
| |
| def next(self): |
| |
| """ Return the next decoded line from the input stream.""" |
| data = self.reader.next() |
| data, bytesencoded = self.encode(data, self.errors) |
| return data |
| |
| def __iter__(self): |
| return self |
| |
| def write(self, data): |
| |
| data, bytesdecoded = self.decode(data, self.errors) |
| return self.writer.write(data) |
| |
| def writelines(self, list): |
| |
| data = ''.join(list) |
| data, bytesdecoded = self.decode(data, self.errors) |
| return self.writer.write(data) |
| |
| def reset(self): |
| |
| self.reader.reset() |
| self.writer.reset() |
| |
| def __getattr__(self, name, |
| getattr=getattr): |
| |
| """ Inherit all other methods from the underlying stream. |
| """ |
| return getattr(self.stream, name) |
| |
| def __enter__(self): |
| return self |
| |
| def __exit__(self, type, value, tb): |
| self.stream.close() |
| |
| ### Shortcuts |
| |
| def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): |
| |
| """ Open an encoded file using the given mode and return |
| a wrapped version providing transparent encoding/decoding. |
| |
| Note: The wrapped version will only accept the object format |
| defined by the codecs, i.e. Unicode objects for most builtin |
| codecs. Output is also codec dependent and will usually be |
| Unicode as well. |
| |
| Files are always opened in binary mode, even if no binary mode |
| was specified. This is done to avoid data loss due to encodings |
| using 8-bit values. The default file mode is 'rb' meaning to |
| open the file in binary read mode. |
| |
| encoding specifies the encoding which is to be used for the |
| file. |
| |
| errors may be given to define the error handling. It defaults |
| to 'strict' which causes ValueErrors to be raised in case an |
| encoding error occurs. |
| |
| buffering has the same meaning as for the builtin open() API. |
| It defaults to line buffered. |
| |
| The returned wrapped file object provides an extra attribute |
| .encoding which allows querying the used encoding. This |
| attribute is only available if an encoding was specified as |
| parameter. |
| |
| """ |
| if encoding is not None: |
| if 'U' in mode: |
| # No automatic conversion of '\n' is done on reading and writing |
| mode = mode.strip().replace('U', '') |
| if mode[:1] not in set('rwa'): |
| mode = 'r' + mode |
| if 'b' not in mode: |
| # Force opening of the file in binary mode |
| mode = mode + 'b' |
| file = __builtin__.open(filename, mode, buffering) |
| if encoding is None: |
| return file |
| info = lookup(encoding) |
| srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) |
| # Add attributes to simplify introspection |
| srw.encoding = encoding |
| return srw |
| |
| def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): |
| |
| """ Return a wrapped version of file which provides transparent |
| encoding translation. |
| |
| Strings written to the wrapped file are interpreted according |
| to the given data_encoding and then written to the original |
| file as string using file_encoding. The intermediate encoding |
| will usually be Unicode but depends on the specified codecs. |
| |
| Strings are read from the file using file_encoding and then |
| passed back to the caller as string using data_encoding. |
| |
| If file_encoding is not given, it defaults to data_encoding. |
| |
| errors may be given to define the error handling. It defaults |
| to 'strict' which causes ValueErrors to be raised in case an |
| encoding error occurs. |
| |
| The returned wrapped file object provides two extra attributes |
| .data_encoding and .file_encoding which reflect the given |
| parameters of the same name. The attributes can be used for |
| introspection by Python programs. |
| |
| """ |
| if file_encoding is None: |
| file_encoding = data_encoding |
| data_info = lookup(data_encoding) |
| file_info = lookup(file_encoding) |
| sr = StreamRecoder(file, data_info.encode, data_info.decode, |
| file_info.streamreader, file_info.streamwriter, errors) |
| # Add attributes to simplify introspection |
| sr.data_encoding = data_encoding |
| sr.file_encoding = file_encoding |
| return sr |
| |
| ### Helpers for codec lookup |
| |
| def getencoder(encoding): |
| |
| """ Lookup up the codec for the given encoding and return |
| its encoder function. |
| |
| Raises a LookupError in case the encoding cannot be found. |
| |
| """ |
| return lookup(encoding).encode |
| |
| def getdecoder(encoding): |
| |
| """ Lookup up the codec for the given encoding and return |
| its decoder function. |
| |
| Raises a LookupError in case the encoding cannot be found. |
| |
| """ |
| return lookup(encoding).decode |
| |
| def getincrementalencoder(encoding): |
| |
| """ Lookup up the codec for the given encoding and return |
| its IncrementalEncoder class or factory function. |
| |
| Raises a LookupError in case the encoding cannot be found |
| or the codecs doesn't provide an incremental encoder. |
| |
| """ |
| encoder = lookup(encoding).incrementalencoder |
| if encoder is None: |
| raise LookupError(encoding) |
| return encoder |
| |
| def getincrementaldecoder(encoding): |
| |
| """ Lookup up the codec for the given encoding and return |
| its IncrementalDecoder class or factory function. |
| |
| Raises a LookupError in case the encoding cannot be found |
| or the codecs doesn't provide an incremental decoder. |
| |
| """ |
| decoder = lookup(encoding).incrementaldecoder |
| if decoder is None: |
| raise LookupError(encoding) |
| return decoder |
| |
| def getreader(encoding): |
| |
| """ Lookup up the codec for the given encoding and return |
| its StreamReader class or factory function. |
| |
| Raises a LookupError in case the encoding cannot be found. |
| |
| """ |
| return lookup(encoding).streamreader |
| |
| def getwriter(encoding): |
| |
| """ Lookup up the codec for the given encoding and return |
| its StreamWriter class or factory function. |
| |
| Raises a LookupError in case the encoding cannot be found. |
| |
| """ |
| return lookup(encoding).streamwriter |
| |
| def iterencode(iterator, encoding, errors='strict', **kwargs): |
| """ |
| Encoding iterator. |
| |
| Encodes the input strings from the iterator using a IncrementalEncoder. |
| |
| errors and kwargs are passed through to the IncrementalEncoder |
| constructor. |
| """ |
| encoder = getincrementalencoder(encoding)(errors, **kwargs) |
| for input in iterator: |
| output = encoder.encode(input) |
| if output: |
| yield output |
| output = encoder.encode("", True) |
| if output: |
| yield output |
| |
| def iterdecode(iterator, encoding, errors='strict', **kwargs): |
| """ |
| Decoding iterator. |
| |
| Decodes the input strings from the iterator using a IncrementalDecoder. |
| |
| errors and kwargs are passed through to the IncrementalDecoder |
| constructor. |
| """ |
| decoder = getincrementaldecoder(encoding)(errors, **kwargs) |
| for input in iterator: |
| output = decoder.decode(input) |
| if output: |
| yield output |
| output = decoder.decode("", True) |
| if output: |
| yield output |
| |
| ### Helpers for charmap-based codecs |
| |
| def make_identity_dict(rng): |
| |
| """ make_identity_dict(rng) -> dict |
| |
| Return a dictionary where elements of the rng sequence are |
| mapped to themselves. |
| |
| """ |
| res = {} |
| for i in rng: |
| res[i]=i |
| return res |
| |
| def make_encoding_map(decoding_map): |
| |
| """ Creates an encoding map from a decoding map. |
| |
| If a target mapping in the decoding map occurs multiple |
| times, then that target is mapped to None (undefined mapping), |
| causing an exception when encountered by the charmap codec |
| during translation. |
| |
| One example where this happens is cp875.py which decodes |
| multiple character to \u001a. |
| |
| """ |
| m = {} |
| for k,v in decoding_map.items(): |
| if not v in m: |
| m[v] = k |
| else: |
| m[v] = None |
| return m |
| |
| ### error handlers |
| |
| try: |
| strict_errors = lookup_error("strict") |
| ignore_errors = lookup_error("ignore") |
| replace_errors = lookup_error("replace") |
| xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") |
| backslashreplace_errors = lookup_error("backslashreplace") |
| except LookupError: |
| # In --disable-unicode builds, these error handler are missing |
| strict_errors = None |
| ignore_errors = None |
| replace_errors = None |
| xmlcharrefreplace_errors = None |
| backslashreplace_errors = None |
| |
| # Tell modulefinder that using codecs probably needs the encodings |
| # package |
| _false = 0 |
| if _false: |
| import encodings |
| |
| ### Tests |
| |
| if __name__ == '__main__': |
| |
| # Make stdout translate Latin-1 output into UTF-8 output |
| sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') |
| |
| # Have stdin translate Latin-1 input into UTF-8 input |
| sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |