summaryrefslogtreecommitdiff
path: root/Lib/codecs.py
diff options
context:
space:
mode:
authorThomas Wouters <thomas@python.org>2006-04-21 09:43:23 +0000
committerThomas Wouters <thomas@python.org>2006-04-21 09:43:23 +0000
commita977329b6fb0e4c95cabb9043794de69b27a1099 (patch)
treeb91552a0578639bd10181ab612039c1bed9bec27 /Lib/codecs.py
parentd858f70617a9df8456e89a898ad8f97bd57c09f9 (diff)
downloadcpython-git-a977329b6fb0e4c95cabb9043794de69b27a1099.tar.gz
Merge part of the trunk changes into the p3yk branch. This merges from 43030
(branch-creation time) up to 43067. 43068 and 43069 contain a little swapping action between re.py and sre.py, and this mightily confuses svn merge, so later changes are going in separately. This merge should break no additional tests. The last-merged revision is going in a 'last_merge' property on '.' (the branch directory.) Arbitrarily chosen, really; if there's a BCP for this, I couldn't find it, but we can easily change it afterwards ;)
Diffstat (limited to 'Lib/codecs.py')
-rw-r--r--Lib/codecs.py183
1 files changed, 172 insertions, 11 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 6895a228fe..28856c714e 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -73,6 +73,23 @@ BOM64_BE = BOM_UTF32_BE
### Codec base classes (defining the API)
+class CodecInfo(tuple):
+
+ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
+ incrementalencoder=None, incrementaldecoder=None, name=None):
+ self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
+ self.name = name
+ self.encode = encode
+ self.decode = decode
+ self.incrementalencoder = incrementalencoder
+ self.incrementaldecoder = incrementaldecoder
+ self.streamwriter = streamwriter
+ self.streamreader = streamreader
+ return self
+
+ def __repr__(self):
+ return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
+
class Codec:
""" Defines the interface for stateless encoders/decoders.
@@ -137,6 +154,88 @@ class Codec:
"""
raise NotImplementedError
+class IncrementalEncoder(object):
+ """
+ A IncrementalEncoder encodes an input in multiple steps. The input can be
+ passed piece by piece to the encode() method. The IncrementalEncoder remembers
+ the state of the Encoding process between calls to encode().
+ """
+ def __init__(self, errors='strict'):
+ """
+ Creates a IncrementalEncoder instance.
+
+ The IncrementalEncoder may use different error handling schemes by
+ providing the errors keyword argument. See the module docstring
+ for a list of possible values.
+ """
+ self.errors = errors
+ self.buffer = ""
+
+ def encode(self, input, final=False):
+ """
+ Encodes input and returns the resulting object.
+ """
+ raise NotImplementedError
+
+ def reset(self):
+ """
+ Resets the encoder to the initial state.
+ """
+
+class IncrementalDecoder(object):
+ """
+ An IncrementalDecoder decodes an input in multiple steps. The input can be
+ passed piece by piece to the decode() method. The IncrementalDecoder
+ remembers the state of the decoding process between calls to decode().
+ """
+ def __init__(self, errors='strict'):
+ """
+ Creates a IncrementalDecoder instance.
+
+ The IncrementalDecoder may use different error handling schemes by
+ providing the errors keyword argument. See the module docstring
+ for a list of possible values.
+ """
+ self.errors = errors
+
+ def decode(self, input, final=False):
+ """
+ Decodes input and returns the resulting object.
+ """
+ raise NotImplementedError
+
+ def reset(self):
+ """
+ Resets the decoder to the initial state.
+ """
+
+class BufferedIncrementalDecoder(IncrementalDecoder):
+ """
+ This subclass of IncrementalDecoder can be used as the baseclass for an
+ incremental decoder if the decoder must be able to handle incomplete byte
+ sequences.
+ """
+ def __init__(self, errors='strict'):
+ IncrementalDecoder.__init__(self, errors)
+ self.buffer = "" # undecoded input that is kept between calls to decode()
+
+ def _buffer_decode(self, input, errors, final):
+ # Overwrite this method in subclasses: It must decode input
+ # and return an (output, length consumed) tuple
+ raise NotImplementedError
+
+ def decode(self, input, final=False):
+ # decode input (taking the buffer into account)
+ data = self.buffer + input
+ (result, consumed) = self._buffer_decode(data, self.errors, final)
+ # keep undecoded input until the next call
+ self.buffer = data[consumed:]
+ return result
+
+ def reset(self):
+ IncrementalDecoder.reset(self)
+ self.bytebuffer = ""
+
#
# The StreamWriter and StreamReader class provide generic working
# interfaces which can be used to implement new encoding submodules
@@ -666,8 +765,8 @@ def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
file = __builtin__.open(filename, mode, buffering)
if encoding is None:
return file
- (e, d, sr, sw) = lookup(encoding)
- srw = StreamReaderWriter(file, sr, sw, errors)
+ info = lookup(encoding)
+ srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
# Add attributes to simplify introspection
srw.encoding = encoding
return srw
@@ -699,11 +798,9 @@ def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
"""
if file_encoding is None:
file_encoding = data_encoding
- encode, decode = lookup(data_encoding)[:2]
- Reader, Writer = lookup(file_encoding)[2:]
- sr = StreamRecoder(file,
- encode, decode, Reader, Writer,
- errors)
+ info = lookup(data_encoding)
+ sr = StreamRecoder(file, info.encode, info.decode,
+ info.streamreader, info.streamwriter, errors)
# Add attributes to simplify introspection
sr.data_encoding = data_encoding
sr.file_encoding = file_encoding
@@ -719,7 +816,7 @@ def getencoder(encoding):
Raises a LookupError in case the encoding cannot be found.
"""
- return lookup(encoding)[0]
+ return lookup(encoding).encode
def getdecoder(encoding):
@@ -729,7 +826,35 @@ def getdecoder(encoding):
Raises a LookupError in case the encoding cannot be found.
"""
- return lookup(encoding)[1]
+ return lookup(encoding).decode
+
+def getincrementalencoder(encoding):
+
+ """ Lookup up the codec for the given encoding and return
+ its IncrementalEncoder class or factory function.
+
+ Raises a LookupError in case the encoding cannot be found
+ or the codecs doesn't provide an incremental encoder.
+
+ """
+ encoder = lookup(encoding).incrementalencoder
+ if encoder is None:
+ raise LookupError(encoding)
+ return encoder
+
+def getincrementaldecoder(encoding):
+
+ """ Lookup up the codec for the given encoding and return
+ its IncrementalDecoder class or factory function.
+
+ Raises a LookupError in case the encoding cannot be found
+ or the codecs doesn't provide an incremental decoder.
+
+ """
+ decoder = lookup(encoding).incrementaldecoder
+ if decoder is None:
+ raise LookupError(encoding)
+ return decoder
def getreader(encoding):
@@ -739,7 +864,7 @@ def getreader(encoding):
Raises a LookupError in case the encoding cannot be found.
"""
- return lookup(encoding)[2]
+ return lookup(encoding).streamreader
def getwriter(encoding):
@@ -749,7 +874,43 @@ def getwriter(encoding):
Raises a LookupError in case the encoding cannot be found.
"""
- return lookup(encoding)[3]
+ return lookup(encoding).streamwriter
+
+def iterencode(iterator, encoding, errors='strict', **kwargs):
+ """
+ Encoding iterator.
+
+ Encodes the input strings from the iterator using a IncrementalEncoder.
+
+ errors and kwargs are passed through to the IncrementalEncoder
+ constructor.
+ """
+ encoder = getincrementalencoder(encoding)(errors, **kwargs)
+ for input in iterator:
+ output = encoder.encode(input)
+ if output:
+ yield output
+ output = encoder.encode("", True)
+ if output:
+ yield output
+
+def iterdecode(iterator, encoding, errors='strict', **kwargs):
+ """
+ Decoding iterator.
+
+ Decodes the input strings from the iterator using a IncrementalDecoder.
+
+ errors and kwargs are passed through to the IncrementalDecoder
+ constructor.
+ """
+ decoder = getincrementaldecoder(encoding)(errors, **kwargs)
+ for input in iterator:
+ output = decoder.decode(input)
+ if output:
+ yield output
+ output = decoder.decode("", True)
+ if output:
+ yield output
### Helpers for charmap-based codecs