Source code for naz.codec

# The code in this file is copied from https://github.com/praekelt/vumi/blob/master/vumi/codecs/vumi_codecs.py
# which is in turn largely copied from http://stackoverflow.com/questions/13130935/decode-7-bit-gsm
# Vumi's license is included below:

# Copyright (c) Praekelt Foundation and individual contributors.
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:

#     1.  Redistributions of source code must retain the above copyright notice,
#         this list of conditions and the following disclaimer.

#     2.  Redistributions in binary form must reproduce the above copyright
#         notice, this list of conditions and the following disclaimer in the
#         documentation and/or other materials provided with the distribution.

#     3.  Neither the name of the Praekelt Foundation nor the names of its
#         contributors may be used to endorse or promote products derived from
#         this software without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.


import codecs
import typing


# An alternative to using this codec module is to use: https://github.com/dsch/gsm0338
# however, I'm guessing that vumi has been in use longer and we should thus go with it.


class NazCodecException(Exception):
    pass


[docs]class GSM7BitCodec(codecs.Codec): """ SMPP uses a 7-bit GSM character set. This class implements that encoding/decoding scheme. Users should never have to use this directly, instead; use `naz.protocol.SubmitSM(encoding="gsm0338")` Example Usage: .. highlight:: python .. code-block:: python import naz codec = naz.codec.GSM7BitCodec() codec.encode("foo €") """ gsm_basic_charset = ( "@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞ\x1bÆæßÉ !\"#¤%&'()*+,-./0123456789:;" "<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑÜ`¿abcdefghijklmnopqrstuvwxyzäö" "ñüà" ) gsm_basic_charset_map = dict((l, i) for i, l in enumerate(gsm_basic_charset)) gsm_extension = ( "````````````````````^```````````````````{}`````\\````````````[~]`" "|````````````````````````````````````€``````````````````````````" ) gsm_extension_map = dict((l, i) for i, l in enumerate(gsm_extension)) # All the methods have to be staticmethods because they are passed to `codecs.CodecInfo`
[docs] @staticmethod def encode(input: str, errors: str = "strict") -> typing.Tuple[bytes, int]: """ return an encoded version of the string as a bytes object and its length. Parameters: input: the string to encode errors: same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method """ # for the types of this method, # see: https://github.com/python/typeshed/blob/f7d240f06e5608a20b2daac4e96fe085c0577239/stdlib/2and3/codecs.pyi#L21-L22 result = [] for position, c in enumerate(input): idx = GSM7BitCodec.gsm_basic_charset_map.get(c) if idx is not None: result.append(chr(idx)) continue idx = GSM7BitCodec.gsm_extension_map.get(c) if idx is not None: result.append(chr(27) + chr(idx)) else: result.append(GSM7BitCodec._handle_encode_error(c, errors, position, input)) obj = "".join(result) # this is equivalent to; # import six; six.b('someString') # see: # https://github.com/benjaminp/six/blob/68112f3193c7d4bef5ad86ed1b6ed528edd9093d/six.py#L625 obj_bytes = obj.encode("latin-1") return (obj_bytes, len(obj_bytes))
[docs] @staticmethod def decode(input: bytes, errors: str = "strict") -> typing.Tuple[str, int]: """ return a string decoded from the given bytes and its length. Parameters: input: the bytes to decode errors: same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method """ res = iter(input) result = [] for position, c in enumerate(res): try: if c == 27: c = next(res) result.append(GSM7BitCodec.gsm_extension[c]) else: result.append(GSM7BitCodec.gsm_basic_charset[c]) except IndexError as indexErrorException: result.append( GSM7BitCodec._handle_decode_error( c, errors, position, input, indexErrorException ) ) obj = "".join(result) return (obj, len(obj))
@staticmethod def _handle_encode_error(char, handler_type, position, obj): handler = None if handler_type == "strict": handler = GSM7BitCodec._handle_encode_strict_error elif handler_type == "ignore": handler = GSM7BitCodec._handle_encode_ignore_error elif handler_type == "replace": handler = GSM7BitCodec._handle_encode_replace_error if handler is None: raise NazCodecException("Invalid errors type {0} for GSM7BitCodec".format(handler_type)) return handler(char, position, obj) @staticmethod def _handle_encode_strict_error(char, position, obj): raise UnicodeEncodeError("gsm0338", char, position, position + 1, repr(obj)) @staticmethod def _handle_encode_ignore_error(char, position, obj): return "" @staticmethod def _handle_encode_replace_error(char, position, obj): return chr(GSM7BitCodec.gsm_basic_charset_map.get("?")) @staticmethod def _handle_decode_error(char, handler_type, position, obj, indexErrorException): handler = None if handler_type == "strict": handler = GSM7BitCodec._handle_decode_strict_error elif handler_type == "ignore": handler = GSM7BitCodec._handle_decode_ignore_error elif handler_type == "replace": handler = GSM7BitCodec._handle_decode_replace_error if handler is None: raise NazCodecException("Invalid errors type {0} for GSM7BitCodec".format(handler_type)) return handler(char, position, obj, indexErrorException) @staticmethod def _handle_decode_strict_error(char, position, obj, indexErrorException): # https://github.com/google/pytype/issues/349 raise UnicodeDecodeError( "gsm0338", chr(char).encode("latin-1"), position, position + 1, repr(obj), ) from indexErrorException @staticmethod def _handle_decode_ignore_error(char, position, obj, indexErrorException): return "" @staticmethod def _handle_decode_replace_error(char, position, obj, indexErrorException): return "?"
[docs]class UCS2Codec(codecs.Codec): """ This class implements the UCS2 encoding/decoding scheme. Users should never have to use this directly, instead; use `naz.protocol.SubmitSM(encoding="ucs2")` UCS2 is for all intents & purposes assumed to be the same as big endian UTF16. """ # All the methods have to be staticmethods because they are passed to `codecs.CodecInfo`
[docs] @staticmethod def encode(input: str, errors: str = "strict") -> typing.Tuple[bytes, int]: """ return an encoded version of the string as a bytes object and its length. Parameters: input: the string to encode errors: same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method """ # https://github.com/google/pytype/issues/348 return codecs.utf_16_be_encode(input, errors)
[docs] @staticmethod def decode(input: bytes, errors: str = "strict") -> typing.Tuple[str, int]: """ return a string decoded from the given bytes and its length. Parameters: input: the bytes to decode errors: same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method """ return codecs.utf_16_be_decode(input, errors)
_INBUILT_CODECS: typing.Dict[str, codecs.CodecInfo] = { # pytype issue; https://github.com/google/pytype/issues/574 "ucs2": codecs.CodecInfo( name="ucs2", encode=UCS2Codec.encode, decode=UCS2Codec.decode, # pytype: disable=wrong-arg-types ), "gsm0338": codecs.CodecInfo( name="gsm0338", encode=GSM7BitCodec.encode, decode=GSM7BitCodec.decode, # pytype: disable=wrong-arg-types ), }
[docs]def register_codecs(custom_codecs: typing.Union[None, typing.Dict[str, codecs.CodecInfo]] = None): """ Register codecs, both custom and naz inbuilt ones. Custom codecs that have same encoding as inbuilt ones will take precedence. Users should never have to use this directly, instead; use `naz.Client(custom_codecs={"my_encoding": codecs.CodecInfo(name="my_encoding", encode=..., decode=...)})` Parameters: custom_codecs: a list of custom codecs to register. """ if custom_codecs is None: custom_codecs = {} # Note: Search function registration is not currently reversible, # which may cause problems in some cases, such as unit testing or module reloading. # https://docs.python.org/3.7/library/codecs.html#codecs.register # # Note: Encodings are first looked up in the registry's cache. # thus if you call `register_codecs` and then call it again with different # codecs, the second codecs may not take effect. # ie; codecs.lookup(encoding) will return the first codecs since they were stored # in the cache. # There doesn't appear to be away to clear codec cache at runtime. # see: https://docs.python.org/3/library/codecs.html#codecs.lookup def _codec_search_function(_encoding): """ We should try and get codecs from the custom_codecs first. This way, if someone had overridden an inbuilt codec, their implementation is chosen first and cached. """ if custom_codecs.get(_encoding): return custom_codecs.get(_encoding) else: return _INBUILT_CODECS.get(_encoding) codecs.register(_codec_search_function)