Python codecs.html() Examples

The following are 21 code examples of codecs.html(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module codecs , or try the search function .
Example #1
Source File: main.py    From cvt2utf with MIT License 6 votes vote down vote up
def normalize_codec_name(chardet_name):
    """
    Normalizes chardet codec names to Python codec names.
    :param chardet_name: chardet codec names
    :return: Python codec names. See: https://docs.python.org/3.7/library/codecs.html#standard-encodings
    """

    python_name = chardet_name.lower().replace('iso-', 'iso').replace('-', '_')
    python_name = codecs.lookup(python_name).name

    # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
    # contains certain special charaters. To make it more special-character-tolerant, we should
    # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
    if python_name == 'gb2312':
        return 'gb18030'

    return python_name 
Example #2
Source File: shell.py    From supersqlite with MIT License 6 votes vote down vote up
def pop_output(self):
        """Restores most recently pushed output.  There are many
        output parameters such as nullvalue, mode
        (list/tcl/html/insert etc), column widths, header etc.  If you
        temporarily need to change some settings then
        :meth:`push_output`, change the settings and then pop the old
        ones back.

        A simple example is implementing a command like .dump.  Push
        the current output, change the mode to insert so we get SQL
        inserts printed and then pop to go back to what was there
        before.

        """
        # first item should always be present
        assert len(self._output_stack)
        if len(self._output_stack)==1:
            o=self._output_stack[0]
        else:
            o=self._output_stack.pop()
        for k,v in o.items():
            setattr(self,k,v) 
Example #3
Source File: shell.py    From supersqlite with MIT License 6 votes vote down vote up
def process_command(self, cmd):
        """Processes a dot command.  It is split into parts using the
        `shlex.split
        <http://docs.python.org/library/shlex.html#shlex.split>`__
        function which is roughly the same method used by Unix/POSIX
        shells.
        """
        if self.echo:
            self.write(self.stderr, cmd+"\n")
        # broken with unicode on Python 2!!!
        if sys.version_info<(3,0):
            cmd=cmd.encode("utf8")
            cmd=[c.decode("utf8") for c in shlex.split(cmd)]
        else:
            cmd=shlex.split(cmd)
        assert cmd[0][0]=="."
        cmd[0]=cmd[0][1:]
        fn=getattr(self, "command_"+cmd[0], None)
        if not fn:
            raise self.Error("Unknown command \"%s\".  Enter \".help\" for help" % (cmd[0],))
        res=fn(cmd[1:])

    ###
    ### Commands start here
    ### 
Example #4
Source File: surrogateescape.py    From caterpillar with Apache License 2.0 6 votes vote down vote up
def error_handler(error):
  """Error handler for surrogateescape decoding.

  Should be used with an ASCII-compatible encoding (e.g., 'latin-1' or 'utf-8').
  Replaces any invalid byte sequences with surrogate code points.

  As specified in
  https://docs.python.org/2/library/codecs.html#codecs.register_error.
  """
  # We can't use this with UnicodeEncodeError; the UTF-8 encoder doesn't raise
  # an error for surrogates. Instead, use encode.
  if not isinstance(error, UnicodeDecodeError):
    raise error

  result = []
  for i in range(error.start, error.end):
    byte = ord(error.object[i])
    if byte < 128:
      raise error
    result.append(unichr(0xdc00 + byte))

  return ''.join(result), error.end 
Example #5
Source File: shell.py    From magnitude with MIT License 6 votes vote down vote up
def pop_output(self):
        """Restores most recently pushed output.  There are many
        output parameters such as nullvalue, mode
        (list/tcl/html/insert etc), column widths, header etc.  If you
        temporarily need to change some settings then
        :meth:`push_output`, change the settings and then pop the old
        ones back.

        A simple example is implementing a command like .dump.  Push
        the current output, change the mode to insert so we get SQL
        inserts printed and then pop to go back to what was there
        before.

        """
        # first item should always be present
        assert len(self._output_stack)
        if len(self._output_stack)==1:
            o=self._output_stack[0]
        else:
            o=self._output_stack.pop()
        for k,v in o.items():
            setattr(self,k,v) 
Example #6
Source File: shell.py    From magnitude with MIT License 6 votes vote down vote up
def process_command(self, cmd):
        """Processes a dot command.  It is split into parts using the
        `shlex.split
        <http://docs.python.org/library/shlex.html#shlex.split>`__
        function which is roughly the same method used by Unix/POSIX
        shells.
        """
        if self.echo:
            self.write(self.stderr, cmd+"\n")
        # broken with unicode on Python 2!!!
        if sys.version_info<(3,0):
            cmd=cmd.encode("utf8")
            cmd=[c.decode("utf8") for c in shlex.split(cmd)]
        else:
            cmd=shlex.split(cmd)
        assert cmd[0][0]=="."
        cmd[0]=cmd[0][1:]
        fn=getattr(self, "command_"+cmd[0], None)
        if not fn:
            raise self.Error("Unknown command \"%s\".  Enter \".help\" for help" % (cmd[0],))
        res=fn(cmd[1:])

    ###
    ### Commands start here
    ### 
Example #7
Source File: unit_tests.py    From roberteldersoftwarediff with Apache License 2.0 6 votes vote down vote up
def get_special_case_params():
    #  The windows and unix specific tests should be tested on both unix and Windows to detect crashes.
    special_cases = [
        [u"noexist", u"noexist"],
        [u"tests/ascii/ex1", u"noexist"],
        [u"noexist", u"tests/ascii/ex1"],
        [u"tests/ascii/ex1", u"tests/ascii/ex1", "--outfile", "/dev/null"],
        [u"tests/ascii/ex1", u"tests/ascii/ex2"],
        [u"tests/utf_8/ex3", u"tests/utf_8/ex4"],
        [u"tests/utf_8/ex3", u"tests/utf_8/ex4", u"--oldfile-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-8\""],
        [u"tests/utf_8/ex3", u"tests/utf_8/ex4", u"--oldfile-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-8\"", u"--output-encoding", u"\"utf-8\""],
        [u"tests/ascii/ex5", u"tests/ascii/ex6"],
        [u"tests/ascii/ex7", u"tests/ascii/ex8"],
        [u"tests/ascii/a.json", u"tests/ascii/b.json"],
        [u"tests/ascii/a.json", u"tests/ascii/b.json", u"--push-delimiters", u"\"{\"", u"\"[\"", u"--pop-delimiters", u"\"}\"", u"\"]\"", u"--include-delimiters"],
        [u"tests/utf_8/fancy1", u"tests/utf_8/fancy2", u"--delimiters", u"日本国", u"--include-delimiters", u"--parameters-encoding", u"\"utf-8\"", u"--output-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-8\"", u"--oldfile-encoding", u"\"utf-8\""],
        [u"tests/utf_8/fancy1", u"tests/utf_8/fancy2", u"--delimiters", u"\"\\u65e5\\u672c\\u56fd\"", u"--include-delimiters", u"--parameters-encoding", u"\"utf-8\"", u"--output-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-8\"", u"--oldfile-encoding", u"\"utf-8\""],
        [u"tests/utf_8/this-is-encoded-in-utf-8", u"tests/utf_16/this-is-encoded-in-utf-16", u"--output-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-16\"", u"--oldfile-encoding", u"\"utf-8\"", u"--enable-mark"],
        [u"tests/ascii/a.html", u"tests/ascii/b.html", u"-m", u"html"]
    ]
    return special_cases[random.randint(0, len(special_cases)-1)] 
Example #8
Source File: codec.py    From naz with MIT License 5 votes vote down vote up
def decode(input: bytes, errors: str = "strict") -> typing.Tuple[str, int]:
        """
        return a string decoded from the given bytes and its length.

        Parameters:
            input: the bytes to decode
            errors:	same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method
        """
        return codecs.utf_16_be_decode(input, errors) 
Example #9
Source File: codec.py    From naz with MIT License 5 votes vote down vote up
def register_codecs(custom_codecs: typing.Union[None, typing.Dict[str, codecs.CodecInfo]] = None):
    """
    Register codecs, both custom and naz inbuilt ones.
    Custom codecs that have same encoding as inbuilt ones will take precedence.
    Users should never have to use this directly,
    instead; use `naz.Client(custom_codecs={"my_encoding": codecs.CodecInfo(name="my_encoding", encode=..., decode=...)})`

    Parameters:
        custom_codecs: a list of custom codecs to register.
    """
    if custom_codecs is None:
        custom_codecs = {}

    # Note: Search function registration is not currently reversible,
    # which may cause problems in some cases, such as unit testing or module reloading.
    # https://docs.python.org/3.7/library/codecs.html#codecs.register
    #
    # Note: Encodings are first looked up in the registry's cache.
    # thus if you call `register_codecs` and then call it again with different
    # codecs, the second codecs may not take effect.
    # ie; codecs.lookup(encoding) will return the first codecs since they were stored
    # in the cache.
    # There doesn't appear to be away to clear codec cache at runtime.
    # see: https://docs.python.org/3/library/codecs.html#codecs.lookup

    def _codec_search_function(_encoding):
        """
        We should try and get codecs from the custom_codecs first.
        This way, if someone had overridden an inbuilt codec, their
        implementation is chosen first and cached.
        """
        if custom_codecs.get(_encoding):
            return custom_codecs.get(_encoding)
        else:
            return _INBUILT_CODECS.get(_encoding)

    codecs.register(_codec_search_function) 
Example #10
Source File: shell.py    From magnitude with MIT License 5 votes vote down vote up
def usage(self):
        "Returns the usage message.  Make sure it is newline terminated"

        msg="""
Usage: program [OPTIONS] FILENAME [SQL|CMD] [SQL|CMD]...
FILENAME is the name of a SQLite database. A new database is
created if the file does not exist.
OPTIONS include:
   -init filename       read/process named file
   -echo                print commands before execution
   -[no]header          turn headers on or off
   -bail                stop after hitting an error
   -interactive         force interactive I/O
   -batch               force batch I/O
   -column              set output mode to 'column'
   -csv                 set output mode to 'csv'
   -html                set output mode to 'html'
   -line                set output mode to 'line'
   -list                set output mode to 'list'
   -python              set output mode to 'python'
   -separator 'x'       set output field separator (|)
   -nullvalue 'text'    set text string for NULL values
   -version             show SQLite version
   -encoding 'name'     the encoding to use for files
                        opened via .import, .read & .output
   -nocolour            disables colour output to screen
"""
        return msg.lstrip()

    ###
    ### Value formatting routines.  They take a value and return a
    ### text formatting of them.  Mostly used by the various output's
    ### but also by random other pieces of code.
    ### 
Example #11
Source File: codec.py    From naz with MIT License 5 votes vote down vote up
def encode(input: str, errors: str = "strict") -> typing.Tuple[bytes, int]:
        """
        return an encoded version of the string as a bytes object and its length.

        Parameters:
            input: the string to encode
            errors:	same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method
        """
        # https://github.com/google/pytype/issues/348
        return codecs.utf_16_be_encode(input, errors) 
Example #12
Source File: codec.py    From naz with MIT License 5 votes vote down vote up
def decode(input: bytes, errors: str = "strict") -> typing.Tuple[str, int]:
        """
        return a string decoded from the given bytes and its length.

        Parameters:
            input: the bytes to decode
            errors:	same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method
        """
        res = iter(input)
        result = []
        for position, c in enumerate(res):
            try:
                if c == 27:
                    c = next(res)
                    result.append(GSM7BitCodec.gsm_extension[c])
                else:
                    result.append(GSM7BitCodec.gsm_basic_charset[c])
            except IndexError as indexErrorException:
                result.append(
                    GSM7BitCodec._handle_decode_error(
                        c, errors, position, input, indexErrorException
                    )
                )

        obj = "".join(result)
        return (obj, len(obj)) 
Example #13
Source File: _util.py    From shadowsocks with Apache License 2.0 5 votes vote down vote up
def find_encodings(enc=None, system=False):
    """Find functions for encoding translations for a specific codec.

    :param str enc: The codec to find translation functions for. It will be
                    normalized by converting to lowercase, excluding
                    everything which is not ascii, and hyphens will be
                    converted to underscores.

    :param bool system: If True, find encodings based on the system's stdin
                        encoding, otherwise assume utf-8.

    :raises: :exc:LookupError if the normalized codec, ``enc``, cannot be
             found in Python's encoding translation map.
    """
    if not enc:
        enc = 'utf-8'

    if system:
        if getattr(sys.stdin, 'encoding', None) is None:
            enc = sys.stdin.encoding
            log.debug("Obtained encoding from stdin: %s" % enc)
        else:
            enc = 'ascii'

    ## have to have lowercase to work, see
    ## http://docs.python.org/dev/library/codecs.html#standard-encodings
    enc = enc.lower()
    codec_alias = encodings.normalize_encoding(enc)

    codecs.register(encodings.search_function)
    coder = codecs.lookup(codec_alias)

    return coder 
Example #14
Source File: _util.py    From shadowsocks with Apache License 2.0 5 votes vote down vote up
def b(x):
        """See http://python3porting.com/problems.html#nicer-solutions"""
        coder = find_encodings()
        if isinstance(x, bytes):
            return coder.encode(x.decode(coder.name))[0]
        else:
            return coder.encode(x)[0] 
Example #15
Source File: _util.py    From shadowsocks with Apache License 2.0 5 votes vote down vote up
def b(x):
        """See http://python3porting.com/problems.html#nicer-solutions"""
        return x 
Example #16
Source File: codec.py    From naz with MIT License 5 votes vote down vote up
def encode(input: str, errors: str = "strict") -> typing.Tuple[bytes, int]:
        """
        return an encoded version of the string as a bytes object and its length.

        Parameters:
            input: the string to encode
            errors:	same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method
        """
        # for the types of this method,
        # see: https://github.com/python/typeshed/blob/f7d240f06e5608a20b2daac4e96fe085c0577239/stdlib/2and3/codecs.pyi#L21-L22
        result = []
        for position, c in enumerate(input):
            idx = GSM7BitCodec.gsm_basic_charset_map.get(c)
            if idx is not None:
                result.append(chr(idx))
                continue
            idx = GSM7BitCodec.gsm_extension_map.get(c)
            if idx is not None:
                result.append(chr(27) + chr(idx))
            else:
                result.append(GSM7BitCodec._handle_encode_error(c, errors, position, input))

        obj = "".join(result)
        # this is equivalent to;
        # import six; six.b('someString')
        # see:
        # https://github.com/benjaminp/six/blob/68112f3193c7d4bef5ad86ed1b6ed528edd9093d/six.py#L625
        obj_bytes = obj.encode("latin-1")
        return (obj_bytes, len(obj_bytes)) 
Example #17
Source File: shell.py    From supersqlite with MIT License 5 votes vote down vote up
def usage(self):
        "Returns the usage message.  Make sure it is newline terminated"

        msg="""
Usage: program [OPTIONS] FILENAME [SQL|CMD] [SQL|CMD]...
FILENAME is the name of a SQLite database. A new database is
created if the file does not exist.
OPTIONS include:
   -init filename       read/process named file
   -echo                print commands before execution
   -[no]header          turn headers on or off
   -bail                stop after hitting an error
   -interactive         force interactive I/O
   -batch               force batch I/O
   -column              set output mode to 'column'
   -csv                 set output mode to 'csv'
   -html                set output mode to 'html'
   -line                set output mode to 'line'
   -list                set output mode to 'list'
   -python              set output mode to 'python'
   -separator 'x'       set output field separator (|)
   -nullvalue 'text'    set text string for NULL values
   -version             show SQLite version
   -encoding 'name'     the encoding to use for files
                        opened via .import, .read & .output
   -nocolour            disables colour output to screen
"""
        return msg.lstrip()

    ###
    ### Value formatting routines.  They take a value and return a
    ### text formatting of them.  Mostly used by the various output's
    ### but also by random other pieces of code.
    ### 
Example #18
Source File: jproperties.py    From community-edition-setup with MIT License 5 votes vote down vote up
def _jbackslashreplace_error_handler(err):
    """
    Encoding error handler which replaces invalid characters with Java-compliant Unicode escape sequences.

    :param err: An `:exc:UnicodeEncodeError` instance.
    :return: See https://docs.python.org/2/library/codecs.html?highlight=codecs#codecs.register_error
    """
    if not isinstance(err, UnicodeEncodeError):
        raise err

    return _escape_non_ascii(err.object[err.start:err.end]), err.end 
Example #19
Source File: formats.py    From fuel with MIT License 5 votes vote down vote up
def open_(filename, mode='r', encoding=None):
    """Open a text file with encoding and optional gzip compression.

    Note that on legacy Python any encoding other than ``None`` or opening
    GZipped files will return an unpicklable file-like object.

    Parameters
    ----------
    filename : str
        The filename to read.
    mode : str, optional
        The mode with which to open the file. Defaults to `r`.
    encoding : str, optional
        The encoding to use (see the codecs documentation_ for supported
        values). Defaults to ``None``.

    .. _documentation:
    https://docs.python.org/3/library/codecs.html#standard-encodings

    """
    if filename.endswith('.gz'):
        if six.PY2:
            zf = io.BufferedReader(gzip.open(filename, mode))
            if encoding:
                return codecs.getreader(encoding)(zf)
            else:
                return zf
        else:
            return io.BufferedReader(gzip.open(filename, mode,
                                               encoding=encoding))
    if six.PY2:
        if encoding:
            return codecs.open(filename, mode, encoding=encoding)
        else:
            return open(filename, mode)
    else:
        return open(filename, mode, encoding=encoding) 
Example #20
Source File: codecs.py    From Emoji-Tools with GNU General Public License v3.0 5 votes vote down vote up
def encode(self, input, errors='strict'):
		assert errors == 'strict'
		#return codecs.encode(input, self.base_encoding, self.name), len(input)

		# The above line could totally be all we needed, relying on the error
		# handling to replace the unencodable Unicode characters with our extended
		# byte sequences.
		#
		# However, there seems to be a design bug in Python (probably intentional):
		# the error handler for encoding is supposed to return a **Unicode** character,
		# that then needs to be encodable itself...  Ugh.
		#
		# So we implement what codecs.encode() should have been doing: which is expect
		# error handler to return bytes() to be added to the output.
		#
		# This seems to have been fixed in Python 3.3.  We should try using that and
		# use fallback only if that failed.
		# https://docs.python.org/3.3/library/codecs.html#codecs.register_error

		length = len(input)
		out = b''
		while input:
			try:
				part = codecs.encode(input, self.base_encoding)
				out += part
				input = '' # All converted
			except UnicodeEncodeError as e:
				# Convert the correct part
				out += codecs.encode(input[:e.start], self.base_encoding)
				replacement, pos = self.error(e)
				out += replacement
				input = input[pos:]
		return out, length 
Example #21
Source File: basic.py    From EasY_HaCk with Apache License 2.0 4 votes vote down vote up
def decodePage(page, contentEncoding, contentType):
    """
    Decode compressed/charset HTTP response
    """

    if not page or (conf.nullConnection and len(page) < 2):
        return getUnicode(page)

    if isinstance(contentEncoding, basestring) and contentEncoding:
        contentEncoding = contentEncoding.lower()
    else:
        contentEncoding = ""

    if isinstance(contentType, basestring) and contentType:
        contentType = contentType.lower()
    else:
        contentType = ""

    if contentEncoding in ("gzip", "x-gzip", "deflate"):
        if not kb.pageCompress:
            return None

        try:
            if contentEncoding == "deflate":
                data = StringIO.StringIO(zlib.decompress(page, -15))  # Reference: http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
            else:
                data = gzip.GzipFile("", "rb", 9, StringIO.StringIO(page))
                size = struct.unpack("<l", page[-4:])[0]  # Reference: http://pydoc.org/get.cgi/usr/local/lib/python2.5/gzip.py
                if size > MAX_CONNECTION_TOTAL_SIZE:
                    raise Exception("size too large")

            page = data.read()
        except Exception, msg:
            if "<html" not in page:  # in some cases, invalid "Content-Encoding" appears for plain HTML (should be ignored)
                errMsg = "detected invalid data for declared content "
                errMsg += "encoding '%s' ('%s')" % (contentEncoding, msg)
                singleTimeLogMessage(errMsg, logging.ERROR)

                warnMsg = "turning off page compression"
                singleTimeWarnMessage(warnMsg)

                kb.pageCompress = False
                raise SqlmapCompressionException