Python remove non ascii

15 Python code examples are found related to " remove non ascii". You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: document_preprocessing.py From textlytics with MIT License

6 votes

def remove_non_ascii_chars(self, doc):
        """
        Remove non-ASCII characters.

        Parameters
        ----------
        doc : str
            Document that will be cleaned, all non unicode will be removed.

        Returns
        ----------
        doc : str
            Document without ascii chars.
        """
        for i in range(0, len(doc)):
            try:
                doc[i].encode("ascii")
            except UnicodeError, UnicodeDecodeError:
                # means it's non-ASCII
                doc[i] = ""

Example 2

Source File: cnn_dm_downloader.py From TransferRL with MIT License

5 votes

def remove_non_ascii(text):
    try:
        return unidecode(unicode(text, encoding = "utf-8"))
    except:
        return unidecode(str(text))

Example 3

Source File: cnn_dm_downloader.py From RLSeq2Seq with MIT License

5 votes

def remove_non_ascii(text):
    try:
        return unidecode(unicode(text, encoding = "utf-8"))
    except:
        return unidecode(text)

Example 4

Source File: helper.py From Verum with Apache License 2.0

5 votes

def remove_non_ascii_from_graph(g):
    """ networkx graph -> networkx graph

    :param g: A networkx graph
    :return: a networkx graph with nonAscii removed from all node and edge attributes
    """
    # ascii safe node key and value
    for node, data in g.nodes(data=True):
        for attr in data.keys():
            data[attr] = removeNonAscii(data[attr])
        g.node[node] = data

    if type(g) in [nx.classes.multidigraph.MultiDiGraph, nx.classes.multigraph.MultiGraph]:
        for edge in g.edges(data=True, keys=True):
            edge_attr = edge[3]
            for attr in edge_attr:
                if type(edge_attr[attr]) is str:
                    edge_attr[attr] = removeNonAscii(edge_attr[attr])
            g.edge[edge[0]][edge[1]][edge[2]] = edge_attr
    else:
        for edge in g.edges(data=True):
            edge_attr = edge[2]
            for attr in edge_attr:
                if type(edge_attr[attr]) is str:
                    edge_attr[attr] = removeNonAscii(edge_attr[attr])
            g.edge[edge[0]][edge[1]] = edge_attr


    # return the safed node
    return g

## MAIN LOOP EXECUTION

Example 5

Source File: munin_stdout.py From munin with Apache License 2.0

5 votes

def removeNonAsciiDrop(string):
    nonascii = "error"
    # print "CON: ", string
    try:
        # Generate a new string without disturbing characters and allow new lines
        # Python 2 method
        try:
            nonascii = "".join(i for i in string if (ord(i) < 127 and ord(i) > 31) or ord(i) == 10 or ord(i) == 13)
        except Exception as e:
            # Python 3 fallback
            return string
    except Exception as e:
        traceback.print_exc()
        pass
    return nonascii

Example 6

Source File: blockchain.py From Trusty-cogs-archive with MIT License

5 votes

def remove_non_ascii(self, data):
        msg = b""
        for char in data:
            if char in range(0, 127):
                msg += bytes(chr(char).encode("utf8"))
        return msg

Example 7

Source File: formatter.py From cltk with MIT License

5 votes

def remove_non_ascii(input_string):
    """Remove non-ascii characters
    Source: http://stackoverflow.com/a/1342373
    """
    no_ascii = "".join(i for i in input_string if ord(i) < 128)
    return no_ascii

Example 8

Source File: utilities.py From HiCExplorer with GNU General Public License v3.0

5 votes

def remove_non_ascii(pText):
    """
    This function converts all non-ascii characters to a most alike representation.
    Code from:
    https://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space/20079244
    """
    return unidecode(pText)

Example 9

Source File: __init__.py From codecov-python with Apache License 2.0

5 votes

def remove_non_ascii(data):
    try:
        return data.decode("utf8") + ""
    except:
        return "".join([i if ord(i) < 128 else "" for i in data])

Example 10

Source File: pyTweetCleaner.py From SEDTWik-Event-Detection-from-Tweets with MIT License

5 votes

def remove_non_ascii_chars(self, text):
        """
        return text after removing non-ascii characters i.e. characters with ascii value >= 128
        """
        return ''.join([w if ord(w) < 128 else ' ' for w in text])

Example 11

Source File: yarAnalyzer.py From yarAnalyzer with MIT License

5 votes

def remove_non_ascii(s):
    nonascii = "error"
    try:
        new_bytes = []
        for cb in s:
            if cb > 31 and cb < 127:
                new_bytes.append(chr(cb))
            else:
                new_bytes.append(chr(32))  # space
        #print("'%s'" % ''.join(new_bytes))
        nonascii = ''.join(new_bytes)
    except Exception as e:
        traceback.print_exc()
        pass
    return nonascii

Example 12

Source File: utils.py From py_stringmatching with BSD 3-Clause "New" or "Revised" License

5 votes

def remove_non_ascii_chars(input_string):
    if type(input_string) is str:
        return _remove_non_ascii_chars(input_string)
    elif type(input_string) is unicode:
        return _remove_non_ascii_chars(input_string.encode('ascii', 'ignore'))
    else:
        return remove_non_ascii_chars(unicode(input_string))

Example 13

Source File: helpers.py From Loki with GNU General Public License v3.0

5 votes

def removeNonAscii(string, stripit=False):
    nonascii = "error"

    try:
        try:
            # Handle according to the type
            if isinstance(string, unicode) and not stripit:
                nonascii = string.encode('unicode-escape')
            elif isinstance(string, str) and not stripit:
                nonascii = string.decode('utf-8', 'replace').encode('unicode-escape')
            else:
                try:
                    nonascii = string.encode('raw_unicode_escape')
                except Exception, e:
                    nonascii = str("%s" % string)

        except Exception as e:
            # traceback.print_exc()
            # print "All methods failed - removing characters"
            # Generate a new string without disturbing characters
            nonascii = "".join(i for i in string if ord(i)<127 and ord(i)>31)

    except Exception as e:
        traceback.print_exc()
        pass

    return nonascii

Example 14

Source File: helpers.py From Loki with GNU General Public License v3.0

5 votes

def removeNonAsciiDrop(string):
    nonascii = "error"
    #print "CON: ", string
    try:
        # Generate a new string without disturbing characters
        nonascii = "".join(i for i in string if ord(i)<127 and ord(i)>31)

    except Exception, e:
        traceback.print_exc()
        pass
    #print "NON: ", nonascii

Example 15

Source File: PipelineGSEnrichment.py From CGATPipelines with MIT License

5 votes

def removeNonAscii(s):
    '''
    Removes non-ascii characters from database terms (as some downloaded
    information has special characters which cause errors)
    '''
    return "".join(i for i in s if ord(i) < 128)