Python remove non ascii
15 Python code examples are found related to "
remove non ascii".
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: document_preprocessing.py From textlytics with MIT License | 6 votes |
def remove_non_ascii_chars(self, doc): """ Remove non-ASCII characters. Parameters ---------- doc : str Document that will be cleaned, all non unicode will be removed. Returns ---------- doc : str Document without ascii chars. """ for i in range(0, len(doc)): try: doc[i].encode("ascii") except UnicodeError, UnicodeDecodeError: # means it's non-ASCII doc[i] = ""
Example 2
Source File: cnn_dm_downloader.py From TransferRL with MIT License | 5 votes |
def remove_non_ascii(text): try: return unidecode(unicode(text, encoding = "utf-8")) except: return unidecode(str(text))
Example 3
Source File: cnn_dm_downloader.py From RLSeq2Seq with MIT License | 5 votes |
def remove_non_ascii(text): try: return unidecode(unicode(text, encoding = "utf-8")) except: return unidecode(text)
Example 4
Source File: helper.py From Verum with Apache License 2.0 | 5 votes |
def remove_non_ascii_from_graph(g): """ networkx graph -> networkx graph :param g: A networkx graph :return: a networkx graph with nonAscii removed from all node and edge attributes """ # ascii safe node key and value for node, data in g.nodes(data=True): for attr in data.keys(): data[attr] = removeNonAscii(data[attr]) g.node[node] = data if type(g) in [nx.classes.multidigraph.MultiDiGraph, nx.classes.multigraph.MultiGraph]: for edge in g.edges(data=True, keys=True): edge_attr = edge[3] for attr in edge_attr: if type(edge_attr[attr]) is str: edge_attr[attr] = removeNonAscii(edge_attr[attr]) g.edge[edge[0]][edge[1]][edge[2]] = edge_attr else: for edge in g.edges(data=True): edge_attr = edge[2] for attr in edge_attr: if type(edge_attr[attr]) is str: edge_attr[attr] = removeNonAscii(edge_attr[attr]) g.edge[edge[0]][edge[1]] = edge_attr # return the safed node return g ## MAIN LOOP EXECUTION
Example 5
Source File: munin_stdout.py From munin with Apache License 2.0 | 5 votes |
def removeNonAsciiDrop(string): nonascii = "error" # print "CON: ", string try: # Generate a new string without disturbing characters and allow new lines # Python 2 method try: nonascii = "".join(i for i in string if (ord(i) < 127 and ord(i) > 31) or ord(i) == 10 or ord(i) == 13) except Exception as e: # Python 3 fallback return string except Exception as e: traceback.print_exc() pass return nonascii
Example 6
Source File: blockchain.py From Trusty-cogs-archive with MIT License | 5 votes |
def remove_non_ascii(self, data): msg = b"" for char in data: if char in range(0, 127): msg += bytes(chr(char).encode("utf8")) return msg
Example 7
Source File: formatter.py From cltk with MIT License | 5 votes |
def remove_non_ascii(input_string): """Remove non-ascii characters Source: http://stackoverflow.com/a/1342373 """ no_ascii = "".join(i for i in input_string if ord(i) < 128) return no_ascii
Example 8
Source File: utilities.py From HiCExplorer with GNU General Public License v3.0 | 5 votes |
def remove_non_ascii(pText): """ This function converts all non-ascii characters to a most alike representation. Code from: https://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space/20079244 """ return unidecode(pText)
Example 9
Source File: __init__.py From codecov-python with Apache License 2.0 | 5 votes |
def remove_non_ascii(data): try: return data.decode("utf8") + "" except: return "".join([i if ord(i) < 128 else "" for i in data])
Example 10
Source File: pyTweetCleaner.py From SEDTWik-Event-Detection-from-Tweets with MIT License | 5 votes |
def remove_non_ascii_chars(self, text): """ return text after removing non-ascii characters i.e. characters with ascii value >= 128 """ return ''.join([w if ord(w) < 128 else ' ' for w in text])
Example 11
Source File: yarAnalyzer.py From yarAnalyzer with MIT License | 5 votes |
def remove_non_ascii(s): nonascii = "error" try: new_bytes = [] for cb in s: if cb > 31 and cb < 127: new_bytes.append(chr(cb)) else: new_bytes.append(chr(32)) # space #print("'%s'" % ''.join(new_bytes)) nonascii = ''.join(new_bytes) except Exception as e: traceback.print_exc() pass return nonascii
Example 12
Source File: utils.py From py_stringmatching with BSD 3-Clause "New" or "Revised" License | 5 votes |
def remove_non_ascii_chars(input_string): if type(input_string) is str: return _remove_non_ascii_chars(input_string) elif type(input_string) is unicode: return _remove_non_ascii_chars(input_string.encode('ascii', 'ignore')) else: return remove_non_ascii_chars(unicode(input_string))
Example 13
Source File: helpers.py From Loki with GNU General Public License v3.0 | 5 votes |
def removeNonAscii(string, stripit=False): nonascii = "error" try: try: # Handle according to the type if isinstance(string, unicode) and not stripit: nonascii = string.encode('unicode-escape') elif isinstance(string, str) and not stripit: nonascii = string.decode('utf-8', 'replace').encode('unicode-escape') else: try: nonascii = string.encode('raw_unicode_escape') except Exception, e: nonascii = str("%s" % string) except Exception as e: # traceback.print_exc() # print "All methods failed - removing characters" # Generate a new string without disturbing characters nonascii = "".join(i for i in string if ord(i)<127 and ord(i)>31) except Exception as e: traceback.print_exc() pass return nonascii
Example 14
Source File: helpers.py From Loki with GNU General Public License v3.0 | 5 votes |
def removeNonAsciiDrop(string): nonascii = "error" #print "CON: ", string try: # Generate a new string without disturbing characters nonascii = "".join(i for i in string if ord(i)<127 and ord(i)>31) except Exception, e: traceback.print_exc() pass #print "NON: ", nonascii
Example 15
Source File: PipelineGSEnrichment.py From CGATPipelines with MIT License | 5 votes |
def removeNonAscii(s): ''' Removes non-ascii characters from database terms (as some downloaded information has special characters which cause errors) ''' return "".join(i for i in s if ord(i) < 128)