Python regex.sub() Examples
The following are 30
code examples of regex.sub().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: utils.py From MnemonicReader with BSD 3-Clause "New" or "Revised" License | 8 votes |
def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))))
Example #2
Source File: codetidy.py From chepy with GNU General Public License v3.0 | 6 votes |
def to_snake_case(self): """Convert string to snake case Converts the input string to snake case. Snake case is all lower case with underscores as word boundaries. e.g. this_is_snake_case. Returns: Chepy: The Chepy object. Examples: >>> Chepy("helloWorld").to_snake_case().o "hello_world" """ s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", self._convert_to_str()) self.state = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() return self
Example #3
Source File: prepro.py From transformer with Apache License 2.0 | 6 votes |
def make_vocab(fpath, fname): '''Constructs vocabulary. Args: fpath: A string. Input file path. fname: A string. Output file name. Writes vocabulary line by line to `preprocessed/fname` ''' text = codecs.open(fpath, 'r', 'utf-8').read() text = regex.sub("[^\s\p{Latin}']", "", text) words = text.split() word2cnt = Counter(words) if not os.path.exists('preprocessed'): os.mkdir('preprocessed') with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout: fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>")) for word, cnt in word2cnt.most_common(len(word2cnt)): fout.write(u"{}\t{}\n".format(word, cnt))
Example #4
Source File: utils.py From OpenQA with MIT License | 6 votes |
def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))))
Example #5
Source File: pygrok.py From pygrok with MIT License | 6 votes |
def _load_search_pattern(self): self.type_mapper = {} py_regex_pattern = self.pattern while True: # Finding all types specified in the groks m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern) for n in m: self.type_mapper[n[1]] = n[2] #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type} # with regex and regex group name py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}', lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")", py_regex_pattern) #replace %{pattern_name} with regex py_regex_pattern = re.sub(r'%{(\w+)}', lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")", py_regex_pattern) if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None: break self.regex_obj = re.compile(py_regex_pattern)
Example #6
Source File: normalize.py From sacremoses with MIT License | 6 votes |
def normalize(self, text): """ Returns a string with normalized punctuation. """ # Optionally, replace unicode puncts BEFORE normalization. if self.pre_replace_unicode_punct: text = self.replace_unicode_punct(text) # Actual normalization. for regexp, substitution in self.substitutions: # print(regexp, substitution) text = re.sub(regexp, substitution, text_type(text)) # print(text) # Optionally, replace unicode puncts BEFORE normalization. if self.post_remove_control_chars: text = self.remove_control_chars(text) return text
Example #7
Source File: iocextract.py From python-iocextract with GNU General Public License v2.0 | 6 votes |
def refang_email(email): """Refang an email address. :param email: String email address. :rtype: str """ # Check for ' at ' and ' dot ' first. email = re.sub('\W[aA][tT]\W', '@', email.lower()) email = re.sub('\W*[dD][oO][tT]\W*', '.', email) # Then do other char replaces. return _refang_common(email).replace('[', '').\ replace(']', '').\ replace('{', '').\ replace('}', '').\ replace('{', '')
Example #8
Source File: utils.py From justcopy-backend with MIT License | 6 votes |
def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))))
Example #9
Source File: models.py From casepro with BSD 3-Clause "New" or "Revised" License | 6 votes |
def normalize_phone(cls, number): """ Normalizes the passed in phone number """ # remove any invalid characters number = regex.sub(r"[^0-9a-z\+]", "", number.lower(), regex.V0) # add on a plus if it looks like it could be a fully qualified number if len(number) >= 11 and number[0] not in ["+", "0"]: number = "+" + number try: normalized = phonenumbers.parse(number) if phonenumbers.is_possible_number(normalized): return phonenumbers.format_number(normalized, phonenumbers.PhoneNumberFormat.E164) except Exception: pass return number
Example #10
Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0 | 6 votes |
def sdm_sim(headlines, bodies): def similarity(headline, body): clean_headline = clean(headline) clean_body = clean(body) fullClient = retinasdk.FullClient("e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative") RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE) clean_body = RE.sub(u'', clean_body) # clean_body = clean_body.encode('ascii', 'ignore') clean_body = clean_body.encode('utf8', 'ignore') clean_body = clean_body.decode('utf8', 'ignore') # print(clean_body) clean_body.replace("0x6e", " ") # newdata = clean_body[:start] + clean_body[end:] # clean_body = clean_body.translate(None, '0x6e') comp_with_stop_words = fullClient.compare('[{"text": "'+clean_headline+'"}, {"text": "'+clean_body +'"}]') sim = comp_with_stop_words.cosineSimilarity features = [] features.append(sim) return features x = [] for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): x.append(similarity(headline, body)) return x
Example #11
Source File: cli.py From chepy with GNU General Public License v3.0 | 6 votes |
def cli_highlight(fire: object, highlight: str): """Highlight regex match for cli Args: fire (object): The fire object. highlight (str): Regex to highlight """ current_state = fire.states[fire._current_index] if fire is not None and isinstance(fire, Chepy): try: print( re.sub( "({})".format(highlight), yellow_background(r"\1"), str(current_state), ) ) except: red("Could not highlight because state is not a string") # elif type(current_state) == bytes or type(current_state) == bytearray: # print(re.sub('({})'.format(highlight).encode(), red(r'\1').encode(), current_state).decode()) else: print(type(fire))
Example #12
Source File: parse_tlg_indices.py From cltk with MIT License | 6 votes |
def _handle_splits(_str): """Check if incoming date has a '-" or '/', if so do stuff.""" _str = _str.replace('/', '-') _tmp_dict = {} if '-' in _str: start, stop = _str.split('-') if _check_number(start): start = regex.sub(r'[0-9]+\?*', start, stop) elif _check_number(stop): stop = regex.sub(r'[0-9]+\?*', stop, start) else: start = _str stop = _str _tmp_dict['start_raw'] = start _tmp_dict['stop_raw'] = stop _tmp_dict['start_epoch'] = _get_epoch(start) _tmp_dict['stop_epoch'] = _get_epoch(stop) return _tmp_dict
Example #13
Source File: dataformat.py From chepy with GNU General Public License v3.0 | 6 votes |
def from_charcode(self, prefix: str = ""): """Convert array of unicode chars to string Args: prefix (str, optional): Any prefix for the charcode. Ex: \\u or u. Defaults to "". Returns: Chepy: The Chepy object. Examples: >>> Chepy(["314e", "61", "20", "41"]).from_charcode().o ["ㅎ", "a", " ", "A"] """ out = [] for c in self.state: c = re.sub(prefix, "", c) out.append(chr(int(c, 16))) self.state = out return self
Example #14
Source File: dataset.py From talon with Apache License 2.0 | 6 votes |
def build_detection_class(folder, dataset_filename, label, sender_known=True): """Builds signature detection class. Signature detection dataset includes patterns for two classes: * class for positive patterns (goes with label 1) * class for negative patterns (goes with label -1) The patterns are build of emails from `folder` and appended to dataset file. >>> build_signature_detection_class('emails/P', 'train.data', 1) """ with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if sender is None or msg is None: continue msg = re.sub('|'.join(ANNOTATIONS), '', msg) X = build_pattern(msg, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
Example #15
Source File: client.py From ibis with Apache License 2.0 | 6 votes |
def sql(self, query: str): """ Convert a SQL query to an Ibis table expression. Parameters ---------- query : string Returns ------- table : TableExpr """ # Remove `;` + `--` (comment) query = re.sub(r'\s*;\s*--', '\n--', query.strip()) # Remove trailing ; query = re.sub(r'\s*;\s*$', '', query.strip()) schema = self._get_schema_using_validator(query) return ops.SQLQueryResult(query, schema, self).to_expr()
Example #16
Source File: links.py From chepy with GNU General Public License v3.0 | 6 votes |
def github_to_raw(self): """Convert a github link to raw github link Returns: Chepy: The Chepy object. Examples: >>> Chepy("https://github.com/securisec/chepy/blob/master/README.md").github_to_raw() 'https://raw.githubusercontent.com/securisec/chepy/master/README.md' """ self.state = re.sub( "/blob", "", re.sub( "(github\.com)(/)", r"raw.githubusercontent.com\2", self._convert_to_str(), ), ) return self
Example #17
Source File: Auto_NLP.py From Auto_ViML with Apache License 2.0 | 6 votes |
def process_text(text): soup = BeautifulSoup(text, "lxml") tags_del = soup.get_text() no_html = re.sub('<[^>]*>', '', tags_del) tokenized = casual_tokenizer(no_html) lower = [item.lower() for item in tokenized] decontract = [expandContractions(item, c_re=c_re) for item in lower] tagged = nltk.pos_tag(decontract) lemma = lemma_wordnet(tagged) #no_num = [re.sub('[0-9]+', '', each) for each in lemma] no_punc = [w for w in lemma if w not in punc] no_stop = [w for w in no_punc if w not in stop_words] return no_stop ################################################################################################################################################################ #### THE ABOVE Process_Text secion Re-used with Permission from: #### R O B S A L G A D O robert.salgado@gmail.com Thank YOU! ################################################################################
Example #18
Source File: my_utils.py From ICDAR-2019-SROIE with MIT License | 6 votes |
def pred_to_dict(text, pred, prob): res = {"company": ("", 0), "date": ("", 0), "address": ("", 0), "total": ("", 0)} keys = list(res.keys()) seps = [0] + (numpy.nonzero(numpy.diff(pred))[0] + 1).tolist() + [len(pred)] for i in range(len(seps) - 1): pred_class = pred[seps[i]] - 1 if pred_class == -1: continue new_key = keys[pred_class] new_prob = prob[seps[i] : seps[i + 1]].max() if new_prob > res[new_key][1]: res[new_key] = (text[seps[i] : seps[i + 1]], new_prob) return {k: regex.sub(r"[\t\n]", " ", v[0].strip()) for k, v in res.items()}
Example #19
Source File: networking.py From chepy with GNU General Public License v3.0 | 6 votes |
def defang_url(self): """Make a URL harmless Takes a Universal Resource Locator (URL) and 'Defangs' it; meaning the URL becomes invalid, neutralising the risk of accidentally clicking on a malicious link. This is often used when dealing with malicious links or IOCs. Returns: Chepy: The Chepy object. Examples: >>> Chepy("https://app.google.com/?lol=some data&a=1").defang_url().o "hxxps://app[.]google[.]com/?lol=some data&a=1" """ self.state = re.sub(r"(^htt)", "hxx", self._convert_to_str()) self.state = re.sub(r"\.", "[.]", self._convert_to_str()) return self
Example #20
Source File: tokenizer.py From nmt-chatbot with GNU General Public License v3.0 | 6 votes |
def sentence_split(sentence): # If not an embedded detokenizer - split by spaces if not preprocessing['embedded_detokenizer']: return sentence.split() global re_split # Prepare for split sentence into a words by ' ▁' line = ' ▁▁' + sentence[1:].replace('▁', '▁▁') line = re_split.sub(r' ▁\1\2 ▁', line) # split, filer and return return list(filter(lambda line: False if len(line) == 0 or line == '▁' else True, [token.strip() for token in line.split(' ▁')])) # Load json file with BPE join pairs
Example #21
Source File: networking.py From chepy with GNU General Public License v3.0 | 6 votes |
def defang_ip(self): """Make an IP address harmless Takes a IPv4 or IPv6 address and 'Defangs' it, meaning the IP becomes invalid, removing the risk of accidentally utilising it as an IP address. Returns: Chepy: The Chepy object. Examples: >>> Chepy("2001:4860:4860::8844").defang_ip().o "2001[:]4860[:]4860[:][:]8844" >>> Chepy("127.0.0.1").defang_ip().o "127[.]0[.]0[.]1" """ if ":" in self._convert_to_str(): self.state = re.sub(r":", "[:]", self._convert_to_str()) else: self.state = re.sub(r"\.|:", "[.]", self._convert_to_str()) return self
Example #22
Source File: quotations.py From talon with Apache License 2.0 | 6 votes |
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): """ Splits line in two if splitter pattern preceded by some text on the same line (done only for 'On <date> <person> wrote:' pattern. """ def splitter_wrapper(splitter): """Wraps splitter with new line""" if splitter.start() and msg_body[splitter.start() - 1] != '\n': return '%s%s' % (delimiter, splitter.group()) else: return splitter.group() if content_type == 'text/plain': msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body) return msg_body
Example #23
Source File: quotations.py From talon with Apache License 2.0 | 6 votes |
def _replace_link_brackets(msg_body): """ Normalize links i.e. replace '<', '>' wrapping the link with some symbols so that '>' closing the link couldn't be mistakenly taken for quotation marker. Converts msg_body into a unicode """ if isinstance(msg_body, bytes): msg_body = msg_body.decode('utf8') def link_wrapper(link): newline_index = msg_body[:link.start()].rfind("\n") if msg_body[newline_index + 1] == ">": return link.group() else: return "@@%s@@" % link.group(1) msg_body = re.sub(RE_LINK, link_wrapper, msg_body) return msg_body
Example #24
Source File: roman.py From indic_transliteration with MIT License | 6 votes |
def simplify_accent_notation(cls, text): # References: https://en.wikipedia.org/wiki/Combining_Diacritical_Marks text = text.replace("á", "á") text = text.replace("í", "í") text = text.replace("ú", "ú") text = text.replace("ŕ", "ŕ") text = text.replace("é", "é") text = text.replace("ó", "ó") text = text.replace("à", "à") text = text.replace("ì", "ì") text = text.replace("ù", "ù") text = text.replace("è", "è") text = text.replace("ò", "ò") text = regex.sub("([̀́])([̥̇¯̄]+)", "\\2\\1", text) return text
Example #25
Source File: links.py From chepy with GNU General Public License v3.0 | 5 votes |
def pastebin_to_raw(self): """Convert a pastebin link to raw pastebin link Returns: Chepy: The Chepy object. Examples: >>> Chepy("https://pastebin.com/abCD").pastebin_to_raw() 'https://pastebin.com/raw/abCD' """ self.state = re.sub(r"(pastebin\.com)(/)", r"\1/raw\2", self._convert_to_str()) return self
Example #26
Source File: sp_encoder.py From ru_transformers with Apache License 2.0 | 5 votes |
def encode(self, text): if text and text[0] != ' ': text = ' ' + text text = re.sub(r'(?=[^ ])([\W])([\w])',r'\g<1> \g<2>',text) text = text.replace('\n', NEW_LINE) stext = re.split('(<\|n\|>)', text) result = [token for item in stext for token in self.sp.EncodeAsIds(item) if item] return list(filter(lambda a: a != self.blank_line_id, result))
Example #27
Source File: yt_encoder.py From ru_transformers with Apache License 2.0 | 5 votes |
def decode(self, tokens): # I hate regexps if not isinstance(tokens,list): tokens = tokens.tolist() result = self.bpe.decode(tokens)[0] result = re.sub(r'( )?(<\|n\|>)( )?', r'\n', result) result = re.sub(r'([\n(]) (\w)',r'\g<1>\g<2>', result) result = re.sub(r'(\W)([«"''\n(]|^) (\w)',r'\g<1>\g<2>\g<3>', result) result = re.sub(r'(\w)- (\w)',r'\g<1>-\g<2>', result) return result
Example #28
Source File: networking.py From chepy with GNU General Public License v3.0 | 5 votes |
def refang_url(self): """Refangs a URL so that it is clickable Returns: Chepy: The Chepy object. Examples: >>> Chepy("hxxps://app[.]google[.]com/?lol=some data&a=1").refang_url().o "https://app.google.com/?lol=some data&a=1" """ self.state = re.sub(r"(^hxx)", "htt", self._convert_to_str()) self.state = re.sub(r"\[\.\]", ".", self._convert_to_str()) return self
Example #29
Source File: codetidy.py From chepy with GNU General Public License v3.0 | 5 votes |
def to_camel_case(self, ignore_space: bool = False): """Convert string to camel case Converts the input string to camel case. Camel case is all lower case except letters after word boundaries which are uppercase. e.g. thisIsCamelCase Args: ignore_space (bool, optional): Ignore space boundaries. Defaults to False. Returns: Chepy: The Chepy object. Examples: >>> Chepy("some Data_test").to_camel_case().o "someDataTest" To ignore space, we can set the `ignore_space` to True >>> Chepy("some Data_test").to_camel_case(ignore_space=True).o "some DataTest" """ if ignore_space: r = re.compile(r"_.|\-.") else: r = re.compile(r"_.|\-.|\s.") self.state = r.sub(lambda x: x.group()[1].upper(), self._convert_to_str()) return self
Example #30
Source File: sentence_extracting.py From exbert with Apache License 2.0 | 5 votes |
def replace_newlines(s:str) -> str: return re.sub(r"\n+", r" ", s) # String -> String