Python regex.sub() Examples

The following are 30 code examples of regex.sub(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module regex , or try the search function .
Example #1
Source File: utils.py    From MnemonicReader with BSD 3-Clause "New" or "Revised" License 8 votes vote down vote up
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s)))) 
Example #2
Source File: codetidy.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def to_snake_case(self):
        """Convert string to snake case

        Converts the input string to snake case. Snake case is all lower case 
        with underscores as word boundaries. e.g. this_is_snake_case.

        Returns:
            Chepy: The Chepy object.

        Examples:
            >>> Chepy("helloWorld").to_snake_case().o
            "hello_world"
        """
        s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", self._convert_to_str())
        self.state = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
        return self 
Example #3
Source File: prepro.py    From transformer with Apache License 2.0 6 votes vote down vote up
def make_vocab(fpath, fname):
    '''Constructs vocabulary.
    
    Args:
      fpath: A string. Input file path.
      fname: A string. Output file name.
    
    Writes vocabulary line by line to `preprocessed/fname`
    '''  
    text = codecs.open(fpath, 'r', 'utf-8').read()
    text = regex.sub("[^\s\p{Latin}']", "", text)
    words = text.split()
    word2cnt = Counter(words)
    if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
    with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
        fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
        for word, cnt in word2cnt.most_common(len(word2cnt)):
            fout.write(u"{}\t{}\n".format(word, cnt)) 
Example #4
Source File: utils.py    From OpenQA with MIT License 6 votes vote down vote up
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s)))) 
Example #5
Source File: pygrok.py    From pygrok with MIT License 6 votes vote down vote up
def _load_search_pattern(self):
        self.type_mapper = {}
        py_regex_pattern = self.pattern
        while True:
            # Finding all types specified in the groks
            m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
            for n in m:
                self.type_mapper[n[1]] = n[2]
            #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
            # with regex and regex group name

            py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
                lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            #replace %{pattern_name} with regex
            py_regex_pattern = re.sub(r'%{(\w+)}',
                lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
                break

        self.regex_obj = re.compile(py_regex_pattern) 
Example #6
Source File: normalize.py    From sacremoses with MIT License 6 votes vote down vote up
def normalize(self, text):
        """
        Returns a string with normalized punctuation.
        """
        # Optionally, replace unicode puncts BEFORE normalization.
        if self.pre_replace_unicode_punct:
            text = self.replace_unicode_punct(text)

        # Actual normalization.
        for regexp, substitution in self.substitutions:
            # print(regexp, substitution)
            text = re.sub(regexp, substitution, text_type(text))
            # print(text)

        # Optionally, replace unicode puncts BEFORE normalization.
        if self.post_remove_control_chars:
            text = self.remove_control_chars(text)

        return text 
Example #7
Source File: iocextract.py    From python-iocextract with GNU General Public License v2.0 6 votes vote down vote up
def refang_email(email):
    """Refang an email address.

    :param email: String email address.
    :rtype: str
    """
    # Check for ' at ' and ' dot ' first.
    email = re.sub('\W[aA][tT]\W', '@', email.lower())
    email = re.sub('\W*[dD][oO][tT]\W*', '.', email)

    # Then do other char replaces.
    return _refang_common(email).replace('[', '').\
                                 replace(']', '').\
                                 replace('{', '').\
                                 replace('}', '').\
                                 replace('{', '') 
Example #8
Source File: utils.py    From justcopy-backend with MIT License 6 votes vote down vote up
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s)))) 
Example #9
Source File: models.py    From casepro with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def normalize_phone(cls, number):
        """
        Normalizes the passed in phone number
        """
        # remove any invalid characters
        number = regex.sub(r"[^0-9a-z\+]", "", number.lower(), regex.V0)

        # add on a plus if it looks like it could be a fully qualified number
        if len(number) >= 11 and number[0] not in ["+", "0"]:
            number = "+" + number

        try:
            normalized = phonenumbers.parse(number)

            if phonenumbers.is_possible_number(normalized):
                return phonenumbers.format_number(normalized, phonenumbers.PhoneNumberFormat.E164)
        except Exception:
            pass

        return number 
Example #10
Source File: feature_engineering.py    From coling2018_fake-news-challenge with Apache License 2.0 6 votes vote down vote up
def sdm_sim(headlines, bodies):
    def similarity(headline, body):
        clean_headline = clean(headline)
        clean_body = clean(body)
        fullClient = retinasdk.FullClient("e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative")

        RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
        clean_body = RE.sub(u'', clean_body)
        #         clean_body = clean_body.encode('ascii', 'ignore')
        clean_body = clean_body.encode('utf8', 'ignore')
        clean_body = clean_body.decode('utf8', 'ignore')
        #         print(clean_body)
        clean_body.replace("0x6e", " ")
        #         newdata = clean_body[:start] + clean_body[end:]
        #         clean_body = clean_body.translate(None, '0x6e')
        comp_with_stop_words = fullClient.compare('[{"text": "'+clean_headline+'"}, {"text": "'+clean_body +'"}]')
        sim = comp_with_stop_words.cosineSimilarity

        features = []
        features.append(sim)
        return features
    x = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        x.append(similarity(headline, body))
    return x 
Example #11
Source File: cli.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def cli_highlight(fire: object, highlight: str):
    """Highlight regex match for cli
    
    Args:
        fire (object): The fire object.
        highlight (str): Regex to highlight
    """
    current_state = fire.states[fire._current_index]
    if fire is not None and isinstance(fire, Chepy):
        try:
            print(
                re.sub(
                    "({})".format(highlight),
                    yellow_background(r"\1"),
                    str(current_state),
                )
            )
        except:
            red("Could not highlight because state is not a string")
        # elif type(current_state) == bytes or type(current_state) == bytearray:
        #     print(re.sub('({})'.format(highlight).encode(), red(r'\1').encode(), current_state).decode())
    else:
        print(type(fire)) 
Example #12
Source File: parse_tlg_indices.py    From cltk with MIT License 6 votes vote down vote up
def _handle_splits(_str):
    """Check if incoming date has a '-" or '/', if so do stuff."""
    _str = _str.replace('/', '-')
    _tmp_dict = {}

    if '-' in _str:
        start, stop = _str.split('-')
        if _check_number(start):
            start = regex.sub(r'[0-9]+\?*', start, stop)
        elif _check_number(stop):
            stop = regex.sub(r'[0-9]+\?*', stop, start)
    else:
        start = _str
        stop = _str
    _tmp_dict['start_raw'] = start
    _tmp_dict['stop_raw'] = stop

    _tmp_dict['start_epoch'] = _get_epoch(start)
    _tmp_dict['stop_epoch'] = _get_epoch(stop)

    return _tmp_dict 
Example #13
Source File: dataformat.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def from_charcode(self, prefix: str = ""):
        """Convert array of unicode chars to string
        
        Args:
            prefix (str, optional): Any prefix for the charcode. Ex: \\u or u. Defaults to "".
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy(["314e", "61", "20", "41"]).from_charcode().o
            ["ㅎ", "a", " ", "A"]
        """
        out = []
        for c in self.state:
            c = re.sub(prefix, "", c)
            out.append(chr(int(c, 16)))
        self.state = out
        return self 
Example #14
Source File: dataset.py    From talon with Apache License 2.0 6 votes vote down vote up
def build_detection_class(folder, dataset_filename,
                          label, sender_known=True):
    """Builds signature detection class.

    Signature detection dataset includes patterns for two classes:
    * class for positive patterns (goes with label 1)
    * class for negative patterns (goes with label -1)

    The patterns are build of emails from `folder` and appended to
    dataset file.

    >>> build_signature_detection_class('emails/P', 'train.data', 1)
    """
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if sender is None or msg is None:
                continue
            msg = re.sub('|'.join(ANNOTATIONS), '', msg)
            X = build_pattern(msg, features(sender))
            X.append(label)
            labeled_pattern = ','.join([str(e) for e in X])
            dataset.write(labeled_pattern + '\n') 
Example #15
Source File: client.py    From ibis with Apache License 2.0 6 votes vote down vote up
def sql(self, query: str):
        """
        Convert a SQL query to an Ibis table expression.

        Parameters
        ----------
        query : string

        Returns
        -------
        table : TableExpr
        """
        # Remove `;` + `--` (comment)
        query = re.sub(r'\s*;\s*--', '\n--', query.strip())
        # Remove trailing ;
        query = re.sub(r'\s*;\s*$', '', query.strip())
        schema = self._get_schema_using_validator(query)
        return ops.SQLQueryResult(query, schema, self).to_expr() 
Example #16
Source File: links.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def github_to_raw(self):
        """Convert a github link to raw github link
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("https://github.com/securisec/chepy/blob/master/README.md").github_to_raw()
            'https://raw.githubusercontent.com/securisec/chepy/master/README.md'
        """
        self.state = re.sub(
            "/blob",
            "",
            re.sub(
                "(github\.com)(/)",
                r"raw.githubusercontent.com\2",
                self._convert_to_str(),
            ),
        )
        return self 
Example #17
Source File: Auto_NLP.py    From Auto_ViML with Apache License 2.0 6 votes vote down vote up
def process_text(text):
    soup = BeautifulSoup(text, "lxml")
    tags_del = soup.get_text()
    no_html = re.sub('<[^>]*>', '', tags_del)
    tokenized = casual_tokenizer(no_html)
    lower = [item.lower() for item in tokenized]
    decontract = [expandContractions(item, c_re=c_re) for item in lower]
    tagged = nltk.pos_tag(decontract)
    lemma = lemma_wordnet(tagged)
    #no_num = [re.sub('[0-9]+', '', each) for each in lemma]
    no_punc = [w for w in lemma if w not in punc]
    no_stop = [w for w in no_punc if w not in stop_words]
    return no_stop
################################################################################################################################################################
####   THE ABOVE Process_Text secion Re-used with Permission from:
####  R O B   S A L G A D O    robert.salgado@gmail.com Thank YOU!
################################################################################ 
Example #18
Source File: my_utils.py    From ICDAR-2019-SROIE with MIT License 6 votes vote down vote up
def pred_to_dict(text, pred, prob):
    res = {"company": ("", 0), "date": ("", 0), "address": ("", 0), "total": ("", 0)}
    keys = list(res.keys())

    seps = [0] + (numpy.nonzero(numpy.diff(pred))[0] + 1).tolist() + [len(pred)]
    for i in range(len(seps) - 1):
        pred_class = pred[seps[i]] - 1
        if pred_class == -1:
            continue

        new_key = keys[pred_class]
        new_prob = prob[seps[i] : seps[i + 1]].max()
        if new_prob > res[new_key][1]:
            res[new_key] = (text[seps[i] : seps[i + 1]], new_prob)

    return {k: regex.sub(r"[\t\n]", " ", v[0].strip()) for k, v in res.items()} 
Example #19
Source File: networking.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def defang_url(self):
        """Make a URL harmless
        
        Takes a Universal Resource Locator (URL) and 'Defangs' it; 
        meaning the URL becomes invalid, neutralising the risk of accidentally 
        clicking on a malicious link. This is often used when dealing with 
        malicious links or IOCs.
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("https://app.google.com/?lol=some data&a=1").defang_url().o
            "hxxps://app[.]google[.]com/?lol=some data&a=1"
        """
        self.state = re.sub(r"(^htt)", "hxx", self._convert_to_str())
        self.state = re.sub(r"\.", "[.]", self._convert_to_str())
        return self 
Example #20
Source File: tokenizer.py    From nmt-chatbot with GNU General Public License v3.0 6 votes vote down vote up
def sentence_split(sentence):

    # If not an embedded detokenizer - split by spaces
    if not preprocessing['embedded_detokenizer']:
        return sentence.split()

    global re_split

    # Prepare for split sentence into a words by ' ▁'
    line = ' ▁▁' + sentence[1:].replace('▁', '▁▁')
    line = re_split.sub(r' ▁\1\2 ▁', line)

    # split, filer and return
    return list(filter(lambda line: False if len(line) == 0 or line == '▁' else True, [token.strip() for token in line.split(' ▁')]))

# Load json file with BPE join pairs 
Example #21
Source File: networking.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def defang_ip(self):
        """Make an IP address harmless
        
        Takes a IPv4 or IPv6 address and 'Defangs' it, meaning the 
        IP becomes invalid, removing the risk of accidentally utilising 
        it as an IP address.
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("2001:4860:4860::8844").defang_ip().o
            "2001[:]4860[:]4860[:][:]8844"
            
            >>> Chepy("127.0.0.1").defang_ip().o
            "127[.]0[.]0[.]1"
        """
        if ":" in self._convert_to_str():
            self.state = re.sub(r":", "[:]", self._convert_to_str())
        else:
            self.state = re.sub(r"\.|:", "[.]", self._convert_to_str())
        return self 
Example #22
Source File: quotations.py    From talon with Apache License 2.0 6 votes vote down vote up
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
    """
    Splits line in two if splitter pattern preceded by some text on the same
    line (done only for 'On <date> <person> wrote:' pattern.
    """
    def splitter_wrapper(splitter):
        """Wraps splitter with new line"""
        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
            return '%s%s' % (delimiter, splitter.group())
        else:
            return splitter.group()

    if content_type == 'text/plain':
        msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)

    return msg_body 
Example #23
Source File: quotations.py    From talon with Apache License 2.0 6 votes vote down vote up
def _replace_link_brackets(msg_body):
    """
    Normalize links i.e. replace '<', '>' wrapping the link with some symbols
    so that '>' closing the link couldn't be mistakenly taken for quotation
    marker.

    Converts msg_body into a unicode
    """
    if isinstance(msg_body, bytes):
        msg_body = msg_body.decode('utf8')

    def link_wrapper(link):
        newline_index = msg_body[:link.start()].rfind("\n")
        if msg_body[newline_index + 1] == ">":
            return link.group()
        else:
            return "@@%s@@" % link.group(1)

    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
    return msg_body 
Example #24
Source File: roman.py    From indic_transliteration with MIT License 6 votes vote down vote up
def simplify_accent_notation(cls, text):
        # References: https://en.wikipedia.org/wiki/Combining_Diacritical_Marks
        text = text.replace("á", "á")
        text = text.replace("í", "í")
        text = text.replace("ú", "ú")
        text = text.replace("ŕ", "ŕ")
        text = text.replace("é", "é")
        text = text.replace("ó", "ó")

        text = text.replace("à", "à")
        text = text.replace("ì", "ì")
        text = text.replace("ù", "ù")
        text = text.replace("è", "è")
        text = text.replace("ò", "ò")
        
        text = regex.sub("([̀́])([̥̇¯̄]+)", "\\2\\1", text)
        return text 
Example #25
Source File: links.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def pastebin_to_raw(self):
        """Convert a pastebin link to raw pastebin link
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("https://pastebin.com/abCD").pastebin_to_raw()
            'https://pastebin.com/raw/abCD'
        """
        self.state = re.sub(r"(pastebin\.com)(/)", r"\1/raw\2", self._convert_to_str())
        return self 
Example #26
Source File: sp_encoder.py    From ru_transformers with Apache License 2.0 5 votes vote down vote up
def encode(self, text):
        if text and text[0] != ' ': text = ' ' + text
        text = re.sub(r'(?=[^ ])([\W])([\w])',r'\g<1> \g<2>',text)
        text = text.replace('\n', NEW_LINE)
        stext = re.split('(<\|n\|>)', text)
        result = [token 
                    for item in stext 
                        for token in self.sp.EncodeAsIds(item)
                            if item]
        return list(filter(lambda a: a != self.blank_line_id, result)) 
Example #27
Source File: yt_encoder.py    From ru_transformers with Apache License 2.0 5 votes vote down vote up
def decode(self, tokens): # I hate regexps
        if not isinstance(tokens,list):
            tokens = tokens.tolist()
        result = self.bpe.decode(tokens)[0]
        result = re.sub(r'( )?(<\|n\|>)( )?', r'\n', result)
        result = re.sub(r'([\n(]) (\w)',r'\g<1>\g<2>', result)
        result = re.sub(r'(\W)([«"''\n(]|^) (\w)',r'\g<1>\g<2>\g<3>', result)
        result = re.sub(r'(\w)- (\w)',r'\g<1>-\g<2>', result)
        return result 
Example #28
Source File: networking.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def refang_url(self):
        """Refangs a URL so that it is clickable
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("hxxps://app[.]google[.]com/?lol=some data&a=1").refang_url().o
            "https://app.google.com/?lol=some data&a=1"
        """
        self.state = re.sub(r"(^hxx)", "htt", self._convert_to_str())
        self.state = re.sub(r"\[\.\]", ".", self._convert_to_str())
        return self 
Example #29
Source File: codetidy.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def to_camel_case(self, ignore_space: bool = False):
        """Convert string to camel case
        
        Converts the input string to camel case. Camel case is all lower case 
        except letters after word boundaries which are uppercase. e.g. thisIsCamelCase 

        Args:
            ignore_space (bool, optional): Ignore space boundaries. Defaults to False.
        
        Returns:
            Chepy: The Chepy object.

        Examples:
            >>> Chepy("some Data_test").to_camel_case().o
            "someDataTest"
            
            To ignore space, we can set the `ignore_space` to True
            >>> Chepy("some Data_test").to_camel_case(ignore_space=True).o
            "some DataTest"
        """
        if ignore_space:
            r = re.compile(r"_.|\-.")
        else:
            r = re.compile(r"_.|\-.|\s.")
        self.state = r.sub(lambda x: x.group()[1].upper(), self._convert_to_str())
        return self 
Example #30
Source File: sentence_extracting.py    From exbert with Apache License 2.0 5 votes vote down vote up
def replace_newlines(s:str) -> str:
    return re.sub(r"\n+", r" ", s)

# String -> String