Python regex.match() Examples

The following are 30 code examples of regex.match(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module regex , or try the search function .
Example #1
Source File: vocab.py    From lightNLP with Apache License 2.0 6 votes vote down vote up
def __init__(self, words, tags, rels):
        self.pad_index = 0
        self.unk_index = 1

        self.words = [self.PAD, self.UNK] + sorted(words)
        self.tags = [self.PAD, self.UNK] + sorted(tags)
        self.rels = sorted(rels)

        self.word_dict = {word: i for i, word in enumerate(self.words)}
        self.tag_dict = {tag: i for i, tag in enumerate(self.tags)}
        self.rel_dict = {rel: i for i, rel in enumerate(self.rels)}

        # ids of punctuation that appear in words
        self.puncts = sorted(i for word, i in self.word_dict.items()
                             if regex.match(r'\p{P}+$', word))

        self.n_words = len(self.words)
        self.n_tags = len(self.tags)
        self.n_rels = len(self.rels)
        self.n_train_words = self.n_words 
Example #2
Source File: validate-python2-obsolete.py    From tools with GNU General Public License v2.0 6 votes vote down vote up
def validate_left_to_right_relations(cols):
    """
    Certain UD relations must always go left-to-right.
    Here we currently check the rule for the basic dependencies.
    The same should also be tested for the enhanced dependencies!
    """
    if is_multiword_token(cols):
        return
    if DEPREL >= len(cols):
        return # this has been already reported in trees()
    #if cols[DEPREL] == u"conj":
    if re.match(r"^(conj|fixed|flat)", cols[DEPREL]):
        ichild = int(cols[ID])
        iparent = int(cols[HEAD])
        if ichild < iparent:
            warn(u"Violation of guidelines: relation %s must go left-to-right" % cols[DEPREL], u"Syntax")


##### Tests applicable to the whole tree 
Example #3
Source File: validate-python2-obsolete.py    From tools with GNU General Public License v2.0 6 votes vote down vote up
def validate_sent_id(comments,known_ids,lcode):
    matched=[]
    for c in comments:
        match=sentid_re.match(c)
        if match:
            matched.append(match)
        else:
            if c.startswith(u"# sent_id") or c.startswith(u"#sent_id"):
                warn(u"Spurious sent_id line: '%s' Should look like '# sent_id = xxxxxx' where xxxx is not whitespace. Forward slash reserved for special purposes." %c,u"Metadata")
    if not matched:
        warn(u"Missing the sent_id attribute.",u"Metadata")
    elif len(matched)>1:
        warn(u"Multiple sent_id attribute.",u"Metadata")
    else:
        sid=matched[0].group(1)
        if sid in known_ids:
            warn(u"Non-unique sent_id the sent_id attribute: "+sid,u"Metadata")
        if sid.count(u"/")>1 or (sid.count(u"/")==1 and lcode!=u"ud" and lcode!=u"shopen"):
            warn(u"The forward slash is reserved for special use in parallel treebanks: "+sid,u"Metadata")
        known_ids.add(sid) 
Example #4
Source File: test_partialparse.py    From ctparse with MIT License 6 votes vote down vote up
def test_partial_parse() -> None:
    match_a = regex.match("(?<R1>a)", "ab")
    match_b = next(regex.finditer("(?<R2>b)", "ab"))

    pp = PartialParse.from_regex_matches(
        (RegexMatch(1, match_a), RegexMatch(2, match_b))
    )

    assert len(pp.prod) == 2
    assert len(pp.rules) == 2

    assert isinstance(pp.score, float)

    def mock_rule(ts: datetime.datetime, a: Time) -> Time:
        return Time()

    pp2 = pp.apply_rule(
        datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1)
    )

    assert pp != pp2

    with pytest.raises(ValueError):
        PartialParse((), ()) 
Example #5
Source File: prep_wikipedia.py    From justcopy-backend with MIT License 6 votes vote down vote up
def preprocess(article):
    # Take out HTML escaping WikiExtractor didn't clean
    for k, v in article.items():
        article[k] = PARSER.unescape(v)

    # Filter some disambiguation pages not caught by the WikiExtractor
    if article['id'] in BLACKLIST:
        return None
    if '(disambiguation)' in article['title'].lower():
        return None
    if '(disambiguation page)' in article['title'].lower():
        return None

    # Take out List/Index/Outline pages (mostly links)
    if re.match(r'(List of .+)|(Index of .+)|(Outline of .+)',
                article['title']):
        return None

    # Return doc with `id` set to `title`
    return {'id': article['title'], 'text': article['text']} 
Example #6
Source File: rules.py    From epitran with MIT License 6 votes vote down vote up
def _read_rule(self, i, line):
        line = line.strip()
        if line:
            line = unicodedata.normalize('NFD', line)
            s = re.match(r'(?P<symbol>::\w+::)\s*=\s*(?P<value>.+)', line)
            if s:
                self.symbols[s.group('symbol')] = s.group('value')
            else:
                line = self._sub_symbols(line)
                r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line)
                try:
                    a, b, X, Y = r.groups()
                except AttributeError:
                    raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line))
                X, Y = X.replace('#', '^'), Y.replace('#', '$')
                a, b = a.replace('0', ''), b.replace('0', '')
                try:
                    if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a):
                        return self._fields_to_function_metathesis(a, X, Y)
                    else:
                        return self._fields_to_function(a, b, X, Y)
                except Exception as e:
                    raise DatafileError('Line {}: "{}" cannot be compiled as regex: ̪{}'.format(i + 1, line, e)) 
Example #7
Source File: flite.py    From epitran with MIT License 6 votes vote down vote up
def transliterate(self, text, normpunc=False, ligatures=False):
        """Convert English text to IPA transcription

        Args:
            text (unicode): English text
            normpunc (bool): if True, normalize punctuation downward
            ligatures (bool): if True, use non-standard ligatures instead of
                              standard IPA
        """
        text = unicodedata.normalize('NFC', text)
        acc = []
        for chunk in self.chunk_re.findall(text):
            if self.letter_re.match(chunk):
                acc.append(self.english_g2p(chunk))
            else:
                acc.append(chunk)
        text = ''.join(acc)
        text = self.puncnorm.norm(text) if normpunc else text
        text = ligaturize(text) if (ligatures or self.ligatures) else text
        return text 
Example #8
Source File: validate.py    From tools with GNU General Public License v2.0 6 votes vote down vote up
def validate_left_to_right_relations(id, tree):
    """
    Certain UD relations must always go left-to-right.
    Here we currently check the rule for the basic dependencies.
    The same should also be tested for the enhanced dependencies!
    """
    testlevel = 3
    testclass = 'Syntax'
    cols = tree['nodes'][id]
    if is_multiword_token(cols):
        return
    if DEPREL >= len(cols):
        return # this has been already reported in trees()
    # According to the v2 guidelines, apposition should also be left-headed, although the definition of apposition may need to be improved.
    if re.match(r"^(conj|fixed|flat|goeswith|appos)", cols[DEPREL]):
        ichild = int(cols[ID])
        iparent = int(cols[HEAD])
        if ichild < iparent:
            # We must recognize the relation type in the test id so we can manage exceptions for legacy treebanks.
            # For conj, flat, and fixed the requirement was introduced already before UD 2.2, and all treebanks in UD 2.3 passed it.
            # For appos and goeswith the requirement was introduced before UD 2.4 and legacy treebanks are allowed to fail it.
            testid = "right-to-left-%s" % lspec2ud(cols[DEPREL])
            testmessage = "Relation '%s' must go left-to-right." % cols[DEPREL]
            warn(testmessage, testclass, testlevel=testlevel, testid=testid, nodeid=id, nodelineno=tree['linenos'][id]) 
Example #9
Source File: query.py    From cltk with MIT License 6 votes vote down vote up
def match_regex(input_str, pattern, language, context, case_insensitive=True):
    """Take input string and a regex pattern, then yield generator of matches
     in desired format.

     TODO: Rename this `match_pattern` and incorporate the keyword expansion
      code currently in search_corpus.

    :param input_str:
    :param pattern:
    :param language:
    :param context: Integer or 'sentence' 'paragraph'
    :rtype : str
    """
    if type(context) is str:
        contexts = ['sentence', 'paragraph']
        assert context in contexts or type(context) is int, 'Available contexts: {}'.format(contexts)
    else:
        context = int(context)
    for match in _regex_span(pattern, input_str, case_insensitive=case_insensitive):
        if context == 'sentence':
            yield _sentence_context(match, language)
        elif context == 'paragraph':
            yield _paragraph_context(match)
        else:
            yield _window_match(match, context) 
Example #10
Source File: parse_tlg_indices.py    From cltk with MIT License 6 votes vote down vote up
def _get_epoch(_str):
    """Take incoming string, return its epoch."""
    _return = None
    if _str.startswith('A.D. '):
        _return = 'ad'
    elif _str.startswith('a. A.D. '):
        _return = None #?
    elif _str.startswith('p. A.D. '):
        _return = 'ad'
    elif regex.match(r'^[0-9]+ B\.C\. *', _str):
        _return = 'bc'
    elif regex.match(r'^a\. *[0-9]+ B\.C\. *', _str):
        _return = 'bc'
    elif regex.match(r'^p\. *[0-9]+ B\.C\. *', _str):
        _return = None  #?
    elif _str == 'Incertum' or _str == 'Varia':
        _return = _str
    return _return 
Example #11
Source File: client.py    From ibis with Apache License 2.0 6 votes vote down vote up
def list_tables(self, like=None, database=None):
        """
        List tables in the current (or indicated) database. Like the SHOW
        TABLES command.

        Parameters
        ----------
        like : string, default None
          e.g. 'foo*' to match all tables starting with 'foo'
        database : string, default None
          If not passed, uses the current/default database

        Returns
        -------
        results : list of strings
        """
        results = [t.name for t in self._catalog.listTables(dbName=database)]
        if like:
            results = [
                table_name
                for table_name in results
                if re.match(like, table_name) is not None
            ]

        return results 
Example #12
Source File: query.py    From cltk with MIT License 6 votes vote down vote up
def _window_match(match, window=100):
    """Take incoming match and highlight in context.
    :rtype : str
    :param match: Regex match.
    :param window: Characters on each side of match to return.
    :type window: int
    """
    window = int(window)
    start = match.start()
    end = match.end()
    snippet_left = match.string[start - window:start]
    snippet_match = match.string[match.start():match.end()]
    snippet_right = match.string[end:end + window]

    snippet = snippet_left + '*' + snippet_match + '*' + snippet_right

    return snippet 
Example #13
Source File: source.py    From pythonparser with MIT License 6 votes vote down vote up
def _extract_encoding(self, source):
        if isinstance(source, bytes):
            re = self._encoding_bytes_re
            nl = b"\n"
        else:
            re = self._encoding_re
            nl = "\n"
        match = re.match(source)
        if not match:
            index = source.find(nl)
            if index != -1:
                match = re.match(source[index + 1:])
        if match:
            encoding = match.group(1)
            if isinstance(encoding, bytes):
                return encoding.decode("ascii")
            return encoding
        return "ascii" 
Example #14
Source File: quotations.py    From talon with Apache License 2.0 5 votes vote down vote up
def _correct_splitlines_in_headers(markers, lines):
    """
    Corrects markers by removing splitlines deemed to be inside header blocks.
    """
    updated_markers = ""
    i = 0
    in_header_block = False
    for m in markers:
        # Only set in_header_block flag when we hit an 's' and line is a header
        if m == 's':
            if not in_header_block:
                if bool(re.search(RE_HEADER, lines[i])):
                    in_header_block = True
            else:
                if QUOT_PATTERN.match(lines[i]):
                    m = 'm'
                else:
                    m = 't'

        # If the line is not a header line, set in_header_block false.
        if not bool(re.search(RE_HEADER, lines[i])):
            in_header_block = False

        # Add the marker to the new updated markers string.
        updated_markers += m
        i += 1

    return updated_markers 
Example #15
Source File: tpu_lm_finetuning.py    From ru_transformers with Apache License 2.0 5 votes vote down vote up
def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
    if len(glob_checkpoints) <= args.save_total_limit:
        return

    ordering_and_checkpoint_path = []
    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        log_info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint) 
Example #16
Source File: hotpot_preliminary_doc_retri.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def filter_word(text):
    """Take out english stopwords, punctuation, and compound endings."""
    text = normalize(text)
    if regex.match(r'^\p{P}+$', text):
        return True
    if text.lower() in STOPWORDS:
        return True
    return False 
Example #17
Source File: text_clean.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def filter_document_id(input_string):
    pid_words = input_string.strip().replace('_', ' ')
    match = re.search('[a-zA-Z]', pid_words)
    if match is None:
        return True
    elif check_arabic(pid_words):
        return True
    else:
        return False 
Example #18
Source File: hotpot_preliminary_doc_retri.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def filter_document_id(input_string, remove_disambiguation_doc=True):
    pid_words = input_string.strip().replace('_', ' ')
    match = re.search('[a-zA-Z]', pid_words)
    if match is None:  # filter id that contains no alphabets characters
        return True
    elif check_arabic(pid_words):  # remove id that contain arabic characters.
        return True
    else:
        if remove_disambiguation_doc:
            if filter_disamb_doc(input_string):
                return True
        return False 
Example #19
Source File: text_clean.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def filter_word(text):
    """Take out english stopwords, punctuation, and compound endings."""
    text = normalize(text)
    if regex.match(r'^\p{P}+$', text):
        return True
    if text.lower() in STOPWORDS:
        return True
    return False 
Example #20
Source File: utils.py    From neural_chat with MIT License 5 votes vote down vote up
def filter_word(text):
    """Take out english stopwords, punctuation, and compound endings."""
    text = normalize(text)
    if regex.match(r'^\p{P}+$', text):
        return True
    if text.lower() in STOPWORDS:
        return True
    return False 
Example #21
Source File: text_clean.py    From combine-FEVER-NSMN with MIT License 5 votes vote down vote up
def filter_document_id(input_string):
    pid_words = input_string.strip().replace('_', ' ')
    match = re.search('[a-zA-Z]', pid_words)
    if match is None:
        return True
    elif check_arabic(pid_words):
        return True
    else:
        return False 
Example #22
Source File: text_clean.py    From combine-FEVER-NSMN with MIT License 5 votes vote down vote up
def filter_word(text):
    """Take out english stopwords, punctuation, and compound endings."""
    text = normalize(text)
    if regex.match(r'^\p{P}+$', text):
        return True
    if text.lower() in STOPWORDS:
        return True
    return False 
Example #23
Source File: quotations.py    From talon with Apache License 2.0 5 votes vote down vote up
def is_splitter(line):
    '''
    Returns Matcher object if provided string is a splitter and
    None otherwise.
    '''
    for pattern in SPLITTER_PATTERNS:
        matcher = re.match(pattern, line)
        if matcher:
            return matcher 
Example #24
Source File: vocab.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def extend(self, words):
        self.words.extend(sorted(set(words).difference(self.word_dict)))
        self.word_dict = {word: i for i, word in enumerate(self.words)}
        self.puncts = sorted(i for word, i in self.word_dict.items()
                             if regex.match(r'\p{P}+$', word))
        self.n_words = len(self.words) 
Example #25
Source File: parse_tlg_indices.py    From cltk with MIT License 5 votes vote down vote up
def select_id_by_name(query):
    """Do a case-insensitive regex match on author name, returns TLG id."""
    id_author = get_id_author()
    comp = regex.compile(r'{}'.format(query.casefold()), flags=regex.VERSION1)
    matches = []
    for _id, author in id_author.items():
        match = comp.findall(author.casefold())
        if match:
            matches.append((_id, author))
    return matches 
Example #26
Source File: quotations.py    From talon with Apache License 2.0 5 votes vote down vote up
def mark_message_lines(lines):
    """Mark message lines with markers to distinguish quotation lines.

    Markers:

    * e - empty line
    * m - line that starts with quotation marker '>'
    * s - splitter line
    * t - presumably lines from the last message in the conversation

    >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
    'tsem'
    """
    markers = ['e' for _ in lines]
    i = 0
    while i < len(lines):
        if not lines[i].strip():
            markers[i] = 'e'  # empty line
        elif QUOT_PATTERN.match(lines[i]):
            markers[i] = 'm'  # line with quotation marker
        elif RE_FWD.match(lines[i]):
            markers[i] = 'f'  # ---- Forwarded message ----
        else:
            # in case splitter is spread across several lines
            splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))

            if splitter:
                # append as many splitter markers as lines in splitter
                splitter_lines = splitter.group().splitlines()
                for j in range(len(splitter_lines)):
                    markers[i + j] = 's'

                # skip splitter lines
                i += len(splitter_lines) - 1
            else:
                # probably the line from the last message in the conversation
                markers[i] = 't'
        i += 1

    return ''.join(markers) 
Example #27
Source File: parse_tlg_indices.py    From cltk with MIT License 5 votes vote down vote up
def _check_number(_str):
    """check if the string contains only a number followed by ?"""
    if regex.match(r'^[0-9]+\?*', _str):
        return True
    return False 
Example #28
Source File: dataset.py    From talon with Apache License 2.0 5 votes vote down vote up
def parse_msg_sender(filename, sender_known=True):
    """Given a filename returns the sender and the message.

    Here the message is assumed to be a whole MIME message or just
    message body.

    >>> sender, msg = parse_msg_sender('msg.eml')
    >>> sender, msg = parse_msg_sender('msg_body')

    If you don't want to consider the sender's name in your classification
    algorithm:
    >>> parse_msg_sender(filename, False)
    """
    import sys
    kwargs = {}
    if sys.version_info > (3, 0):
        kwargs["encoding"] = "utf8"

    sender, msg = None, None
    if os.path.isfile(filename) and not is_sender_filename(filename):
        with open(filename, **kwargs) as f:
            msg = f.read()
            sender = u''
            if sender_known:
                sender_filename = build_sender_filename(filename)
                if os.path.exists(sender_filename):
                    with open(sender_filename) as sender_file:
                        sender = sender_file.read().strip()
                else:
                    # if sender isn't found then the next line fails
                    # and it is ok
                    lines = msg.splitlines()
                    for line in lines:
                        match = re.match('From:(.*)', line)
                        if match:
                            sender = match.group(1)
                            break
    return (sender, msg) 
Example #29
Source File: __init__.py    From indic_transliteration with MIT License 5 votes vote down vote up
def do_vyanjana_svara_join(self, vyanjanaanta, svaraadi):
        import regex
        if regex.match("|".join(self['vowels']) + ".*", svaraadi):
            return vyanjanaanta[:-1] + self.vowel_to_mark_map[svaraadi[0]] + svaraadi[1:]
        else:
            raise ValueError(svaraadi + " is not svaraadi.") 
Example #30
Source File: query.py    From cltk with MIT License 5 votes vote down vote up
def _regex_span(_regex, _str, case_insensitive=True):
    """Return all matches in an input string.
    :rtype : regex.match.span
    :param _regex: A regular expression pattern.
    :param _str: Text on which to run the pattern.
    """
    if case_insensitive:
        flags = regex.IGNORECASE | regex.FULLCASE | regex.VERSION1
    else:
        flags = regex.VERSION1
    comp = regex.compile(_regex, flags=flags)
    matches = comp.finditer(_str)
    for match in matches:
        yield match