Python regex.finditer() Examples

The following are 30 code examples of regex.finditer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module regex , or try the search function .
Example #1
Source File: autosum_arxiv.py    From autosum with MIT License 6 votes vote down vote up
def get_arxiv_meta_archive(aid):
    title = ''
    authors = []
    jref = ''
    txt = ''
    with tarfile.open("./kddcup2003/hep-th-abs.tar.gz", "r:gz") as t:
        for m in t.getmembers():
            if m.name.find(aid) != -1:
                txt = t.extractfile(m).read()
                break
    for m in regex.finditer(r'Title:\s+(.*)(?=Author)', txt, regex.S):
        title = clean_line(m.group(1))
        break
    for m in regex.finditer(r'Authors?:\s+(.*)(?=Comment)', txt, regex.S):
        a = clean_line(m.group(1))
        authors = regex.split(r'(?:,\s*(?:and\s+)?|\s+and\s+)', a)
        break
    for m in regex.finditer(r'Journal-ref:\s+(.*?)(?=\\\\)', txt, regex.S):
        jref = clean_line(m.group(1))
        break

    return title, authors, jref 
Example #2
Source File: functional_load.py    From CorpusTools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def neutralize_with_all_envs(trans, env_filters):
    string = ''.join(trans.with_word_boundaries())
    length = len(string)
    for env_filter in env_filters:
        pattern = env_filter.generate_regular_expression()
        for match in re.finditer(pattern, string, overlapped=True):
            mid_index = match.start('MID')
            temp = ''
            for i in range(length):
                if i == mid_index:
                    s = '-'
                else:
                    s = string[i]
                temp += s
            string = temp
    return string


# This function is weirdly named. It should really be something like
# average_minpair_fl
# It has also been changed so as to have two "relativizer" options:
# one to words containing the relevant segments and one to all
# words in the corpus (though it basically does the calculation
# by calling the above minpair_fl() function). 
Example #3
Source File: _panphon.py    From panphon with MIT License 6 votes vote down vote up
def compile_regex_from_str(self, ft_str):
        """Given a string describing features masks for a sequence of segments,
        return a regex matching the corresponding strings.

        Args:
            ft_str (str): feature masks, each enclosed in square brackets, in
            which the features are delimited by any standard delimiter.

        Returns:
           Pattern: regular expression pattern equivalent to `ft_str`
        """

        sequence = []
        for m in re.finditer(r'\[([^]]+)\]', ft_str):
            ft_mask = fts(m.group(1))
            segs = self.all_segs_matching_fts(ft_mask)
            sub_pat = '({})'.format('|'.join(segs))
            sequence.append(sub_pat)
        pattern = ''.join(sequence)
        regex = re.compile(pattern)
        return regex 
Example #4
Source File: test_partialparse.py    From ctparse with MIT License 6 votes vote down vote up
def test_partial_parse() -> None:
    match_a = regex.match("(?<R1>a)", "ab")
    match_b = next(regex.finditer("(?<R2>b)", "ab"))

    pp = PartialParse.from_regex_matches(
        (RegexMatch(1, match_a), RegexMatch(2, match_b))
    )

    assert len(pp.prod) == 2
    assert len(pp.rules) == 2

    assert isinstance(pp.score, float)

    def mock_rule(ts: datetime.datetime, a: Time) -> Time:
        return Time()

    pp2 = pp.apply_rule(
        datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1)
    )

    assert pp != pp2

    with pytest.raises(ValueError):
        PartialParse((), ()) 
Example #5
Source File: distance.py    From panphon with MIT License 6 votes vote down vote up
def map_to_dogol_prime(self, s):
        """Map a string to Dogolpolsky' classes

        Args:
            s (unicode): IPA word

        Returns:
            (unicode): word with all segments collapsed to D' classes
        """
        segs = []
        for seg in self.fm.seg_regex.finditer(s):
            fts = self.fm.fts(seg.group(0))
            for mask, label in self.dogol_prime:
                if fts >= mask:
                    segs.append(label)
                    break
        return ''.join(segs) 
Example #6
Source File: __init__.py    From date-extractor with Apache License 2.0 5 votes vote down vote up
def getFirstDateFromText(text, debug=False, default_hour=0, default_minute=0, default_second=0, return_precision=False):
    #print("starting getFirstDateFromText")
    global patterns

    for match in regex.finditer(patterns['date_compiled'], text):
        #print("\nmatch is", match.group(0))
        #print("\nmatch.index is", ([item for item in match.groupdict().items() if item[1]]))
        if not isDefinitelyNotDate(match.group(0)):
            match = dict((k, num(v)) for k, v in match.groupdict().items() if num(v))
            return datetime_from_dict(match, debug, default_hour, default_minute, default_second, return_precision)
    #print "finishing getFirstDateFromText"

# the date of a webpage, like a blog or article, will often be the first date mentioned 
Example #7
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_emojis(self, node, token_class="emoticon"):
        boundaries = []
        for m in re.finditer(r"\X", node.value.text):
            if m.end() - m.start() > 1:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
            else:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class) 
Example #8
Source File: test_types.py    From ctparse with MIT License 5 votes vote down vote up
def test_init(self):
        m = next(regex.finditer(r"(?P<R1>match me)", "xxx match me xxx"))
        r = RegexMatch(1, m)
        self.assertEqual(r.mstart, 4)
        self.assertEqual(r.mend, 12)
        self.assertEqual(len(r), 8)
        self.assertEqual(r._text, "match me")
        self.assertEqual(repr(r), "RegexMatch[4-12]{1:match me}")
        self.assertEqual(r.nb_str(), "RegexMatch[]{1:match me}") 
Example #9
Source File: test_rule.py    From ctparse with MIT License 5 votes vote down vote up
def test_regex_match(self):
        m = next(regex.finditer("(?P<R1>x)", "x"))
        r = RegexMatch(1, m)
        self.assertTrue(regex_match(1)(r))
        self.assertFalse(regex_match(1)(TestClassA())) 
Example #10
Source File: distance.py    From panphon with MIT License 5 votes vote down vote up
def ftstr2dict(ftstr):
    fts = {}
    for m in re.finditer(r'([-0+])(\w+)', ftstr):
        v, k = m.groups()
        fts[k] = {'-': -1, '0': 0, '+': 1}[v]
    return fts 
Example #11
Source File: _panphon.py    From panphon with MIT License 5 votes vote down vote up
def segment_text(text, seg_regex=SEG_REGEX):
    """Return an iterator of segments in the text.

    Args:
        text (unicode): string of IPA Unicode text
        seg_regex (_regex.Pattern): compiled regex defining a segment (base +
                                    modifiers)

    Return:
        generator: segments in the input text
    """
    for m in seg_regex.finditer(text):
        yield m.group(0) 
Example #12
Source File: _panphon.py    From panphon with MIT License 5 votes vote down vote up
def fts(s):
    """Given string `s` with +/-[alphabetical sequence]s, return list of features.

    Args:
        s (str): string with segments of the sort "+son -syl 0cor"

    Return:
        list: list of (value, feature) tuples
    """
    return [m.groups() for m in FT_REGEX.finditer(s)] 
Example #13
Source File: _panphon.py    From panphon with MIT License 5 votes vote down vote up
def pat(p):
    """Given a string `p` with feature matrices (features grouped with square
    brackets into segments, return a list of sets of (value, feature) tuples.

    Args:
        p (str): list of feature matrices as strings

    Return:
        list: list of sets of (value, feature) tuples
    """
    pattern = []
    for matrix in [m.group(0) for m in MT_REGEX.finditer(p)]:
        segment = set([m.groups() for m in FT_REGEX.finditer(matrix)])
        pattern.append(segment)
    return pattern 
Example #14
Source File: _panphon.py    From panphon with MIT License 5 votes vote down vote up
def filter_string(self, word):
        """Return a string like the input but containing only legal IPA segments

        Args:
            word (unicode): input string to be filtered

        Returns:
            unicode: string identical to `word` but with invalid IPA segments
                     absent

        """
        segs = [m.group(0) for m in self.seg_regex.finditer(word)]
        return ''.join(segs) 
Example #15
Source File: segment.py    From panphon with MIT License 5 votes vote down vote up
def __init__(self, names, features={}, ftstr='', weights=None):
        """Construct a `Segment` object

        Args:
            names (list): ordered list of feature names
            features (dict): name-value pairs for specified features
            ftstr (unicode): a string, each /(+|0|-)\w+/ sequence of which is
                             interpreted as a feature specification
            weights (float): order list of feature weights/saliences
            """
        self.n2s = {-1: '-', 0: '0', 1: '+'}
        self.s2n = {k: v for (v, k) in self.n2s.items()}
        self.names = names
        """Set a feature specification"""
        self.data = {}
        for name in names:
            if name in features:
                self.data[name] = features[name]
            else:
                self.data[name] = 0
        for m in re.finditer(r'(\+|0|-)(\w+)', ftstr):
            v, k = m.groups()
            self.data[k] = self.s2n[v]
        if weights:
            self.weights = weights
        else:
            self.weights = [1 for _ in names] 
Example #16
Source File: event_tagger.py    From estnltk with GNU General Public License v2.0 5 votes vote down vote up
def _match(self, text):
        matches = []
        if self.mapping:
            seq = self.map.keys()
        else:
            seq = self.regex_sequence

        for r in seq:
            for matchobj in re.finditer(r, text, overlapped=True):
                groups = (matchobj.groupdict())
                result = {
                    'start': matchobj.start(),
                    'end': matchobj.end(),
                    'regex': r,
                    'groups':groups
                }

                if self.mapping:
                    for k, v in self.map[r].items():
                        if k not in result.keys():
                            result[k] = v

                matches.append(
                    result
                )

        return matches 
Example #17
Source File: run_coqa.py    From FlowDelta with MIT License 5 votes vote down vote up
def split_with_span(s):
    if s.split() == []:
        return [], []
    else:
        return zip(*[(m.group(0), (m.start(), m.end()-1)) for m in re.finditer(r'\S+', s)]) 
Example #18
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_left(self, regex, node):
        boundaries = []
        prev_end = 0
        for m in regex.finditer(node.value.text):
            boundaries.append((prev_end, m.start(), None))
            prev_end = m.start()
        self._split_on_boundaries(node, boundaries, token_class=None, lock_match=False) 
Example #19
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_matches(self, regex, node, token_class="regular", repl=None, split_named_subgroups=True, delete_whitespace=False):
        boundaries = []
        split_groups = split_named_subgroups and len(regex.groupindex) > 0
        group_numbers = sorted(regex.groupindex.values())
        for m in regex.finditer(node.value.text):
            if split_groups:
                for g in group_numbers:
                    if m.span(g) != (-1, -1):
                        boundaries.append((m.start(g), m.end(g), None))
            else:
                if repl is None:
                    boundaries.append((m.start(), m.end(), None))
                else:
                    boundaries.append((m.start(), m.end(), m.expand(repl)))
        self._split_on_boundaries(node, boundaries, token_class, delete_whitespace=delete_whitespace) 
Example #20
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_set(self, regex, node, items, token_class="regular", ignore_case=False):
        boundaries = []
        for m in regex.finditer(node.value.text):
            instance = m.group(0)
            if ignore_case:
                instance = instance.lower()
            if instance in items:
                boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class) 
Example #21
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_left(self, regex, node):
        boundaries = []
        prev_end = 0
        for m in regex.finditer(node.value.text):
            boundaries.append((prev_end, m.start(), None))
            prev_end = m.start()
        self._split_on_boundaries(node, boundaries, token_class=None, lock_match=False) 
Example #22
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_abbreviations(self, token_dll, split_multipart_abbrevs=True):
        """Turn instances of abbreviations into tokens."""
        self._split_all_matches(self.single_letter_ellipsis, token_dll, "abbreviation")
        self._split_all_matches(self.and_cetera, token_dll, "abbreviation")
        self._split_all_matches(self.str_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.nr_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.single_token_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.single_letter_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.ps, token_dll, "abbreviation")

        for t in token_dll:
            if t.value.markup or t.value._locked:
                continue
            boundaries = []
            for m in self.abbreviation.finditer(t.value.text):
                instance = m.group(0)
                if split_multipart_abbrevs and self.multipart_abbreviation.fullmatch(instance):
                    start, end = m.span(0)
                    s = start
                    for i, c in enumerate(instance, start=1):
                        if c == ".":
                            boundaries.append((s, start + i, None))
                            s = start + i
                else:
                    boundaries.append((m.start(), m.end(), None))
            self._split_on_boundaries(t, boundaries, "abbreviation") 
Example #23
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_matches(self, regex, node, token_class="regular", repl=None, split_named_subgroups=True, delete_whitespace=False):
        boundaries = []
        split_groups = split_named_subgroups and len(regex.groupindex) > 0
        group_numbers = sorted(regex.groupindex.values())
        for m in regex.finditer(node.value.text):
            if split_groups:
                for g in group_numbers:
                    if m.span(g) != (-1, -1):
                        boundaries.append((m.start(g), m.end(g), None))
            else:
                if repl is None:
                    boundaries.append((m.start(), m.end(), None))
                else:
                    boundaries.append((m.start(), m.end(), m.expand(repl)))
        self._split_on_boundaries(node, boundaries, token_class, delete_whitespace=delete_whitespace) 
Example #24
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_emojis(self, node, token_class="emoticon"):
        boundaries = []
        for m in re.finditer(r"\X", node.value.text):
            if m.end() - m.start() > 1:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
            else:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class) 
Example #25
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_set(self, regex, node, items, token_class="regular", ignore_case=False):
        boundaries = []
        for m in regex.finditer(node.value.text):
            instance = m.group(0)
            if ignore_case:
                instance = instance.lower()
            if instance in items:
                boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class) 
Example #26
Source File: tokenizer.py    From SoMaJo with GNU General Public License v3.0 5 votes vote down vote up
def _split_abbreviations(self, token_dll, split_multipart_abbrevs=True):
        """Turn instances of abbreviations into tokens."""
        self._split_all_matches(self.single_letter_ellipsis, token_dll, "abbreviation")
        self._split_all_matches(self.and_cetera, token_dll, "abbreviation")
        self._split_all_matches(self.str_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.nr_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.single_token_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.single_letter_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.ps, token_dll, "abbreviation")

        for t in token_dll:
            if t.value.markup or t.value._locked:
                continue
            boundaries = []
            for m in self.abbreviation.finditer(t.value.text):
                instance = m.group(0)
                if split_multipart_abbrevs and self.multipart_abbreviation.fullmatch(instance):
                    start, end = m.span(0)
                    s = start
                    for i, c in enumerate(instance, start=1):
                        if c == ".":
                            boundaries.append((s, start + i, None))
                            s = start + i
                else:
                    boundaries.append((m.start(), m.end(), None))
            self._split_on_boundaries(t, boundaries, "abbreviation") 
Example #27
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
            r"\b(?:{})\b".format(r"|".join(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            )),
            re.I if ignorecase else 0
        ).finditer(s):
        yield m.group(0) 
Example #28
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def __findeqtagpairspans(
        s: str,
        tag: str,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
        yield (match.span("__open"), match.span("__content"), match.span("__close")) 
Example #29
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def __findtagpairspans(
        s: str,
        tag: str, closetag: Optional[str] = None,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    if closetag is None or tag == closetag:
        yield from __findeqtagpairspans(s, tag, useregex=useregex)
        return

    if not useregex:
        tag = re.escape(tag)
        closetag = re.escape(closetag)

    retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag))

    startspans = []

    for match in retags.finditer(s):
        opengroup = match.group("__open")
        if opengroup:
            startspans.append(match.span())
            continue

        closegroup = match.group("__close")
        if closegroup and startspans:
            startspan = startspans.pop()
            endspan = match.span()

            yield (startspan, (startspan[1], endspan[0]), endspan) 
Example #30
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
            r"\b(?:{})\b".format(r"|".join(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            )),
            re.I if ignorecase else 0
        ).finditer(s):
        yield m.group(0)