Python regex.escape() Examples

The following are 24 code examples of regex.escape(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module regex , or try the search function .
Example #1
Source File: utils.py    From dragonfly with GNU Lesser General Public License v3.0 6 votes vote down vote up
def _phrase_to_regex(phrase):
    # Treat whitespace between words as meaning anything other than alphanumeric
    # characters.
    pattern = r"[^\w--_]+".join(regex.escape(word) for word in phrase.split())
    # Treat spaces at the beginning or end of the phrase as matching any
    # whitespace character. This makes it easy to select stuff like non-breaking
    # space, which occurs frequently in browsers.
    # TODO Support newlines. Note that these are frequently implemented as
    # separate text nodes in the accessibility tree, so the obvious
    # implementation would not work well.
    if phrase == " ":
        pattern = r"\s"
    else:
        if phrase.startswith(" "):
            pattern = r"\s" + pattern
        if phrase.endswith(" "):
            pattern = pattern + r"\s"
    # Only match at boundaries of alphanumeric sequences if the phrase ends
    # are alphanumeric.
    if regex.search(r"^[\w--_]", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = r"(?<![\w--_])" + pattern
    if regex.search(r"[\w--_]$", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = pattern + r"(?![\w--_])"
    return pattern 
Example #2
Source File: autosum_arxiv.py    From autosum with MIT License 6 votes vote down vote up
def search_citing_sentences(aid, txt, match):
    lines = txt.split('\n')
    txt = ' '.join(lines)
    txt = ' '.join(txt.split())
    sentences = split_sentences(txt)
    founds = set()
    for r in match.keys():
        if r:
            regexp_list = [regex.escape('\cite%s' % r),
                           regex.escape('\\refs{%s}' % r),
                           r'(?<!(bibitem|lref).*?)' + regex.escape('%s' % r)]
            print aid, r
            for regexp in regexp_list:
                results = search_citation(sentences, regexp)
                founds.update(results)
                print("Regex: '{0!s}', Found: {1:d}".format(regexp, len(results)))
                if len(results):
                    break
    print("_" * 50)
    return founds 
Example #3
Source File: test_black.py    From black with MIT License 6 votes vote down vote up
def test_expression_diff(self) -> None:
        source, _ = read_data("expression.py")
        expected, _ = read_data("expression.diff")
        tmp_file = Path(black.dump_to_file(source))
        diff_header = re.compile(
            rf"{re.escape(str(tmp_file))}\t\d\d\d\d-\d\d-\d\d "
            r"\d\d:\d\d:\d\d\.\d\d\d\d\d\d \+\d\d\d\d"
        )
        try:
            result = BlackRunner().invoke(black.main, ["--diff", str(tmp_file)])
            self.assertEqual(result.exit_code, 0)
        finally:
            os.unlink(tmp_file)
        actual = result.output
        actual = diff_header.sub(DETERMINISTIC_HEADER, actual)
        actual = actual.rstrip() + "\n"  # the diff output has a trailing space
        if expected != actual:
            dump = black.dump_to_file(actual)
            msg = (
                "Expected diff isn't equal to the actual. If you made changes to"
                " expression.py and this is an anticipated difference, overwrite"
                f" tests/data/expression.diff with {dump}"
            )
            self.assertEqual(expected, actual, msg) 
Example #4
Source File: util.py    From urduhack with MIT License 6 votes vote down vote up
def remove_punctuation(text: str, marks=None) -> str:
    """
    Remove punctuation from ``text`` by removing all instances of ``marks``.

    Args:
        text (str): Urdu text
        marks (str): If specified, remove only the characters in this string,
            e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
            Otherwise, all punctuation marks are removed.
    Returns:
        str: returns a ``str`` object containing normalized text.
    Note:
        When ``marks=None``, Python's built-in :meth:`str.translate()` is
        used to remove punctuation; otherwise, a regular expression is used
        instead. The former's performance is about 5-10x faster.
    Examples:
        >>> from urduhack.preprocessing import remove_punctuation
        >>> output = remove_punctuation("کر ؟ سکتی ہے۔")
        کر سکتی ہے

    """
    if marks:
        return re.sub('[{}]+'.format(re.escape(marks)), '', text, flags=re.UNICODE)

    return text.translate(PUNCTUATION_TRANSLATE_UNICODE) 
Example #5
Source File: intdict.py    From mwic with MIT License 6 votes vote down vote up
def expand(self, s):
        if not self._defs:
            return s
        if self._regex is not None:
            regex = self._regex
            substs = self._substs
        else:
            substs = []
            regex = []
            for i, (name, definition) in enumerate(self._defs.items()):
                substs += [definition]
                regex += ['(?P<mwic{i}>{name})'.format(i=i, name=re.escape(name))]
            regex = '|'.join(regex)
            regex = re.compile(regex)
            self._regex = regex
            self._substs = substs
        assert self._regex is not None
        assert self._substs is not None
        def replace(match):
            for i, subst in enumerate(substs):
                if match.group('mwic{i}'.format(i=i)) is not None:
                    return subst
            assert False  # no coverage
        return self._regex.sub(replace, s) 
Example #6
Source File: phrase2vec.py    From mat2vec with MIT License 6 votes vote down vote up
def exclude_words(phrasegrams, words):
    """Given a list of words, excludes those from the keys of the phrase dictionary."""
    new_phrasergrams = {}
    words_re_list = []
    for word in words:
        we = regex.escape(word)
        words_re_list.append("^" + we + "$|^" + we + "_|_" + we + "$|_" + we + "_")
    word_reg = regex.compile(r""+"|".join(words_re_list))
    for gram in tqdm(phrasegrams):
        valid = True
        for sub_gram in gram:
            if word_reg.search(sub_gram.decode("unicode_escape", "ignore")) is not None:
                valid = False
                break
            if not valid:
                continue
        if valid:
            new_phrasergrams[gram] = phrasegrams[gram]
    return new_phrasergrams


# Generating word grams. 
Example #7
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
            r"\b(?:{})\b".format(r"|".join(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            )),
            re.I if ignorecase else 0
        ).finditer(s):
        yield m.group(0) 
Example #8
Source File: cache.py    From synapse with Apache License 2.0 5 votes vote down vote up
def regexizeTagGlob(tag):
    '''
    Returns:
        a regular expression string with ** and * interpreted as tag globs

    Precondition:
        tag is a valid tagmatch

    Notes:
        A single asterisk will replace exactly one dot-delimited component of a tag
        A double asterisk will replace one or more of any character.

        The returned string does not contain a starting '^' or trailing '$'.
    '''
    return ReRegex.sub(lambda m: r'([^.]+?)' if m.group(1) is None else r'(.+)', regex.escape(tag)) 
Example #9
Source File: util.py    From MicroTokenizer with MIT License 5 votes vote down vote up
def prints(*texts, **kwargs):
    """Print formatted message (manual ANSI escape sequences to avoid
    dependency)

    *texts (unicode): Texts to print. Each argument is rendered as paragraph.
    **kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
    """
    exits = kwargs.get('exits', None)
    title = kwargs.get('title', None)
    title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
    message = '\n\n'.join([_wrap(text) for text in texts])
    print('\n{}{}\n'.format(title, message))
    if exits is not None:
        sys.exit(exits) 
Example #10
Source File: util.py    From MicroTokenizer with MIT License 5 votes vote down vote up
def compile_prefix_regex(entries):
    if '(' in entries:
        # Handle deprecated data
        expression = '|'.join(['^' + re.escape(piece)
                               for piece in entries if piece.strip()])
        return re.compile(expression)
    else:
        expression = '|'.join(['^' + piece
                               for piece in entries if piece.strip()])
        return re.compile(expression) 
Example #11
Source File: util.py    From MicroTokenizer with MIT License 5 votes vote down vote up
def read_regex(path):
    path = ensure_path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece)
                           for piece in entries if piece.strip()])
    return re.compile(expression) 
Example #12
Source File: lexer.py    From pythonparser with MIT License 5 votes vote down vote up
def _replace_escape_bytes(self, value):
        chunks = []
        offset = 0
        while offset < len(value):
            match = self._lex_escape_re.search(value, offset)
            if match is None:
                # Append the remaining of the string
                chunks.append(value[offset:])
                break

            # Append the part of string before match
            chunks.append(value[offset:match.start()])
            offset = match.end()

            # Process the escape
            if match.group(1) is not None: # single-char
                chr = match.group(1)
                if chr == b"\n":
                    pass
                elif chr == b"\\" or chr == b"'" or chr == b"\"":
                    chunks.append(chr)
                elif chr == b"a":
                    chunks.append(b"\a")
                elif chr == b"b":
                    chunks.append(b"\b")
                elif chr == b"f":
                    chunks.append(b"\f")
                elif chr == b"n":
                    chunks.append(b"\n")
                elif chr == b"r":
                    chunks.append(b"\r")
                elif chr == b"t":
                    chunks.append(b"\t")
                elif chr == b"v":
                    chunks.append(b"\v")
            elif match.group(2) is not None: # oct
                chunks.append(byte(int(match.group(2), 8)))
            elif match.group(3) is not None: # hex
                chunks.append(byte(int(match.group(3), 16)))

        return b"".join(chunks) 
Example #13
Source File: _wikilist.py    From wikitextparser with GNU General Public License v3.0 5 votes vote down vote up
def convert(self, newstart: str) -> None:
        """Convert to another list type by replacing starting pattern."""
        match = self._match
        ms = match.start()
        for s, e in reversed(match.spans('pattern')):
            self[s - ms:e - ms] = newstart
        self.pattern = escape(newstart) 
Example #14
Source File: dictionary.py    From dateparser with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _construct_split_regex(self):
        known_words_group = "|".join(map(re.escape, self._get_sorted_words_from_cache()))
        if self._no_word_spacing:
            regex = r"^(.*?)({})(.*)$".format(known_words_group)
        else:
            regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(known_words_group)
        self._split_regex_cache.setdefault(
            self._settings.registry_key, {})[self.info['name']] = \
            re.compile(regex, re.UNICODE | re.IGNORECASE) 
Example #15
Source File: xsampa.py    From panphon with MIT License 5 votes vote down vote up
def read_xsampa_table(self):
        filename = os.path.join('data', 'ipa-xsampa.csv')
        filename = pkg_resources.resource_filename(__name__, filename)
        with open(filename, 'rb') as f:
            xs2ipa = {x[1]: x[0] for x in csv.reader(f, encoding='utf-8')}
        xs = sorted(xs2ipa.keys(), key=len, reverse=True)
        xs_regex = re.compile('|'.join(list(map(re.escape, xs))))
        return xs_regex, xs2ipa 
Example #16
Source File: codec.py    From kraken with Apache License 2.0 5 votes vote down vote up
def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str]) -> None:
        """
        Builds a codec converting between graphemes/code points and integer
        label sequences.

        charset may either be a string, a list or a dict. In the first case
        each code point will be assigned a label, in the second case each
        string in the list will be assigned a label, and in the final case each
        key string will be mapped to the value sequence of integers. In the
        first two cases labels will be assigned automatically.

        As 0 is the blank label in a CTC output layer, output labels and input
        dictionaries are/should be 1-indexed.

        Args:
            charset (unicode, list, dict): Input character set.
        """
        if isinstance(charset, dict):
            self.c2l = charset
        else:
            self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)}
        # map integer labels to code points because regex only works with strings
        self.l2c = {}  # type: Dict[str, str]
        for k, v in self.c2l.items():
            self.l2c[''.join(chr(c) for c in v)] = k

        # sort prefixes for c2l regex
        self.c2l_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.c2l.keys(), key=len, reverse=True)))
        # sort prefixes for l2c regex
        self.l2c_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.l2c.keys(), key=len, reverse=True))) 
Example #17
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def __findeqtagpairspans(
        s: str,
        tag: str,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
        yield (match.span("__open"), match.span("__content"), match.span("__close")) 
Example #18
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
            r"\b(?:{})\b".format(r"|".join(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            )),
            re.I if ignorecase else 0
        ).finditer(s):
        yield m.group(0) 
Example #19
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def __findtagpairspans(
        s: str,
        tag: str, closetag: Optional[str] = None,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    if closetag is None or tag == closetag:
        yield from __findeqtagpairspans(s, tag, useregex=useregex)
        return

    if not useregex:
        tag = re.escape(tag)
        closetag = re.escape(closetag)

    retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag))

    startspans = []

    for match in retags.finditer(s):
        opengroup = match.group("__open")
        if opengroup:
            startspans.append(match.span())
            continue

        closegroup = match.group("__close")
        if closegroup and startspans:
            startspan = startspans.pop()
            endspan = match.span()

            yield (startspan, (startspan[1], endspan[0]), endspan) 
Example #20
Source File: strtools.py    From extratools with MIT License 5 votes vote down vote up
def __findeqtagpairspans(
        s: str,
        tag: str,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
        yield (match.span("__open"), match.span("__content"), match.span("__close")) 
Example #21
Source File: utils.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def escape_string(self):
        """Escape all special characters in a string
        
        Returns:
            Chepy: The Chepy object.
        """
        self.state = re.escape(self._convert_to_str())
        return self 
Example #22
Source File: scorer.py    From nmt-chatbot with GNU General Public License v3.0 4 votes vote down vote up
def check_urls(index, question, answer):
    global full_sentence_valid_url

    full_sentence_valid_url = False
    valid_url = False

    # Disabled
    if score_settings['incorrect_url_modifier_value'] is None:
        return 0

    # Find all utls in sentence
    for url in re.finditer('http(?:s?):(//([^/]*?)/(?:[^ ])*?(?=$|[' + re.escape(score_settings['url_delimiters']) + ']))?', answer):

        # Check if result is in cache already and return it
        if url_cache[url.group(0)][1] > time.time():
            if url_cache[url.group(0)][0] == 0:
                return score_settings['incorrect_url_modifier_value']

        # Url not in cache - check it
        else:

            # Send HEAD request and check HTTP response code
            try:
                request = requests.head(url.group(0))
                code = request.status_code
            except Exception as e:
                code = 0

            # Add to cache
            url_cache[url.group(0)] = [1 if code == 200 else 0, time.time() + 86400]

            # If code is diffrent than 200 - return modifier value
            if code != 200:
                return score_settings['incorrect_url_modifier_value']

        # Check if it's full sentence url
        valid_url = (len(url.group(0)) == len(answer))

    # Everyting ok, set if full sentence url and return 0
    full_sentence_valid_url = valid_url
    return 0

# Add score by sentence length 
Example #23
Source File: version.py    From synapse with Apache License 2.0 4 votes vote down vote up
def parseVersionParts(text, seps=vseps):
    '''
    Extract a list of major/minor/version integer strings from a string.

    Args:
        text (str): String to parse
        seps (tuple): A tuple or list of separators to use when parsing the version string.

    Examples:
        Parse a simple version string into a major and minor parts::

            parts = parseVersionParts('1.2')

        Parse a complex version string into a major and minor parts::

            parts = parseVersionParts('wowsoft_1.2')

        Parse a simple version string into a major, minor and patch parts.  Parts after the "3." are dropped from the
        results::

            parts = parseVersionParts('1.2.3.4.5')

    Notes:
        This attempts to brute force out integers from the version string by stripping any leading ascii letters and
        part separators, and then regexing out numeric parts optionally followed by part separators.  It will stop at
        the first mixed-character part encountered.  For example, "1.2-3a" would only parse out the "1" and "2" from
        the string.

    Returns:
        dict: Either a empty dictionary or dictionary containing up to three keys, 'major', 'minor' and 'patch'.
    '''
    # Join seps together
    seps = ''.join(seps)
    # Strip whitespace
    text = text.strip()
    # Strip off leading chars
    text = text.lstrip(string.ascii_letters)
    # Strip off any leading separator which may be present
    text = text.lstrip(seps)
    pattern = r'^(\d+)([{}]+|$)'.format(regex.escape(seps))
    parts = []
    ret = {}
    off = 0
    while True:
        m = regex.search(pattern, text[off:])
        if not m:
            break
        off += m.end()
        p, s = m.groups()
        parts.append(int(p))
    if not parts:
        return None
    keys = ('major', 'minor', 'patch')
    ret.update(zip(keys, parts))
    return ret 
Example #24
Source File: look.py    From formulas with European Union Public License 1.1 4 votes vote down vote up
def xmatch(lookup_value, lookup_array, match_type=1):
    res = [Error.errors['#N/A']]
    t_id = _get_type_id(lookup_value)
    if match_type > 0:
        def check(j, x, val, r):
            if x <= val:
                r[0] = j
                return x == val and j > 1
            return j > 1

    elif match_type < 0:
        def check(j, x, val, r):
            if x < val:
                return True
            r[0] = j
            return v == val

    else:
        t_id = _get_type_id(lookup_value)
        if t_id == 1:
            def sub(m):
                return {'\\': '', '?': '.', '*': '.*'}[m.groups()[0]]

            match = regex.compile(r'^%s$' % regex.sub(
                r'(?<!\\\~)\\(?P<sub>[\*\?])|(?P<sub>\\)\~(?=\\[\*\?])', sub,
                regex.escape(lookup_value)
            ), regex.IGNORECASE).match
        else:
            match = lambda x: x == lookup_value

        # noinspection PyUnusedLocal
        def check(j, x, val, r):
            if match(x):
                r[0] = j

    convert = lambda x: x
    if t_id == 1:
        convert = lambda x: x.upper()

    lookup_value = convert(lookup_value)
    for i, v in _yield_vals(t_id, lookup_array):
        if check(i, convert(v), lookup_value, res):
            break
    return res[0]