Python regex.escape() Examples
The following are 24
code examples of regex.escape().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: utils.py From dragonfly with GNU Lesser General Public License v3.0 | 6 votes |
def _phrase_to_regex(phrase): # Treat whitespace between words as meaning anything other than alphanumeric # characters. pattern = r"[^\w--_]+".join(regex.escape(word) for word in phrase.split()) # Treat spaces at the beginning or end of the phrase as matching any # whitespace character. This makes it easy to select stuff like non-breaking # space, which occurs frequently in browsers. # TODO Support newlines. Note that these are frequently implemented as # separate text nodes in the accessibility tree, so the obvious # implementation would not work well. if phrase == " ": pattern = r"\s" else: if phrase.startswith(" "): pattern = r"\s" + pattern if phrase.endswith(" "): pattern = pattern + r"\s" # Only match at boundaries of alphanumeric sequences if the phrase ends # are alphanumeric. if regex.search(r"^[\w--_]", phrase, regex.VERSION1 | regex.UNICODE): pattern = r"(?<![\w--_])" + pattern if regex.search(r"[\w--_]$", phrase, regex.VERSION1 | regex.UNICODE): pattern = pattern + r"(?![\w--_])" return pattern
Example #2
Source File: autosum_arxiv.py From autosum with MIT License | 6 votes |
def search_citing_sentences(aid, txt, match): lines = txt.split('\n') txt = ' '.join(lines) txt = ' '.join(txt.split()) sentences = split_sentences(txt) founds = set() for r in match.keys(): if r: regexp_list = [regex.escape('\cite%s' % r), regex.escape('\\refs{%s}' % r), r'(?<!(bibitem|lref).*?)' + regex.escape('%s' % r)] print aid, r for regexp in regexp_list: results = search_citation(sentences, regexp) founds.update(results) print("Regex: '{0!s}', Found: {1:d}".format(regexp, len(results))) if len(results): break print("_" * 50) return founds
Example #3
Source File: test_black.py From black with MIT License | 6 votes |
def test_expression_diff(self) -> None: source, _ = read_data("expression.py") expected, _ = read_data("expression.diff") tmp_file = Path(black.dump_to_file(source)) diff_header = re.compile( rf"{re.escape(str(tmp_file))}\t\d\d\d\d-\d\d-\d\d " r"\d\d:\d\d:\d\d\.\d\d\d\d\d\d \+\d\d\d\d" ) try: result = BlackRunner().invoke(black.main, ["--diff", str(tmp_file)]) self.assertEqual(result.exit_code, 0) finally: os.unlink(tmp_file) actual = result.output actual = diff_header.sub(DETERMINISTIC_HEADER, actual) actual = actual.rstrip() + "\n" # the diff output has a trailing space if expected != actual: dump = black.dump_to_file(actual) msg = ( "Expected diff isn't equal to the actual. If you made changes to" " expression.py and this is an anticipated difference, overwrite" f" tests/data/expression.diff with {dump}" ) self.assertEqual(expected, actual, msg)
Example #4
Source File: util.py From urduhack with MIT License | 6 votes |
def remove_punctuation(text: str, marks=None) -> str: """ Remove punctuation from ``text`` by removing all instances of ``marks``. Args: text (str): Urdu text marks (str): If specified, remove only the characters in this string, e.g. ``marks=',;:'`` removes commas, semi-colons, and colons. Otherwise, all punctuation marks are removed. Returns: str: returns a ``str`` object containing normalized text. Note: When ``marks=None``, Python's built-in :meth:`str.translate()` is used to remove punctuation; otherwise, a regular expression is used instead. The former's performance is about 5-10x faster. Examples: >>> from urduhack.preprocessing import remove_punctuation >>> output = remove_punctuation("کر ؟ سکتی ہے۔") کر سکتی ہے """ if marks: return re.sub('[{}]+'.format(re.escape(marks)), '', text, flags=re.UNICODE) return text.translate(PUNCTUATION_TRANSLATE_UNICODE)
Example #5
Source File: intdict.py From mwic with MIT License | 6 votes |
def expand(self, s): if not self._defs: return s if self._regex is not None: regex = self._regex substs = self._substs else: substs = [] regex = [] for i, (name, definition) in enumerate(self._defs.items()): substs += [definition] regex += ['(?P<mwic{i}>{name})'.format(i=i, name=re.escape(name))] regex = '|'.join(regex) regex = re.compile(regex) self._regex = regex self._substs = substs assert self._regex is not None assert self._substs is not None def replace(match): for i, subst in enumerate(substs): if match.group('mwic{i}'.format(i=i)) is not None: return subst assert False # no coverage return self._regex.sub(replace, s)
Example #6
Source File: phrase2vec.py From mat2vec with MIT License | 6 votes |
def exclude_words(phrasegrams, words): """Given a list of words, excludes those from the keys of the phrase dictionary.""" new_phrasergrams = {} words_re_list = [] for word in words: we = regex.escape(word) words_re_list.append("^" + we + "$|^" + we + "_|_" + we + "$|_" + we + "_") word_reg = regex.compile(r""+"|".join(words_re_list)) for gram in tqdm(phrasegrams): valid = True for sub_gram in gram: if word_reg.search(sub_gram.decode("unicode_escape", "ignore")) is not None: valid = False break if not valid: continue if valid: new_phrasergrams[gram] = phrasegrams[gram] return new_phrasergrams # Generating word grams.
Example #7
Source File: strtools.py From extratools with MIT License | 5 votes |
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]: for m in re.compile( r"\b(?:{})\b".format(r"|".join( e if useregex else re.escape(e).replace(' ', r"s+") for e in entities )), re.I if ignorecase else 0 ).finditer(s): yield m.group(0)
Example #8
Source File: cache.py From synapse with Apache License 2.0 | 5 votes |
def regexizeTagGlob(tag): ''' Returns: a regular expression string with ** and * interpreted as tag globs Precondition: tag is a valid tagmatch Notes: A single asterisk will replace exactly one dot-delimited component of a tag A double asterisk will replace one or more of any character. The returned string does not contain a starting '^' or trailing '$'. ''' return ReRegex.sub(lambda m: r'([^.]+?)' if m.group(1) is None else r'(.+)', regex.escape(tag))
Example #9
Source File: util.py From MicroTokenizer with MIT License | 5 votes |
def prints(*texts, **kwargs): """Print formatted message (manual ANSI escape sequences to avoid dependency) *texts (unicode): Texts to print. Each argument is rendered as paragraph. **kwargs: 'title' becomes coloured headline. exits=True performs sys exit. """ exits = kwargs.get('exits', None) title = kwargs.get('title', None) title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else '' message = '\n\n'.join([_wrap(text) for text in texts]) print('\n{}{}\n'.format(title, message)) if exits is not None: sys.exit(exits)
Example #10
Source File: util.py From MicroTokenizer with MIT License | 5 votes |
def compile_prefix_regex(entries): if '(' in entries: # Handle deprecated data expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return re.compile(expression) else: expression = '|'.join(['^' + piece for piece in entries if piece.strip()]) return re.compile(expression)
Example #11
Source File: util.py From MicroTokenizer with MIT License | 5 votes |
def read_regex(path): path = ensure_path(path) with path.open() as file_: entries = file_.read().split('\n') expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return re.compile(expression)
Example #12
Source File: lexer.py From pythonparser with MIT License | 5 votes |
def _replace_escape_bytes(self, value): chunks = [] offset = 0 while offset < len(value): match = self._lex_escape_re.search(value, offset) if match is None: # Append the remaining of the string chunks.append(value[offset:]) break # Append the part of string before match chunks.append(value[offset:match.start()]) offset = match.end() # Process the escape if match.group(1) is not None: # single-char chr = match.group(1) if chr == b"\n": pass elif chr == b"\\" or chr == b"'" or chr == b"\"": chunks.append(chr) elif chr == b"a": chunks.append(b"\a") elif chr == b"b": chunks.append(b"\b") elif chr == b"f": chunks.append(b"\f") elif chr == b"n": chunks.append(b"\n") elif chr == b"r": chunks.append(b"\r") elif chr == b"t": chunks.append(b"\t") elif chr == b"v": chunks.append(b"\v") elif match.group(2) is not None: # oct chunks.append(byte(int(match.group(2), 8))) elif match.group(3) is not None: # hex chunks.append(byte(int(match.group(3), 16))) return b"".join(chunks)
Example #13
Source File: _wikilist.py From wikitextparser with GNU General Public License v3.0 | 5 votes |
def convert(self, newstart: str) -> None: """Convert to another list type by replacing starting pattern.""" match = self._match ms = match.start() for s, e in reversed(match.spans('pattern')): self[s - ms:e - ms] = newstart self.pattern = escape(newstart)
Example #14
Source File: dictionary.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _construct_split_regex(self): known_words_group = "|".join(map(re.escape, self._get_sorted_words_from_cache())) if self._no_word_spacing: regex = r"^(.*?)({})(.*)$".format(known_words_group) else: regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(known_words_group) self._split_regex_cache.setdefault( self._settings.registry_key, {})[self.info['name']] = \ re.compile(regex, re.UNICODE | re.IGNORECASE)
Example #15
Source File: xsampa.py From panphon with MIT License | 5 votes |
def read_xsampa_table(self): filename = os.path.join('data', 'ipa-xsampa.csv') filename = pkg_resources.resource_filename(__name__, filename) with open(filename, 'rb') as f: xs2ipa = {x[1]: x[0] for x in csv.reader(f, encoding='utf-8')} xs = sorted(xs2ipa.keys(), key=len, reverse=True) xs_regex = re.compile('|'.join(list(map(re.escape, xs)))) return xs_regex, xs2ipa
Example #16
Source File: codec.py From kraken with Apache License 2.0 | 5 votes |
def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str]) -> None: """ Builds a codec converting between graphemes/code points and integer label sequences. charset may either be a string, a list or a dict. In the first case each code point will be assigned a label, in the second case each string in the list will be assigned a label, and in the final case each key string will be mapped to the value sequence of integers. In the first two cases labels will be assigned automatically. As 0 is the blank label in a CTC output layer, output labels and input dictionaries are/should be 1-indexed. Args: charset (unicode, list, dict): Input character set. """ if isinstance(charset, dict): self.c2l = charset else: self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)} # map integer labels to code points because regex only works with strings self.l2c = {} # type: Dict[str, str] for k, v in self.c2l.items(): self.l2c[''.join(chr(c) for c in v)] = k # sort prefixes for c2l regex self.c2l_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.c2l.keys(), key=len, reverse=True))) # sort prefixes for l2c regex self.l2c_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.l2c.keys(), key=len, reverse=True)))
Example #17
Source File: strtools.py From extratools with MIT License | 5 votes |
def __findeqtagpairspans( s: str, tag: str, useregex: bool = False ) -> Iterable[Tuple[Tuple[int, int], ...]]: for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s): yield (match.span("__open"), match.span("__content"), match.span("__close"))
Example #18
Source File: strtools.py From extratools with MIT License | 5 votes |
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]: for m in re.compile( r"\b(?:{})\b".format(r"|".join( e if useregex else re.escape(e).replace(' ', r"s+") for e in entities )), re.I if ignorecase else 0 ).finditer(s): yield m.group(0)
Example #19
Source File: strtools.py From extratools with MIT License | 5 votes |
def __findtagpairspans( s: str, tag: str, closetag: Optional[str] = None, useregex: bool = False ) -> Iterable[Tuple[Tuple[int, int], ...]]: if closetag is None or tag == closetag: yield from __findeqtagpairspans(s, tag, useregex=useregex) return if not useregex: tag = re.escape(tag) closetag = re.escape(closetag) retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag)) startspans = [] for match in retags.finditer(s): opengroup = match.group("__open") if opengroup: startspans.append(match.span()) continue closegroup = match.group("__close") if closegroup and startspans: startspan = startspans.pop() endspan = match.span() yield (startspan, (startspan[1], endspan[0]), endspan)
Example #20
Source File: strtools.py From extratools with MIT License | 5 votes |
def __findeqtagpairspans( s: str, tag: str, useregex: bool = False ) -> Iterable[Tuple[Tuple[int, int], ...]]: for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s): yield (match.span("__open"), match.span("__content"), match.span("__close"))
Example #21
Source File: utils.py From chepy with GNU General Public License v3.0 | 5 votes |
def escape_string(self): """Escape all special characters in a string Returns: Chepy: The Chepy object. """ self.state = re.escape(self._convert_to_str()) return self
Example #22
Source File: scorer.py From nmt-chatbot with GNU General Public License v3.0 | 4 votes |
def check_urls(index, question, answer): global full_sentence_valid_url full_sentence_valid_url = False valid_url = False # Disabled if score_settings['incorrect_url_modifier_value'] is None: return 0 # Find all utls in sentence for url in re.finditer('http(?:s?):(//([^/]*?)/(?:[^ ])*?(?=$|[' + re.escape(score_settings['url_delimiters']) + ']))?', answer): # Check if result is in cache already and return it if url_cache[url.group(0)][1] > time.time(): if url_cache[url.group(0)][0] == 0: return score_settings['incorrect_url_modifier_value'] # Url not in cache - check it else: # Send HEAD request and check HTTP response code try: request = requests.head(url.group(0)) code = request.status_code except Exception as e: code = 0 # Add to cache url_cache[url.group(0)] = [1 if code == 200 else 0, time.time() + 86400] # If code is diffrent than 200 - return modifier value if code != 200: return score_settings['incorrect_url_modifier_value'] # Check if it's full sentence url valid_url = (len(url.group(0)) == len(answer)) # Everyting ok, set if full sentence url and return 0 full_sentence_valid_url = valid_url return 0 # Add score by sentence length
Example #23
Source File: version.py From synapse with Apache License 2.0 | 4 votes |
def parseVersionParts(text, seps=vseps): ''' Extract a list of major/minor/version integer strings from a string. Args: text (str): String to parse seps (tuple): A tuple or list of separators to use when parsing the version string. Examples: Parse a simple version string into a major and minor parts:: parts = parseVersionParts('1.2') Parse a complex version string into a major and minor parts:: parts = parseVersionParts('wowsoft_1.2') Parse a simple version string into a major, minor and patch parts. Parts after the "3." are dropped from the results:: parts = parseVersionParts('1.2.3.4.5') Notes: This attempts to brute force out integers from the version string by stripping any leading ascii letters and part separators, and then regexing out numeric parts optionally followed by part separators. It will stop at the first mixed-character part encountered. For example, "1.2-3a" would only parse out the "1" and "2" from the string. Returns: dict: Either a empty dictionary or dictionary containing up to three keys, 'major', 'minor' and 'patch'. ''' # Join seps together seps = ''.join(seps) # Strip whitespace text = text.strip() # Strip off leading chars text = text.lstrip(string.ascii_letters) # Strip off any leading separator which may be present text = text.lstrip(seps) pattern = r'^(\d+)([{}]+|$)'.format(regex.escape(seps)) parts = [] ret = {} off = 0 while True: m = regex.search(pattern, text[off:]) if not m: break off += m.end() p, s = m.groups() parts.append(int(p)) if not parts: return None keys = ('major', 'minor', 'patch') ret.update(zip(keys, parts)) return ret
Example #24
Source File: look.py From formulas with European Union Public License 1.1 | 4 votes |
def xmatch(lookup_value, lookup_array, match_type=1): res = [Error.errors['#N/A']] t_id = _get_type_id(lookup_value) if match_type > 0: def check(j, x, val, r): if x <= val: r[0] = j return x == val and j > 1 return j > 1 elif match_type < 0: def check(j, x, val, r): if x < val: return True r[0] = j return v == val else: t_id = _get_type_id(lookup_value) if t_id == 1: def sub(m): return {'\\': '', '?': '.', '*': '.*'}[m.groups()[0]] match = regex.compile(r'^%s$' % regex.sub( r'(?<!\\\~)\\(?P<sub>[\*\?])|(?P<sub>\\)\~(?=\\[\*\?])', sub, regex.escape(lookup_value) ), regex.IGNORECASE).match else: match = lambda x: x == lookup_value # noinspection PyUnusedLocal def check(j, x, val, r): if match(x): r[0] = j convert = lambda x: x if t_id == 1: convert = lambda x: x.upper() lookup_value = convert(lookup_value) for i, v in _yield_vals(t_id, lookup_array): if check(i, convert(v), lookup_value, res): break return res[0]