Python regex.match() Examples
The following are 30
code examples of regex.match().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: vocab.py From lightNLP with Apache License 2.0 | 6 votes |
def __init__(self, words, tags, rels): self.pad_index = 0 self.unk_index = 1 self.words = [self.PAD, self.UNK] + sorted(words) self.tags = [self.PAD, self.UNK] + sorted(tags) self.rels = sorted(rels) self.word_dict = {word: i for i, word in enumerate(self.words)} self.tag_dict = {tag: i for i, tag in enumerate(self.tags)} self.rel_dict = {rel: i for i, rel in enumerate(self.rels)} # ids of punctuation that appear in words self.puncts = sorted(i for word, i in self.word_dict.items() if regex.match(r'\p{P}+$', word)) self.n_words = len(self.words) self.n_tags = len(self.tags) self.n_rels = len(self.rels) self.n_train_words = self.n_words
Example #2
Source File: validate-python2-obsolete.py From tools with GNU General Public License v2.0 | 6 votes |
def validate_left_to_right_relations(cols): """ Certain UD relations must always go left-to-right. Here we currently check the rule for the basic dependencies. The same should also be tested for the enhanced dependencies! """ if is_multiword_token(cols): return if DEPREL >= len(cols): return # this has been already reported in trees() #if cols[DEPREL] == u"conj": if re.match(r"^(conj|fixed|flat)", cols[DEPREL]): ichild = int(cols[ID]) iparent = int(cols[HEAD]) if ichild < iparent: warn(u"Violation of guidelines: relation %s must go left-to-right" % cols[DEPREL], u"Syntax") ##### Tests applicable to the whole tree
Example #3
Source File: validate-python2-obsolete.py From tools with GNU General Public License v2.0 | 6 votes |
def validate_sent_id(comments,known_ids,lcode): matched=[] for c in comments: match=sentid_re.match(c) if match: matched.append(match) else: if c.startswith(u"# sent_id") or c.startswith(u"#sent_id"): warn(u"Spurious sent_id line: '%s' Should look like '# sent_id = xxxxxx' where xxxx is not whitespace. Forward slash reserved for special purposes." %c,u"Metadata") if not matched: warn(u"Missing the sent_id attribute.",u"Metadata") elif len(matched)>1: warn(u"Multiple sent_id attribute.",u"Metadata") else: sid=matched[0].group(1) if sid in known_ids: warn(u"Non-unique sent_id the sent_id attribute: "+sid,u"Metadata") if sid.count(u"/")>1 or (sid.count(u"/")==1 and lcode!=u"ud" and lcode!=u"shopen"): warn(u"The forward slash is reserved for special use in parallel treebanks: "+sid,u"Metadata") known_ids.add(sid)
Example #4
Source File: test_partialparse.py From ctparse with MIT License | 6 votes |
def test_partial_parse() -> None: match_a = regex.match("(?<R1>a)", "ab") match_b = next(regex.finditer("(?<R2>b)", "ab")) pp = PartialParse.from_regex_matches( (RegexMatch(1, match_a), RegexMatch(2, match_b)) ) assert len(pp.prod) == 2 assert len(pp.rules) == 2 assert isinstance(pp.score, float) def mock_rule(ts: datetime.datetime, a: Time) -> Time: return Time() pp2 = pp.apply_rule( datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1) ) assert pp != pp2 with pytest.raises(ValueError): PartialParse((), ())
Example #5
Source File: prep_wikipedia.py From justcopy-backend with MIT License | 6 votes |
def preprocess(article): # Take out HTML escaping WikiExtractor didn't clean for k, v in article.items(): article[k] = PARSER.unescape(v) # Filter some disambiguation pages not caught by the WikiExtractor if article['id'] in BLACKLIST: return None if '(disambiguation)' in article['title'].lower(): return None if '(disambiguation page)' in article['title'].lower(): return None # Take out List/Index/Outline pages (mostly links) if re.match(r'(List of .+)|(Index of .+)|(Outline of .+)', article['title']): return None # Return doc with `id` set to `title` return {'id': article['title'], 'text': article['text']}
Example #6
Source File: rules.py From epitran with MIT License | 6 votes |
def _read_rule(self, i, line): line = line.strip() if line: line = unicodedata.normalize('NFD', line) s = re.match(r'(?P<symbol>::\w+::)\s*=\s*(?P<value>.+)', line) if s: self.symbols[s.group('symbol')] = s.group('value') else: line = self._sub_symbols(line) r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line) try: a, b, X, Y = r.groups() except AttributeError: raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line)) X, Y = X.replace('#', '^'), Y.replace('#', '$') a, b = a.replace('0', ''), b.replace('0', '') try: if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a): return self._fields_to_function_metathesis(a, X, Y) else: return self._fields_to_function(a, b, X, Y) except Exception as e: raise DatafileError('Line {}: "{}" cannot be compiled as regex: ̪{}'.format(i + 1, line, e))
Example #7
Source File: flite.py From epitran with MIT License | 6 votes |
def transliterate(self, text, normpunc=False, ligatures=False): """Convert English text to IPA transcription Args: text (unicode): English text normpunc (bool): if True, normalize punctuation downward ligatures (bool): if True, use non-standard ligatures instead of standard IPA """ text = unicodedata.normalize('NFC', text) acc = [] for chunk in self.chunk_re.findall(text): if self.letter_re.match(chunk): acc.append(self.english_g2p(chunk)) else: acc.append(chunk) text = ''.join(acc) text = self.puncnorm.norm(text) if normpunc else text text = ligaturize(text) if (ligatures or self.ligatures) else text return text
Example #8
Source File: validate.py From tools with GNU General Public License v2.0 | 6 votes |
def validate_left_to_right_relations(id, tree): """ Certain UD relations must always go left-to-right. Here we currently check the rule for the basic dependencies. The same should also be tested for the enhanced dependencies! """ testlevel = 3 testclass = 'Syntax' cols = tree['nodes'][id] if is_multiword_token(cols): return if DEPREL >= len(cols): return # this has been already reported in trees() # According to the v2 guidelines, apposition should also be left-headed, although the definition of apposition may need to be improved. if re.match(r"^(conj|fixed|flat|goeswith|appos)", cols[DEPREL]): ichild = int(cols[ID]) iparent = int(cols[HEAD]) if ichild < iparent: # We must recognize the relation type in the test id so we can manage exceptions for legacy treebanks. # For conj, flat, and fixed the requirement was introduced already before UD 2.2, and all treebanks in UD 2.3 passed it. # For appos and goeswith the requirement was introduced before UD 2.4 and legacy treebanks are allowed to fail it. testid = "right-to-left-%s" % lspec2ud(cols[DEPREL]) testmessage = "Relation '%s' must go left-to-right." % cols[DEPREL] warn(testmessage, testclass, testlevel=testlevel, testid=testid, nodeid=id, nodelineno=tree['linenos'][id])
Example #9
Source File: query.py From cltk with MIT License | 6 votes |
def match_regex(input_str, pattern, language, context, case_insensitive=True): """Take input string and a regex pattern, then yield generator of matches in desired format. TODO: Rename this `match_pattern` and incorporate the keyword expansion code currently in search_corpus. :param input_str: :param pattern: :param language: :param context: Integer or 'sentence' 'paragraph' :rtype : str """ if type(context) is str: contexts = ['sentence', 'paragraph'] assert context in contexts or type(context) is int, 'Available contexts: {}'.format(contexts) else: context = int(context) for match in _regex_span(pattern, input_str, case_insensitive=case_insensitive): if context == 'sentence': yield _sentence_context(match, language) elif context == 'paragraph': yield _paragraph_context(match) else: yield _window_match(match, context)
Example #10
Source File: parse_tlg_indices.py From cltk with MIT License | 6 votes |
def _get_epoch(_str): """Take incoming string, return its epoch.""" _return = None if _str.startswith('A.D. '): _return = 'ad' elif _str.startswith('a. A.D. '): _return = None #? elif _str.startswith('p. A.D. '): _return = 'ad' elif regex.match(r'^[0-9]+ B\.C\. *', _str): _return = 'bc' elif regex.match(r'^a\. *[0-9]+ B\.C\. *', _str): _return = 'bc' elif regex.match(r'^p\. *[0-9]+ B\.C\. *', _str): _return = None #? elif _str == 'Incertum' or _str == 'Varia': _return = _str return _return
Example #11
Source File: client.py From ibis with Apache License 2.0 | 6 votes |
def list_tables(self, like=None, database=None): """ List tables in the current (or indicated) database. Like the SHOW TABLES command. Parameters ---------- like : string, default None e.g. 'foo*' to match all tables starting with 'foo' database : string, default None If not passed, uses the current/default database Returns ------- results : list of strings """ results = [t.name for t in self._catalog.listTables(dbName=database)] if like: results = [ table_name for table_name in results if re.match(like, table_name) is not None ] return results
Example #12
Source File: query.py From cltk with MIT License | 6 votes |
def _window_match(match, window=100): """Take incoming match and highlight in context. :rtype : str :param match: Regex match. :param window: Characters on each side of match to return. :type window: int """ window = int(window) start = match.start() end = match.end() snippet_left = match.string[start - window:start] snippet_match = match.string[match.start():match.end()] snippet_right = match.string[end:end + window] snippet = snippet_left + '*' + snippet_match + '*' + snippet_right return snippet
Example #13
Source File: source.py From pythonparser with MIT License | 6 votes |
def _extract_encoding(self, source): if isinstance(source, bytes): re = self._encoding_bytes_re nl = b"\n" else: re = self._encoding_re nl = "\n" match = re.match(source) if not match: index = source.find(nl) if index != -1: match = re.match(source[index + 1:]) if match: encoding = match.group(1) if isinstance(encoding, bytes): return encoding.decode("ascii") return encoding return "ascii"
Example #14
Source File: quotations.py From talon with Apache License 2.0 | 5 votes |
def _correct_splitlines_in_headers(markers, lines): """ Corrects markers by removing splitlines deemed to be inside header blocks. """ updated_markers = "" i = 0 in_header_block = False for m in markers: # Only set in_header_block flag when we hit an 's' and line is a header if m == 's': if not in_header_block: if bool(re.search(RE_HEADER, lines[i])): in_header_block = True else: if QUOT_PATTERN.match(lines[i]): m = 'm' else: m = 't' # If the line is not a header line, set in_header_block false. if not bool(re.search(RE_HEADER, lines[i])): in_header_block = False # Add the marker to the new updated markers string. updated_markers += m i += 1 return updated_markers
Example #15
Source File: tpu_lm_finetuning.py From ru_transformers with Apache License 2.0 | 5 votes |
def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False): if not args.save_total_limit: return if args.save_total_limit <= 0: return # Check if we should delete older checkpoint(s) glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix))) if len(glob_checkpoints) <= args.save_total_limit: return ordering_and_checkpoint_path = [] for path in glob_checkpoints: if use_mtime: ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) else: regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path) if regex_match and regex_match.groups(): ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) checkpoints_sorted = sorted(ordering_and_checkpoint_path) checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit) checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] for checkpoint in checkpoints_to_be_deleted: log_info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) shutil.rmtree(checkpoint)
Example #16
Source File: hotpot_preliminary_doc_retri.py From semanticRetrievalMRS with MIT License | 5 votes |
def filter_word(text): """Take out english stopwords, punctuation, and compound endings.""" text = normalize(text) if regex.match(r'^\p{P}+$', text): return True if text.lower() in STOPWORDS: return True return False
Example #17
Source File: text_clean.py From semanticRetrievalMRS with MIT License | 5 votes |
def filter_document_id(input_string): pid_words = input_string.strip().replace('_', ' ') match = re.search('[a-zA-Z]', pid_words) if match is None: return True elif check_arabic(pid_words): return True else: return False
Example #18
Source File: hotpot_preliminary_doc_retri.py From semanticRetrievalMRS with MIT License | 5 votes |
def filter_document_id(input_string, remove_disambiguation_doc=True): pid_words = input_string.strip().replace('_', ' ') match = re.search('[a-zA-Z]', pid_words) if match is None: # filter id that contains no alphabets characters return True elif check_arabic(pid_words): # remove id that contain arabic characters. return True else: if remove_disambiguation_doc: if filter_disamb_doc(input_string): return True return False
Example #19
Source File: text_clean.py From semanticRetrievalMRS with MIT License | 5 votes |
def filter_word(text): """Take out english stopwords, punctuation, and compound endings.""" text = normalize(text) if regex.match(r'^\p{P}+$', text): return True if text.lower() in STOPWORDS: return True return False
Example #20
Source File: utils.py From neural_chat with MIT License | 5 votes |
def filter_word(text): """Take out english stopwords, punctuation, and compound endings.""" text = normalize(text) if regex.match(r'^\p{P}+$', text): return True if text.lower() in STOPWORDS: return True return False
Example #21
Source File: text_clean.py From combine-FEVER-NSMN with MIT License | 5 votes |
def filter_document_id(input_string): pid_words = input_string.strip().replace('_', ' ') match = re.search('[a-zA-Z]', pid_words) if match is None: return True elif check_arabic(pid_words): return True else: return False
Example #22
Source File: text_clean.py From combine-FEVER-NSMN with MIT License | 5 votes |
def filter_word(text): """Take out english stopwords, punctuation, and compound endings.""" text = normalize(text) if regex.match(r'^\p{P}+$', text): return True if text.lower() in STOPWORDS: return True return False
Example #23
Source File: quotations.py From talon with Apache License 2.0 | 5 votes |
def is_splitter(line): ''' Returns Matcher object if provided string is a splitter and None otherwise. ''' for pattern in SPLITTER_PATTERNS: matcher = re.match(pattern, line) if matcher: return matcher
Example #24
Source File: vocab.py From lightNLP with Apache License 2.0 | 5 votes |
def extend(self, words): self.words.extend(sorted(set(words).difference(self.word_dict))) self.word_dict = {word: i for i, word in enumerate(self.words)} self.puncts = sorted(i for word, i in self.word_dict.items() if regex.match(r'\p{P}+$', word)) self.n_words = len(self.words)
Example #25
Source File: parse_tlg_indices.py From cltk with MIT License | 5 votes |
def select_id_by_name(query): """Do a case-insensitive regex match on author name, returns TLG id.""" id_author = get_id_author() comp = regex.compile(r'{}'.format(query.casefold()), flags=regex.VERSION1) matches = [] for _id, author in id_author.items(): match = comp.findall(author.casefold()) if match: matches.append((_id, author)) return matches
Example #26
Source File: quotations.py From talon with Apache License 2.0 | 5 votes |
def mark_message_lines(lines): """Mark message lines with markers to distinguish quotation lines. Markers: * e - empty line * m - line that starts with quotation marker '>' * s - splitter line * t - presumably lines from the last message in the conversation >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) 'tsem' """ markers = ['e' for _ in lines] i = 0 while i < len(lines): if not lines[i].strip(): markers[i] = 'e' # empty line elif QUOT_PATTERN.match(lines[i]): markers[i] = 'm' # line with quotation marker elif RE_FWD.match(lines[i]): markers[i] = 'f' # ---- Forwarded message ---- else: # in case splitter is spread across several lines splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) if splitter: # append as many splitter markers as lines in splitter splitter_lines = splitter.group().splitlines() for j in range(len(splitter_lines)): markers[i + j] = 's' # skip splitter lines i += len(splitter_lines) - 1 else: # probably the line from the last message in the conversation markers[i] = 't' i += 1 return ''.join(markers)
Example #27
Source File: parse_tlg_indices.py From cltk with MIT License | 5 votes |
def _check_number(_str): """check if the string contains only a number followed by ?""" if regex.match(r'^[0-9]+\?*', _str): return True return False
Example #28
Source File: dataset.py From talon with Apache License 2.0 | 5 votes |
def parse_msg_sender(filename, sender_known=True): """Given a filename returns the sender and the message. Here the message is assumed to be a whole MIME message or just message body. >>> sender, msg = parse_msg_sender('msg.eml') >>> sender, msg = parse_msg_sender('msg_body') If you don't want to consider the sender's name in your classification algorithm: >>> parse_msg_sender(filename, False) """ import sys kwargs = {} if sys.version_info > (3, 0): kwargs["encoding"] = "utf8" sender, msg = None, None if os.path.isfile(filename) and not is_sender_filename(filename): with open(filename, **kwargs) as f: msg = f.read() sender = u'' if sender_known: sender_filename = build_sender_filename(filename) if os.path.exists(sender_filename): with open(sender_filename) as sender_file: sender = sender_file.read().strip() else: # if sender isn't found then the next line fails # and it is ok lines = msg.splitlines() for line in lines: match = re.match('From:(.*)', line) if match: sender = match.group(1) break return (sender, msg)
Example #29
Source File: __init__.py From indic_transliteration with MIT License | 5 votes |
def do_vyanjana_svara_join(self, vyanjanaanta, svaraadi): import regex if regex.match("|".join(self['vowels']) + ".*", svaraadi): return vyanjanaanta[:-1] + self.vowel_to_mark_map[svaraadi[0]] + svaraadi[1:] else: raise ValueError(svaraadi + " is not svaraadi.")
Example #30
Source File: query.py From cltk with MIT License | 5 votes |
def _regex_span(_regex, _str, case_insensitive=True): """Return all matches in an input string. :rtype : regex.match.span :param _regex: A regular expression pattern. :param _str: Text on which to run the pattern. """ if case_insensitive: flags = regex.IGNORECASE | regex.FULLCASE | regex.VERSION1 else: flags = regex.VERSION1 comp = regex.compile(_regex, flags=flags) matches = comp.finditer(_str) for match in matches: yield match