Python regex.split() Examples
The following are 30
code examples of regex.split().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: VSMTagger.py From Ossian with Apache License 2.0 | 6 votes |
def _process_text_line(self, text): split_text = [token for token in new_regex.split(self.tokenisation_pattern, text) \ if token != ''] if self.replace_whitespace: new_text = [] for token in split_text: if token.isspace(): new_text.append(self.replace_whitespace) else: new_text.append(token) split_text = new_text split_text = [token.strip(u' ') for token in split_text] ## prevent multiple spaces split_text = [token for token in split_text if token != u''] ## prevent multiple spaces split_text = [token.lower() for token in split_text] ## lowercase text = ' '.join(split_text) return text
Example #2
Source File: autosum_arxiv.py From autosum with MIT License | 6 votes |
def search_citing_sentences(aid, txt, match): lines = txt.split('\n') txt = ' '.join(lines) txt = ' '.join(txt.split()) sentences = split_sentences(txt) founds = set() for r in match.keys(): if r: regexp_list = [regex.escape('\cite%s' % r), regex.escape('\\refs{%s}' % r), r'(?<!(bibitem|lref).*?)' + regex.escape('%s' % r)] print aid, r for regexp in regexp_list: results = search_citation(sentences, regexp) founds.update(results) print("Regex: '{0!s}', Found: {1:d}".format(regexp, len(results))) if len(results): break print("_" * 50) return founds
Example #3
Source File: autosum_arxiv.py From autosum with MIT License | 6 votes |
def get_arxiv_meta_archive(aid): title = '' authors = [] jref = '' txt = '' with tarfile.open("./kddcup2003/hep-th-abs.tar.gz", "r:gz") as t: for m in t.getmembers(): if m.name.find(aid) != -1: txt = t.extractfile(m).read() break for m in regex.finditer(r'Title:\s+(.*)(?=Author)', txt, regex.S): title = clean_line(m.group(1)) break for m in regex.finditer(r'Authors?:\s+(.*)(?=Comment)', txt, regex.S): a = clean_line(m.group(1)) authors = regex.split(r'(?:,\s*(?:and\s+)?|\s+and\s+)', a) break for m in regex.finditer(r'Journal-ref:\s+(.*?)(?=\\\\)', txt, regex.S): jref = clean_line(m.group(1)) break return title, authors, jref
Example #4
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _set_splitters(self, settings=None): splitters = { 'wordchars': set(), # The ones that split string only if they are not surrounded by letters from both sides 'capturing': set(), # The ones that are not filtered out from tokens after split } splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS) wordchars = self._get_wordchars(settings) skip = set(self.info.get('skip', [])) | splitters['capturing'] for token in skip: if not re.match(r'^\W+$', token, re.UNICODE): continue if token in wordchars: splitters['wordchars'].add(token) self._splitters = splitters
Example #5
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 6 votes |
def is_applicable(self, date_string, strip_timezone=False, settings=None): """ Check if the locale is applicable to translate date string. :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str|unicode :param strip_timezone: If True, timezone is stripped from date string. :type strip_timezone: bool :return: boolean value representing if the locale is applicable for the date string or not. """ if strip_timezone: date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False) date_string = self._translate_numerals(date_string) if settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = self._simplify(date_string, settings=settings) dictionary = self._get_dictionary(settings) date_tokens = dictionary.split(date_string) return dictionary.are_tokens_valid(date_tokens)
Example #6
Source File: iocextract.py From python-iocextract with GNU General Public License v2.0 | 6 votes |
def defang(ioc): """Defang a URL, domain, or IPv4 address. :param ioc: String URL, domain, or IPv4 address. :rtype: str """ # If it's a url, defang just the scheme and netloc. try: parsed = urlparse(ioc) if parsed.netloc: parsed = parsed._replace(netloc=parsed.netloc.replace('.', '[.]'), scheme=parsed.scheme.replace('t', 'x')) return parsed.geturl() except ValueError: pass # If it's a domain or IP, defang up to the first slash. split_list = ioc.split('/') defanged = split_list[0].replace('.', '[.]') # Include everything after the first slash without modification. if len(split_list) > 1: defanged = '/'.join([defanged] + split_list[1:]) return defanged
Example #7
Source File: iocextract.py From python-iocextract with GNU General Public License v2.0 | 6 votes |
def _is_ipv6_url(url): """URL network location is an IPv6 address, not a domain. :param url: String URL :rtype: bool """ # Fix urlparse exception. parsed = urlparse(url) # Handle RFC 2732 IPv6 URLs with and without port, as well as non-RFC IPv6 URLs. if ']:' in parsed.netloc: ipv6 = ':'.join(parsed.netloc.split(':')[:-1]) else: ipv6 = parsed.netloc try: ipaddress.IPv6Address(unicode(ipv6.replace('[', '').replace(']', ''))) except ValueError: return False return True
Example #8
Source File: iocextract.py From python-iocextract with GNU General Public License v2.0 | 6 votes |
def extract_unencoded_urls(data, refang=False, strip=False): """Extract only unencoded URLs. :param data: Input text :param bool refang: Refang output? :param bool strip: Strip possible garbage from the end of URLs :rtype: Iterator[:class:`str`] """ unencoded_urls = itertools.chain( GENERIC_URL_RE.finditer(data), BRACKET_URL_RE.finditer(data), BACKSLASH_URL_RE.finditer(data), ) for url in unencoded_urls: if refang: url = refang_url(url.group(1)) else: url = url.group(1) if strip: url = re.split(URL_SPLIT_STR, url)[0] yield url
Example #9
Source File: kraken.py From nidaba with GNU General Public License v2.0 | 6 votes |
def setup(*args, **kwargs): try: global binarization global pageseg global rpred global models global mod_db from kraken import binarization from kraken import pageseg from kraken import rpred from kraken.lib import models # pronn/clstm models get prioritized over pyrnn ones mod_db = {k: storage.get_abs_path(*v) for k, v in nidaba_cfg['ocropus_models'].iteritems()} if kwargs.get('modeldata'): md = kwargs.get('modeldata') if isinstance(md, list): md = storage.get_abs_path(md) for model in glob.glob(md + '/*/*/DESCRIPTION'): with open(model) as fp: meta = json.load(fp) mod_db[model.split('/')[-2]] = os.path.join(os.path.dirname(model), meta['name']) ocr_kraken.arg_values['model'] = mod_db.keys() except ImportError as e: raise NidabaPluginException(e.message)
Example #10
Source File: data.py From qb with MIT License | 6 votes |
def _split_doc(doc): """Given a doc, split it into chunks (by paragraph).""" curr = [] curr_len = 0 for split in regex.split(r'\n+', doc): split = split.strip() if len(split) == 0: continue # Maybe group paragraphs together until we hit a length limit if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH: return ' '.join(curr) curr = [] curr_len = 0 curr.append(split) curr_len += len(split) if len(curr) > 0: return ' '.join(curr)
Example #11
Source File: retriever_reader.py From neural_chat with MIT License | 6 votes |
def _split_doc(self, doc): """Given a doc, split it into chunks (by paragraph).""" GROUP_LENGTH = 0 docs = [] curr = [] curr_len = 0 for split in regex.split(r'\n+', doc): split = split.strip() if len(split) == 0: continue # Maybe group paragraphs together until we hit a length limit if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH: # yield ' '.join(curr) docs.append(' '.join(curr)) curr = [] curr_len = 0 curr.append(split) curr_len += len(split) if len(curr) > 0: # yield ' '.join(curr) docs.append(' '.join(curr)) return docs
Example #12
Source File: utils.py From chepy with GNU General Public License v3.0 | 6 votes |
def split_by(self, pattern: str = "\n", trim=True): """Split a string by the given pattern Args: pattern (str, optional): Pattern to split by. Defaults to '\\n'. time (bool, optional): Trim whitespace after split. Defaults to True Returns: Chepy: The Chepy object. """ if trim: self.state = list( map(pydash.trim, re.split(pattern, self._convert_to_str())) ) else: self.state = re.split(pattern, self._convert_to_str()) return self
Example #13
Source File: scorer.py From nmt-chatbot with GNU General Public License v3.0 | 6 votes |
def ascii_emoticons(index, question, answer): global valid_emoticon valid_emoticon = False # Disabled if score_settings['ascii_emoticon_modifier_value'] is None: return 0 # Split by words (tokens) tokens = answer.split() # Calculate emoticon score score = [1 if len(token) > 1 and len(re.findall('[^a-zA-Z0-9]', token)) / len(token) > score_settings['ascii_emoticon_non_char_to_all_chars_ratio'] else 0 for token in tokens] score = sum([1 if (index > 0 and score[index - 1] == 0 and value == 1) or (index == 0 and value == 1) else 0 for index, value in enumerate(score)]) * score_settings['ascii_emoticon_modifier_value'] if score: valid_emoticon = True return score # Check if sentence includes 'unk' token
Example #14
Source File: build_corpus.py From wordvectors with MIT License | 6 votes |
def sentence_segment(text): ''' Args: text: A string. A unsegmented paragraph. Returns: A list of sentences. ''' global lcode if lcode in ['ja', 'zh']: sents = regex.split(u"([。!?])?[\n]+|[。!?]", text) elif lcode in ['th']: sents = text.split("[\n]+") elif lcode in ['hi', 'bn']: # hindi, bengali sents = regex.split(u"([.।?!])?[\n]+|[.।?!] ", text) elif lcode in ['de']: # german sents = regex.split("([.?!])?[\n]+|[.?!] ", text) sents = [sent[0].lower() + sent[1:] for sent in sents if sent is not None and len(sent) > 1] else: sents = regex.split("([.?!])?[\n]+|[.?!] ", text) return sents
Example #15
Source File: generate.py From justcopy-backend with MIT License | 6 votes |
def search_docs(inputs, max_ex=5, opts=None): """Given a set of document ids (returned by ranking for a question), search for top N best matching (by heuristic) paragraphs that contain the answer. """ if not opts: raise RuntimeError('Options dict must be supplied.') doc_ids, q_tokens, answer = inputs examples = [] for i, doc_id in enumerate(doc_ids): for j, paragraph in enumerate(re.split(r'\n+', fetch_text(doc_id))): found = find_answer(paragraph, q_tokens, answer, opts) if found: # Reverse ranking, giving priority to early docs + paragraphs score = (found[0], -i, -j, random.random()) if len(examples) < max_ex: heapq.heappush(examples, (score, found[1])) else: heapq.heappushpop(examples, (score, found[1])) return [e[1] for e in examples]
Example #16
Source File: helpers.py From talon with Apache License 2.0 | 6 votes |
def extract_names(sender): """Tries to extract sender's names from `From:` header. It could extract not only the actual names but e.g. the name of the company, parts of email, etc. >>> extract_names('Sergey N. Obukhov <serobnic@mail.ru>') ['Sergey', 'Obukhov', 'serobnic'] >>> extract_names('') [] """ sender = to_unicode(sender, precise=True) # Remove non-alphabetical characters sender = "".join([char if char.isalpha() else ' ' for char in sender]) # Remove too short words and words from "black" list i.e. # words like `ru`, `gmail`, `com`, `org`, etc. sender = [word for word in sender.split() if len(word) > 1 and not word in BAD_SENDER_NAMES] # Remove duplicates names = list(set(sender)) return names
Example #17
Source File: helpers.py From talon with Apache License 2.0 | 6 votes |
def capitalized_words_percent(s): '''Returns capitalized words percent.''' s = to_unicode(s, precise=True) words = re.split('\s', s) words = [w for w in words if w.strip()] words = [w for w in words if len(w) > 2] capitalized_words_counter = 0 valid_words_counter = 0 for word in words: if not INVALID_WORD_START.match(word): valid_words_counter += 1 if word[0].isupper() and not word[1].isupper(): capitalized_words_counter += 1 if valid_words_counter > 0 and len(words) > 1: return 100 * float(capitalized_words_counter) / valid_words_counter return 0
Example #18
Source File: retriever_reader.py From ParlAI with MIT License | 6 votes |
def add_cmdline_args(argparser): """ Add command-line arguments specifically for this agent. """ agent = argparser.add_argument_group('RetrieverReader Arguments') agent.add_argument('--retriever-model-file', type=str, default=None) agent.add_argument('--reader-model-file', type=str, default=None) agent.add_argument( '--num-retrieved', type=int, default=5, help='how many passages to retrieve' ) agent.add_argument( '--split-paragraphs', type='bool', default=True, help='Whether to split the retrieved passages into ' 'paragraphs', ) return agent
Example #19
Source File: retriever_reader.py From ParlAI with MIT License | 6 votes |
def _split_doc(self, doc): """ Given a doc, split it into chunks (by paragraph). """ GROUP_LENGTH = 0 docs = [] curr = [] curr_len = 0 for split in regex.split(r'\n+', doc): split = split.strip() if len(split) == 0: continue # Maybe group paragraphs together until we hit a length limit if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH: # yield ' '.join(curr) docs.append(' '.join(curr)) curr = [] curr_len = 0 curr.append(split) curr_len += len(split) if len(curr) > 0: # yield ' '.join(curr) docs.append(' '.join(curr)) return docs
Example #20
Source File: drqa.py From justcopy-backend with MIT License | 6 votes |
def _split_doc(self, doc): """Given a doc, split it into chunks (by paragraph).""" curr = [] curr_len = 0 for split in regex.split(r'\n+', doc): split = split.strip() if len(split) == 0: continue # Maybe group paragraphs together until we hit a length limit if len(curr) > 0 and curr_len + len(split) > self.GROUP_LENGTH: yield ' '.join(curr) curr = [] curr_len = 0 curr.append(split) curr_len += len(split) if len(curr) > 0: yield ' '.join(curr)
Example #21
Source File: build_corpus.py From wordvectors with MIT License | 6 votes |
def word_segment(sent): ''' Args: sent: A string. A sentence. Returns: A list of words. ''' global lcode if lcode in ['ko']: words = [word for word, _ in kkma.pos(sent)] elif lcode in ['ja']: words = mecab.parse(sent.encode('utf8')).split() elif lcode in ['th']: words = pythai.split(sent) elif lcode in ['vi']: words = ViTokenizer.tokenize(sent).split() elif lcode in ['zh']: words = list(jieba.cut(sent, cut_all=False)) # elif lcode in ['ar']: # words = segmenter.segment(sent).split() else: # Mostly european languages words = sent.split() return words
Example #22
Source File: freshness_date_parser.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _are_all_words_units(self, date_string): skip = [_UNITS, r'ago|in|\d+', r':|[ap]m'] date_string = re.sub(r'\s+', ' ', date_string.strip()) words = filter(lambda x: x if x else False, re.split(r'\W', date_string)) words = filter(lambda x: not re.match(r'%s' % '|'.join(skip), x), words) return not list(words)
Example #23
Source File: tokenize.py From StrepHit with GNU General Public License v3.0 | 5 votes |
def tokenize(self, sentence): """ Tokenize the given sentence. You can also pass a generic text, but you will lose the sentence segmentation. :param str sentence: a natural language sentence or text to be tokenized :return: the list of tokens :rtype: list """ tokens = regex.split(self.tokenization_regex, unicode(sentence)) logger.debug("'%s' tokenized into %s using regex %s" % (sentence, tokens, self.tokenization_regex)) # Skip empty tokens return [token for token in tokens if token]
Example #24
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None): dictionary = self._get_dictionary(settings) for i, token in enumerate(tokens): tokens[i] = dictionary.split(token, keep_formatting) return list(chain.from_iterable(tokens))
Example #25
Source File: autosum_arxiv.py From autosum with MIT License | 5 votes |
def clean_line(txt): # join multiple lines txt = txt.replace('\n', ' ') # remove multiple spaces txt = ' '.join(txt.split()) return txt
Example #26
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _split_tokens_with_regex(self, tokens, regex): tokens = tokens[:] for i, token in enumerate(tokens): tokens[i] = re.split(regex, token) return filter(bool, chain.from_iterable(tokens))
Example #27
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _word_split(self, string, settings): if 'no_word_spacing' in self.info: return self._split(string, keep_formatting=True, settings=settings) else: return string.split()
Example #28
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _sentence_split(self, string, settings): abbreviations = self._get_abbreviations(settings=settings) digit_abbreviations = ['[0-9]'] # numeric date with full stop abbreviation_string = '' for abbreviation in abbreviations: abbreviation_string += '(?<! ' + abbreviation[:-1] + ')' # negative lookbehind if self.shortname in ['fi', 'cs', 'hu', 'de', 'da']: for digit_abbreviation in digit_abbreviations: abbreviation_string += '(?<!' + digit_abbreviation + ')' # negative lookbehind splitters_dict = {1: r'[\.!?;…\r\n]+(?:\s|$)*', # most European, Tagalog, Hebrew, Georgian, # Indonesian, Vietnamese 2: r'(?:[¡¿]+|[\.!?;…\r\n]+(?:\s|$))+', # Spanish 3: r'[|!?;\r\n]+(?:\s|$)+', # Hindi and Bangla 4: r'[。…‥\.!??!;\r\n]+(?:\s|$)+', # Japanese and Chinese 5: r'[\r\n]+', # Thai 6: r'[\r\n؟!\.…]+(?:\s|$)+'} # Arabic and Farsi if 'sentence_splitter_group' not in self.info: split_reg = abbreviation_string + splitters_dict[1] sentences = re.split(split_reg, string) else: split_reg = abbreviation_string + splitters_dict[self.info['sentence_splitter_group']] sentences = re.split(split_reg, string) for i in sentences: if not i: sentences.remove(i) return sentences
Example #29
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _translate_numerals(self, date_string): date_string_tokens = NUMERAL_PATTERN.split(date_string) for i, token in enumerate(date_string_tokens): if token.isdecimal(): date_string_tokens[i] = str(int(token)).zfill(len(token)) if isinstance(date_string_tokens[i], bytes): date_string_tokens[i] = date_string_tokens[i].decode('utf-8') return u''.join(date_string_tokens)
Example #30
Source File: autosum_arxiv.py From autosum with MIT License | 5 votes |
def split_sentences(text): """Returns split sentences list Reference: http://stackoverflow.com/questions/8465335/a-regex-for-extracting- sentence-from-a-paragraph-in-python """ sentenceEnders = regex.compile(r""" # Split sentences on whitespace between them. (?: # Group for two positive lookbehinds. (?<=[.!?]) # Either an end of sentence punct, | (?<=[.!?]['"]) # or end of sentence punct and quote. ) # End group of two positive lookbehinds. (?<! Mr\. ) # Don't end sentence on "Mr." (?<! Mrs\. ) # Don't end sentence on "Mrs." (?<! Jr\. ) # Don't end sentence on "Jr." (?<! Dr\. ) # Don't end sentence on "Dr." (?<! Prof\. ) # Don't end sentence on "Prof." (?<! Sr\. ) # Don't end sentence on "Sr." (?<! Sen\. ) (?<! Ms\. ) (?<! Rep\. ) (?<! Gov\. ) (?<! et\ al\. ) (?<! i\.e\. ) (?<! U\.S\. ) (?<! p\. ) # Don't end sentence on "p." (page) \s+ # Split on whitespace between sentences. """, regex.IGNORECASE | regex.VERBOSE) sentenceList = sentenceEnders.split(text) return sentenceList