Python regex.split() Examples

The following are 30 code examples of regex.split(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module regex , or try the search function .
Example #1
Source File: VSMTagger.py    From Ossian with Apache License 2.0 6 votes vote down vote up
def _process_text_line(self, text):            

        split_text = [token for token in new_regex.split(self.tokenisation_pattern, text) \
                            if token != '']
        if self.replace_whitespace:
            new_text = []
            for token in split_text:
                if token.isspace():
                    new_text.append(self.replace_whitespace)                        
                else:
                    new_text.append(token)  
            split_text = new_text
        
        split_text = [token.strip(u' ') for token in split_text]  ## prevent multiple spaces
        split_text = [token for token in split_text if token != u'']  ## prevent multiple spaces
        split_text = [token.lower() for token in split_text]     ## lowercase
        text = ' '.join(split_text) 
        return text 
Example #2
Source File: autosum_arxiv.py    From autosum with MIT License 6 votes vote down vote up
def search_citing_sentences(aid, txt, match):
    lines = txt.split('\n')
    txt = ' '.join(lines)
    txt = ' '.join(txt.split())
    sentences = split_sentences(txt)
    founds = set()
    for r in match.keys():
        if r:
            regexp_list = [regex.escape('\cite%s' % r),
                           regex.escape('\\refs{%s}' % r),
                           r'(?<!(bibitem|lref).*?)' + regex.escape('%s' % r)]
            print aid, r
            for regexp in regexp_list:
                results = search_citation(sentences, regexp)
                founds.update(results)
                print("Regex: '{0!s}', Found: {1:d}".format(regexp, len(results)))
                if len(results):
                    break
    print("_" * 50)
    return founds 
Example #3
Source File: autosum_arxiv.py    From autosum with MIT License 6 votes vote down vote up
def get_arxiv_meta_archive(aid):
    title = ''
    authors = []
    jref = ''
    txt = ''
    with tarfile.open("./kddcup2003/hep-th-abs.tar.gz", "r:gz") as t:
        for m in t.getmembers():
            if m.name.find(aid) != -1:
                txt = t.extractfile(m).read()
                break
    for m in regex.finditer(r'Title:\s+(.*)(?=Author)', txt, regex.S):
        title = clean_line(m.group(1))
        break
    for m in regex.finditer(r'Authors?:\s+(.*)(?=Comment)', txt, regex.S):
        a = clean_line(m.group(1))
        authors = regex.split(r'(?:,\s*(?:and\s+)?|\s+and\s+)', a)
        break
    for m in regex.finditer(r'Journal-ref:\s+(.*?)(?=\\\\)', txt, regex.S):
        jref = clean_line(m.group(1))
        break

    return title, authors, jref 
Example #4
Source File: locale.py    From dateparser with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _set_splitters(self, settings=None):
        splitters = {
            'wordchars': set(),  # The ones that split string only if they are not surrounded by letters from both sides
            'capturing': set(),  # The ones that are not filtered out from tokens after split
        }
        splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS)

        wordchars = self._get_wordchars(settings)
        skip = set(self.info.get('skip', [])) | splitters['capturing']
        for token in skip:
            if not re.match(r'^\W+$', token, re.UNICODE):
                continue
            if token in wordchars:
                splitters['wordchars'].add(token)

        self._splitters = splitters 
Example #5
Source File: locale.py    From dateparser with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def is_applicable(self, date_string, strip_timezone=False, settings=None):
        """
        Check if the locale is applicable to translate date string.

        :param date_string:
            A string representing date and/or time in a recognizably valid format.
        :type date_string: str|unicode

        :param strip_timezone:
            If True, timezone is stripped from date string.
        :type strip_timezone: bool

        :return: boolean value representing if the locale is applicable for the date string or not.
        """
        if strip_timezone:
            date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False)

        date_string = self._translate_numerals(date_string)
        if settings.NORMALIZE:
            date_string = normalize_unicode(date_string)
        date_string = self._simplify(date_string, settings=settings)
        dictionary = self._get_dictionary(settings)
        date_tokens = dictionary.split(date_string)

        return dictionary.are_tokens_valid(date_tokens) 
Example #6
Source File: iocextract.py    From python-iocextract with GNU General Public License v2.0 6 votes vote down vote up
def defang(ioc):
    """Defang a URL, domain, or IPv4 address.

    :param ioc: String URL, domain, or IPv4 address.
    :rtype: str
    """
    # If it's a url, defang just the scheme and netloc.
    try:
        parsed = urlparse(ioc)
        if parsed.netloc:
            parsed = parsed._replace(netloc=parsed.netloc.replace('.', '[.]'),
                                     scheme=parsed.scheme.replace('t', 'x'))
            return parsed.geturl()
    except ValueError:
        pass

    # If it's a domain or IP, defang up to the first slash.
    split_list = ioc.split('/')
    defanged = split_list[0].replace('.', '[.]')
    # Include everything after the first slash without modification.
    if len(split_list) > 1:
        defanged = '/'.join([defanged] + split_list[1:])

    return defanged 
Example #7
Source File: iocextract.py    From python-iocextract with GNU General Public License v2.0 6 votes vote down vote up
def _is_ipv6_url(url):
    """URL network location is an IPv6 address, not a domain.

    :param url: String URL
    :rtype: bool
    """
    # Fix urlparse exception.
    parsed = urlparse(url)

    # Handle RFC 2732 IPv6 URLs with and without port, as well as non-RFC IPv6 URLs.
    if ']:' in parsed.netloc:
        ipv6 = ':'.join(parsed.netloc.split(':')[:-1])
    else:
        ipv6 = parsed.netloc

    try:
        ipaddress.IPv6Address(unicode(ipv6.replace('[', '').replace(']', '')))
    except ValueError:
        return False

    return True 
Example #8
Source File: iocextract.py    From python-iocextract with GNU General Public License v2.0 6 votes vote down vote up
def extract_unencoded_urls(data, refang=False, strip=False):
    """Extract only unencoded URLs.

    :param data: Input text
    :param bool refang: Refang output?
    :param bool strip: Strip possible garbage from the end of URLs
    :rtype: Iterator[:class:`str`]
    """
    unencoded_urls = itertools.chain(
        GENERIC_URL_RE.finditer(data),
        BRACKET_URL_RE.finditer(data),
        BACKSLASH_URL_RE.finditer(data),
    )
    for url in unencoded_urls:
        if refang:
            url = refang_url(url.group(1))
        else:
            url = url.group(1)

        if strip:
            url = re.split(URL_SPLIT_STR, url)[0]

        yield url 
Example #9
Source File: kraken.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def setup(*args, **kwargs):
    try:
        global binarization
        global pageseg
        global rpred
        global models
        global mod_db
        from kraken import binarization
        from kraken import pageseg
        from kraken import rpred
        from kraken.lib import models
        # pronn/clstm models get prioritized over pyrnn ones
        mod_db = {k: storage.get_abs_path(*v) for k, v in nidaba_cfg['ocropus_models'].iteritems()}
        if kwargs.get('modeldata'):
            md = kwargs.get('modeldata')
            if isinstance(md, list):
                md = storage.get_abs_path(md)
            for model in glob.glob(md + '/*/*/DESCRIPTION'):
                with open(model) as fp:
                    meta = json.load(fp)
                    mod_db[model.split('/')[-2]] = os.path.join(os.path.dirname(model), meta['name'])
        ocr_kraken.arg_values['model'] = mod_db.keys()

    except ImportError as e:
        raise NidabaPluginException(e.message) 
Example #10
Source File: data.py    From qb with MIT License 6 votes vote down vote up
def _split_doc(doc):
    """Given a doc, split it into chunks (by paragraph)."""
    curr = []
    curr_len = 0
    for split in regex.split(r'\n+', doc):
        split = split.strip()
        if len(split) == 0:
            continue
        # Maybe group paragraphs together until we hit a length limit
        if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
            return ' '.join(curr)
            curr = []
            curr_len = 0
        curr.append(split)
        curr_len += len(split)
    if len(curr) > 0:
        return ' '.join(curr) 
Example #11
Source File: retriever_reader.py    From neural_chat with MIT License 6 votes vote down vote up
def _split_doc(self, doc):
        """Given a doc, split it into chunks (by paragraph)."""
        GROUP_LENGTH = 0
        docs = []
        curr = []
        curr_len = 0
        for split in regex.split(r'\n+', doc):
            split = split.strip()
            if len(split) == 0:
                continue
            # Maybe group paragraphs together until we hit a length limit
            if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
                # yield ' '.join(curr)
                docs.append(' '.join(curr))
                curr = []
                curr_len = 0
            curr.append(split)
            curr_len += len(split)
        if len(curr) > 0:
            # yield ' '.join(curr)
            docs.append(' '.join(curr))
        return docs 
Example #12
Source File: utils.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def split_by(self, pattern: str = "\n", trim=True):
        """Split a string by the given pattern
        
        Args:
            pattern (str, optional): Pattern to split by. Defaults to '\\n'.
            time (bool, optional): Trim whitespace after split. Defaults to True
        
        Returns:
            Chepy: The Chepy object.
        """
        if trim:
            self.state = list(
                map(pydash.trim, re.split(pattern, self._convert_to_str()))
            )
        else:
            self.state = re.split(pattern, self._convert_to_str())
        return self 
Example #13
Source File: scorer.py    From nmt-chatbot with GNU General Public License v3.0 6 votes vote down vote up
def ascii_emoticons(index, question, answer):
    global valid_emoticon

    valid_emoticon = False

    # Disabled
    if score_settings['ascii_emoticon_modifier_value'] is None:
        return 0

    # Split by words (tokens)
    tokens = answer.split()

    # Calculate emoticon score
    score = [1 if len(token) > 1 and len(re.findall('[^a-zA-Z0-9]', token)) / len(token) > score_settings['ascii_emoticon_non_char_to_all_chars_ratio'] else 0 for token in tokens]
    score = sum([1 if (index > 0 and score[index - 1] == 0 and value == 1) or (index == 0 and value == 1) else 0 for index, value in enumerate(score)]) * score_settings['ascii_emoticon_modifier_value']

    if score:
        valid_emoticon = True

    return score

# Check if sentence includes 'unk' token 
Example #14
Source File: build_corpus.py    From wordvectors with MIT License 6 votes vote down vote up
def sentence_segment(text):
    '''
    Args:
      text: A string. A unsegmented paragraph.
    
    Returns:
      A list of sentences.
    '''
    global lcode
    if lcode in ['ja', 'zh']:
        sents = regex.split(u"([。!?])?[\n]+|[。!?]", text) 
    elif lcode in ['th']:
        sents = text.split("[\n]+") 
    elif lcode in ['hi', 'bn']: # hindi, bengali
        sents = regex.split(u"([.।?!])?[\n]+|[.।?!] ", text)
    elif lcode in ['de']: # german
        sents = regex.split("([.?!])?[\n]+|[.?!] ", text)
        sents = [sent[0].lower() + sent[1:] for sent in sents if sent is not None and len(sent) > 1]
    else:
        sents = regex.split("([.?!])?[\n]+|[.?!] ", text)
    return sents 
Example #15
Source File: generate.py    From justcopy-backend with MIT License 6 votes vote down vote up
def search_docs(inputs, max_ex=5, opts=None):
    """Given a set of document ids (returned by ranking for a question), search
    for top N best matching (by heuristic) paragraphs that contain the answer.
    """
    if not opts:
        raise RuntimeError('Options dict must be supplied.')

    doc_ids, q_tokens, answer = inputs
    examples = []
    for i, doc_id in enumerate(doc_ids):
        for j, paragraph in enumerate(re.split(r'\n+', fetch_text(doc_id))):
            found = find_answer(paragraph, q_tokens, answer, opts)
            if found:
                # Reverse ranking, giving priority to early docs + paragraphs
                score = (found[0], -i, -j, random.random())
                if len(examples) < max_ex:
                    heapq.heappush(examples, (score, found[1]))
                else:
                    heapq.heappushpop(examples, (score, found[1]))
    return [e[1] for e in examples] 
Example #16
Source File: helpers.py    From talon with Apache License 2.0 6 votes vote down vote up
def extract_names(sender):
    """Tries to extract sender's names from `From:` header.

    It could extract not only the actual names but e.g.
    the name of the company, parts of email, etc.

    >>> extract_names('Sergey N.  Obukhov <serobnic@mail.ru>')
    ['Sergey', 'Obukhov', 'serobnic']
    >>> extract_names('')
    []
    """
    sender = to_unicode(sender, precise=True)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
    # words like `ru`, `gmail`, `com`, `org`, etc.
    sender = [word for word in sender.split() if len(word) > 1 and
              not word in BAD_SENDER_NAMES]
    # Remove duplicates
    names = list(set(sender))
    return names 
Example #17
Source File: helpers.py    From talon with Apache License 2.0 6 votes vote down vote up
def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
    s = to_unicode(s, precise=True)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    words = [w for w in words if len(w) > 2]    
    capitalized_words_counter = 0
    valid_words_counter = 0
    for word in words:
        if not INVALID_WORD_START.match(word):
            valid_words_counter += 1
            if word[0].isupper() and not word[1].isupper():
                capitalized_words_counter += 1
    if valid_words_counter > 0 and len(words) > 1:
        return 100 * float(capitalized_words_counter) / valid_words_counter

    return 0 
Example #18
Source File: retriever_reader.py    From ParlAI with MIT License 6 votes vote down vote up
def add_cmdline_args(argparser):
        """
        Add command-line arguments specifically for this agent.
        """
        agent = argparser.add_argument_group('RetrieverReader Arguments')
        agent.add_argument('--retriever-model-file', type=str, default=None)
        agent.add_argument('--reader-model-file', type=str, default=None)
        agent.add_argument(
            '--num-retrieved', type=int, default=5, help='how many passages to retrieve'
        )
        agent.add_argument(
            '--split-paragraphs',
            type='bool',
            default=True,
            help='Whether to split the retrieved passages into ' 'paragraphs',
        )
        return agent 
Example #19
Source File: retriever_reader.py    From ParlAI with MIT License 6 votes vote down vote up
def _split_doc(self, doc):
        """
        Given a doc, split it into chunks (by paragraph).
        """
        GROUP_LENGTH = 0
        docs = []
        curr = []
        curr_len = 0
        for split in regex.split(r'\n+', doc):
            split = split.strip()
            if len(split) == 0:
                continue
            # Maybe group paragraphs together until we hit a length limit
            if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
                # yield ' '.join(curr)
                docs.append(' '.join(curr))
                curr = []
                curr_len = 0
            curr.append(split)
            curr_len += len(split)
        if len(curr) > 0:
            # yield ' '.join(curr)
            docs.append(' '.join(curr))
        return docs 
Example #20
Source File: drqa.py    From justcopy-backend with MIT License 6 votes vote down vote up
def _split_doc(self, doc):
        """Given a doc, split it into chunks (by paragraph)."""
        curr = []
        curr_len = 0
        for split in regex.split(r'\n+', doc):
            split = split.strip()
            if len(split) == 0:
                continue
            # Maybe group paragraphs together until we hit a length limit
            if len(curr) > 0 and curr_len + len(split) > self.GROUP_LENGTH:
                yield ' '.join(curr)
                curr = []
                curr_len = 0
            curr.append(split)
            curr_len += len(split)
        if len(curr) > 0:
            yield ' '.join(curr) 
Example #21
Source File: build_corpus.py    From wordvectors with MIT License 6 votes vote down vote up
def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split() 
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()        
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False)) 
#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else: # Mostly european languages
        words = sent.split()
    
    return words 
Example #22
Source File: freshness_date_parser.py    From dateparser with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _are_all_words_units(self, date_string):
        skip = [_UNITS,
                r'ago|in|\d+',
                r':|[ap]m']

        date_string = re.sub(r'\s+', ' ', date_string.strip())

        words = filter(lambda x: x if x else False, re.split(r'\W', date_string))
        words = filter(lambda x: not re.match(r'%s' % '|'.join(skip), x), words)
        return not list(words) 
Example #23
Source File: tokenize.py    From StrepHit with GNU General Public License v3.0 5 votes vote down vote up
def tokenize(self, sentence):
        """ Tokenize the given sentence.
            You can also pass a generic text, but you will lose the sentence segmentation.

            :param str sentence: a natural language sentence or text to be tokenized
            :return: the list of tokens
            :rtype: list
        """
        tokens = regex.split(self.tokenization_regex, unicode(sentence))
        logger.debug("'%s' tokenized into %s using regex %s" % (sentence, tokens, self.tokenization_regex))
        # Skip empty tokens
        return [token for token in tokens if token] 
Example #24
Source File: locale.py    From dateparser with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None):
        dictionary = self._get_dictionary(settings)
        for i, token in enumerate(tokens):
            tokens[i] = dictionary.split(token, keep_formatting)
        return list(chain.from_iterable(tokens)) 
Example #25
Source File: autosum_arxiv.py    From autosum with MIT License 5 votes vote down vote up
def clean_line(txt):
    # join multiple lines
    txt = txt.replace('\n', ' ')
    # remove multiple spaces
    txt = ' '.join(txt.split())
    return txt 
Example #26
Source File: locale.py    From dateparser with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _split_tokens_with_regex(self, tokens, regex):
        tokens = tokens[:]
        for i, token in enumerate(tokens):
            tokens[i] = re.split(regex, token)
        return filter(bool, chain.from_iterable(tokens)) 
Example #27
Source File: locale.py    From dateparser with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _word_split(self, string, settings):
        if 'no_word_spacing' in self.info:
            return self._split(string, keep_formatting=True, settings=settings)
        else:
            return string.split() 
Example #28
Source File: locale.py    From dateparser with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _sentence_split(self, string, settings):
        abbreviations = self._get_abbreviations(settings=settings)
        digit_abbreviations = ['[0-9]']  # numeric date with full stop
        abbreviation_string = ''

        for abbreviation in abbreviations:
            abbreviation_string += '(?<! ' + abbreviation[:-1] + ')'  # negative lookbehind
        if self.shortname in ['fi', 'cs', 'hu', 'de', 'da']:
            for digit_abbreviation in digit_abbreviations:
                abbreviation_string += '(?<!' + digit_abbreviation + ')'  # negative lookbehind

        splitters_dict = {1: r'[\.!?;…\r\n]+(?:\s|$)*',  # most European, Tagalog, Hebrew, Georgian,
                          # Indonesian, Vietnamese
                          2: r'(?:[¡¿]+|[\.!?;…\r\n]+(?:\s|$))+',  # Spanish
                          3: r'[|!?;\r\n]+(?:\s|$)+',  # Hindi and Bangla
                          4: r'[。…‥\.!??!;\r\n]+(?:\s|$)+',  # Japanese and Chinese
                          5: r'[\r\n]+',  # Thai
                          6: r'[\r\n؟!\.…]+(?:\s|$)+'}  # Arabic and Farsi
        if 'sentence_splitter_group' not in self.info:
            split_reg = abbreviation_string + splitters_dict[1]
            sentences = re.split(split_reg, string)
        else:
            split_reg = abbreviation_string + splitters_dict[self.info['sentence_splitter_group']]
            sentences = re.split(split_reg, string)

        for i in sentences:
            if not i:
                sentences.remove(i)
        return sentences 
Example #29
Source File: locale.py    From dateparser with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _translate_numerals(self, date_string):
        date_string_tokens = NUMERAL_PATTERN.split(date_string)
        for i, token in enumerate(date_string_tokens):
            if token.isdecimal():
                date_string_tokens[i] = str(int(token)).zfill(len(token))
                if isinstance(date_string_tokens[i], bytes):
                    date_string_tokens[i] = date_string_tokens[i].decode('utf-8')
        return u''.join(date_string_tokens) 
Example #30
Source File: autosum_arxiv.py    From autosum with MIT License 5 votes vote down vote up
def split_sentences(text):
    """Returns split sentences list
       Reference:
       http://stackoverflow.com/questions/8465335/a-regex-for-extracting-
              sentence-from-a-paragraph-in-python
    """
    sentenceEnders = regex.compile(r"""
        # Split sentences on whitespace between them.
        (?:               # Group for two positive lookbehinds.
          (?<=[.!?])      # Either an end of sentence punct,
        | (?<=[.!?]['"])  # or end of sentence punct and quote.
        )                 # End group of two positive lookbehinds.
        (?<!  Mr\.   )    # Don't end sentence on "Mr."
        (?<!  Mrs\.  )    # Don't end sentence on "Mrs."
        (?<!  Jr\.   )    # Don't end sentence on "Jr."
        (?<!  Dr\.   )    # Don't end sentence on "Dr."
        (?<!  Prof\. )    # Don't end sentence on "Prof."
        (?<!  Sr\.   )    # Don't end sentence on "Sr."
        (?<!  Sen\.  )
        (?<!  Ms\.   )
        (?<!  Rep\.  )
        (?<!  Gov\.  )
        (?<!  et\ al\.  )
        (?<!  i\.e\.  )
        (?<!  U\.S\.  )
        (?<!  p\.  )      # Don't end sentence on "p." (page)
        \s+               # Split on whitespace between sentences.
        """, regex.IGNORECASE | regex.VERBOSE)
    sentenceList = sentenceEnders.split(text)
    return sentenceList