Python re.U() Examples

The following are code examples for showing how to use re.U(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the exmaples you don't like. You can also save this page to your account.

Example 1
Project: gransk   Author: pcbje   File: find_entities.py    (Apache License 2.0) View Source Project 6 votes vote down vote up
def setup(self, config):
    """
    Compile configured regular expressions.

    :param config: Configuration object.
    :type config: ``dict``
    """
    self.matches = {}

    patterns = []

    for entity_type, pattern_conf in config.get(helper.ENTITIES, {}).items():
      patterns.append(
          r'\b(?P<{}>{})\b'.format(entity_type, pattern_conf[helper.PATTERN]))

    self.pattern = regex.compile(
        '|'.join(patterns),
        regex.I | regex.U) 
Example 2
Project: SerpScrap   Author: ecoron   File: urlscrape.py    (MIT License) View Source Project 6 votes vote down vote up
def split_into_sentences(text):
    potential_end_pat = re.compile(r"".join([
        r"([\w\.'’&\]\)]+[\.\?!])",  # A word that ends with punctuation
        r"([‘’“”'\"\)\]]*)",  # Followed by optional quote/parens/etc
        r"(\s+(?![a-z\-–—]))",  # Followed by whitespace + non-(lowercase or dash)
        ]),
        re.U
    )
    dot_iter = re.finditer(potential_end_pat, text)
    end_indices = [
        (x.start() + len(x.group(1)) + len(x.group(2)))
        for x in dot_iter
        if is_sentence_ender(x.group(1))
    ]
    spans = zip([None] + end_indices, end_indices + [None])
    sentences = [
        text[start:end].strip() for start, end in spans
    ]
    return sentences 
Example 3
Project: mm-randbot   Author: arvego   File: vk_utils.py    (license) View Source Project 6 votes vote down vote up
def replace_wiki_links(text, raw_link=False):
    """
    ?????? ????-?????? ???? '[user_id|link_text]' ?? ??????????? HTML
    :param text: ????? ??? ?????????
    :param raw_link: ?????? ?????? ?????? ????-??????
    """
    link_format = "{1} (vk.com/{0})" if raw_link else "<a href=\"https://vk.com/{0}\">{1}</a>"
    pattern = re.compile(r"\[([^|]+)\|([^|]+)\]", re.U)
    results = pattern.findall(text, re.U)
    for i in results:
        user_id = i[0]
        link_text = i[1]
        before = "[{0}|{1}]".format(user_id, link_text)
        after = link_format.format(user_id, link_text)
        text = text.replace(before, after)
    return text 
Example 4
Project: doork   Author: AeonDave   File: search.py    (license) View Source Project 6 votes vote down vote up
def _extract_info(self, soup):
        empty_info = {'from': 0, 'to': 0, 'total': 0}
        div_ssb = soup.find('div', id='ssb')
        if not div_ssb:
            self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
            return empty_info
        p = div_ssb.find('p')
        if not p:
            self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
            return empty_info
        txt = ''.join(p.findAll(text=True))
        txt = txt.replace(',', '')
        matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U)
        if not matches:
            return empty_info
        return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} 
Example 5
Project: doork   Author: AeonDave   File: search.py    (license) View Source Project 6 votes vote down vote up
def _html_unescape(self, str):
        def entity_replacer(m):
            entity = m.group(1)
            if entity in name2codepoint:
                return unichr(name2codepoint[entity])
            else:
                return m.group(0)

        def ascii_replacer(m):
            cp = int(m.group(1))
            if cp <= 255:
                return unichr(cp)
            else:
                return m.group(0)

        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
        return re.sub(r'&([^;]+);', entity_replacer, s, re.U) 
Example 6
Project: doork   Author: AeonDave   File: search.py    (license) View Source Project 6 votes vote down vote up
def _extract_info(self, soup):
        empty_info = {'from': 0, 'to': 0, 'total': 0}
        td_rsb = soup.find('td', 'rsb')
        if not td_rsb:
            self._maybe_raise(ParseError, "Td with number of results was not found on Blogs search page", soup)
            return empty_info
        font = td_rsb.find('font')
        if not font:
            self._maybe_raise(ParseError, """<p> tag within <tr class='rsb'> was not found on Blogs search page""", soup)
            return empty_info
        txt = ''.join(font.findAll(text=True))
        txt = txt.replace(',', '')
        if self.hl == 'es':
            matches = re.search(r'Resultados (\d+) - (\d+) de (?:aproximadamente )?(\d+)', txt, re.U)
        elif self.hl == 'en':
            matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt, re.U)
        if not matches:
            return empty_info
        return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} 
Example 7
Project: doork   Author: AeonDave   File: sponsoredlinks.py    (license) View Source Project 6 votes vote down vote up
def _html_unescape(self, str):
        def entity_replacer(m):
            entity = m.group(1)
            if entity in name2codepoint:
                return unichr(name2codepoint[entity])
            else:
                return m.group(0)

        def ascii_replacer(m):
            cp = int(m.group(1))
            if cp <= 255:
                return unichr(cp)
            else:
                return m.group(0)

        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
        return re.sub(r'&([^;]+);', entity_replacer, s, re.U) 
Example 8
Project: topic-ensemble   Author: derekgreene   File: util.py    (license) View Source Project 6 votes vote down vote up
def preprocess_simple( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True ):
	"""
	Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
	"""
	token_pattern = re.compile(r"[\s\-]+", re.U)

	def custom_tokenizer( s ):
		return [x.lower() for x in token_pattern.split(s) if (len(x) >= min_term_length) ]

	# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
	if apply_norm:
		norm_function = "l2"
	else:
		norm_function = None
	tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
	X = tfidf.fit_transform(docs)
	terms = []
	# store the vocabulary map
	v = tfidf.vocabulary_
	for i in range(len(v)):
		terms.append("")
	for term in v.keys():
		terms[ v[term] ] = term
	return (X,terms) 
Example 9
Project: itchatRobot   Author: benkris1   File: MyRobot.py    (license) View Source Project 6 votes vote down vote up
def text_reply(msg):
    #logger.info(json.dumps(msg).decode("unicode_escape"))
    if msg['isAt'] and myRobot.groupOnline:
        match = re.match("@\S+\s+?(.*)",msg['Content'],re.U)
        temp = match.group(1) if match else msg['Content']
        logger.info(temp)
        reply = tuLing.reply(temp,msg["FromUserName"])
        logger.info(u"%s group %s msg :[%s],reply:[%s]",msg["User"].get("NickName"),msg['ActualNickName'],msg["Text"],reply)
        itchat.send(u'@%s\u2005 ??: %s' % (msg['ActualNickName'], reply), msg['FromUserName']) 
Example 10
Project: flinck   Author: Kraymer   File: brain.py    (license) View Source Project 6 votes vote down vote up
def search_filename(fname, fields):
    """Extract movie title/date from filename and return dict with movies infos
    """
    path_tokens = os.path.normpath(fname).split(os.sep)
    candidate = path_tokens[-1]
    res = re.split(FNAME_SPLIT_RE, candidate,
                   flags=re.I | re.U)[0].strip()
    res = scrub(res, '[({])}', ' ')
    res = ' '.join([x for x in re.split(r'[\s\._]', res, flags=re.U) if x])
    years = re.findall(r'((?:19|20)\d\d)', res)
    if years:
        toks = re.split(r'(%s)' % years[-1], res)
    else:
        toks = [res]
    title = toks[0].strip()
    year = toks[1] if len(toks) > 1 else None
    item = search_by(title, year, fields)
    if item:
        item['filename'] = fname
        return item 
Example 11
Project: tv_grab_es_movistartv   Author: MovistarTV   File: tv_grab_es_movistartv.py    (license) View Source Project 6 votes vote down vote up
def __get_series_data(program, ext_info):
        episode = int(program['episode'])
        season = int(program['season'])
        desc = ext_info['synopsis'] if ext_info else u'Año: %s' % program['year']
        if season == 0:
            sn = re.findall(r'.*\sT(\d*/?\d+).*', program['full_title'], re.U)
            season = int(sn[0].replace('/', '')) if sn else season
        if 'episode_title' in program:
            title = program['serie']
            stitle = '%ix%02d %s' % (season, episode, program['episode_title'])
        else:
            title = re.findall(r'(.*)\sT\d*/?\d+.*', program['full_title'], re.U)
            title = title[0] if title else program['full_title']
            stitle = '%ix%02d %s' % (
                season, episode, ext_info['originalTitle']
                if ext_info and 'originalTitle' in ext_info else 'Episodio %i' % episode
            )
        return {
            'title': title,
            'sub-title': stitle,
            'season': season if season > 0 else '',
            'episode': episode,
            'desc': desc
        } 
Example 12
Project: oil   Author: oilshell   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_ignore_case(self):
        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")

        if have_unicode:
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I)) 
Example 13
Project: oil   Author: oilshell   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_ignore_case_set(self):
        self.assertTrue(re.match(r'[19A]', 'A', re.I))
        self.assertTrue(re.match(r'[19a]', 'a', re.I))
        self.assertTrue(re.match(r'[19a]', 'A', re.I))
        self.assertTrue(re.match(r'[19A]', 'a', re.I))
        if have_unicode:
            self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I)) 
Example 14
Project: python2-tracer   Author: extremecoders-re   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_ignore_case(self):
        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")

        if have_unicode:
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I)) 
Example 15
Project: python2-tracer   Author: extremecoders-re   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_ignore_case_set(self):
        self.assertTrue(re.match(r'[19A]', 'A', re.I))
        self.assertTrue(re.match(r'[19a]', 'a', re.I))
        self.assertTrue(re.match(r'[19a]', 'A', re.I))
        self.assertTrue(re.match(r'[19A]', 'a', re.I))
        if have_unicode:
            self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I)) 
Example 16
Project: web_ctp   Author: molebot   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_sre_character_class_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
            if i < 256:
                self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
            if i < 0x10000:
                self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
            self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
            self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
            self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
        self.assertIsNotNone(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
        self.assertRaises(re.error, re.match, r"[\911]", "")
        self.assertRaises(re.error, re.match, r"[\x1z]", "")
        self.assertRaises(re.error, re.match, r"[\u123z]", "")
        self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
        self.assertRaises(re.error, re.match, r"[\U00110000]", "") 
Example 17
Project: web_ctp   Author: molebot   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_sre_byte_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
            self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
            self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
            self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
            self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
            self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
            self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
        self.assertIsNotNone(re.match(br"\u", b'u'))
        self.assertIsNotNone(re.match(br"\U", b'U'))
        self.assertIsNotNone(re.match(br"\0", b"\000"))
        self.assertIsNotNone(re.match(br"\08", b"\0008"))
        self.assertIsNotNone(re.match(br"\01", b"\001"))
        self.assertIsNotNone(re.match(br"\018", b"\0018"))
        self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
        self.assertRaises(re.error, re.match, br"\911", b"")
        self.assertRaises(re.error, re.match, br"\x1", b"")
        self.assertRaises(re.error, re.match, br"\x1z", b"") 
Example 18
Project: plugin.video.skystreaming   Author: Ideneal   File: utility.py    (license) View Source Project 6 votes vote down vote up
def m3u2list(data):
    """convert an m3u data to a list"""
    matches = re.compile('^#EXTINF:-?[0-9]*(.*?),(.*?)\n(.*?)$', re.I + re.M + re.U + re.S).findall(data)
    li = []
    for params, display_name, url in matches:
        item_data = {'params': params, 'display_name': display_name, 'url': url}
        li.append(item_data)

    playlist = []
    for channel in li:
        item_data = {'display_name': channel['display_name'], 'url': channel['url']}
        matches = re.compile(' (.+?)="(.+?)"', re.I + re.M + re.U + re.S).findall(channel['params'])
        for field, value in matches:
            item_data[field.strip().lower().replace('-', '_')] = value.strip()
        playlist.append(item_data)
    return playlist 
Example 19
Project: pefile.pypy   Author: cloudtracer   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_ignore_case(self):
        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")

        if have_unicode:
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I)) 
Example 20
Project: pefile.pypy   Author: cloudtracer   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_ignore_case_set(self):
        self.assertTrue(re.match(r'[19A]', 'A', re.I))
        self.assertTrue(re.match(r'[19a]', 'a', re.I))
        self.assertTrue(re.match(r'[19a]', 'A', re.I))
        self.assertTrue(re.match(r'[19A]', 'a', re.I))
        if have_unicode:
            self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I)) 
Example 21
Project: dbs-back   Author: Beit-Hatfutsot   File: phonetic.py    (license) View Source Project 6 votes vote down vote up
def is_hebrew(string):
    'A hacky way to check if our string is in Hebrew - check the 1rst char'
    # Drop digits from the string
    string = re.sub('\d', '', string)
    # Drop special characters from the string
    string = re.sub('\W', '', string, flags = re.U)
    # Strip the string
    string = string.strip()
    # Support empty strings
    if not string:
        return None
    # Make sure the string is UTF-8
    if type(string) != unicode:
        string = string.decode('utf-8')
    HEBREW_AB = unicode(u'???????????????????????????')
    if string[0] in HEBREW_AB:
        return True
    else:
        return False 
Example 22
Project: ouroboros   Author: pybee   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_sre_character_class_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
            if i < 256:
                self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
            if i < 0x10000:
                self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
                self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
            self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
            self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
            self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
        self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
        self.assertRaises(re.error, re.match, r"[\911]", "")
        self.assertRaises(re.error, re.match, r"[\x1z]", "")
        self.assertRaises(re.error, re.match, r"[\u123z]", "")
        self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
        self.assertRaises(re.error, re.match, r"[\U00110000]", "") 
Example 23
Project: ouroboros   Author: pybee   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_sre_byte_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
            self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
            self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
            self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
            self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
            self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
            self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
        self.assertTrue(re.match(br"\u", b'u'))
        self.assertTrue(re.match(br"\U", b'U'))
        self.assertTrue(re.match(br"\0", b"\000"))
        self.assertTrue(re.match(br"\08", b"\0008"))
        self.assertTrue(re.match(br"\01", b"\001"))
        self.assertTrue(re.match(br"\018", b"\0018"))
        self.assertTrue(re.match(br"\567", bytes([0o167])))
        self.assertRaises(re.error, re.match, br"\911", b"")
        self.assertRaises(re.error, re.match, br"\x1", b"")
        self.assertRaises(re.error, re.match, br"\x1z", b"") 
Example 24
Project: completor.vim   Author: maralla   File: omni.py    (license) View Source Project 6 votes vote down vote up
def has_omnifunc(self, ft):
        if ft not in self.trigger_cache:
            name = '{}_omni_trigger'.format(ft)
            option = self.get_option(name)
            if not option:
                return False

            try:
                self.trigger_cache[ft] = re.compile(
                    to_unicode(option, 'utf-8'), re.X | re.U)
            except Exception:
                return False

        try:
            return bool(vim.current.buffer.options['omnifunc'])
        except vim.error:
            return False 
Example 25
Project: kbe_server   Author: xiaohaoppy   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_sre_character_class_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
            if i < 256:
                self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
            if i < 0x10000:
                self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
                self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
            self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
            self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
            self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
        self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
        self.assertRaises(re.error, re.match, r"[\911]", "")
        self.assertRaises(re.error, re.match, r"[\x1z]", "")
        self.assertRaises(re.error, re.match, r"[\u123z]", "")
        self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
        self.assertRaises(re.error, re.match, r"[\U00110000]", "") 
Example 26
Project: kbe_server   Author: xiaohaoppy   File: test_re.py    (license) View Source Project 6 votes vote down vote up
def test_sre_byte_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
            self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
            self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
            self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
            self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
            self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
            self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
        self.assertTrue(re.match(br"\u", b'u'))
        self.assertTrue(re.match(br"\U", b'U'))
        self.assertTrue(re.match(br"\0", b"\000"))
        self.assertTrue(re.match(br"\08", b"\0008"))
        self.assertTrue(re.match(br"\01", b"\001"))
        self.assertTrue(re.match(br"\018", b"\0018"))
        self.assertTrue(re.match(br"\567", bytes([0o167])))
        self.assertRaises(re.error, re.match, br"\911", b"")
        self.assertRaises(re.error, re.match, br"\x1", b"")
        self.assertRaises(re.error, re.match, br"\x1z", b"") 
Example 27
Project: QTodoTxt2   Author: QTodoTxt   File: filters.py    (license) View Source Project 6 votes vote down vote up
def compile(searchString):
        r"""
        Return the user's searchString compiled to a regular expression.

        Example terms: @call +work (A) carrots
        Term may be prefixed with ! or ~ for negation.
        Terms may be combined with "," or " " (AND) or with "|" (OR).
        Terms only match the beginning of a word in the task.
        Terms are case-insensitive.
        Expressions may NOT be nested with parentheses.
        Only \-character special regular expression sets are allowed, everything else is escaped.
        """
        if not searchString:
            return None

        terms = SimpleTextFilter._splitter.split(searchString)
        terms = [SimpleTextFilter._term2re(term) for term in terms]

        return re.compile("".join(terms), re.I | re.U) 
Example 28
Project: formpack   Author: kobotoolbox   File: string.py    (license) View Source Project 6 votes vote down vote up
def slugify(string, separator=r'-'):
    r"""
    Slugify a unicode string using unicodedata to normalize the string.
    :Example:
        >>> slugify(u"H\xe9ll\xf8 W\xc3\xb6rld")
        'hell-world'
        >>> slugify("Bonjour, tout l'monde !", separator="_")
        'bonjour_tout_lmonde'
        >>> slugify("\tStuff with -- dashes and...   spaces   \n")
        'stuff-with-dashes-and-spaces'
    """

    string = normalize(string)
    string = re.sub(r'[^\w\s' + separator + ']', '', string, flags=re.U)
    string = string.strip().lower()
    return re.sub(r'[' + separator + '\s]+', separator, string, flags=re.U) 
Example 29
Project: t-hoarder_kit   Author: congosto   File: tweets_entity.py    (license) View Source Project 6 votes vote down vote up
def token_words (self,source):
    list_words=[]
    source_without_urls=u''
  #renove urls from tweet
    urls=re.findall (r'(http[s]*://\S+)', source,re.U)
    for url in urls:
      start=source.find(url)
      end=len(url)
      source_without_urls=source_without_urls+source[0:start-1]
      source=source[start+end:] 
    source_without_urls=source_without_urls+source
    list_tokens=re.findall (r'[#@]*\w+', source_without_urls,re.U) 
#  remove users and hashtags
    for token in list_tokens:
      if (token.find('#') == -1) and (token.find('@') == -1):
        number= re.search(r'\d+',token)
        if not number:
          token=token.lower()
          list_words.append(token)
    return list_words 
Example 30
Project: t-hoarder_kit   Author: congosto   File: tweets_entity.py    (license) View Source Project 6 votes vote down vote up
def set_user_mention_day(self,date,text):
    list_mentions=re.findall (r'@\w+', text)
    if len (list_mentions) >0:
      user=list_mentions[0]
      if re.match(r'[\.]*(@\w+)[^\t\n]+',text):
        if user in self.top_users_reply:
          index= self.top_users_reply.index(user)
          self.dict_top_users_reply_day.store(date,index,1)
      elif re.match('[rt[\s]*(@\w+)[:]*',text,re.U):
        if user in self.top_users_RT:
          index= self.top_users_RT.index(user)
          self.dict_top_users_RT_day.store(date,index,1)
      for user in list_mentions:
        if user in self.top_users_mention:
          index= self.top_users_mention.index(user)
          self.dict_top_users_mention_day.store(date,index,1)
    return 
Example 31
Project: t-hoarder_kit   Author: congosto   File: tweets_entity.py    (license) View Source Project 6 votes vote down vote up
def get_tweet (tweet):
   data = tweet.split('\t')
   if len (data) >= 10:
     id_tweet = data[0]
     timestamp = data[1]
     date_hour =re.findall(r'(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)',timestamp,re.U)
     (year,month,day,hour,minutes,seconds) = date_hour[0]
     author= data[2]
     text = data[3]
     app = data[4]
     user_id = data[6]
     followers = data[6]
     following = data[7]
     statuses = data[8]
     loc = data[9]
     return (year,month,day,hour,minutes,seconds, author,text,app,user_id,followers,following,statuses,loc)
   else:
     print ' tweet not match'
     return None 
Example 32
Project: t-hoarder_kit   Author: congosto   File: users_types.py    (license) View Source Project 6 votes vote down vote up
def get_tweet (tweet):
   data = tweet.split('\t')
   if len (data) >= 8:
     id_tweet = data[0]
     timestamp = data[1]
     date_hour =re.findall(r'(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)',timestamp,re.U)
     (year,month,day,hour,minutes,seconds) = date_hour[0]
     author= data[2]
     text = data[3]
     app = data[4]
     id_user = data[5]
     followers = data[6]
     following = data [7]
     return (id_tweet,year,month,day,hour,minutes,seconds, author,text,app,id_user,followers,following)
   else:
     print ' tweet not match'
     return None 
Example 33
Project: t-hoarder_kit   Author: congosto   File: tweets_spread.py    (license) View Source Project 6 votes vote down vote up
def get_tweet_source (text):
  source=None
  text_aux=text
  start=text_aux.find('RT')
  while  start !=  -1:
    #print start
    text=text_aux[start:]
    #print text
    RT = re.match('[RT[\s]*(@\w+)[:]*',text,re.U)
    if RT:
      source=RT.group(1)
      text_aux=text[len(RT.group(0)):]
      #print text_aux
      #print source
      start=text_aux.find('RT')
    else:
      break
  return (source, text_aux) 
Example 34
Project: sentiment-domain-adaptation   Author: cmward   File: tokenizer.py    (license) View Source Project 6 votes vote down vote up
def __init__(self):
        self.emoticon_pattern = r'[=:;]-?\s?[\)\(D]'
        self.repeated_pattern = re.compile(r'(.)\1{2,}')
        self.user_pattern = re.compile(r'@+[\w_]+')
        self.hashtag_pattern = re.compile(r'\#+[\w_]+[\w\'_\-]*[\w_]+')
        self.url_pattern = re.compile(
            r'(https?:\/\/(www\.)?)?[[email protected]:%._\+~#=]{2,256}\.' + \
            r'[a-z]{2,6}\b([[email protected]:%_\+.~#?&//=]*)')
        word_pattern = r"""
            (?:<[^>]+>)                    # HTML tags
            |
            (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
            |
            (?:[\w_\-']+)                  # Words
            |
            (?:\.(?:\s*\.){1,})            # Ellipsis dots.
            |
            (?:\S)                         # Everything else that isn't whitespace.
            """
        self.word_pattern = re.compile(word_pattern,
                                       re.VERBOSE | re.I | re.U) 
Example 35
Project: Albireo   Author: lordfriend   File: import.py    (license) View Source Project 6 votes vote down vote up
def __parse_episode_number(self, eps_title):
        '''
        parse the episode number from episode title, it use a list of regular expressions. the position in the list
        is the priority of the regular expression.
        :param eps_title: the title of episode.
        :return: episode number if matched, otherwise, -1
        '''
        try:
            for regex in episode_regex_tuple:
                search_result = re.search(regex, eps_title, re.U | re.I)
                if search_result is not None:
                    return int(search_result.group(1))

            return -1
        except Exception:
            return -1 
Example 36
Project: Albireo   Author: lordfriend   File: AbstractScanner.py    (license) View Source Project 6 votes vote down vote up
def parse_episode_number(self, eps_title):
        '''
        parse the episode number from episode title, it use a list of regular expressions. the position in the list
        is the priority of the regular expression.
        :param eps_title: the title of episode.
        :return: episode number if matched, otherwise, -1
        '''
        try:
            for regex in episode_regex_tuple:
                search_result = re.search(regex, eps_title, re.U | re.I)
                if search_result is not None:
                    matched_number = int(search_result.group(1))
                    if self.bangumi.eps_no_offset is not None:
                        matched_number = matched_number + self.bangumi.eps_no_offset
                    return matched_number

            return -1
        except Exception as error:
            logger.warn(error)
            return -1 
Example 37
Project: Albireo   Author: lordfriend   File: feed.py    (license) View Source Project 6 votes vote down vote up
def parse_episode_number(self, eps_title):
        '''
        parse the episode number from episode title, it use a list of regular expressions. the position in the list
        is the priority of the regular expression.
        :param eps_title: the title of episode.
        :return: episode number if matched, otherwise, -1
        '''
        try:
            for regex in episode_regex_tuple:
                search_result = re.search(regex, eps_title, re.U | re.I)
                if search_result is not None:
                    return int(search_result.group(1))

            return -1
        except Exception:
            return -1 
Example 38
Project: pythainlp   Author: PyThaiNLP   File: __init__.py    (license) View Source Project 6 votes vote down vote up
def normalize(text):
    """
    ??????????????????????????????????
    normalize(text)
    ?????? str
    ????????
    >>> print(normalize("?????")=="????") # ? ? ? ? ? ??? ????
    True
    """
    if six.PY2:
        for data in rule2py2:
            text=re.sub(data[0].replace(u"t",u"[????]"),data[1],text,re.U)
    else:
        for data in rule2:
            text=re.sub(data[0].replace("t","[????]"),data[1],text,re.U)
    for data in list(zip(rule1,rule1)):
        text=re.sub(data[0].replace(u"t",u"[????]")+"+",data[1],text,re.U)
    return text 
Example 39
Project: kinect-2-libras   Author: inessadl   File: textwrap.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def __init__(self,
                 width=70,
                 initial_indent="",
                 subsequent_indent="",
                 expand_tabs=True,
                 replace_whitespace=True,
                 fix_sentence_endings=False,
                 break_long_words=True,
                 drop_whitespace=True,
                 break_on_hyphens=True):
        self.width = width
        self.initial_indent = initial_indent
        self.subsequent_indent = subsequent_indent
        self.expand_tabs = expand_tabs
        self.replace_whitespace = replace_whitespace
        self.fix_sentence_endings = fix_sentence_endings
        self.break_long_words = break_long_words
        self.drop_whitespace = drop_whitespace
        self.break_on_hyphens = break_on_hyphens

        # recompile the regexes for Unicode mode -- done in this clumsy way for
        # backwards compatibility because it's rather common to monkey-patch
        # the TextWrapper class' wordsep_re attribute.
        self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
        self.wordsep_simple_re_uni = re.compile(
            self.wordsep_simple_re.pattern, re.U)


    # -- Private methods -----------------------------------------------
    # (possibly useful for subclasses to override) 
Example 40
Project: earthy   Author: alvations   File: __init__.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def __init__(self):
        # Initialize the standard TreebankWordTokenizer.
        super(self.__class__, self).__init__()
        # Adding to TreebankWordTokenizer, the splits on
        # - chervon quotes u'\xab' and u'\xbb' .
        # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
        improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
        improved_close_quote_regex = re.compile(u'([»”’])', re.U)
        improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
        self.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
        self.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
        self.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 ')) 
Example 41
Project: idol   Author: nondanee   File: tools.py    (GNU General Public License v3.0) View Source Project 5 votes vote down vote up
def translate(original):#original <type 'unicode'>
    waittrans = re.findall(u'[?-?|?-?|?-?|?|?]+',original,re.U)
    findnum = len(waittrans)
    subnum = 0
    waitfill = original
    while(findnum!=subnum):
        waitfill = re.sub(u'[?-?|?-?|?-?|?|?]+',"%s",waitfill,re.U)
        subnum = len(re.findall('%s',waitfill))
    # if len(re.findall('%',waitfill)) != subnum:
    waitfill = re.sub(u'%(?!s)','?'.decode("utf-8"),waitfill)
    
    filltext=[]
    print "workload",len(waittrans)
    for line in waittrans:     
              
        if line in { u"?" : "", u"?" : ""}:
            filltext.append(line)
            continue

        send = line.encode("utf-8") 
        gettrans = baidufanyi(send)

        if re.search(u"[???]",gettrans[-1]):
            gettrans = gettrans[0:-1]              
           
        filltext.append(gettrans)

    translation = waitfill %tuple(filltext)
    translation = re.sub("?".decode("utf-8"),'%',translation)
    return translation 
Example 42
Project: Qkou_kit   Author: pddg   File: stream.py    (MIT License) View Source Project 5 votes vote down vote up
def tweetassembler(**args):
    in_reply_to_status = args['in_reply_to_status']
    if in_reply_to_status is not None:
        regex = u'.*??.*'
        if re.match(regex, in_reply_to_status.text, re.U):
            # ??????ID???
            id = in_reply_to_status.in_reply_to_status_id
            # ??????????????
            qkou_status = api.get_status(id)
            entities = qkou_status.entities['hashtags']
            # ????????????????
            if len(entities) > 0:
                hashtag = entities[0]['text']
                # ??????????????
                info_num = re.search("(?<=lec)[0-9]*", hashtag)
                news_num = re.search("(?<=news)[0-9]*", hashtag)
                if info_num is not None:
                    qkou_id = info_num.group()
                    log.debug("[ Stream ] Info??????")
                    dm_text = get_info(qkou_id)
                elif news_num is not None:
                    news_id = news_num.group()
                    log.debug("[ Stream ] News??????")
                    dm_text = get_news(news_id)
                else:
                    pass
                try:
                    api.send_direct_message(
                        user_id=in_reply_to_status.user.id, text=dm_text)
                    log.debug('[ Stream ] DM???')
                except Exception as e:
                    log.exception(e)
            else:
                pass 
Example 43
Project: sublime-text-3-packages   Author: nickjj   File: WordCount.py    (MIT License) View Source Project 5 votes vote down vote up
def load(self):
		Pref.view                   = False
		Pref.elapsed_time           = 0.4
		Pref.running                = False

		Pref.wrdRx                  = re.compile(s.get('word_regexp', "^[^\w]?`*\w+[^\w]*$"), re.U)
		Pref.wrdRx                  = Pref.wrdRx.match
		Pref.splitRx                = s.get('word_split', None)
		if Pref.splitRx:
			Pref.splitRx            = re.compile(Pref.splitRx, re.U)
			Pref.splitRx            = Pref.splitRx.findall

		Pref.enable_live_count      = s.get('enable_live_count', True)
		Pref.enable_readtime        = s.get('enable_readtime', False)
		Pref.enable_line_word_count = s.get('enable_line_word_count', False)
		Pref.enable_line_char_count = s.get('enable_line_char_count', False)
		Pref.enable_count_lines     = s.get('enable_count_lines', False)
		Pref.enable_count_chars     = s.get('enable_count_chars', False)
		Pref.enable_count_pages     = s.get('enable_count_pages', True)

		Pref.words_per_page         = s.get('words_per_page', 300)
		Pref.page_count_mode_count_words = s.get('page_count_mode_count_words', True)
		Pref.char_ignore_whitespace = s.get('char_ignore_whitespace', True)
		Pref.readtime_wpm           = s.get('readtime_wpm', 200)
		Pref.whitelist              = [x.lower() for x in s.get('whitelist_syntaxes', []) or []]
		Pref.blacklist              = [x.lower() for x in s.get('blacklist_syntaxes', []) or []]
		Pref.strip                  = s.get('strip', [])

		for window in sublime.windows():
			for view in window.views():
				view.erase_status('WordCount');
				view.settings().erase('WordCount') 
Example 44
Project: abusehelper   Author: Exploit-install   File: iprange.py    (license) View Source Project 5 votes vote down vote up
def __init__(self, version, pattern):
        self._version = version

        self._ip_rex = re.compile(r"(" + pattern + r")", re.U | re.I)
        self._cidr_rex = re.compile(r"\s*/\s*(\d{1,5})", re.U | re.I)
        self._range_rex = re.compile(r"\s*-\s*(" + pattern + r")", re.U | re.I) 
Example 45
Project: abusehelper   Author: Exploit-install   File: test_atoms.py    (license) View Source Project 5 votes vote down vote up
def test_from_re(self):
        # re.U and re.S flags are implicitly set
        self.assertEqual(RegExp.from_re(re.compile("a", re.U)), RegExp("a"))
        self.assertEqual(RegExp.from_re(re.compile("a", re.S)), RegExp("a"))

        # re.I flag can be set explicitly
        self.assertEqual(
            RegExp.from_re(re.compile("a", re.I)),
            RegExp("a", ignore_case=True))

        # re.M, re.L and re.X are forbidden
        for flag in [re.M, re.L, re.X]:
            self.assertRaises(ValueError, RegExp.from_re, re.compile("a", flag)) 
Example 46
Project: abusehelper   Author: Exploit-install   File: atoms.py    (license) View Source Project 5 votes vote down vote up
def init(self, pattern, ignore_case=False):
        Atom.init(self)

        flags = re.U | re.S | (re.I if ignore_case else 0)
        self._regexp = re.compile(pattern, flags) 
Example 47
Project: abusehelper   Author: Exploit-install   File: rulelang.py    (license) View Source Project 5 votes vote down vote up
def format_regexp(format, regexp):
    escape_slash_rex = re.compile(r"((?:^|[^\\])(?:\\\\)*?)(/+)", re.U)

    def escape_slash(match):
        return match.group(1) + match.group(2).replace("/", "\\/")

    pattern = regexp.pattern
    pattern = escape_slash_rex.sub(escape_slash, pattern)

    result = "/" + pattern + "/"
    if regexp.ignore_case:
        result += "i"
    yield result 
Example 48
Project: abusehelper   Author: Exploit-install   File: message.py    (license) View Source Project 5 votes vote down vote up
def escape_whitespace(unicode_string):
    r"""
    Return the given unicode string with the whitespace escaped
    using 'unicode-escape' encoding.

    >>> escape_whitespace(u"space is not escaped")
    u'space is not escaped'

    >>> escape_whitespace(u"multi\nline\nwith\ttabs")
    u'multi\\nline\\nwith\\ttabs'
    """

    return re.sub(r"\s", lambda x: unicode(x.group(0).encode("unicode-escape")), unicode_string, re.U) 
Example 49
Project: CodingDojo   Author: ComputerSocietyUNB   File: text.py    (license) View Source Project 5 votes vote down vote up
def slugify(value, allow_unicode=False):
    """
    Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
    Remove characters that aren't alphanumerics, underscores, or hyphens.
    Convert to lowercase. Also strip leading and trailing whitespace.
    """
    value = force_text(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
        value = re.sub('[^\w\s-]', '', value, flags=re.U).strip().lower()
        return mark_safe(re.sub('[-\s]+', '-', value, flags=re.U))
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub('[^\w\s-]', '', value).strip().lower()
    return mark_safe(re.sub('[-\s]+', '-', value)) 
Example 50
Project: CodingDojo   Author: ComputerSocietyUNB   File: html.py    (license) View Source Project 5 votes vote down vote up
def remove_tags(html, tags):
    """Returns the given HTML with given tags removed."""
    warnings.warn(
        "django.utils.html.remove_tags() and the removetags template filter "
        "are deprecated. Consider using the bleach library instead.",
        RemovedInDjango110Warning, stacklevel=3
    )
    tags = [re.escape(tag) for tag in tags.split()]
    tags_re = '(%s)' % '|'.join(tags)
    starttag_re = re.compile(r'<%s(/?>|(\s+[^>]*>))' % tags_re, re.U)
    endtag_re = re.compile('</%s>' % tags_re)
    html = starttag_re.sub('', html)
    html = endtag_re.sub('', html)
    return html