Python re.L Examples

The following are 30 code examples of re.L(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module re , or try the search function .
Example #1
Source File: Bayes.py    From weiboanalysis with Apache License 2.0 7 votes vote down vote up
def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace('\u200b', '')
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(r'\w', re.L)
                    result = p.sub("", word)
                    if not result or result == ' ':  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set('\u200b')
                                 - set(' ') - set('\u3000') - set('️'))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut, label  # 返回每条微博的分词和标注 
Example #2
Source File: Bayes.py    From weiboanalysis with Apache License 2.0 7 votes vote down vote up
def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace('\u200b', '')
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(b'\w', re.L)
                    result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8")
                    if not result or result == ' ':  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set('\u200b')
                                 - set(' ') - set('\u3000') - set('️'))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut, label  # 返回每条微博的分词和标注 
Example #3
Source File: tool.py    From weiboanalysis with Apache License 2.0 6 votes vote down vote up
def pynlp_build_key_word(filename):
    d={}
    with open(filename, encoding="utf-8") as fp:
        for line in fp:
            s = line
            p = re.compile(r'http?://.+$')  # 正则表达式,提取URL
            result = p.findall(line)  # 找出所有url
            if len(result):
                for i in result:
                    s = s.replace(i, '')  # 一个一个的删除
            temp = pynlpir.segment(s, pos_tagging=False)  # 分词
            for i in temp:
                if '@' in i:
                    temp.remove(i)  # 删除分词中的名字
                p = re.compile(r'\w', re.L)
                result = p.sub("", i)
                if not result or result == ' ':  # 空字符
                    continue
                if len(i) > 1:  # 避免大量无意义的词语进入统计范围
                    d[i] = d.get(i, 0) + 1
    kw_list = sorted(d, key=lambda x: d[x], reverse=True)
    size = int(len(kw_list) * 0.2)  # 取最前的30%
    mood = set(kw_list[:size])
    return list(mood - set(stop)- set('\u200b') - set(' ') - set('\u3000')) 
Example #4
Source File: tree.py    From uproot with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _branch_flags(flags):
        flagsbyte = 0
        for flag in flags:
            if flag == "i":
                flagsbyte += re.I
            elif flag == "L":
                flagsbyte += re.L
            elif flag == "m":
                flagsbyte += re.M
            elif flag == "s":
                flagsbyte += re.S
            elif flag == "u":
                flagsbyte += re.U
            elif flag == "x":
                flagsbyte += re.X
        return flagsbyte 
Example #5
Source File: tree.py    From uproot with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _branch_flags(flags):
        flagsbyte = 0
        for flag in flags:
            if flag == "i":
                flagsbyte += re.I
            elif flag == "L":
                flagsbyte += re.L
            elif flag == "m":
                flagsbyte += re.M
            elif flag == "s":
                flagsbyte += re.S
            elif flag == "u":
                flagsbyte += re.U
            elif flag == "x":
                flagsbyte += re.X
        return flagsbyte 
Example #6
Source File: tools.py    From hadrian with Apache License 2.0 6 votes vote down vote up
def getmatch(self, haystack):
        if not isinstance(haystack, basestring):
            return None
        flags = 0
        if self.flags is not None:
            if "i" in self.flags or "I" in self.flags:
                flags |= re.I
            if "l" in self.flags or "L" in self.flags:
                flags |= re.L
            if "m" in self.flags or "M" in self.flags:
                flags |= re.M
            if "s" in self.flags or "S" in self.flags:
                flags |= re.S
            if "u" in self.flags or "U" in self.flags:
                flags |= re.U
            if "x" in self.flags or "X" in self.flags:
                flags |= re.X
        if re.match(self.pattern, haystack, flags=flags) is None:
            return None
        elif self.to is None:
            return Match(haystack, haystack)
        else:
            return Match(haystack, re.sub(self.pattern, self.to, haystack, flags=flags)) 
Example #7
Source File: utils.py    From LDA_RecEngine with Apache License 2.0 6 votes vote down vote up
def preprocessing(content):
    remove_punc = ('。 ; 。 、 」 「 , ( ) —').split(' ')
    ## preprocessing #1 : remove XXenglishXX and numbers
    preprocessing_1 = re.compile(r'\d*',re.L)  ## only substitute numbers
    #preprocessing_1 = re.compile(r'\w*',re.L)  ## substitute number & English
    content = preprocessing_1.sub("",content)
    ## preprocessing #2 : remove punctuation
    preprocessing_2 = re.compile('[%s]' % re.escape(string.punctuation))
    content = preprocessing_2.sub("",content)
    ## preprocessing #3 : remove Chinese punctuation and multiple whitspaces
    content = content.replace('\n','')
    for punc in remove_punc:
        content = content.replace(punc,'')
    try:
        content = parsing.strip_multiple_whitespaces(content)
    except:
        print 'Warning : failed to strip whitespaces @ '   
    
    return content 
Example #8
Source File: test_re.py    From ironpython3 with Apache License 2.0 5 votes vote down vote up
def check_en_US_utf8(self):
        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
        self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
        self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 
Example #9
Source File: test_re.py    From ironpython3 with Apache License 2.0 5 votes vote down vote up
def check_en_US_iso88591(self):
        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 
Example #10
Source File: test_sre_yield.py    From sre_yield with Apache License 2.0 5 votes vote down vote up
def testParseErrors(self):
        self.assertRaises(sre_yield.ParseError, sre_yield.AllStrings, "a", re.I)
        self.assertRaises(sre_yield.ParseError, sre_yield.AllStrings, "a", re.U)
        # Causes a failure inside sre_parse under Python 3.6
        # self.assertRaises(sre_yield.ParseError, sre_yield.AllStrings, 'a', re.L) 
Example #11
Source File: test_re.py    From ironpython3 with Apache License 2.0 5 votes vote down vote up
def test_flags(self):
        for flag in [re.I, re.M, re.X, re.S, re.L]:
            self.assertTrue(re.compile('^pattern$', flag)) 
Example #12
Source File: test_re.py    From gcblue with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_constants(self):
        self.assertEqual(re.I, re.IGNORECASE)
        self.assertEqual(re.L, re.LOCALE)
        self.assertEqual(re.M, re.MULTILINE)
        self.assertEqual(re.S, re.DOTALL)
        self.assertEqual(re.X, re.VERBOSE) 
Example #13
Source File: test_re.py    From ironpython3 with Apache License 2.0 5 votes vote down vote up
def test_constants(self):
        self.assertEqual(re.I, re.IGNORECASE)
        self.assertEqual(re.L, re.LOCALE)
        self.assertEqual(re.M, re.MULTILINE)
        self.assertEqual(re.S, re.DOTALL)
        self.assertEqual(re.X, re.VERBOSE) 
Example #14
Source File: test_re.py    From gcblue with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_flags(self):
        for flag in [re.I, re.M, re.X, re.S, re.L]:
            self.assertNotEqual(re.compile('^pattern$', flag), None) 
Example #15
Source File: header.py    From quantipy with MIT License 5 votes vote down vote up
def _getMultRespDef(self, mrDef):
        """Get 'normal' multiple response defintions.
        This is a helper function for the multRespDefs getter function.
        A multiple response definition <mrDef> in the string format returned
        by the IO module is converted into a multiple response definition of
        the form multRespSet = {<setName>: {"setType": <setType>, "label":
        <lbl>, "varNames": <list_of_varNames>}}. SetType may be either 'D'
        (multiple dichotomy sets) or 'C' (multiple category sets). If setType
        is 'D', the multiple response definition also includes '"countedValue":
        countedValue'"""
        regex = b"\$(?P<setName>\S+)=(?P<setType>[CD])\n?"
        m = re.search(regex + b".*", mrDef, re.I | re.L)
        if not m:
            return {}
        setType = m.group("setType")
        if setType == b"C":  # multiple category sets
            regex += b" (?P<lblLen>\d+) (?P<lblVarNames>.+) ?\n?"
            matches = re.findall(regex, mrDef, re.I)
            setName, setType, lblLen, lblVarNames = matches[0]
        else:               # multiple dichotomy sets
            # \w+ won't always work (e.g. thai) --> \S+
            regex += (b"(?P<valueLen>\d+) (?P<countedValue>\S+)" +
                      b" (?P<lblLen>\d+) (?P<lblVarNames>.+) ?\n?")
            matches = re.findall(regex, mrDef, re.I | re.L)
            setName, setType, valueLen = matches[0][:3]
            countedValue, lblLen, lblVarNames = matches[0][3:]
        lbl = lblVarNames[:int(lblLen)]
        varNames = lblVarNames[int(lblLen):].split()
        multRespSet = {setName: {b"setType": setType, b"label": lbl,
                                 b"varNames": varNames}}
        if setType == b"D":
            multRespSet[setName][b"countedValue"] = countedValue
        return multRespSet 
Example #16
Source File: test_re.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 5 votes vote down vote up
def check_en_US_iso88591(self):
        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 
Example #17
Source File: test_re.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 5 votes vote down vote up
def test_getlower(self):
        import _sre
        self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
        self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
        self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
        self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))

        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
        self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
        self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC") 
Example #18
Source File: test_re.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 5 votes vote down vote up
def test_constants(self):
        self.assertEqual(re.I, re.IGNORECASE)
        self.assertEqual(re.L, re.LOCALE)
        self.assertEqual(re.M, re.MULTILINE)
        self.assertEqual(re.S, re.DOTALL)
        self.assertEqual(re.X, re.VERBOSE) 
Example #19
Source File: test_re.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 5 votes vote down vote up
def test_flags(self):
        for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
            self.assertTrue(re.compile('^pattern$', flag))
        for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
            self.assertTrue(re.compile(b'^pattern$', flag)) 
Example #20
Source File: test_re.py    From Fluid-Designer with GNU General Public License v3.0 5 votes vote down vote up
def check_en_US_iso88591(self):
        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 
Example #21
Source File: test_re.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 5 votes vote down vote up
def test_locale(self):
        self.check_flags(b'bytes pattern', re.L,
                         "re.compile(b'bytes pattern', re.LOCALE)") 
Example #22
Source File: test_re.py    From medicare-demo with Apache License 2.0 5 votes vote down vote up
def test_constants(self):
        self.assertEqual(re.I, re.IGNORECASE)
        self.assertEqual(re.L, re.LOCALE)
        self.assertEqual(re.M, re.MULTILINE)
        self.assertEqual(re.S, re.DOTALL)
        self.assertEqual(re.X, re.VERBOSE) 
Example #23
Source File: test_re.py    From medicare-demo with Apache License 2.0 5 votes vote down vote up
def test_flags(self):
        for flag in [re.I, re.M, re.X, re.S, re.L]:
            self.assertNotEqual(re.compile('^pattern$', flag), None) 
Example #24
Source File: test_re.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def test_constants(self):
        self.assertEqual(re.I, re.IGNORECASE)
        self.assertEqual(re.L, re.LOCALE)
        self.assertEqual(re.M, re.MULTILINE)
        self.assertEqual(re.S, re.DOTALL)
        self.assertEqual(re.X, re.VERBOSE) 
Example #25
Source File: test_re.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def test_flags(self):
        for flag in [re.I, re.M, re.X, re.S, re.L]:
            self.assertTrue(re.compile('^pattern$', flag)) 
Example #26
Source File: test_re.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def check_en_US_iso88591(self):
        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 
Example #27
Source File: test_re.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def check_en_US_utf8(self):
        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
        self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
        self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 
Example #28
Source File: test_re.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def test_constants(self):
        self.assertEqual(re.I, re.IGNORECASE)
        self.assertEqual(re.L, re.LOCALE)
        self.assertEqual(re.M, re.MULTILINE)
        self.assertEqual(re.S, re.DOTALL)
        self.assertEqual(re.X, re.VERBOSE) 
Example #29
Source File: test_re.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def test_flags(self):
        for flag in [re.I, re.M, re.X, re.S, re.L]:
            self.assertTrue(re.compile('^pattern$', flag)) 
Example #30
Source File: test_re.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def check_en_US_iso88591(self):
        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))