Python re.findall() Examples

The following are code examples for showing how to use re.findall(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: pyblish-win   Author: pyblish   File: test_re.py    GNU Lesser General Public License v3.0 7 votes vote down vote up
def test_string_boundaries(self):
        # See http://bugs.python.org/issue10713
        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
                         "abc")
        # There's a word boundary at the start of a string.
        self.assertTrue(re.match(r"\b", "abc"))
        # A non-empty string includes a non-boundary zero-length match.
        self.assertTrue(re.search(r"\B", "abc"))
        # There is no non-boundary match at the start of a string.
        self.assertFalse(re.match(r"\B", "abc"))
        # However, an empty string contains no word boundaries, and also no
        # non-boundaries.
        self.assertIsNone(re.search(r"\B", ""))
        # This one is questionable and different from the perlre behaviour,
        # but describes current behavior.
        self.assertIsNone(re.search(r"\b", ""))
        # A single word-character string has two boundaries, but no
        # non-boundary gaps.
        self.assertEqual(len(re.findall(r"\b", "a")), 2)
        self.assertEqual(len(re.findall(r"\B", "a")), 0)
        # If there are no words, there are no boundaries
        self.assertEqual(len(re.findall(r"\b", " ")), 0)
        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
        # Can match around the whitespace.
        self.assertEqual(len(re.findall(r"\B", " ")), 2) 
Example 2
Project: crawler   Author: fst034356   File: crawlProxy.py    MIT License 7 votes vote down vote up
def getProxyThird(days=1):
        '''
        抓取:有代理 http://www.youdaili.net/Daili/http/
        :param days:
        :return:
        '''
        url = "http://www.youdaili.net/Daili/http/"
        tree = getHtmlTree(url)
        page_url_list = tree.xpath(
            './/div[@class="chunlist"]/ul/li/p/a/@href')[0:days]
        for page_url in page_url_list:
            html = getHtmlTree(page_url, xpath=False)
            proxy_list = re.findall(
                r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', str(html))
            for proxy in proxy_list:
                yield proxy 
Example 3
Project: neu-eone.py   Author: yearsyan   File: nespider.py    MIT License 6 votes vote down vote up
def __decode_first_text(self, text):

        res = re.findall('<tr><td class=label1>作者:<td class=content valign=top>(.*?)<td class=lab\
el1>索书号:<td class=content valign=top>(.*?) \n<tr><td class=label1>出版社:<td class\
=content valign=top>(.*?)<td class=label>年份:<td class=content valign=top>(.*?) ', text)
        result_num = int(re.findall('(\d+) \(最大显示记录', text)[0])
        self.query_url = re.findall('10,"(.*?)1"\)</script></div>[\s\S]*?记录', text)[0]
        return_list = [
            {
                'writer': i[0],
                'index': i[1],
                'publisher': i[2],
                'year': i[3]
            }
            for i in res
        ]
        self.sum = result_num
        self.first = return_list  # 保存第一页的信息 
Example 4
Project: neu-eone.py   Author: yearsyan   File: nespider.py    MIT License 6 votes vote down vote up
def get_books(self, page):
        # 如果是第一页 直接返回第一页的信息 否则进行二次访问
        if page == 1 or page == '1':
            return self.first
        else:
            if int(page - 1) * 10 > self.sum:
                return None
            else:
                page = str(int(page) - 1) + '1'
        text = str(requests.get(self.query_url + page, proxies=proxies['library']).content, 'utf8')
        res = re.findall('<tr><td class=label1>作者:<td class=content valign=top>(.*?)<td class=lab\
el1>索书号:<td class=content valign=top>(.*?) \n<tr><td class=label1>出版社:<td class\
=content valign=top>(.*?)<td class=label>年份:<td class=content valign=top>(.*?) ', text)
        return_list = [
            {
                'writer': i[0],
                'index': i[1],
                'publisher': i[2],
                'year': i[3]
            }
            for i in res
        ]
        return return_list 
Example 5
Project: neu-eone.py   Author: yearsyan   File: nespider.py    MIT License 6 votes vote down vote up
def __library_url(self):
        lib_headers = {
            'Referer': 'https://portal.neu.edu.cn/tp_up/view?m=up',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/74.0.3729.131 Safari/537.36',
            'Origin': 'https://portal.neu.edu.cn',
            'Content-Type': 'application/json;charset=UTF-8'
        }
        page = requests.get('https://portal.neu.edu.cn/tp_up/up/subgroup/library',
                            cookies=self.index_cookies,
                            headers=lib_headers,
                            proxies=proxies['index'])
        library_url = re.findall('var url = "(.*?)"', page.text)[0]
        return library_url

    # 通过一网通办登录校园卡平台,私有方法 
Example 6
Project: neu-eone.py   Author: yearsyan   File: nespider.py    MIT License 6 votes vote down vote up
def __get_exam_detial(self,batch_id):
        response = requests.get('http://219.216.96.4/eams/stdExamTable!examTable.action?examBatch.id=%s'%batch_id,
                     cookies=self.aao_cookies, headers=self.__headers,proxies=proxies['aao'],)
        # 如果访问过快,将无法获取下一批次的考试 0.5是极限
        time.sleep(0.5)
        data = re.findall('<td>([\s\S]*?)</td>', response.text)
        return [{
            'id': data[i*9],
            'name': data[i*9+1],
            'type': data[i*9+2],
            'date': data[i*9+3],
            'time': data[i*9+4],
            'location': re.findall('<a.*?>(.*?)</a>',data[i*9+5])[0],
            'seat': data[i*9+6],
            'condition': data[i*9+7],
            'info': data[i*9+8].strip()
        } for i in range(int(len(data)/9))]

    # 获取当前学期的考试 
Example 7
Project: programsynthesishunting   Author: flexgp   File: subtree_parse.py    GNU General Public License v3.0 6 votes vote down vote up
def get_num_from_str(string):
    """
    Given a string of a snippet, return the indexes of that snippet.

     in: '[1, 2] <RE>'

     out: [1, 2]

    :param string: A string defining a snippet.
    :return: The indexes of that snippet.
    """

    # Get index portion of string
    index = re.findall("\[\d+, \d+\]", string)

    return eval(index[0]) 
Example 8
Project: programsynthesishunting   Author: flexgp   File: optimize_constants.py    GNU General Public License v3.0 6 votes vote down vote up
def make_consts_consecutive(s):
    """
    The given phenotype will have zero or more occurrences of each const c[0],
    c[1], etc. But eg it might have c[7], but no c[0]. We need to remap, eg:
        7 -> 0
        9 -> 1
    so that we just have c[0], c[1], etc.
    
    :param s: A given phenotype string.
    :return: The phenotype string but with consecutive constants.
    """

    p = r"c\[(\d+)\]"
    # find the consts, extract idxs as ints, unique-ify and sort
    const_idxs = sorted(map(int, set(re.findall(p, s))))

    for i, j in enumerate(const_idxs):
        ci = "c[%d]" % i
        cj = "c[%d]" % j
        s = s.replace(cj, ci)

    return s, len(const_idxs) 
Example 9
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_dldp_interface.py    MIT License 6 votes vote down vote up
def judge_is_mac_same(mac1, mac2):
    """Judge whether two macs are the same"""

    if mac1 == mac2:
        return True

    list1 = re.findall(r'([0-9A-Fa-f]+)', mac1)
    list2 = re.findall(r'([0-9A-Fa-f]+)', mac2)
    if len(list1) != len(list2):
        return False

    for index, value in enumerate(list1, start=0):
        if value.lstrip('0').lower() != list2[index].lstrip('0').lower():
            return False

    return True 
Example 10
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_dldp_interface.py    MIT License 6 votes vote down vote up
def check_macaddr(self):
        """Check mac-address whether valid"""

        valid_char = '0123456789abcdef-'
        mac = self.local_mac

        if len(mac) > 16:
            return False

        mac_list = re.findall(r'([0-9a-fA-F]+)', mac)
        if len(mac_list) != 3:
            return False

        if mac.count('-') != 2:
            return False

        for _, value in enumerate(mac, start=0):
            if value.lower() not in valid_char:
                return False

        return True 
Example 11
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_evpn_bgp.py    MIT License 6 votes vote down vote up
def get_peers_enable(self):
        """get evpn peer address enable list"""

        if len(self.config_list) != 2:
            return None
        self.config_list = self.config.split('l2vpn-family evpn')
        get = re.findall(
            r"peer ([0-9]+.[0-9]+.[0-9]+.[0-9]+)\s?as-number\s?(\S*)", self.config_list[0])
        if not get:
            return None
        else:
            peers = list()
            for item in get:
                cmd = "peer %s enable" % item[0]
                exist = is_config_exist(self.config_list[1], cmd)
                if exist:
                    peers.append(
                        dict(peer_address=item[0], as_number=item[1], peer_enable='true'))
                else:
                    peers.append(dict(peer_address=item[0], as_number=item[1], peer_enable='false'))
            return peers 
Example 12
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_evpn_bgp.py    MIT License 6 votes vote down vote up
def get_peers_advertise_type(self):
        """get evpn peer address advertise type list"""

        if len(self.config_list) != 2:
            return None
        self.config_list = self.config.split('l2vpn-family evpn')
        get = re.findall(
            r"peer ([0-9]+.[0-9]+.[0-9]+.[0-9]+)\s?as-number\s?(\S*)", self.config_list[0])
        if not get:
            return None
        else:
            peers = list()
            for item in get:
                cmd = "peer %s advertise arp" % item[0]
                exist1 = is_config_exist(self.config_list[1], cmd)
                cmd = "peer %s advertise irb" % item[0]
                exist2 = is_config_exist(self.config_list[1], cmd)
                if exist1:
                    peers.append(dict(peer_address=item[0], as_number=item[1], advertise_type='arp'))
                if exist2:
                    peers.append(dict(peer_address=item[0], as_number=item[1], advertise_type='irb'))
            return peers 
Example 13
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_vxlan_global.py    MIT License 6 votes vote down vote up
def get_bd_list(self):
        """get bridge domain list"""

        bd_info = list()
        conf_str = CE_NC_GET_BRIDGE_DOMAIN
        xml_str = get_nc_config(self.module, conf_str)
        if "<data/>" in xml_str:
            return bd_info

        xml_str = xml_str.replace('\r', '').replace('\n', '').\
            replace('xmlns="urn:ietf:params:xml:ns:netconf:base:1.0"', "").\
            replace('xmlns="http://www.huawei.com/netconf/vrp"', "")

        # get bridge domain info
        root = ElementTree.fromstring(xml_str)
        bds = root.findall("data/evc/bds/bd/bdId")
        if not bds:
            return bd_info

        for bridge_domain in bds:
            if bridge_domain.tag == "bdId":
                bd_info.append(bridge_domain.text)

        return bd_info 
Example 14
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_aaa_server.py    MIT License 6 votes vote down vote up
def get_authentication_scheme(self, **kwargs):
        """ Get scheme of authentication """

        module = kwargs["module"]
        conf_str = CE_GET_AUTHENTICATION_SCHEME

        xml_str = self.netconf_get_config(module=module, conf_str=conf_str)

        result = list()

        if "<data/>" in xml_str:
            return result
        else:
            re_find = re.findall(
                r'.*<firstAuthenMode>(.*)</firstAuthenMode>.*\s*'
                r'<secondAuthenMode>(.*)</secondAuthenMode>.*\s*'
                r'<authenSchemeName>(.*)</authenSchemeName>.*', xml_str)

            if re_find:
                return re_find
            else:
                return result 
Example 15
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_aaa_server.py    MIT License 6 votes vote down vote up
def get_authorization_scheme(self, **kwargs):
        """ Get scheme of authorization """

        module = kwargs["module"]
        conf_str = CE_GET_AUTHORIZATION_SCHEME

        xml_str = self.netconf_get_config(module=module, conf_str=conf_str)

        result = list()

        if "<data/>" in xml_str:
            return result
        else:
            re_find = re.findall(
                r'.*<firstAuthorMode>(.*)</firstAuthorMode>.*\s*'
                r'<secondAuthorMode>(.*)</secondAuthorMode>.*\s*'
                r'<authorSchemeName>(.*)</authorSchemeName>.*', xml_str)

            if re_find:
                return re_find
            else:
                return result 
Example 16
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_aaa_server.py    MIT License 6 votes vote down vote up
def get_authorization_domain(self, **kwargs):
        """ Get domain of authorization """

        module = kwargs["module"]
        conf_str = CE_GET_AUTHORIZATION_DOMAIN

        xml_str = self.netconf_get_config(module=module, conf_str=conf_str)

        result = list()

        if "<data/>" in xml_str:
            return result
        else:
            re_find = re.findall(
                r'.*<domainName>(.*)</domainName>.*\s*'
                r'<authorSchemeName>(.*)</authorSchemeName>.*', xml_str)

            if re_find:
                return re_find
            else:
                return result 
Example 17
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: ce_aaa_server.py    MIT License 6 votes vote down vote up
def get_accounting_scheme(self, **kwargs):
        """ Get scheme of accounting """

        module = kwargs["module"]
        conf_str = CE_GET_ACCOUNTING_SCHEME

        xml_str = self.netconf_get_config(module=module, conf_str=conf_str)

        result = list()

        if "<data/>" in xml_str:
            return result
        else:
            re_find = re.findall(
                r'.*<accountingMode>(.*)</accountingMode>.*\s*'
                r'<acctSchemeName>(.*)</acctSchemeName>.*', xml_str)

            if re_find:
                return re_find
            else:
                return result 
Example 18
Project: stream-finder   Author: kiwiholmberg   File: twitch.py    MIT License 5 votes vote down vote up
def get_client_id():
    r = requests.get(CDN_BASE + 'global.js')
    if r.status_code >= 400:
        raise Exception('Error fetching global.js script.')

    # Find the client ID with a regex that totally wont match anything else /s
    client_ids = re.findall(r'clientID:"(\w*)"', r.text)
    if len(client_ids) != 1:
        raise Exception(
            'Error finding client ID in twitch global-frontend script. Got {}'.format(client_ids))
    return client_ids[0] 
Example 19
Project: leapp-repository   Author: oamg   File: pam.py    Apache License 2.0 5 votes vote down vote up
def parse(self, config):
        """
        Parse configuration and return list of modules that are present in the
        configuration.
        """
        result = re.findall(
            r"^[ \t]*[^#\s]+.*(pam_\S+)\.so.*$",
            config,
            re.MULTILINE
        )

        return result 
Example 20
Project: mycode   Author: gmraabe   File: ipchecker.py    GNU General Public License v3.0 5 votes vote down vote up
def get_external_ip():
    url = 'http://checkip.dyndns.org'
    requesty = urllib.request.urlopen(url).read().decode('utf-8')  # gets the public IP address
    externalIP = ''.join(re.findall('\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}', requesty))  # strips off the extra text and converts to a string
    return externalIP 
Example 21
Project: webnull   Author: macrael   File: webnull.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def transform_body(self, search_re, replacement_string):
        managed = self.current_body()

        matched_hostnames = None
        if managed == '':
            print('Your hostsfile is not managed by webnull, we won\'t change anything')
            exit(1)
        else:
            lines = re.findall(search_re, managed, flags=re.MULTILINE)
            matched_hostnames = set(map(lambda line: re.match(r'^[^\t]+\t+([^\t]+)$', line).group(1), lines))
            if (len(matched_hostnames) != 0):
                new_managed = re.sub(search_re, replacement_string, managed, flags=re.MULTILINE)
                self.write_body(new_managed)

        return matched_hostnames 
Example 22
Project: pyblish-win   Author: pyblish   File: util.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def find_library(name):
            ename = re.escape(name)
            expr = r':-l%s\.\S+ => \S*/(lib%s\.\S+)' % (ename, ename)
            f = os.popen('/sbin/ldconfig -r 2>/dev/null')
            try:
                data = f.read()
            finally:
                f.close()
            res = re.findall(expr, data)
            if not res:
                return _get_soname(_findLib_gcc(name))
            res.sort(cmp= lambda x,y: cmp(_num_version(x), _num_version(y)))
            return res[-1] 
Example 23
Project: pyblish-win   Author: pyblish   File: test_bytes.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_regexps(self):
        def by(s):
            return bytearray(map(ord, s))
        b = by("Hello, world")
        self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")]) 
Example 24
Project: pyblish-win   Author: pyblish   File: test_collections.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_name_conflicts(self):
        # Some names like "self", "cls", "tuple", "itemgetter", and "property"
        # failed when used as field names.  Test to make sure these now work.
        T = namedtuple('T', 'itemgetter property self cls tuple')
        t = T(1, 2, 3, 4, 5)
        self.assertEqual(t, (1,2,3,4,5))
        newt = t._replace(itemgetter=10, property=20, self=30, cls=40, tuple=50)
        self.assertEqual(newt, (10,20,30,40,50))

        # Broader test of all interesting names in a template
        with test_support.captured_stdout() as template:
            T = namedtuple('T', 'x', verbose=True)
        words = set(re.findall('[A-Za-z]+', template.getvalue()))
        words -= set(keyword.kwlist)
        T = namedtuple('T', words)
        # test __new__
        values = tuple(range(len(words)))
        t = T(*values)
        self.assertEqual(t, values)
        t = T(**dict(zip(T._fields, values)))
        self.assertEqual(t, values)
        # test _make
        t = T._make(values)
        self.assertEqual(t, values)
        # exercise __repr__
        repr(t)
        # test _asdict
        self.assertEqual(t._asdict(), dict(zip(T._fields, values)))
        # test _replace
        t = T._make(values)
        newvalues = tuple(v*10 for v in values)
        newt = t._replace(**dict(zip(T._fields, newvalues)))
        self.assertEqual(newt, newvalues)
        # test _fields
        self.assertEqual(T._fields, tuple(words))
        # test __getnewargs__
        self.assertEqual(t.__getnewargs__(), values) 
Example 25
Project: pyblish-win   Author: pyblish   File: test_multibytecodec_support.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def _test_mapping_file_ucm(self):
        with self.open_mapping_file() as f:
            ucmdata = f.read()
        uc = re.findall('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>', ucmdata)
        for uni, coded in uc:
            unich = unichr(int(uni, 16))
            codech = ''.join(chr(int(c, 16)) for c in coded.split())
            self._testpoint(codech, unich) 
Example 26
Project: pyblish-win   Author: pyblish   File: test_re.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_bug_1661(self):
        # Verify that flags do not get silently ignored with compiled patterns
        pattern = re.compile('.')
        self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
        self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
        self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
        self.assertRaises(ValueError, re.compile, pattern, re.I) 
Example 27
Project: pyblish-win   Author: pyblish   File: test_re.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_re_findall(self):
        self.assertEqual(re.findall(":+", "abc"), [])
        self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
        self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
        self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
                                                               (":", ":"),
                                                               (":", "::")]) 
Example 28
Project: pyblish-win   Author: pyblish   File: test_re.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_bug_117612(self):
        self.assertEqual(re.findall(r"(a|(b))", "aba"),
                         [("a", ""),("b", "b"),("a", "")]) 
Example 29
Project: pyblish-win   Author: pyblish   File: test_re.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_re_escape_non_ascii_bytes(self):
        b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
        b_escaped = re.escape(b)
        self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
        self.assertMatch(b_escaped, b)
        res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
        self.assertEqual(len(res), 2) 
Example 30
Project: pyblish-win   Author: pyblish   File: test_re.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_issue17998(self):
        for reps in '*', '+', '?', '{1}':
            for mod in '', '?':
                pattern = '.' + reps + mod + 'yz'
                self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
                                 ['xyz'], msg=pattern)
                if have_unicode:
                    pattern = unicode(pattern)
                    self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
                                     [u'xyz'], msg=pattern) 
Example 31
Project: pyblish-win   Author: pyblish   File: test_re.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_keyword_parameters(self):
        # Issue #20283: Accepting the string keyword parameter.
        pat = re.compile(r'(ab)')
        self.assertEqual(
            pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
        self.assertEqual(
            pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
        self.assertEqual(
            pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
        self.assertEqual(
            pat.split(string='abracadabra', maxsplit=1),
            ['', 'ab', 'racadabra']) 
Example 32
Project: pyblish-win   Author: pyblish   File: AutoExpand.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def getwords(self):
        "Return a list of words that match the prefix before the cursor."
        word = self.getprevword()
        if not word:
            return []
        before = self.text.get("1.0", "insert wordstart")
        wbefore = re.findall(r"\b" + word + r"\w+\b", before)
        del before
        after = self.text.get("insert wordend", "end")
        wafter = re.findall(r"\b" + word + r"\w+\b", after)
        del after
        if not wbefore and not wafter:
            return []
        words = []
        dict = {}
        # search backwards through words before
        wbefore.reverse()
        for w in wbefore:
            if dict.get(w):
                continue
            words.append(w)
            dict[w] = w
        # search onwards through words after
        for w in wafter:
            if dict.get(w):
                continue
            words.append(w)
            dict[w] = w
        words.append(word)
        return words 
Example 33
Project: python-samples   Author: dek-odoo   File: dek_program060.py    Apache License 2.0 5 votes vote down vote up
def do(sentence):
    result = re.findall(r'[\d+]', sentence)
    print result 
Example 34
Project: BASS   Author: Cisco-Talos   File: avclass_common.py    GNU General Public License v2.0 5 votes vote down vote up
def __norm_cat(self, label, hashes):
        if not label:
            return []

        # Initialize list of tokens to return
        ret = []

        # Split label into tokens and process each token
        for token in re.split("[^0-9a-zA-Z]", label):
            # Remove leading and trailing backspace from token
            # and convert to lowercase
            token = token.lower()

            # Remove digits at the end
            # FIXME: What if it is a hash, and removes digits at the end???
            end_len = len(re.findall("\d*$", token)[0])
            if end_len:
                token = token[:-end_len]

            # Ignore short token
            if len(token) < 4:
                continue

            # Ignore token if prefix of a hash of the sample 
            # Most AVs use MD5 prefixes in labels, 
            # but we check SHA1 and SHA256 as well
            hash_token = False
            for hash_str in hashes:
                if hash_str[0:len(token)] == token:
                    hash_token = True
                    break
            if hash_token:
                continue
            for keys, values in self.cat.iteritems():
                if token in values:
                    token = keys
                    ret.append(token)
                    break
                    # Add token
        return ret 
Example 35
Project: kvmd   Author: pikvm   File: __init__.py    GNU General Public License v3.0 5 votes vote down vote up
def _find_storage() -> _Storage:
    with open(_FSTAB_PATH) as fstab_file:
        for line in fstab_file.read().split("\n"):
            line = line.strip()
            if line and not line.startswith("#"):
                parts = line.split()
                if len(parts) == 6:
                    options = dict(re.findall(r"X-kvmd\.otgmsd-(root|user)=([^,]+)", parts[3]))
                    if options:
                        return _Storage(
                            mount_path=parts[1],
                            root_path=options.get("root", ""),
                            user=options.get("user", ""),
                        )
    raise RuntimeError(f"Can't find MSD mountpoint in {_FSTAB_PATH}") 
Example 36
Project: fortran_input_reader   Author: miroi   File: docopt.py    MIT License 5 votes vote down vote up
def parse(class_, source):
        name = re.findall('(<\S*?>)', source)[0]
        value = re.findall('\[default: (.*)\]', source, flags=re.I)
        return class_(name, value[0] if value else None) 
Example 37
Project: fortran_input_reader   Author: miroi   File: docopt.py    MIT License 5 votes vote down vote up
def parse(class_, option_description):
        short, long, argcount, value = None, None, 0, False
        options, _, description = option_description.strip().partition('  ')
        options = options.replace(',', ' ').replace('=', ' ')
        for s in options.split():
            if s.startswith('--'):
                long = s
            elif s.startswith('-'):
                short = s
            else:
                argcount = 1
        if argcount:
            matched = re.findall('\[default: (.*)\]', description, flags=re.I)
            value = matched[0] if matched else None
        return class_(short, long, argcount, value) 
Example 38
Project: fortran_input_reader   Author: miroi   File: docopt.py    MIT License 5 votes vote down vote up
def parse_section(name, source):
    pattern = re.compile('^([^\n]*' + name + '[^\n]*\n?(?:[ \t].*?(?:\n|$))*)',
                         re.IGNORECASE | re.MULTILINE)
    return [s.strip() for s in pattern.findall(source)] 
Example 39
Project: crawler   Author: fst034356   File: crawlProxy.py    MIT License 5 votes vote down vote up
def getProxySecond(proxy_num=100):
        '''
        抓取:66代理 http://www.66ip.cn/,66代理提供API,可以直接提取,
        :param proxy_num:
        :return:
        '''
        url = "http://m.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format(
            proxy_num)
        html = getHtmlTree(url, xpath=False)
        proxy_list = re.findall(
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', str(html))
        for proxy in proxy_list:
            yield proxy 
Example 40
Project: crawler   Author: fst034356   File: tools.py    MIT License 5 votes vote down vote up
def verifyProxy(proxy):
    '''
    检查代理ip的格式是否正确
    :param proxy:
    :return:
    '''
    verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
    return True if re.findall(verify_regex, proxy) else False 
Example 41
Project: crawler   Author: fst034356   File: spider_dingdian.py    MIT License 5 votes vote down vote up
def get_chapter(self, response):

        urls = re.findall(
            r'<td class="L"><a href="(.*?)">(.*?)</a></b>',
            response.text)
        num = 0
        for url in urls:
            num += 1
            chapterurl = response.url + url[0]
            chaptername = url[1]
            yield Request(chapterurl, callback=self.get_chaptercontent, meta={'num': num,
                                                                         'novel_id': response.meta['novel_id'],
                                                                         'chaptername': chaptername,
                                                                         'chapterurl': chapterurl
                                                                         }) 
Example 42
Project: crawler   Author: fst034356   File: download.py    MIT License 5 votes vote down vote up
def __init__(self):
        self.user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]

        self.iplist = []
        html = requests.get("http://haoip.cc/tiqu.htm")
        iplisten = re.findall(r'r/>(.*?)<b', html.text, re.S)
        for ip in iplisten:
            i = re.sub('\n', '', ip)
            self.iplist.append(i.strip()) 
Example 43
Project: DouYinService   Author: 01ly   File: get_signature.py    GNU General Public License v3.0 5 votes vote down vote up
def get_matches_from_url_page(url,pattern,headers):
    res = requests.get(url,headers=headers)
    rel = re.findall(pattern,res.text)
    if rel:
        return rel[0] 
Example 44
Project: webtoon-dl   Author: puilp0502   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def parse_extension(uri):
    """ Parse the extension of URI. """
    patt = re.compile(r'(\.\w+)')
    return re.findall(patt, uri)[-1] 
Example 45
Project: webtoon-dl   Author: puilp0502   File: naver.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def initialize(url):
    global _base_url
    _title_id = re.findall(r"titleId=(\d+)", url)[0]
    _base_url = "http://comic.naver.com/webtoon/detail.nhn?titleId=" + _title_id + "&no=" 
Example 46
Project: webtoon-dl   Author: puilp0502   File: naver.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_image_list(src):
    """
    Get image list from given source.

    :src:
        A HTML source.
    """
    img_list = map(lambda m: m[0], re.findall(_image_list_pattern, src))
    return img_list 
Example 47
Project: webtoon-dl   Author: puilp0502   File: naver.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_next_episode_url(src):
    """
    Return next episode's url from given source src.

    Raises EndOfComic exception if next comic is not found.

    :src:
        A HTML source.
    """
    try:
        episode = re.findall(_next_episode_pattern, src)[0]
        return _base_url + episode
    except IndexError:
        raise EndOfComic 
Example 48
Project: webtoon-dl   Author: puilp0502   File: naver.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_episode_name(src):
    """
    Get episode name from given source.

    :src:
        A HTML source.
    """
    name = re.findall(_episode_name_pattern, src)[0]
    return html.unescape(name) 
Example 49
Project: Autoline   Author: zjh1218   File: report.py    Apache License 2.0 5 votes vote down vote up
def parser_detail_info(self):
        detail_data = []
        output_dir = os.getcwd() + "/logs/%s/%s" % (self.project_id, self.build_no)
        tree = ET.parse(output_dir + "/output.xml")
        root = tree.getroot()
        for test in root.iter("test"):
            detail_data.append({
                "status": test.find("status").attrib["status"].lower(),
                "name": test.attrib["name"].split(" ")[1],
                "starttime": test.find("status").attrib["starttime"],
                "endtime": test.find("status").attrib["endtime"]
            })
            for kw in test.iter("kw"):
                text = ""
                image = ""
                for msg in kw.iter("msg"):
                    if "<a" in msg.text:
                        img = re.findall('src="images/(.+)" width', msg.text)
                        if len(img) != 0:
                            image = img[0]
                    else:
                        text = text + msg.text + "<br>"
                #print(text)
                """    
                msg = kw.find("msg")
                if msg is not None:
                    text = kw.find("msg").text

                if "<a" in text:
                    image = re.findall('src="images/(.+)" width', text)[0]
                """
                detail_data.append({
                    "status": kw.find("status").attrib["status"].lower(),
                    "keyword": kw.attrib["name"],
                    "msg": text,
                    "image": image,
                    "project_id": self.project_id,
                    "build_no": self.build_no
                })

        return detail_data 
Example 50
Project: neu-eone.py   Author: yearsyan   File: nespider.py    MIT License 5 votes vote down vote up
def __init__(self, keyword):
        # 查询参数
        param = '?func=find-b&find_code=WRD&request=%s&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5=' % keyword
        query_page = str(requests.get('http://202.118.8.7:8991/F/', proxies=proxies['library']).content,
                         'utf8')  # 与line254相同 不能直接获取text 需要手动抓换
        post_url = \
        re.findall('<form method=get name=form1 action="(.*?)" onsubmit="return presearch\(this\);">', query_page)[0]
        query_res = requests.get(post_url + param, proxies=proxies['library'])
        query_text = str(query_res.content, 'utf8')
        self.__decode_first_text(query_text)  # 进行第一页的查询