Python requests_html.HTML Examples

The following are 30 code examples of requests_html.HTML(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module requests_html , or try the search function .
Example #1
Source File: test_requests_html.py    From requests-html with MIT License 6 votes vote down vote up
def test_bare_js_async_eval():
    doc = """
    <!DOCTYPE html>
    <html>
    <body>
    <div id="replace">This gets replaced</div>

    <script type="text/javascript">
      document.getElementById("replace").innerHTML = "yolo";
    </script>
    </body>
    </html>
    """

    html = HTML(html=doc, async_=True)
    await html.arender()

    assert html.find('#replace', first=True).text == 'yolo'
    await html.browser.close() 
Example #2
Source File: spys_one_provider.py    From scylla with Apache License 2.0 6 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []
        for ip_row in html.find('table tr[onmouseover]'):

            ip_port_text_elem = ip_row.find('.spy14', first=True)

            if ip_port_text_elem:
                ip_port_text = ip_port_text_elem.text

                ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port_text).group(0)
                port = re.search(r':\n(\d{2,5})', ip_port_text).group(1)

                if ip and port:
                    p = ProxyIP(ip=ip, port=port)
                    ip_list.append(p)

        return ip_list 
Example #3
Source File: proxynova_provider.py    From scylla with Apache License 2.0 6 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for tr in html.find('#tbl_proxy_list > tbody:nth-child(2) > tr'):
            if not 'data-proxy-id' in tr.attrs:
                continue

            script_element = tr.find('td:nth-child(1) > abbr > script', first=True)
            port_element = tr.find('td:nth-child(2)', first=True)
            if not script_element or not port_element:
                continue

            groups = re.findall(r"document\.write\('12345678(\d{1,3}\.\d{1,3})'\.substr\(8\) \+ '(\d{1,3}\.\d{1,3}\.\d{1,3})'\)", script_element.text)
            if not groups or len(groups) != 1:
                continue
            ip = groups[0][0] + groups[0][1]
            port = port_element.text
            ip_list.append(ProxyIP(ip=ip, port=port))
        return ip_list 
Example #4
Source File: http_proxy_provider.py    From scylla with Apache License 2.0 6 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for ip_row in html.find('table.proxytbl tr'):

            ip_element = ip_row.find('td:nth-child(1)', first=True)
            port_element = ip_row.find('td:nth-child(2)', first=True)

            try:
                if ip_element and port_element:
                    port_str = re.search(r'//]]> (\d+)', port_element.text).group(1)

                    p = ProxyIP(ip=ip_element.text, port=port_str)

                    ip_list.append(p)
            except AttributeError:
                pass

        return ip_list 
Example #5
Source File: a2u_provider.py    From scylla with Apache License 2.0 6 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        text = html.raw_html

        ip_port_str_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}', text.decode('utf-8'))

        for ip_port in ip_port_str_list:

            ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port).group(0)
            port = re.search(r':(\d{2,5})', ip_port).group(1)

            if ip and port:
                p = ProxyIP(ip=ip, port=port)
                ip_list.append(p)

        return ip_list 
Example #6
Source File: trends.py    From twitter-scraper with MIT License 6 votes vote down vote up
def get_trends():
    trends = []

    headers = {
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
        "X-Twitter-Active-User": "yes",
        "X-Requested-With": "XMLHttpRequest",
        "Accept-Language": "en-US",
    }

    html = session.get("https://twitter.com/i/trends", headers=headers)
    html = html.json()["module_html"]

    html = HTML(html=html, url="bunk", default_encoding="utf-8")

    for trend_item in html.find("li"):
        trend_text = trend_item.attrs["data-trend-name"]

        trends.append(trend_text)

    return trends 
Example #7
Source File: plain_text_provider.py    From scylla with Apache License 2.0 6 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        if html is None:
            return []

        text = html.raw_html.decode('utf-8')

        for ip_port in text.split('\n'):
            if ip_port.strip() == '' or not re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:(\d{2,5})', ip_port):
                continue
            ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port).group(0)
            port = re.search(r':(\d{2,5})', ip_port).group(1)

            if ip and port:
                p = ProxyIP(ip=ip, port=port)
                ip_list.append(p)

        return ip_list 
Example #8
Source File: test_requests_html.py    From requests-html with MIT License 6 votes vote down vote up
def test_bare_render():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc)
    script = """
        () => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }
    """
    val = html.render(script=script, reload=False)
    for value in ('width', 'height', 'deviceScaleFactor'):
        assert value in val

    assert html.find('html')
    assert 'https://httpbin.org' in html.links 
Example #9
Source File: test_requests_html.py    From requests-html with MIT License 6 votes vote down vote up
def test_bare_arender():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc, async_=True)
    script = """
        () => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }
    """
    val = await html.arender(script=script, reload=False)
    for value in ('width', 'height', 'deviceScaleFactor'):
        assert value in val

    assert html.find('html')
    assert 'https://httpbin.org' in html.links
    await html.browser.close() 
Example #10
Source File: ipaddress_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for ip_row in html.find('.proxylist tbody tr'):
            ip_port = ip_row.find('td:nth-child(1)', first=True).text
            ip_address, port = ip_port.split(":")

            p = ProxyIP(ip=ip_address, port=port)

            ip_list.append(p)

        return ip_list 
Example #11
Source File: data5u_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for ip_row in html.find('.wlist > ul > li:nth-child(2) .l2'):

            ip_element = ip_row.find('span:nth-child(1)', first=True)
            port_element = ip_row.find('span:nth-child(2)', first=True)

            if ip_element and port_element:
                p = ProxyIP(ip=ip_element.text, port=port_element.text)
                ip_list.append(p)

        return ip_list 
Example #12
Source File: proxy_scraper_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        text = html.raw_html.decode('utf-8')
        obj = json.loads(text)
        if not obj or type(obj['usproxy']) != list:
            return ip_list

        for ip_port in obj['usproxy']:
            p = ProxyIP(ip=ip_port['ip'], port=ip_port['port'])
            ip_list.append(p)

        return ip_list 
Example #13
Source File: test_requests_html.py    From requests-html with MIT License 5 votes vote down vote up
def test_html_loading():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc)

    assert 'https://httpbin.org' in html.links
    assert isinstance(html.raw_html, bytes)
    assert isinstance(html.html, str) 
Example #14
Source File: free_proxy_list_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for ip_row in html.find('#proxylisttable tbody tr'):
            ip_address = ip_row.find('td:nth-child(1)', first=True).text
            port = ip_row.find('td:nth-child(2)', first=True).text

            p = ProxyIP(ip=ip_address, port=port)

            ip_list.append(p)

        return ip_list 
Example #15
Source File: test_requests_html.py    From requests-html with MIT License 5 votes vote down vote up
def test_absolute_links(url, link, expected):
    head_template = """<head><base href='{}'></head>"""
    body_template = """<body><a href='{}'>Next</a></body>"""

    # Test without `<base>` tag (url is base)
    html = HTML(html=body_template.format(link), url=url)
    assert html.absolute_links.pop() == expected

    # Test with `<base>` tag (url is other)
    html = HTML(
        html=head_template.format(url) + body_template.format(link),
        url='http://example.com/foobar/')
    assert html.absolute_links.pop() == expected 
Example #16
Source File: kuaidaili_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for ip_row in html.find('#list table tr'):

            ip_element = ip_row.find('td[data-title="IP"]', first=True)
            port_element = ip_row.find('td[data-title="PORT"]', first=True)

            if ip_element and port_element:
                p = ProxyIP(ip=ip_element.text, port=port_element.text)
                ip_list.append(p)

        return ip_list 
Example #17
Source File: test_requests_html.py    From requests-html with MIT License 5 votes vote down vote up
def test_parser():
    doc = """<a href='https://httpbin.org'>httpbin.org\n</a>"""
    html = HTML(html=doc)

    assert html.find('html')
    assert html.element('a').text().strip() == 'httpbin.org' 
Example #18
Source File: proxylists_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for tr in html.find('table table tr'):
            ip_element = tr.find('td:nth-of-type(1)', first=True)
            port_element = tr.find('td:nth-of-type(2)', first=True)
            if ip_element and port_element:
                ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_element.text).group(0)
                port = re.search(r'\d{2,5}', port_element.text).group(0)
                ip_list.append(ProxyIP(ip=ip, port=port))

        return ip_list 
Example #19
Source File: cool_proxy_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for ip_row in html.find('table tr'):

            ip_element = ip_row.find('td:nth-child(1)', first=True)
            port_element = ip_row.find('td:nth-child(2)', first=True)

            if ip_element and port_element:
                p = ProxyIP(ip=re.sub(r'document\.write\(.+\)', '', ip_element.text), port=port_element.text)

                ip_list.append(p)

        return ip_list 
Example #20
Source File: base_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        """Parse the document in order to get a list of proxies

        :param html: the HTML object from requests-html
        :return: a list of proxy ips
        """

        raise NotImplementedError 
Example #21
Source File: problems.py    From codechef-cli with GNU General Public License v3.0 5 votes vote down vote up
def get_description(problem_code, contest_code):
    url = f'/api/contests/{contest_code}/problems/{problem_code}'
    resp = request(url=url)

    try:
        resp_json = resp.json()
    except ValueError:
        return [{'code': 503}]

    if resp_json["status"] == "success":
        problem = [
            '',
            style_text('Name: ', "BOLD") + resp_json.get('problem_name', ''),
            style_text("Description:", "BOLD"),
            re.sub(r'(<|<\/)\w+>', '', resp_json.get("body", '')),
            '',
            style_text("Author: ", "BOLD") + resp_json.get('problem_author', ''),
            style_text("Date Added: ", "BOLD") + resp_json.get('date_added', ''),
            style_text("Max Time Limit: ", "BOLD") + f"{resp_json.get('max_timelimit', '')} secs",
            style_text("Source Limit: ", "BOLD") + f"{resp_json.get('source_sizelimit', '')} Bytes",
            style_text("Languages: ", "BOLD") + resp_json.get('languages_supported', ''),
            ''
        ]
        if resp_json.get('tags'):
            problem.append(
                style_text('Tags: ', 'BOLD') +
                " ".join([tag.text for tag in HTML(html=resp_json['tags']).find('a')])
            )
            problem.append('')
        if resp_json.get('editorial_url'):
            problem.append(style_text('Editorial: ', 'BOLD') + resp_json['editorial_url'])
            problem.append('')

        return [{"data": "\n".join(problem)}]
    elif resp_json["status"] == "error":
        return [{
            'data': 'Problem not found. Use `--search` to search in a specific contest',
            'code': 404
        }]
    return [{'code': 503}] 
Example #22
Source File: xici_provider.py    From scylla with Apache License 2.0 5 votes vote down vote up
def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for ip_row in html.find('#ip_list tr'):

            ip_element = ip_row.find('td:nth-child(2)', first=True)
            port_element = ip_row.find('td:nth-child(3)', first=True)

            if ip_element and port_element:
                p = ProxyIP(ip=ip_element.text, port=port_element.text)
                ip_list.append(p)

        return ip_list 
Example #23
Source File: worker.py    From scylla with Apache License 2.0 5 votes vote down vote up
def get_html(self, url: str, render_js: bool = True) -> Union[HTML, None]:
        """Get html from a specific URL

        :param url: the URL
        :param render_js: [whether to render js], defaults to True
        :param render_js: bool, optional
        :return: [the HTML string]
        :rtype: str
        """

        try:
            # TODO: load config for timeout
            response: HTMLResponse = self.session.get(url, timeout=30)
        except requests.RequestException:
            logger.warning('[Worker] Cannot get this url: ' + url)
            return None
        except (KeyboardInterrupt, SystemExit, InterruptedError):
            self.stop()
            return None

        if response.ok:
            if render_js:
                logger.debug('starting render js...')
                response.html.render(wait=1.5, timeout=10.0)
                logger.debug('end render js...')
            return response.html
        else:
            return None 
Example #24
Source File: crawling.py    From aspider with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def parse_output(self, url, text):
        html = HTML(html=text)
        title_ele = html.find('title', first=True)
        d = OrderedDict()
        d['title'] = title_ele.text
        d['url'] = url
        d['datetime'] = now_time()
        d['text'] = text
        return d 
Example #25
Source File: douban_250_scores.py    From aspider with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def process_page(text):
    html = HTML(html=text)
    item_css = '#content  ol.grid_view > li'
    items = html.find(item_css)
    rank_css = 'em'
    title_css = '.info  span.title'
    score_css = '.info  .rating_num'
    for item in items:
        rank = int(item.find(rank_css, first=True).text)
        title = item.find(title_css, first=True).text
        score = float(item.find(score_css, first=True).text)
        movies_250.append(Movie(rank, score, title)) 
Example #26
Source File: douban_requests.py    From aspider with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_root():
    res = requests.get(root_url)
    text = res.text
    html_page = HTML(html=text, url=root_url)
    links = html_page.links
    pattern = '\?start=.+'
    for link in links:
        if re.search(pattern, link):
            pages_list.append(link)
    parse_page(text) 
Example #27
Source File: douban_requests.py    From aspider with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def parse_page(text):
    html_page = HTML(html=text)
    title_css = '#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)'
    titles = html_page.find(title_css)
    for t in titles:
        print(t.text) 
Example #28
Source File: douban_aspider.py    From aspider with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def parse_page(text):
    html_page = HTML(html=text)
    title_css = '#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)'
    titles = html_page.find(title_css)
    for t in titles:
        print(t.text) 
Example #29
Source File: test_helpers.py    From codechef-cli with GNU General Public License v3.0 5 votes vote down vote up
def test_get_csrf_token(self):
        """Should return token from html element's value"""
        html = HTML(html="<input id='a' value='b' />")
        self.assertEqual(get_csrf_token(html, "a"), 'b') 
Example #30
Source File: test_helpers.py    From codechef-cli with GNU General Public License v3.0 5 votes vote down vote up
def test_get_csrf_token_no_value(self):
        """Should return None when html element has no value"""
        html = HTML(html="<input id='a' />")
        self.assertIsNone(get_csrf_token(html, "a"))