Python lxml.etree.HTML Examples

The following are 30 code examples of lxml.etree.HTML(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.etree , or try the search function .
Example #1
Source File: diagnose.py    From pledgeservice with Apache License 2.0 6 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 
Example #2
Source File: html_parser.py    From amazon-review-spider with MIT License 6 votes vote down vote up
def get_reviews_info(self, content):
        content = str(content)
        content = content.replace("<br>", "")
        content = content.replace("<br />", "")
        html = etree.HTML(content)

        star_list = html.xpath('//a/i[@data-hook="review-star-rating"]/span[@class="a-icon-alt"]/text()')
        title_list = html.xpath('//div[@class="a-row"]/a[@data-hook="review-title"]/text()')

        review_body_list = html.xpath('//div[@class="a-row review-data"]/span['
                                      '@data-hook="review-body"]/text()')

        all_review_list = []
        for index in range(len(star_list)):
            star_num = star_list[index][:1]
            if int(star_num) < 4:
                continue
            all_review_list.append(
                {"star": star_num, "title": title_list[index], "body": review_body_list[index],
                 'trans': self.trans.transEn2Zh(review_body_list[index])})

        return all_review_list 
Example #3
Source File: parser.py    From scraper-fourone-jobs with GNU General Public License v2.0 6 votes vote down vote up
def parse(self, html: bytes) -> ApplyContactPerson:
        # 透過 XPATH 取得資訊
        tree = etree.HTML(html)
        contact_name = str(tree.xpath(Config.CONTACT_PERSON_XPATH)[0])

        # 以下為帶有 txticon 樣式,加密過的內容
        raw_email = str(tree.xpath(Config.EMAIL_XPATH)[0])
        raw_telphone = str(tree.xpath(Config.TELPHONE_XPATH)[0])
        raw_mobile = str(tree.xpath(Config.MOBILE_PHONE_XPATH)[0])

        # 取得字型的 CSS 定義檔所在網址路徑
        custom_font_path = self._find_custom_font(html)
        woff: WOFFContent = WebOpenFontReader.read(custom_font_path)

        # 解碼內容
        decoder = FontTextDecoder(woff, Config.FONT_GLYPHID_TRANSLATOR)
        email = decoder.decode(raw_email)
        telphone = decoder.decode(raw_telphone)
        mobile = decoder.decode(raw_mobile)
        return ApplyContactPerson(contact_name, email, telphone, mobile) 
Example #4
Source File: diagnose.py    From ServerlessCrawler-VancouverRealState with MIT License 6 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 
Example #5
Source File: diagnose.py    From ServerlessCrawler-VancouverRealState with MIT License 6 votes vote down vote up
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>" 
Example #6
Source File: diagnose.py    From ServerlessCrawler-VancouverRealState with MIT License 6 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 
Example #7
Source File: crawl.py    From Vxscan with Apache License 2.0 6 votes vote down vote up
def jsparse(self, r):
        try:
            html = etree.HTML(r.text)
            result = html.xpath('//script/@src')
            for i in result:
                if not re.search(
                    r'jquery|bootstrap|adsbygoogle|angular|javascript|#|vue|react|51.la/=|map\.baidu\.com|canvas|cnzz\.com|slick\.js|autofill-event\.js|tld\.js|clipboard|Chart\.js',
                    i):
                    if '://' not in i:
                        i = re.sub(r'^/|^\.\./', '', i)
                        i = self.host + '/' + i
                    self.js.append(i)
        except (AttributeError, AttributeError, ValueError):
            pass
        except Exception as e:
            logging.exception(e) 
Example #8
Source File: diagnose.py    From ServerlessCrawler-VancouverRealState with MIT License 6 votes vote down vote up
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>" 
Example #9
Source File: district.py    From BeikeSpider with Apache License 2.0 6 votes vote down vote up
def get_districts(city):
    """
    获取各城市的区县中英文对照信息
    :param city: 城市
    :return: 英文区县名列表
    """
    url = 'https://{0}.ke.com/xiaoqu/'.format(city)
    headers = create_headers()
    response = requests.get(url, timeout=10, headers=headers)
    html = response.content
    root = etree.HTML(html)
    elements = root.xpath("/html/body/div[3]/div[1]/dl[2]/dd/div/div/a")
    en_names = list()
    ch_names = list()
    for element in elements:
        link = element.attrib['href']
        en_names.append(link.split('/')[-2])
        ch_names.append(element.text)

        # 打印区县英文和中文名列表
    for index, name in enumerate(en_names):
        chinese_city_district_dict[name] = ch_names[index]
        print(name + ' -> ' + ch_names[index])
    return en_names 
Example #10
Source File: fetch.py    From twstock with MIT License 6 votes vote down vote up
def fetch_data(url):
    r = requests.get(url, proxies=get_proxies())
    root = etree.HTML(r.text)
    trs = root.xpath('//tr')[1:]

    result = []
    typ = ''
    for tr in trs:
        tr = list(map(lambda x: x.text, tr.iter()))
        if len(tr) == 4:
            # This is type
            typ = tr[2].strip(' ')
        else:
            # This is the row data
            result.append(make_row_tuple(typ, tr))
    return result 
Example #11
Source File: fstoppers.py    From daily-wallpaper with MIT License 6 votes vote down vote up
def resolve_url(self):
        url = URL.format('/potd')
        try:
            r = requests.get(url)
            if r.status_code == 200:
                parser = etree.HTMLParser(recover=True)
                html = etree.HTML(r.content, parser)
                for element in html.iter('img'):
                    if 'href' in element.getparent().attrib:
                        url = URL.format(element.getparent().attrib['href'])
                        break
                if url is not None:
                    r = requests.get(url)
                    if r.status_code == 200:
                        html = etree.HTML(r.content, parser)
                        for element in html.iter('div'):
                            if 'class' in element.attrib and \
                                    element.attrib['class'] == 'photo':
                                if 'data-xlarge' in element.attrib:
                                    self._url = element.attrib['data-xlarge']
                                return True
        except Exception:
            pass
        return False 
Example #12
Source File: nasa.py    From daily-wallpaper with MIT License 6 votes vote down vote up
def resolve_url(self):
        url = URL
        try:
            r = requests.get(url)
            if r.status_code == 200:
                parser = etree.HTMLParser(recover=True)
                html = etree.HTML(r.content, parser)
                images = html.iter('img')
                if images is not None:
                    images = list(images)
                    if len(images) > 0:
                        image_url = images[0].getparent().attrib['href']
                        self._url = 'https://apod.nasa.gov/' + image_url
                        return True
        except Exception:
            pass
        return False 
Example #13
Source File: data_spider.py    From dialogbot with Apache License 2.0 6 votes vote down vote up
def symptom_spider(self, url):
        """症状信息解析"""
        html = self.get_html(url)
        selector = etree.HTML(html)
        symptoms = selector.xpath('//a[@class="gre" ]/text()')
        ps = selector.xpath('//p')
        detail = []
        for p in ps:
            info = p.xpath('string(.)') \
                .replace('\r', '') \
                .replace('\n', '') \
                .replace('\xa0', '') \
                .replace('   ', '') \
                .replace('\t', '')
            detail.append(info)
        symptoms_data = {}
        symptoms_data['symptoms'] = symptoms
        symptoms_data['symptoms_detail'] = detail
        return symptoms, detail 
Example #14
Source File: data_spider.py    From dialogbot with Apache License 2.0 6 votes vote down vote up
def common_spider(self, url):
        """通用解析模块"""
        html = self.get_html(url)
        selector = etree.HTML(html)
        ps = selector.xpath('//p')
        infobox = []
        for p in ps:
            info = p.xpath('string(.)') \
                .replace('\r', '') \
                .replace('\n', '') \
                .replace('\xa0', '') \
                .replace('   ', '') \
                .replace('\t', '')
            if info:
                infobox.append(info)
        return '\n'.join(infobox) 
Example #15
Source File: diagnose.py    From pledgeservice with Apache License 2.0 6 votes vote down vote up
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>" 
Example #16
Source File: get_title.py    From Vxscan with Apache License 2.0 6 votes vote down vote up
def get_title(url):
    code = 0

    try:
        r = req.get(url)
        code = r.status_code
        coding = chardet.detect(r.content).get('encoding')
        text = r.content[:10000].decode(coding)
        html = etree.HTML(text)
        title = html.xpath('//title/text()')
        if title:
            return url + ' | ' + title[0]
        else:
            return url + ' | Status_code: ' + str(code)
    except:
        pass

    return url + ' | Status_code: ' + str(code) 
Example #17
Source File: diagnose.py    From svg-animation-tools with MIT License 6 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 
Example #18
Source File: diagnose.py    From svg-animation-tools with MIT License 6 votes vote down vote up
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>" 
Example #19
Source File: diagnose.py    From svg-animation-tools with MIT License 6 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 
Example #20
Source File: response.py    From pledgeservice with Apache License 2.0 6 votes vote down vote up
def showbrowser(self):
        """
        Show this response in a browser window (for debugging purposes,
        when it's hard to read the HTML).
        """
        import webbrowser
        import tempfile
        f = tempfile.NamedTemporaryFile(prefix='webtest-page',
                                        suffix='.html')
        name = f.name
        f.close()
        f = open(name, 'w')
        if PY3:
            f.write(self.body.decode(self.charset or 'ascii', 'replace'))
        else:
            f.write(self.body)
        f.close()
        if name[0] != '/':  # pragma: no cover
            # windows ...
            url = 'file:///' + name
        else:
            url = 'file://' + name
        webbrowser.open_new(url) 
Example #21
Source File: diagnose.py    From svg-animation-tools with MIT License 6 votes vote down vote up
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>" 
Example #22
Source File: diagnose.py    From locality-sensitive-hashing with MIT License 6 votes vote down vote up
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>" 
Example #23
Source File: diagnose.py    From locality-sensitive-hashing with MIT License 6 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 
Example #24
Source File: diagnose.py    From fuzzdb-collect with GNU General Public License v3.0 6 votes vote down vote up
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>" 
Example #25
Source File: diagnose.py    From fuzzdb-collect with GNU General Public License v3.0 6 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 
Example #26
Source File: doub.spider.py    From ParrotSecCN_Community_QQbot with GNU General Public License v2.0 6 votes vote down vote up
def getNeedInfo(sourceHtml):
    """
        获取SS_SSR的请求地址
    """
    selector = etree.HTML(sourceHtml)

    lists = []
    for i in range(5, 9):
        ca_1 = selector.xpath(
            '/html/body/section/div[3]/div/div[1]/table/tbody/tr[' +
            str(i) +
            ']/td/a/@href')
        for j in ca_1:
            print(j)
            lists.append(j)

    return lists
# lists = [j for j in selector.xpath('/html/body/section/div[3]/div/div[1]/table/tbody/tr['+str(i)+']/td/a/@href')] 
Example #27
Source File: diagnose.py    From weeman with GNU General Public License v3.0 6 votes vote down vote up
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>" 
Example #28
Source File: diagnose.py    From weeman with GNU General Public License v3.0 6 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 
Example #29
Source File: newrelic.py    From changelogs with MIT License 6 votes vote down vote up
def get_content(session, urls):
    log = ""
    for url in urls:
        r = session.get(url)
        if r.status_code == 200:
            root = etree.HTML(r.content)
            try:
                article = root.xpath("//article/div[@class='content']")[0]
                content = etree.tostring(article, method="text", encoding='utf-8')
                if sys.version_info > (3, 0):
                    content = content.decode("utf-8")
                # remove first two lines
                content = '\n'.join(content.split('\n')[2:-1])
                log += "# {version}\n{content}\n\n".format(
                    version=url.split("-")[-1],
                    content=content,
                )
            except IndexError:
                pass
    return log 
Example #30
Source File: spider.py    From job-web-demo with MIT License 6 votes vote down vote up
def _parse_company_detail(self, detail_url):
        resp = self._request('get', detail_url)
        resp.encoding = resp.apparent_encoding
        html = etree.HTML(resp.text)
        name = html.xpath('//div[@class="company_main"]/h1/a/text()')
        # 这里最好先判断一下,以免没提取到出现异常
        if not name:
            self.logger.debug('请求到错误页面')
            time.sleep(30)
            return self._parse_company_detail(detail_url)
        # 返回的键必须包含这些,否则写入会报错
        supply = {
            'details': unescape(str(etree.tostring(html.xpath(
                '//span[@class="company_content"]')[0]), encoding='utf8')).replace(
                '<span class="company_content">', '').replace('\n', '').replace('\xa0', ''),
            'website': html.xpath('//div[@class="company_main"]/a[1]/@href')[0].split('?')[0],
        }
        return supply