Python lxml.etree.HTMLParser() Examples

The following are 30 code examples of lxml.etree.HTMLParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.etree , or try the search function .
Example #1
Source File: nasa.py    From daily-wallpaper with MIT License 6 votes vote down vote up
def resolve_url(self):
        url = URL
        try:
            r = requests.get(url)
            if r.status_code == 200:
                parser = etree.HTMLParser(recover=True)
                html = etree.HTML(r.content, parser)
                images = html.iter('img')
                if images is not None:
                    images = list(images)
                    if len(images) > 0:
                        image_url = images[0].getparent().attrib['href']
                        self._url = 'https://apod.nasa.gov/' + image_url
                        return True
        except Exception:
            pass
        return False 
Example #2
Source File: fstoppers.py    From daily-wallpaper with MIT License 6 votes vote down vote up
def resolve_url(self):
        url = URL.format('/potd')
        try:
            r = requests.get(url)
            if r.status_code == 200:
                parser = etree.HTMLParser(recover=True)
                html = etree.HTML(r.content, parser)
                for element in html.iter('img'):
                    if 'href' in element.getparent().attrib:
                        url = URL.format(element.getparent().attrib['href'])
                        break
                if url is not None:
                    r = requests.get(url)
                    if r.status_code == 200:
                        html = etree.HTML(r.content, parser)
                        for element in html.iter('div'):
                            if 'class' in element.attrib and \
                                    element.attrib['class'] == 'photo':
                                if 'data-xlarge' in element.attrib:
                                    self._url = element.attrib['data-xlarge']
                                return True
        except Exception:
            pass
        return False 
Example #3
Source File: spider.py    From You-are-Pythonista with GNU General Public License v3.0 6 votes vote down vote up
def parse_page(url):
    # headers = build_headers()
    #
    # result = requests.get(url,headers=headers).text
    #
    # parse = etree.HTMLParser(encoding='utf-8')
    # html = etree.HTML(result,parser=parse)
    #
    # hrefs = html.xpath(r'//div[@id="shop-all-list"]//div[@class="tit"]/a/@href')

    hrefs = ['http://www.dianping.com/shop/23093707', 'http://www.dianping.com/brands/b23093707', 'http://www.dianping.com/shop/2461336', 'http://www.dianping.com/shop/90085699', 'http://www.dianping.com/shop/13810171', 'http://www.dianping.com/brands/b13810171', 'http://www.dianping.com/shop/58322041', 'http://www.dianping.com/shop/80620237', 'http://www.dianping.com/shop/130946881', 'http://www.dianping.com/brands/b130946881', 'http://www.dianping.com/shop/32704021', 'http://www.dianping.com/brands/b18005322', 'http://www.dianping.com/shop/75141698', 'http://www.dianping.com/brands/b10008473', 'http://www.dianping.com/shop/92384680', 'http://www.dianping.com/shop/47008792', 'http://www.dianping.com/brands/b47008792', 'http://www.dianping.com/shop/67997136', 'http://www.dianping.com/brands/b4087801', 'http://www.dianping.com/shop/111533101', 'http://www.dianping.com/shop/98779037', 'http://www.dianping.com/shop/102025765', 'http://www.dianping.com/brands/b23093707']


    every_page_headers = build_headers(url)
    print(every_page_headers)
    for href in hrefs:
        result = requests.get(href,headers=every_page_headers).text
        with open('test.html','w',encoding='utf-8') as fp:
            fp.write(result)
        break 
Example #4
Source File: Fun.py    From NotSoBot with MIT License 6 votes vote down vote up
def se(self, ctx, em:str):
		"""Returns a steam emoji image"""
		em = em.lower()
		desc = None
		if em == ':b1:' or em == 'b1':
			b = self.files_path('b1.png')
		else:
			url = "https://steamcommunity-a.akamaihd.net/economy/emoticonhover/{0}".format(em)
			txt = await self.get_text(url)
			if not txt:
				await self.bot.say(":warning: `Emoticon Not Found/Invalid`\nRemember to do :steam_emoticon: (optional ':').")
				return
			root = etree.fromstring(txt, etree.HTMLParser())
			base = root.find('.//img[@class="emoticon_large"]')
			b = BytesIO(base64.b64decode(base.attrib['src'][22:]))
			desc = '**{0}**'.format(root.find('.//div[@class="emoticon_hover_desc"]').text)
		await self.bot.upload(b, filename='steam.png', content=desc) 
Example #5
Source File: test_examples.py    From dataflows with MIT License 6 votes vote down vote up
def country_population():
    from lxml import etree
    from urllib.request import urlopen
    page = urlopen('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population').read()
    parser = etree.HTMLParser()
    tree = etree.fromstring(page, parser=parser)
    tables = tree.findall('.//table')
    for table in tables:
        if 'wikitable' in table.attrib.get('class', ''):
            rows = table.findall('.//tr')
            for row in rows:
                cells = row.findall('td')
                if len(cells) > 3:
                    name = cells[1].find('.//a').attrib.get('title')
                    population = cells[2].text
                    yield(dict(
                        name=name,
                        population=population
                    )) 
Example #6
Source File: tweets_scrape.py    From tweet_scrapper with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, twitter_request_url, twitter_request_header,
                 twitter_request_params=None, twitter_request_proxies=None, scrape_pages=2,
                 twitter_file_path=None, twitter_file_format='csv'):

        self.__twitter_request_url__ = twitter_request_url
        if twitter_request_header is not None:
            self.__twitter_request_header__ = twitter_request_header
        self.__twitter_request_params__ = twitter_request_params
        self.__twitter_request_proxies__ = twitter_request_proxies
        self.scrape_pages = scrape_pages
        self.__twitter_tweet_persist_file_path__ = twitter_file_path
        self.__twitter_tweet_persist_file_format__ = twitter_file_format

        self.hashtag_capture = re.compile(self._tweet_hastag_pattern_)

        self.html_parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
        self.proxy_json = None 
Example #7
Source File: avsox.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getNum(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
    return result1 
Example #8
Source File: xcity.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0]
    except:
        return ''
    try:
        return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
    except:
        return '' 
Example #9
Source File: avsox.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
    return result1 
Example #10
Source File: avsox.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getCover(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
    return result 
Example #11
Source File: avsox.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getCover_small(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
    return result 
Example #12
Source File: xcity.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getRuntime(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
        result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0]
    except:
        return ''
    try:
        return re.findall('\d+',result1)[0]
    except:
        return '' 
Example #13
Source File: mgstage.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getSeries(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
        '\\n')
    result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
        '\\n')
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') 
Example #14
Source File: xcity.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getTitle(a):
    html = etree.fromstring(a, etree.HTMLParser())
    result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
    return result 
Example #15
Source File: xcity.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0]
    return result1 
Example #16
Source File: xcity.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
        result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
    except:
        result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
    return result.strip('+').replace("', '", '').replace('"', '') 
Example #17
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') 
Example #18
Source File: avsox.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getLabel(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
    return result1 
Example #19
Source File: avsox.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
    return result1 
Example #20
Source File: avsox.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getTitle(a):
    try:
        html = etree.fromstring(a, etree.HTMLParser())
        result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
        return result.replace('/', '')
    except:
        return '' 
Example #21
Source File: ADC_function.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getXpathSingle(htmlcode,xpath):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result1 = str(html.xpath(xpath)).strip(" ['']")
    return result1 
Example #22
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getSeries(a):
    #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') 
Example #23
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getDirector(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') 
Example #24
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getCover(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")).strip(" ['']")
    return result 
Example #25
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getCover_small(a, index=0):
    # same issue mentioned below,
    # javdb sometime returns multiple results
    # DO NOT just get the firt one, get the one with correct index number
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
    if not 'https' in result:
        result = 'https:' + result
    return result 
Example #26
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getTag(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
        result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
        return result
    except:
        result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
        return result 
Example #27
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+') 
Example #28
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getLabel(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') 
Example #29
Source File: javdb.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getRuntime(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').rstrip('mi') 
Example #30
Source File: xcity.py    From AV_Data_Capture with GNU General Public License v3.0 5 votes vote down vote up
def getCover_small(a, index=0):
    # same issue mentioned below,
    # javdb sometime returns multiple results
    # DO NOT just get the firt one, get the one with correct index number
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
    if not 'https' in result:
        result = 'https:' + result
    return result