Python html.parser.HTMLParser() Examples

The following are 30 code examples of html.parser.HTMLParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module html.parser , or try the search function .
Example #1
Source File: matcher.py    From romcollectionbrowser with GNU General Public License v2.0 6 votes vote down vote up
def resolveParseResult(self, result, itemName):
        """ This method is due to the fact that our result set is a list of dicts """

        resultValue = ""

        try:
            resultValue = result[itemName][0]
            resultValue = util.html_unescape(resultValue)
            resultValue = resultValue.strip()
            # unescape ugly html encoding from websites
            resultValue = HTMLParser().unescape(resultValue)

        except Exception as e:
            # log.warn("Error while resolving item: " + itemName + " : " + str(exc))
            log.warn("Error while resolving item: {0} : {1} {2}".format(itemName, type(e), str(e)))

        try:
            log.debug("Result " + itemName + " = " + resultValue)
        except:
            pass

        return resultValue 
Example #2
Source File: distribution_point.py    From python-jss with GNU General Public License v3.0 6 votes vote down vote up
def _scrape_tokens(self):
        """Scrape JCDS upload URL and upload access token from the jamfcloud instance."""
        jss = self.connection['jss']
        response = jss.scrape('legacy/packages.html?id=-1&o=c')
        matches = re.search(r'data-base-url="([^"]*)"', response.content.decode("utf-8"))
        if matches is None:
            raise JSSError('Did not find the JCDS base URL on the packages page. Is this actually Jamfcloud?')

        jcds_base_url = matches.group(1)

        matches = re.search(r'data-upload-token="([^"]*)"', response.content.decode("utf-8"))
        if matches is None:
            raise JSSError('Did not find the JCDS upload token on the packages page. Is this actually Jamfcloud?')

        jcds_upload_token = matches.group(1)

        h = HTMLParser()
        jcds_base_url = h.unescape(jcds_base_url)
        self.connection['jcds_base_url'] = jcds_base_url
        self.connection['jcds_upload_token'] = jcds_upload_token
        self.connection["url"] = jcds_base_url  # This is to make JSSImporter happy because it accesses .connection 
Example #3
Source File: serialeco.py    From script.module.openscrapers with GNU General Public License v3.0 6 votes vote down vote up
def search_ep(self, titles, season, episode, year):
		try:
			for title in titles:
				data = {
					'fid_name': title,
					'sezon': season,
					'odcinek': episode,
					'title': title
				}

				result = requests.post('http://178.19.110.218/forumserialeco/skrypt/szukaj3.php', data=data).content
				result = result.decode('utf-8')
				h = HTMLParser()
				result = h.unescape(result)
				if result:
					return title, season, episode
		except:
			return 
Example #4
Source File: 29 PythonCeHui.py    From Python-Spider with Apache License 2.0 6 votes vote down vote up
def note_msg(msg):
    print_msg(get_whole_msg(msg))
    content = HTMLParser().unescape(msg['Content'])
    try:
        content_tree = ETree.fromstring(content)
    except Exception:
        # invent/remove to chatroom
        return
    if content_tree is None:
        return
    revoked = content_tree.find('revokemsg')
    if revoked is None:
        return
    old_msg_id = revoked.find('msgid').text
    old_msg = msg_store.get(old_msg_id)
    if old_msg is None:
        return
    msg_send = get_whole_msg(old_msg, download=True)
    for m in msg_send:
        bot.send(m, toUserName='filehelper')
    clear_timeouted_message() 
Example #5
Source File: wechat-anti-revoke-py3.py    From wechat-anti-revoke with Apache License 2.0 6 votes vote down vote up
def note_msg(msg):
    print_msg(get_whole_msg(msg))
    content = HTMLParser().unescape(msg['Content'])
    try:
        content_tree = ETree.fromstring(content)
    except Exception:
        # invent/remove to chatroom
        return
    if content_tree is None:
        return
    revoked = content_tree.find('revokemsg')
    if revoked is None:
        return
    old_msg_id = revoked.find('msgid').text
    old_msg = msg_store.get(old_msg_id)
    if old_msg is None:
        return
    msg_send = get_whole_msg(old_msg, download=True)
    for m in msg_send:
        bot.send(m, toUserName='filehelper')
    clear_timeouted_message() 
Example #6
Source File: html_linter.py    From html-linter with Apache License 2.0 6 votes vote down vote up
def get_attribute_line_column(tag_definition, line, column, attribute):
    """Returns the line and column of the provided attribute.

    Args:
        tag_definition: str with the definition of the tag.
        line: line where the tag starts.
        column: column where the tag starts (1-based).
        attribute: str representing the attribute to find.

    Return:
       A (line, column) tuple representing the position of the attribute.
    """
    for match in HTMLParser.attrfind.finditer(tag_definition):
        if match.group(1).lower() == attribute:
            return get_line_column(tag_definition, line, column, match.start(1))

    assert False, 'Could not find the requested attribute %s' % attribute 
Example #7
Source File: html_linter.py    From html-linter with Apache License 2.0 6 votes vote down vote up
def __init__(self, html):
        self._messages = []

        # Variables used to get the indentation
        self._last_data = ''
        self._last_data_position = (0, 1)
        self._last_indent = 0

        # Variables used to check if a charset tag should be required.
        self._first_meta_line_col = None
        self._after_head_line_col = None
        self._has_charset = False

        # Variables to extend the feature set of HTMLParser.
        self._endtag_text = None

        HTMLParser.HTMLParser.__init__(self)

        # In case we are dealing with Python 3, set it to non-strict mode.
        if hasattr(self, 'strict'):
            self.strict = False

        self.feed(html)
        self.close() 
Example #8
Source File: filmwebbooster.py    From script.module.openscrapers with GNU General Public License v3.0 5 votes vote down vote up
def search_ep(self, titles, season, episode, year):
		try:
			searchtitles = titles
			for searchtitle in searchtitles:

				response = requests.get(self.base_link + self.search_serial % searchtitle)
				result = response.content
				h = HTMLParser()
				result = h.unescape(result)
				result = client.parseDOM(result, 'ul', attrs={'class': 'resultsList hits'})
				items = client.parseDOM(result, 'li')
				items = [x for x in items if not str(x).startswith("<a href")]
				orgtitles = []
				for content in items:
					try:
						orgtitle = str(
							client.parseDOM(content, 'div', attrs={'class': 'filmPreview__originalTitle'})[0])
					except:
						orgtitle = "0"
						pass
					orgtitles.append(orgtitle)
				ids = client.parseDOM(items, 'data', ret='data-id')
				titles = client.parseDOM(result, 'data', ret='data-title')
				years = client.parseDOM(result, 'span', attrs={'class': 'filmPreview__year'})

				for item in zip(titles, ids, years, orgtitles):
					f_title = str(item[0])
					f_id = str(item[1])
					f_year = str(item[2])
					f_orgtitle = str(item[3])
					teststring = cleantitle.normalize(cleantitle.getsearch(searchtitle))
					words = cleantitle.normalize(cleantitle.getsearch(f_title)).split(" ")
					if self.contains_all_wors(teststring, words) and year == f_year:
						return (f_title, f_id, f_year, f_orgtitle, "SERIAL", season, episode)
		except:
			return 
Example #9
Source File: kinonet.py    From script.module.openscrapers with GNU General Public License v3.0 5 votes vote down vote up
def search(self, title, localtitle, year, is_movie_search):
		try:
			titles = []
			titles.append(cleantitle.normalize(cleantitle.getsearch(title)))
			titles.append(cleantitle.normalize(cleantitle.getsearch(localtitle)))
			titles.append(title)
			titles.append(localtitle)
			for title in titles:
				try:
					url = self.search_link + str(title)
					result = self.session.get(url).content
					result = result.decode('utf-8')
					h = HTMLParser()
					result = h.unescape(result)
					result = client.parseDOM(result, 'div', attrs={'class': 'card-body p-2'})

					for item in result:
						try:
							nazwa = re.findall("""Film online: (.*?)\"""", item)[0]
							try:
								nazwa = re.findall(""">(.*?)<""", nazwa)[0]
							except:
								pass
							name = cleantitle.normalize(cleantitle.getsearch(nazwa))
							rok = re.findall("""Rok wydania filmu online\".*>(.*?)<""", item)[0]
							item = str(item).replace("<span style='color:red'>", "").replace("</span>", "")
							link = re.findall("""href=\"(.*?)\"""", item)[0]
							if link.startswith('//'):
								link = "https:" + link
							name = name.replace("  ", " ")
							title = title.replace("  ", " ")
							words = name.split(" ")
							if self.contains_all_words(title, words) and str(year) in rok:
								return link
						except:
							continue
				except:
					continue
		except:
			return 
Example #10
Source File: serialeco.py    From script.module.openscrapers with GNU General Public License v3.0 5 votes vote down vote up
def sources(self, url, hostDict, hostprDict):
		try:
			sources = []
			data = {
				'fid_name': url[0],
				'sezon': url[1],
				'odcinek': url[2],
				'title': url[0]
			}

			result = requests.post('http://178.19.110.218/forumserialeco/skrypt/szukaj3.php', data=data).content
			result = result.decode('utf-8')
			h = HTMLParser()
			result = h.unescape(result)
			if result:
				wersja = re.findall("""wersja: <b>(.*?)<\/b>""", result)
				id = re.findall("""url='(.*?)'""", result)
				for item in zip(wersja, id):
					try:
						if item[1]:
							info = self.get_lang_by_type(item[0])
							content = client.request("http://seriale.co/frame.php?src=" + item[1])
							video_link = str(client.parseDOM(content, 'iframe', ret='src')[0])
							valid, host = source_utils.is_host_valid(video_link, hostDict)
							if valid:
								sources.append(
									{'source': host, 'quality': 'SD', 'language': info[0], 'url': video_link,
									 'info': info[1], 'direct': False,
									 'debridonly': False})
							else:
								continue
					except:
						continue
				return sources
		except:
			return sources 
Example #11
Source File: diagnose.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.

    :param data: Some markup.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #12
Source File: html2text.py    From RedditBots with MIT License 5 votes vote down vote up
def close(self):
        HTMLParser.HTMLParser.close(self)

        self.pbr()
        self.o('', 0, 'end')

        self.outtext = self.outtext.join(self.outtextlist)
        if self.unicode_snob:
            nbsp = unichr(name2cp('nbsp'))
        else:
            nbsp = u' '
        self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)

        return self.outtext 
Example #13
Source File: diagnose.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #14
Source File: diagnose.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 
Example #15
Source File: Erome.py    From bulk-downloader-for-reddit with GNU General Public License v3.0 5 votes vote down vote up
def getLinks(self,url,lineNumber=129):
 
        content = []
        lineNumber = None

        class EromeParser(HTMLParser):
            tag = None
            def handle_starttag(self, tag, attrs):
                self.tag = {tag:{attr[0]: attr[1] for attr in attrs}}

        pageSource = (urllib.request.urlopen(url).read().decode().split('\n'))

        """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
        for i in range(len(pageSource)):
            obj = EromeParser()
            obj.feed(pageSource[i])
            tag = obj.tag
            
            if tag is not None:
                if "div" in tag:
                    if "id" in tag["div"]:
                        if tag["div"]["id"] == "album":
                            lineNumber = i
                            break

        for line in pageSource[lineNumber:]:
            obj = EromeParser()
            obj.feed(line)
            tag = obj.tag
            if tag is not None:
                if "img" in tag:
                    if "class" in tag["img"]:
                        if tag["img"]["class"]=="img-front":
                            content.append(tag["img"]["src"])
                elif "source" in tag:
                    content.append(tag["source"]["src"])
                    
        return [
            link for link in content \
            if link.endswith("_480p.mp4") or not link.endswith(".mp4")
        ] 
Example #16
Source File: utils.py    From polaris with GNU Affero General Public License v3.0 5 votes vote down vote up
def remove_html(text):
    text = re.sub('<[^<]+?>', '', text)
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    return text
    s = HTMLParser()
    s.reset()
    s.reset()
    s.strict = False
    s.convert_charrefs = True
    s.fed = []
    s.feed(text)
    return ''.join(s.fed) 
Example #17
Source File: headlines.py    From mlb-led-scoreboard with GNU General Public License v3.0 5 votes vote down vote up
def __strings_for_feed(self, feed, max_entries):
    spaces = " " * HEADLINE_SPACER_SIZE
    title = feed.feed.title.encode("ascii", "ignore")
    headlines = ""

    for idx, entry in enumerate(feed.entries):
      if idx < max_entries:
        h = HTMLParser()
        text = h.unescape(entry.title.encode("ascii", "ignore"))
        headlines += text + spaces
    return title + spaces + headlines 
Example #18
Source File: html2text.py    From RedditBots with MIT License 5 votes vote down vote up
def feed(self, data):
        data = data.replace("</' + 'script>", "</ignore>")
        HTMLParser.HTMLParser.feed(self, data) 
Example #19
Source File: filmwebbooster.py    From script.module.openscrapers with GNU General Public License v3.0 5 votes vote down vote up
def search(self, title, localtitle, year):
		try:
			searchtitles = (str(localtitle), str(title))
			for searchtitle in searchtitles:

				response = requests.get(self.base_link + self.search_film % searchtitle)
				result = response.content
				h = HTMLParser()
				result = h.unescape(result)
				result = client.parseDOM(result, 'ul', attrs={'class': 'resultsList hits'})
				items = client.parseDOM(result, 'li')
				items = [x for x in items if not str(x).startswith("<a href")]
				orgtitles = []
				for content in items:
					try:
						orgtitle = str(
							client.parseDOM(content, 'div', attrs={'class': 'filmPreview__originalTitle'})[0])
					except:
						orgtitle = "0"
						pass
					orgtitles.append(orgtitle)
				ids = client.parseDOM(items, 'data', ret='data-id')
				titles = client.parseDOM(result, 'data', ret='data-title')
				years = client.parseDOM(result, 'span', attrs={'class': 'filmPreview__year'})

				for item in zip(titles, ids, years, orgtitles):
					f_title = str(item[0])
					f_id = str(item[1])
					f_year = str(item[2])
					f_orgtitle = str(item[3])
					teststring = cleantitle.normalize(cleantitle.getsearch(searchtitle))
					words = cleantitle.normalize(cleantitle.getsearch(f_title)).split(" ")
					if self.contains_all_wors(teststring, words) and year == f_year:
						return (f_title, f_id, f_year, f_orgtitle, "FILM")
		except:
			return 
Example #20
Source File: kinonet.py    From script.module.openscrapers with GNU General Public License v3.0 5 votes vote down vote up
def search_ep(self, titles, season, episode, year):
		try:
			query = 'S{:02d}E{:02d}'.format(int(season), int(episode))
			for title in titles:
				url = self.search_link + str(title)
				result = self.session.get(url).content
				result = result.decode('utf-8')
				h = HTMLParser()
				result = h.unescape(result)
				result = client.parseDOM(result, 'div', attrs={'class': 'card-body p-2'})

				for item in result:
					nazwa = re.findall("""Film online: (.*?)\"""", item)[0]
					name = cleantitle.normalize(cleantitle.getsearch(nazwa))
					rok = re.findall("""Rok wydania filmu online\".*>(.*?)<""", item)[0]
					item = str(item).replace("<span style='color:red'>", "").replace("</span>", "")
					link = re.findall("""href=\"(.*?)\"""", item)[0]
					if link.startswith('//'):
						link = "https:" + link
					name = name.replace("  ", " ")
					title = title.replace("  ", " ")
					words = title.split(" ")
					if self.contains_all_words(name, words) and str(year) in rok:
						content = requests.get(link.replace('filmy', 'seriale')).content
						content = client.parseDOM(content, 'div', attrs={'class': 'tabela_wiersz mb-1'})
						for odcinek in content:
							if query.lower() in odcinek.lower():
								link = str(client.parseDOM(odcinek, 'a', ret='href')[0])
								return self.base_link + link

		except:
			return 
Example #21
Source File: _htmlparser.py    From python-for-android with Apache License 2.0 5 votes vote down vote up
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed.
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        else:
            real_name = int(name)

        try:
            data = chr(real_name)
        except (ValueError, OverflowError) as e:
            data = "\N{REPLACEMENT CHARACTER}"

        self.handle_data(data) 
Example #22
Source File: client.py    From script.module.openscrapers with GNU General Public License v3.0 5 votes vote down vote up
def _replaceHTMLCodes(txt):
	txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
	txt = HTMLParser().unescape(txt)
	txt = txt.replace("&quot;", "\"")
	txt = txt.replace("&amp;", "&")
	txt = txt.strip()
	return txt 
Example #23
Source File: getSum.py    From script.module.openscrapers with GNU General Public License v3.0 5 votes vote down vote up
def replaceHTMLCodes(text):
	text = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", text)
	text = HTMLParser().unescape(text)
	text = text.replace("&quot;", "\"")
	text = text.replace("&amp;", "&")
	text = text.replace("%2B", "+")
	text = text.replace("\/", "/")
	text = text.replace("\\", "")
	text = text.strip()
	return text 
Example #24
Source File: __init__.py    From script.module.openscrapers with GNU General Public License v3.0 5 votes vote down vote up
def unescape(html_text):
        if sys.version_info >= (3, 0):
            if sys.version_info >= (3, 4):
                return html.unescape(html_text)

            return HTMLParser().unescape(html_text)

        return HTMLParser().unescape(html_text)

    # ------------------------------------------------------------------------------- #
    # Decode Brotli on older versions of urllib3 manually
    # ------------------------------------------------------------------------------- # 
Example #25
Source File: diagnose.py    From plugin.git.browser with GNU General Public License v3.0 5 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 
Example #26
Source File: diagnose.py    From plugin.git.browser with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #27
Source File: test_client.py    From pyvas with MIT License 5 votes vote down vote up
def test_download_report_with_html_format(self, client, report):
        r_format = client.list_report_formats(name="HTML").data[0]
        response = client.download_report(uuid=report["@id"],
                                          format_uuid=r_format["@id"])
        assert isinstance(response, six.string_types)
        parser = HTMLParser()
        parser.feed(response)
        parser.close()
        assert parser 
Example #28
Source File: test_jcds.py    From python-jss with GNU General Public License v3.0 5 votes vote down vote up
def test_jcds_scrape_token(self, cloud_j):  # type: (JSS) -> None
        """Assert that we can scrape the jcds upload token from the ``legacy/packages.html`` page,
        from <div class='chunked-uploader' data-upload-token>"""
        response = cloud_j.scrape('legacy/packages.html?id=-1&o=c')
        re_data_base_url = re.compile('data-upload-token="([^"]*)"')
        # print(response.content)
        matches = re_data_base_url.search(response.content)
        print(matches.group(1))
        h = HTMLParser()

        print(h.unescape(matches.group(1))) 
Example #29
Source File: test_jcds.py    From python-jss with GNU General Public License v3.0 5 votes vote down vote up
def test_jcds_scrape_baseurl(self, cloud_j):  # type: (JSS) -> None
        """Assert that we can scrape the jcds upload base url from the ``legacy/packages.html`` page,
        from <div class='chunked-uploader' data-base-url>"""
        response = cloud_j.scrape('legacy/packages.html?id=-1&o=c')
        re_data_base_url = re.compile('data-base-url="([^"]*)"')
        # print(response.content)
        matches = re_data_base_url.search(response.content)
        print(matches.group(1))
        h = HTMLParser()

        print(h.unescape(matches.group(1))) 
Example #30
Source File: html2text.py    From PyDataset with MIT License 5 votes vote down vote up
def close(self):
        HTMLParser.HTMLParser.close(self)

        self.pbr()
        self.o('', 0, 'end')

        self.outtext = self.outtext.join(self.outtextlist)
        if self.unicode_snob:
            nbsp = unichr(name2cp('nbsp'))
        else:
            nbsp = u' '
        self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)

        return self.outtext