Python html.parser.HTMLParser() Examples
The following are 30
code examples of html.parser.HTMLParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html.parser
, or try the search function
.
Example #1
Source File: matcher.py From romcollectionbrowser with GNU General Public License v2.0 | 6 votes |
def resolveParseResult(self, result, itemName): """ This method is due to the fact that our result set is a list of dicts """ resultValue = "" try: resultValue = result[itemName][0] resultValue = util.html_unescape(resultValue) resultValue = resultValue.strip() # unescape ugly html encoding from websites resultValue = HTMLParser().unescape(resultValue) except Exception as e: # log.warn("Error while resolving item: " + itemName + " : " + str(exc)) log.warn("Error while resolving item: {0} : {1} {2}".format(itemName, type(e), str(e))) try: log.debug("Result " + itemName + " = " + resultValue) except: pass return resultValue
Example #2
Source File: distribution_point.py From python-jss with GNU General Public License v3.0 | 6 votes |
def _scrape_tokens(self): """Scrape JCDS upload URL and upload access token from the jamfcloud instance.""" jss = self.connection['jss'] response = jss.scrape('legacy/packages.html?id=-1&o=c') matches = re.search(r'data-base-url="([^"]*)"', response.content.decode("utf-8")) if matches is None: raise JSSError('Did not find the JCDS base URL on the packages page. Is this actually Jamfcloud?') jcds_base_url = matches.group(1) matches = re.search(r'data-upload-token="([^"]*)"', response.content.decode("utf-8")) if matches is None: raise JSSError('Did not find the JCDS upload token on the packages page. Is this actually Jamfcloud?') jcds_upload_token = matches.group(1) h = HTMLParser() jcds_base_url = h.unescape(jcds_base_url) self.connection['jcds_base_url'] = jcds_base_url self.connection['jcds_upload_token'] = jcds_upload_token self.connection["url"] = jcds_base_url # This is to make JSSImporter happy because it accesses .connection
Example #3
Source File: serialeco.py From script.module.openscrapers with GNU General Public License v3.0 | 6 votes |
def search_ep(self, titles, season, episode, year): try: for title in titles: data = { 'fid_name': title, 'sezon': season, 'odcinek': episode, 'title': title } result = requests.post('http://178.19.110.218/forumserialeco/skrypt/szukaj3.php', data=data).content result = result.decode('utf-8') h = HTMLParser() result = h.unescape(result) if result: return title, season, episode except: return
Example #4
Source File: 29 PythonCeHui.py From Python-Spider with Apache License 2.0 | 6 votes |
def note_msg(msg): print_msg(get_whole_msg(msg)) content = HTMLParser().unescape(msg['Content']) try: content_tree = ETree.fromstring(content) except Exception: # invent/remove to chatroom return if content_tree is None: return revoked = content_tree.find('revokemsg') if revoked is None: return old_msg_id = revoked.find('msgid').text old_msg = msg_store.get(old_msg_id) if old_msg is None: return msg_send = get_whole_msg(old_msg, download=True) for m in msg_send: bot.send(m, toUserName='filehelper') clear_timeouted_message()
Example #5
Source File: wechat-anti-revoke-py3.py From wechat-anti-revoke with Apache License 2.0 | 6 votes |
def note_msg(msg): print_msg(get_whole_msg(msg)) content = HTMLParser().unescape(msg['Content']) try: content_tree = ETree.fromstring(content) except Exception: # invent/remove to chatroom return if content_tree is None: return revoked = content_tree.find('revokemsg') if revoked is None: return old_msg_id = revoked.find('msgid').text old_msg = msg_store.get(old_msg_id) if old_msg is None: return msg_send = get_whole_msg(old_msg, download=True) for m in msg_send: bot.send(m, toUserName='filehelper') clear_timeouted_message()
Example #6
Source File: html_linter.py From html-linter with Apache License 2.0 | 6 votes |
def get_attribute_line_column(tag_definition, line, column, attribute): """Returns the line and column of the provided attribute. Args: tag_definition: str with the definition of the tag. line: line where the tag starts. column: column where the tag starts (1-based). attribute: str representing the attribute to find. Return: A (line, column) tuple representing the position of the attribute. """ for match in HTMLParser.attrfind.finditer(tag_definition): if match.group(1).lower() == attribute: return get_line_column(tag_definition, line, column, match.start(1)) assert False, 'Could not find the requested attribute %s' % attribute
Example #7
Source File: html_linter.py From html-linter with Apache License 2.0 | 6 votes |
def __init__(self, html): self._messages = [] # Variables used to get the indentation self._last_data = '' self._last_data_position = (0, 1) self._last_indent = 0 # Variables used to check if a charset tag should be required. self._first_meta_line_col = None self._after_head_line_col = None self._has_charset = False # Variables to extend the feature set of HTMLParser. self._endtag_text = None HTMLParser.HTMLParser.__init__(self) # In case we are dealing with Python 3, set it to non-strict mode. if hasattr(self, 'strict'): self.strict = False self.feed(html) self.close()
Example #8
Source File: filmwebbooster.py From script.module.openscrapers with GNU General Public License v3.0 | 5 votes |
def search_ep(self, titles, season, episode, year): try: searchtitles = titles for searchtitle in searchtitles: response = requests.get(self.base_link + self.search_serial % searchtitle) result = response.content h = HTMLParser() result = h.unescape(result) result = client.parseDOM(result, 'ul', attrs={'class': 'resultsList hits'}) items = client.parseDOM(result, 'li') items = [x for x in items if not str(x).startswith("<a href")] orgtitles = [] for content in items: try: orgtitle = str( client.parseDOM(content, 'div', attrs={'class': 'filmPreview__originalTitle'})[0]) except: orgtitle = "0" pass orgtitles.append(orgtitle) ids = client.parseDOM(items, 'data', ret='data-id') titles = client.parseDOM(result, 'data', ret='data-title') years = client.parseDOM(result, 'span', attrs={'class': 'filmPreview__year'}) for item in zip(titles, ids, years, orgtitles): f_title = str(item[0]) f_id = str(item[1]) f_year = str(item[2]) f_orgtitle = str(item[3]) teststring = cleantitle.normalize(cleantitle.getsearch(searchtitle)) words = cleantitle.normalize(cleantitle.getsearch(f_title)).split(" ") if self.contains_all_wors(teststring, words) and year == f_year: return (f_title, f_id, f_year, f_orgtitle, "SERIAL", season, episode) except: return
Example #9
Source File: kinonet.py From script.module.openscrapers with GNU General Public License v3.0 | 5 votes |
def search(self, title, localtitle, year, is_movie_search): try: titles = [] titles.append(cleantitle.normalize(cleantitle.getsearch(title))) titles.append(cleantitle.normalize(cleantitle.getsearch(localtitle))) titles.append(title) titles.append(localtitle) for title in titles: try: url = self.search_link + str(title) result = self.session.get(url).content result = result.decode('utf-8') h = HTMLParser() result = h.unescape(result) result = client.parseDOM(result, 'div', attrs={'class': 'card-body p-2'}) for item in result: try: nazwa = re.findall("""Film online: (.*?)\"""", item)[0] try: nazwa = re.findall(""">(.*?)<""", nazwa)[0] except: pass name = cleantitle.normalize(cleantitle.getsearch(nazwa)) rok = re.findall("""Rok wydania filmu online\".*>(.*?)<""", item)[0] item = str(item).replace("<span style='color:red'>", "").replace("</span>", "") link = re.findall("""href=\"(.*?)\"""", item)[0] if link.startswith('//'): link = "https:" + link name = name.replace(" ", " ") title = title.replace(" ", " ") words = name.split(" ") if self.contains_all_words(title, words) and str(year) in rok: return link except: continue except: continue except: return
Example #10
Source File: serialeco.py From script.module.openscrapers with GNU General Public License v3.0 | 5 votes |
def sources(self, url, hostDict, hostprDict): try: sources = [] data = { 'fid_name': url[0], 'sezon': url[1], 'odcinek': url[2], 'title': url[0] } result = requests.post('http://178.19.110.218/forumserialeco/skrypt/szukaj3.php', data=data).content result = result.decode('utf-8') h = HTMLParser() result = h.unescape(result) if result: wersja = re.findall("""wersja: <b>(.*?)<\/b>""", result) id = re.findall("""url='(.*?)'""", result) for item in zip(wersja, id): try: if item[1]: info = self.get_lang_by_type(item[0]) content = client.request("http://seriale.co/frame.php?src=" + item[1]) video_link = str(client.parseDOM(content, 'iframe', ret='src')[0]) valid, host = source_utils.is_host_valid(video_link, hostDict) if valid: sources.append( {'source': host, 'quality': 'SD', 'language': info[0], 'url': video_link, 'info': info[1], 'direct': False, 'debridonly': False}) else: continue except: continue return sources except: return sources
Example #11
Source File: diagnose.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. :param data: Some markup. """ parser = AnnouncingParser() parser.feed(data)
Example #12
Source File: html2text.py From RedditBots with MIT License | 5 votes |
def close(self): HTMLParser.HTMLParser.close(self) self.pbr() self.o('', 0, 'end') self.outtext = self.outtext.join(self.outtextlist) if self.unicode_snob: nbsp = unichr(name2cp('nbsp')) else: nbsp = u' ' self.outtext = self.outtext.replace(u' _place_holder;', nbsp) return self.outtext
Example #13
Source File: diagnose.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #14
Source File: diagnose.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception as e: print("%s could not parse the markup." % parser) traceback.print_exc() if success: print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print("Raw html5lib parsed the markup in %.2fs." % (b-a))
Example #15
Source File: Erome.py From bulk-downloader-for-reddit with GNU General Public License v3.0 | 5 votes |
def getLinks(self,url,lineNumber=129): content = [] lineNumber = None class EromeParser(HTMLParser): tag = None def handle_starttag(self, tag, attrs): self.tag = {tag:{attr[0]: attr[1] for attr in attrs}} pageSource = (urllib.request.urlopen(url).read().decode().split('\n')) """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS""" for i in range(len(pageSource)): obj = EromeParser() obj.feed(pageSource[i]) tag = obj.tag if tag is not None: if "div" in tag: if "id" in tag["div"]: if tag["div"]["id"] == "album": lineNumber = i break for line in pageSource[lineNumber:]: obj = EromeParser() obj.feed(line) tag = obj.tag if tag is not None: if "img" in tag: if "class" in tag["img"]: if tag["img"]["class"]=="img-front": content.append(tag["img"]["src"]) elif "source" in tag: content.append(tag["source"]["src"]) return [ link for link in content \ if link.endswith("_480p.mp4") or not link.endswith(".mp4") ]
Example #16
Source File: utils.py From polaris with GNU Affero General Public License v3.0 | 5 votes |
def remove_html(text): text = re.sub('<[^<]+?>', '', text) text = text.replace('<', '<') text = text.replace('>', '>') return text s = HTMLParser() s.reset() s.reset() s.strict = False s.convert_charrefs = True s.fed = [] s.feed(text) return ''.join(s.fed)
Example #17
Source File: headlines.py From mlb-led-scoreboard with GNU General Public License v3.0 | 5 votes |
def __strings_for_feed(self, feed, max_entries): spaces = " " * HEADLINE_SPACER_SIZE title = feed.feed.title.encode("ascii", "ignore") headlines = "" for idx, entry in enumerate(feed.entries): if idx < max_entries: h = HTMLParser() text = h.unescape(entry.title.encode("ascii", "ignore")) headlines += text + spaces return title + spaces + headlines
Example #18
Source File: html2text.py From RedditBots with MIT License | 5 votes |
def feed(self, data): data = data.replace("</' + 'script>", "</ignore>") HTMLParser.HTMLParser.feed(self, data)
Example #19
Source File: filmwebbooster.py From script.module.openscrapers with GNU General Public License v3.0 | 5 votes |
def search(self, title, localtitle, year): try: searchtitles = (str(localtitle), str(title)) for searchtitle in searchtitles: response = requests.get(self.base_link + self.search_film % searchtitle) result = response.content h = HTMLParser() result = h.unescape(result) result = client.parseDOM(result, 'ul', attrs={'class': 'resultsList hits'}) items = client.parseDOM(result, 'li') items = [x for x in items if not str(x).startswith("<a href")] orgtitles = [] for content in items: try: orgtitle = str( client.parseDOM(content, 'div', attrs={'class': 'filmPreview__originalTitle'})[0]) except: orgtitle = "0" pass orgtitles.append(orgtitle) ids = client.parseDOM(items, 'data', ret='data-id') titles = client.parseDOM(result, 'data', ret='data-title') years = client.parseDOM(result, 'span', attrs={'class': 'filmPreview__year'}) for item in zip(titles, ids, years, orgtitles): f_title = str(item[0]) f_id = str(item[1]) f_year = str(item[2]) f_orgtitle = str(item[3]) teststring = cleantitle.normalize(cleantitle.getsearch(searchtitle)) words = cleantitle.normalize(cleantitle.getsearch(f_title)).split(" ") if self.contains_all_wors(teststring, words) and year == f_year: return (f_title, f_id, f_year, f_orgtitle, "FILM") except: return
Example #20
Source File: kinonet.py From script.module.openscrapers with GNU General Public License v3.0 | 5 votes |
def search_ep(self, titles, season, episode, year): try: query = 'S{:02d}E{:02d}'.format(int(season), int(episode)) for title in titles: url = self.search_link + str(title) result = self.session.get(url).content result = result.decode('utf-8') h = HTMLParser() result = h.unescape(result) result = client.parseDOM(result, 'div', attrs={'class': 'card-body p-2'}) for item in result: nazwa = re.findall("""Film online: (.*?)\"""", item)[0] name = cleantitle.normalize(cleantitle.getsearch(nazwa)) rok = re.findall("""Rok wydania filmu online\".*>(.*?)<""", item)[0] item = str(item).replace("<span style='color:red'>", "").replace("</span>", "") link = re.findall("""href=\"(.*?)\"""", item)[0] if link.startswith('//'): link = "https:" + link name = name.replace(" ", " ") title = title.replace(" ", " ") words = title.split(" ") if self.contains_all_words(name, words) and str(year) in rok: content = requests.get(link.replace('filmy', 'seriale')).content content = client.parseDOM(content, 'div', attrs={'class': 'tabela_wiersz mb-1'}) for odcinek in content: if query.lower() in odcinek.lower(): link = str(client.parseDOM(odcinek, 'a', ret='href')[0]) return self.base_link + link except: return
Example #21
Source File: _htmlparser.py From python-for-android with Apache License 2.0 | 5 votes |
def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed. if name.startswith('x'): real_name = int(name.lstrip('x'), 16) else: real_name = int(name) try: data = chr(real_name) except (ValueError, OverflowError) as e: data = "\N{REPLACEMENT CHARACTER}" self.handle_data(data)
Example #22
Source File: client.py From script.module.openscrapers with GNU General Public License v3.0 | 5 votes |
def _replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") txt = txt.strip() return txt
Example #23
Source File: getSum.py From script.module.openscrapers with GNU General Public License v3.0 | 5 votes |
def replaceHTMLCodes(text): text = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", text) text = HTMLParser().unescape(text) text = text.replace(""", "\"") text = text.replace("&", "&") text = text.replace("%2B", "+") text = text.replace("\/", "/") text = text.replace("\\", "") text = text.strip() return text
Example #24
Source File: __init__.py From script.module.openscrapers with GNU General Public License v3.0 | 5 votes |
def unescape(html_text): if sys.version_info >= (3, 0): if sys.version_info >= (3, 4): return html.unescape(html_text) return HTMLParser().unescape(html_text) return HTMLParser().unescape(html_text) # ------------------------------------------------------------------------------- # # Decode Brotli on older versions of urllib3 manually # ------------------------------------------------------------------------------- #
Example #25
Source File: diagnose.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception as e: print("%s could not parse the markup." % parser) traceback.print_exc() if success: print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print("Raw html5lib parsed the markup in %.2fs." % (b-a))
Example #26
Source File: diagnose.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #27
Source File: test_client.py From pyvas with MIT License | 5 votes |
def test_download_report_with_html_format(self, client, report): r_format = client.list_report_formats(name="HTML").data[0] response = client.download_report(uuid=report["@id"], format_uuid=r_format["@id"]) assert isinstance(response, six.string_types) parser = HTMLParser() parser.feed(response) parser.close() assert parser
Example #28
Source File: test_jcds.py From python-jss with GNU General Public License v3.0 | 5 votes |
def test_jcds_scrape_token(self, cloud_j): # type: (JSS) -> None """Assert that we can scrape the jcds upload token from the ``legacy/packages.html`` page, from <div class='chunked-uploader' data-upload-token>""" response = cloud_j.scrape('legacy/packages.html?id=-1&o=c') re_data_base_url = re.compile('data-upload-token="([^"]*)"') # print(response.content) matches = re_data_base_url.search(response.content) print(matches.group(1)) h = HTMLParser() print(h.unescape(matches.group(1)))
Example #29
Source File: test_jcds.py From python-jss with GNU General Public License v3.0 | 5 votes |
def test_jcds_scrape_baseurl(self, cloud_j): # type: (JSS) -> None """Assert that we can scrape the jcds upload base url from the ``legacy/packages.html`` page, from <div class='chunked-uploader' data-base-url>""" response = cloud_j.scrape('legacy/packages.html?id=-1&o=c') re_data_base_url = re.compile('data-base-url="([^"]*)"') # print(response.content) matches = re_data_base_url.search(response.content) print(matches.group(1)) h = HTMLParser() print(h.unescape(matches.group(1)))
Example #30
Source File: html2text.py From PyDataset with MIT License | 5 votes |
def close(self): HTMLParser.HTMLParser.close(self) self.pbr() self.o('', 0, 'end') self.outtext = self.outtext.join(self.outtextlist) if self.unicode_snob: nbsp = unichr(name2cp('nbsp')) else: nbsp = u' ' self.outtext = self.outtext.replace(u' _place_holder;', nbsp) return self.outtext