Python html.parser.unescape() Examples
The following are 11
code examples of html.parser.unescape().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html.parser
, or try the search function
.
Example #1
Source File: interSubs.py From interSubs with MIT License | 5 votes |
def mtranslate_google(word): import html.parser import urllib.request import urllib.parse agent = {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36"} def unescape(text): parser = html.parser.HTMLParser() return (parser.unescape(text)) def translate(to_translate, to_language="auto", from_language="auto"): base_link = "http://translate.google.com/m?hl=%s&sl=%s&q=%s" to_translate = urllib.parse.quote(to_translate) link = base_link % (to_language, from_language, to_translate) request = urllib.request.Request(link, headers=agent) raw_data = urllib.request.urlopen(request).read() data = raw_data.decode("utf-8") expr = r'class="t0">(.*?)<' re_result = re.findall(expr, data) if (len(re_result) == 0): result = "" else: result = unescape(re_result[0]) return (result) return [[word, translate(word, lang_to, lang_from)]], ['', ''] # reverso.net
Example #2
Source File: gfc.py From KStock with GNU General Public License v3.0 | 5 votes |
def getNews(symbol): url = buildNewsUrl(symbol) content = urlopen(url).read().decode('utf-8') content_json = demjson.decode(content) article_json = [] news_json = content_json['clusters'] for cluster in news_json: for article in cluster: if article == 'a': article_json.extend(cluster[article]) return [[unescape(art['t']).strip(), art['u']] for art in article_json]
Example #3
Source File: helpers.py From twitter_bot_utils with GNU General Public License v3.0 | 5 votes |
def format_text(text): return parser.unescape(text).strip()
Example #4
Source File: useful_twitter.py From chirps with MIT License | 5 votes |
def find_news(): # I'm adventuring with regular expressions for parsing! """Finds news for tweeting, along with their links.""" nyTech = requests.get('https://nytimes.com/section/technology') latest = latest_expr.search(nyTech.text) news_blocks = news_block_expr.findall(latest.group(1)) news = [] for i in range(len(news_blocks)): item = ( news_blocks[i][1].strip() + ' ' + shorten_url(news_blocks[i][0]), news_blocks[i][2].strip()) # This is img src. if item[1].startswith('Daily Report: '): item = item[14:] news.append(item) '''tv = requests.get('https://theverge.com', headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}) feed_patt = r'(?s)<div class="c-compact-river">(.*?)<div class="l-col__sidebar"' bunches = re.findall(feed_patt, tv.text) verge_news = [] for cluster in bunches: snippets = re.findall(r'<h2.*?><a.*>(.*?)</a></h2>', cluster) verge_news.extend(snippets) for item in verge_news: news.append(parser.unescape(item)) random.shuffle(news) #to bring a feel of randomness''' return news
Example #5
Source File: create_zoom_meeting.py From resilient-community-apps with MIT License | 5 votes |
def _unescape(data): """ Return unescaped data such as > -> >, " -> ', etc. """ try: return htmlparser.unescape(data) except: return data
Example #6
Source File: resilient_common.py From resilient-community-apps with MIT License | 5 votes |
def clean_html(htmlFragment): """ Resilient textarea fields return html fragments. This routine will remove the html and insert any code within <div></div> with a linefeed :param htmlFragment: :return: cleaned up code """ if not htmlFragment or not isinstance(htmlFragment, string_types): return htmlFragment s = BeautifulSoup(unescape(htmlFragment), "html.parser") return ' '.join(s.strings)
Example #7
Source File: resilient_common.py From resilient-community-apps with MIT License | 5 votes |
def unescape(data): """ Return unescaped data such as > -> >, " -> ', etc. """ try: return htmlparser.unescape(data) except: return data
Example #8
Source File: resilient_common.py From resilient-community-apps with MIT License | 5 votes |
def clean_html(htmlFragment): """ Resilient textarea fields return html fragments. This routine will remove the html and insert any code within <div></div> with a linefeed :param htmlFragment: :return: cleaned up code """ if not htmlFragment or not isinstance(htmlFragment, string_types): return htmlFragment return BeautifulSoup(unescape(htmlFragment), "html.parser").text
Example #9
Source File: resilient_common.py From resilient-community-apps with MIT License | 5 votes |
def unescape(data): """ Return unescaped data such as > -> >, " -> ', etc. """ try: return htmlparser.unescape(data) except: return data
Example #10
Source File: resilient_common.py From resilient-community-apps with MIT License | 5 votes |
def unescape(data): """ Return unescaped data such as > -> >, " -> ', etc. """ try: return htmlparser.unescape(data) except: return data
Example #11
Source File: snapshill.py From SnapshillBot with Apache License 2.0 | 4 votes |
def run(self): """ Checks through the submissions and archives and posts comments. """ if not self._setup: raise Exception("Snapshill not ready yet!") submissions = self.reddit.front.new(limit=self.limit) for submission in submissions: debugTime = time.time() warned = False log.debug("Found submission.\n" + submission.permalink) if not should_notify(submission): log.debug("Skipping.") continue archives = [ArchiveContainer(fix_url(submission.url), submission.title)] if submission.is_self and submission.selftext_html is not None: log.debug("Found text post...") links = BeautifulSoup(unescape(submission.selftext_html)).find_all("a") finishedURLs = [] for anchor in links: if time.time() > debugTime + WARN_TIME and not warned: log.warn( "Spent over {} seconds on post (ID: {})".format( WARN_TIME, submission.name ) ) warned = True log.debug("Found link in text post...") url = fix_url(anchor["href"]) if skip_url(url): continue if url in finishedURLs: continue # skip for sanity archives.append(ArchiveContainer(url, anchor.contents[0])) finishedURLs.append(url) ratelimit(url) Notification( self.reddit, submission, self._get_header(submission.subreddit), archives, ).notify() db.commit()