Python six.moves.html_parser.HTMLParser() Examples

The following are 10 code examples of six.moves.html_parser.HTMLParser(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module six.moves.html_parser , or try the search function .
Example #1
Source Project: icrawler   Author: hellock   File: bing.py    License: MIT License 5 votes vote down vote up
def parse(self, response):
        soup = BeautifulSoup(
            response.content.decode('utf-8', 'ignore'), 'lxml')
        image_divs = soup.find_all('div', class_='imgpt')
        pattern = re.compile(r'murl\":\"(.*?)\.jpg')
        for div in image_divs:
            href_str = html_parser.HTMLParser().unescape(div.a['m'])
            match = pattern.search(href_str)
            if match:
                name = (match.group(1)
                        if six.PY3 else match.group(1).encode('utf-8'))
                img_url = '{}.jpg'.format(name)
                yield dict(file_url=img_url) 
Example #2
Source Project: -Odoo---   Author: ScottAI   File: helper.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        html_parser.HTMLParser.__init__(self)
        self.recording = 0
        self.data = [] 
Example #3
Source Project: python-docs-samples   Author: GoogleCloudPlatform   File: wikibot.py    License: Apache License 2.0 5 votes vote down vote up
def message(self, msg):
        """Process incoming message stanzas.

        Be aware that this also includes MUC messages and error messages. It is
        usually a good idea to check the messages's type before processing or
        sending replies. If the message is the appropriate type, then the bot
        checks wikipedia to see if the message string exists as a page on the
        site. If so, it sends this link back to the sender in the reply.

        Arguments:
            msg -- The received message stanza. See the SleekXMPP documentation
                for stanza objects and the Message stanza to see how it may be
                used.
        """
        if msg['type'] in ('chat', 'normal'):
            msg_body = msg['body']
            encoded_body = urllib.quote_plus(msg_body)
            response = requests.get(
                'https://en.wikipedia.org/w/api.php?'
                'action=query&list=search&format=json&srprop=snippet&'
                'srsearch={}'.format(encoded_body))
            doc = json.loads(response.content)

            results = doc.get('query', {}).get('search')
            if not results:
                msg.reply('I wasn\'t able to locate info on "{}" Sorry'.format(
                    msg_body)).send()
                return

            snippet = results[0]['snippet']
            title = urllib.quote_plus(results[0]['title'])

            # Strip out html
            snippet = html_parser.HTMLParser().unescape(
                re.sub(r'<[^>]*>', '', snippet))
            msg.reply(u'{}...\n(http://en.wikipedia.org/w/?title={})'.format(
                snippet, title)).send() 
Example #4
Source Project: syntribos   Author: openstack-archive   File: parser.py    License: Apache License 2.0 5 votes vote down vote up
def _string_data(data, data_type):
        """Replace various objects types with string representations."""
        if data_type == 'json':
            return json.dumps(data)
        elif data_type == 'xml':
            if isinstance(data, str):
                return data
            str_data = ElementTree.tostring(data)
            # No way to stop tostring from HTML escaping even if we wanted
            h = html_parser.HTMLParser()
            return h.unescape(str_data.decode())
        elif data_type == 'yaml':
            return yaml.dump(data)
        else:
            return data 
Example #5
Source Project: yagocd   Author: grundic   File: info.py    License: ISC License 5 votes vote down vote up
def __init__(self):
        html_parser.HTMLParser.__init__(self)
        self._in_td = False
        self.data = list() 
Example #6
Source Project: lp-aws-saml   Author: lastpass   File: lp-aws-saml.py    License: GNU General Public License v2.0 5 votes vote down vote up
def get_saml_token(session, username, password, saml_cfg_id):
    """
    Log into LastPass and retrieve a SAML token for a given
    SAML configuration.
    """
    logger.debug("Getting SAML token")

    # now logged in, grab the SAML token from the IdP-initiated login
    idp_login = '%s/saml/launch/cfg/%d' % (LASTPASS_SERVER, saml_cfg_id)

    r = session.get(idp_login, verify=should_verify())

    form = extract_form(r.text)
    if not form['action']:
        # try to scrape the error message just to make it more user friendly
        error = ""
        for l in r.text.splitlines():
            match = re.search(r'<h2>(.*)</h2>', l)
            if match:
                msg = html_parser.HTMLParser().unescape(match.group(1))
                msg = msg.replace("<br/>", "\n")
                msg = msg.replace("<b>", "")
                msg = msg.replace("</b>", "")
                error = "\n" + msg

        raise ValueError("Unable to find SAML ACS" + error)

    return b64decode(form['fields']['SAMLResponse']) 
Example #7
Source Project: figshare   Author: rmcgibbo   File: utils.py    License: MIT License 5 votes vote down vote up
def strip_html(html):
    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.strict = False
            self.fed = []
        def handle_data(self, d):
            self.fed.append(d)
        def get_data(self):
            return ''.join(self.fed)
    p = MLStripper()
    p.feed(html)
    return p.get_data() 
Example #8
Source Project: wechat_mall   Author: elfgzp   File: helper.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        html_parser.HTMLParser.__init__(self)
        self.recording = 0
        self.data = [] 
Example #9
Source Project: git-stacktrace   Author: pinterest   File: server.py    License: Apache License 2.0 5 votes vote down vote up
def _get_field(self, field, default=''):
        val = self.params.get(field, [default])
        val = val[0] if isinstance(val, list) else val
        return HTMLParser().unescape(val) 
Example #10
Source Project: readme_renderer   Author: pypa   File: markdown.py    License: Apache License 2.0 4 votes vote down vote up
def _highlight(html):
    """Syntax-highlights HTML-rendered Markdown.

    Plucks sections to highlight that conform the the GitHub fenced code info
    string as defined at https://github.github.com/gfm/#info-string.

    Args:
        html (str): The rendered HTML.

    Returns:
        str: The HTML with Pygments syntax highlighting applied to all code
            blocks.
    """

    formatter = pygments.formatters.HtmlFormatter(nowrap=True)

    code_expr = re.compile(
        r'<pre><code class="language-(?P<lang>.+?)">(?P<code>.+?)'
        r'</code></pre>', re.DOTALL)

    def replacer(match):
        try:
            lang = match.group('lang')
            lang = _LANG_ALIASES.get(lang, lang)
            lexer = pygments.lexers.get_lexer_by_name(lang)
        except ValueError:
            lexer = pygments.lexers.TextLexer()

        code = match.group('code')

        # Decode html entities in the code. cmark tries to be helpful and
        # translate '"' to '&quot;', but it confuses pygments. Pygments will
        # escape any html entities when re-writing the code, and we run
        # everything through bleach after.
        code = html_parser.HTMLParser().unescape(code)

        highlighted = pygments.highlight(code, lexer, formatter)

        return '<pre>{}</pre>'.format(highlighted)

    result = code_expr.sub(replacer, html)

    return result