Python html.parser.HTMLParser.feed() Examples

The following are 29 code examples of html.parser.HTMLParser.feed(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module html.parser.HTMLParser , or try the search function .
Example #1
Source File: Parser.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 6 votes vote down vote up
def getFormattedHTML(self, indent='  '):
        '''
            getFormattedHTML - Get formatted and xhtml of this document, replacing the original whitespace
                with a pretty-printed version

            @param indent - space/tab/newline of each level of indent, or integer for how many spaces per level

            @return - <str> Formatted html

            @see getHTML - Get HTML with original whitespace

            @see getMiniHTML - Get HTML with only functional whitespace remaining
        '''
        from .Formatter import AdvancedHTMLFormatter
        html = self.getHTML()
        formatter = AdvancedHTMLFormatter(indent, None) # Do not double-encode
        formatter.feed(html)
        return formatter.getHTML() 
Example #2
Source File: Formatter.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 6 votes vote down vote up
def getHTML(self):
        '''
            getHTML - Get the full HTML as contained within this tree, converted to  valid XHTML
                @returns - String
        '''
        root = self.getRoot()
        if root is None:
            raise ValueError('Cannot format, use feed to load contents.')

        if self.doctype:
            doctypeStr = '<!%s>\n' %(self.doctype)
        else:
            doctypeStr = ''

        # 6.6.0: If we have a real root tag, print the outerHTML. If we have a fake root tag (for multiple root condition),
        #   then print the innerHTML (skipping the outer root tag). Otherwise, we will miss
        #   untagged text (between the multiple root nodes).
        rootNode = self.getRoot()
        if rootNode.tagName == INVISIBLE_ROOT_TAG:
            return doctypeStr + rootNode.innerHTML
        else:
            return doctypeStr + rootNode.outerHTML
#        return doctypeStr + ''.join([elem.outerHTML for elem in self.getRootNodes()]) 
Example #3
Source File: ehp.py    From ehp with MIT License 6 votes vote down vote up
def __init__(self, data):
        """
        The data holds the characters.

        Example:

        html = Html()
        data = '<body><em>alpha</em></body>'
        dom = html.feed(data)
        x = dom.fst('em')
        x.append(Data('\nbeta'))

        It outputs.

        <body ><em >alpha
        beta</em></body>
        """

        Root.__init__(self, DATA)
        self.data = data 
Example #4
Source File: ehp.py    From ehp with MIT License 6 votes vote down vote up
def walk_with_root(self):
        """
        Like walk but carries root.

        Example:

        html = Html()
        data = '<body><em>alpha</em></body>'
        dom = html.feed(data)
        
        for (root, name, attr), (ind, name, attr) in dom.walk_with_root():
            print root, name, ind, name

        Output:

        <em >alpha</em> 1 alpha 1
        <body ><em >alpha</em></body> em <em >alpha</em> em
        <body ><em >alpha</em></body> body <body ><em >alpha</em></body> body    
        """

        for root, ind in self.sail_with_root():
            yield ((root, root.name, root.attr), 
                   (ind, ind.name, ind.attr)) 
Example #5
Source File: ehp.py    From ehp with MIT License 6 votes vote down vote up
def remove(self, item):
        """
        This is as list.remove but works with id.

        data = '<a><b></b><b></b></a>'
        html = Html()
        dom = html.feed(data)
        
        for root, ind in dom.sail_with_root():
            if ind.name == 'b':
                root.remove(ind)
        
        print dom
        
        It should print.

        <a ></a>
        """

        index = self.index(item)
        del self[index] 
Example #6
Source File: ehp.py    From ehp with MIT License 6 votes vote down vote up
def take(self, *args):
        """
        It returns the first object whose one of its
        attributes matches (key0, value0), (key1, value1), ... .

        Example:

        data = '<a><b id="foo" size="1"></b></a>'
        html = Html()
        dom = html.feed(data)
        
        print dom.take(('id', 'foo'))
        print dom.take(('id', 'foo'), ('size', '2'))
        """

        seq = self.match(*args)
        
        try:
            item = next(seq)
        except StopIteration:
            return None
        else:
            return item 
Example #7
Source File: __init__.py    From workload-automation with Apache License 2.0 5 votes vote down vote up
def feed(self, data):
        try:
            HTMLParser.feed(self, data)
        except self.StopParsingException:
            pass 
Example #8
Source File: __init__.py    From workload-automation with Apache License 2.0 5 votes vote down vote up
def update_output_v3(self, context):
        for test in self.benchmarks:  # Get all scores from HTML files
            filename = None
            if test == "Browser":
                result_folder = self.target.path.join(self.target.package_data_directory,
                                                      self.apk.apk_info.package, 'files')
                for result_file in self.target.listdir(result_folder, as_root=True):
                    if result_file.startswith("Browser"):
                        filename = result_file
            else:
                filename = '{}_results.html'.format(test)

            device_file = self.target.path.join(self.target.package_data_directory,
                                                self.apk.apk_info.package, 'files', filename)
            host_file = os.path.join(context.output_directory, filename)
            self.target.pull(device_file, host_file, as_root=True)
            with open(host_file) as fh:
                parser = VellamoResultParser()
                parser.feed(fh.read())
                for benchmark in parser.benchmarks:
                    benchmark.name = benchmark.name.replace(' ', '_')
                    context.add_metric('{}_Total'.format(benchmark.name),
                                       benchmark.score)
                    for name, score in list(benchmark.metrics.items()):
                        name = name.replace(' ', '_')
                        context.add_metric('{}_{}'.format(benchmark.name,
                                                          name), score)
            context.add_artifact('vellamo_output', kind='raw',
                                 path=filename) 
Example #9
Source File: citation.py    From Carnets with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def feed(self, data):
        self.data = data
        HTMLParser.feed(self, data) 
Example #10
Source File: citation.py    From Carnets with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def citation2latex(s):
    """Parse citations in Markdown cells.
    
    This looks for HTML tags having a data attribute names `data-cite`
    and replaces it by the call to LaTeX cite command. The tranformation
    looks like this:
    
    `<cite data-cite="granger">(Granger, 2013)</cite>`
    
    Becomes
    
    `\\cite{granger}`
    
    Any HTML tag can be used, which allows the citations to be formatted
    in HTML in any manner.
    """
    parser = CitationParser()
    parser.feed(s)
    parser.close()
    outtext = u''
    startpos = 0
    for citation in parser.citelist:
            outtext += s[startpos:citation[1]]
            outtext += '\\cite{%s}'%citation[0]
            startpos = citation[2] if len(citation)==3 else -1
    outtext += s[startpos:] if startpos != -1 else ''
    return outtext

#-----------------------------------------------------------------------------
# Classes
#----------------------------------------------------------------------------- 
Example #11
Source File: freesound.py    From RenderChan with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def feed(self, data):
        HTMLParser.feed(self, str(data))
        if self.artist == None or self.title == None or self.license == None:
            raise Exception("Error parsing data from freesound!") 
Example #12
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def fromfile(self, filename):
        """
        It builds a structure from a file.
        """

        fd   = open(filename, 'r')
        data = fd.read()
        fd.close()
        return self.feed(data) 
Example #13
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def walk(self):
        """
        Like sail but carries name and attr.

        Example:

        html = Html()
        data = '<body> <em> This is all the text.</em></body>'
        dom = html.feed(data)
        
        for ind, name, attr in dom.walk():
            print 'TAG:', ind
            print 'NAME:', name
            print 'ATTR:', attr

        It should print.

        TAG:  
        NAME: 1
        ATTR: 
        TAG:  This is all the text.
        NAME: 1
        ATTR: 
        TAG: <em > This is all the text.</em>
        NAME: em
        ATTR: 
        TAG: <body > <em > This is all the text.</em></body>
        NAME: body
        ATTR: 
        """

        for ind in self.sail():
            yield (ind, ind.name, ind.attr) 
Example #14
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def text(self):
        """
        It returns all objects whose name matches DATA.
        It basically returns a string corresponding
        to all asci characters that are inside a xml/html
        tag.


        Example:

        html = Html()
        data = '<body><em>This is all the text.</em></body>'
        dom = html.feed(data)
        
        print dom.fst('em').text()

        It outputs.

        This is all the text.

        Notice that if you call text() on an item with
        children then it returns all the *printable* characters
        for that node.
        """

        return self.join('', DATA, AMP, CODE) 
Example #15
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def match_with_root(self, *args):
        """
        Like Root.match but with its parent tag.

        Example:

        from ehp import *
        
        html = Html()
        dom  = html.feed('''<body> <p style="color:black"> xxx </p> 
                         <p style = "color:black"> mmm </p></body>''')
        
        for root, ind in dom.match_with_root(('style', 'color:black')):
            del ind.attr['style']
        
        item = dom.fst('body')
        item.attr['style'] = 'color:black'
        
        print dom

        Output.

        <body style="color:black" > <p > xxx </p> 
                         <p > mmm </p></body>
        """

        for root, ind in self.sail_with_root():
            for key, value in args:
                if ind.attr[key] != value: 
                    break
            else: 
                yield(root, ind) 
Example #16
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def match(self, *args):
        """
        It returns a sequence of objects whose attributes match.
        (key0, value0), (key1, value1), ... .

        Example:

        data = '<a size="1"><b size="1"></b></a>'
        html = Html()
        dom = html.feed(data)
        
        for ind in dom.match(('size', '1')):
            print ind

        It would print.

        <b size="1" ></b>
        <a size="1" ><b size="1" ></b></a>
        """

        for ind in self.sail():
            for key, value in args:
                if ind.attr[key] != value: 
                    break
            else: 
                yield(ind) 
Example #17
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def find_with_root(self, name, *args):
        """
        Like Root.find but returns its parent tag.

        from ehp import *
        
        html = Html()
        dom = html.feed('''<body> <p> alpha </p> <p> beta </p> </body>''')
        
        for root, ind in dom.find_with_root('p'):
            root.remove(ind)
        
        print dom

        It would output.

        <body >   </body>        
        """

        for root, ind in self.sail_with_root():
            if ind.name == name:
                for key, value in args:
                    if ind.attr[key] != value:
                        break
                else:
                    yield(root, ind) 
Example #18
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def find(self, name, *args):
        """
        It is used to find all objects that match name.

        Example 1:

        data = '<a><b></b><b></b></a>'
        html = Html()
        dom = html.feed(data)

        for ind in dom.find('b'):
            print ind

        It should print.

        <b ></b>
        <b ></b>

        Example 2.

        data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>'
        html = Html()
        dom  = html.feed(data)
        
        for ind in dom.find('p', ('style', 'color:green')):
            print ind
        
        Output.
    
        
        <p style="color:green" > beta.</p>
        """

        for ind in self.sail():
            if ind.name == name:
                for key, value in args:
                    if ind.attr[key] != value:
                        break
                else:
                    yield(ind) 
Example #19
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def index(self, item):
        """
        This is similar to index but uses id
        to check for equality.

        Example:

        data = '<a><b></b><b></b></a>'
        html = Html()
        dom = html.feed(data)
        
        for root, ind in dom.sail_with_root():
            print root.name, ind.name, root.index(ind)


        It would print.

        a b 0
        a b 1
         a 0        

        The line where it appears ' a 0' corresponds to the
        outmost object. The outmost object is an instance of Root
        that contains all the other objects.
        """

        count = 0
        for ind in self:
            if ind is item: return count
            count = count + 1

        raise ValueError 
Example #20
Source File: ehp.py    From ehp with MIT License 5 votes vote down vote up
def sail(self):
        """ 
        This is used to navigate through the xml/html document.
        Every xml/html object is represented by a python class
        instance that inherits from Root.
        
        The method sail is used to return an iterator
        for these objects.

        Example:
        data = '<a> <b> </b> </a>'

        html = Html()
        dom = html.feed(data)

        for ind in dom.sail():
            print type(ind),',', ind.name

        It would output.

        <class 'ehp.Root'> , a
        <class 'ehp.Root'> , b
        """
           
        for indi in self[:]:
            for indj in indi.sail():
                yield(indj)

            yield(indi) 
Example #21
Source File: Formatter.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parseStr(self, html):
        '''
            parseStr - Parses a string and creates the DOM tree and indexes.

                @param html <str> - valid HTML
        '''
        self.reset()
        if isinstance(html, bytes):
            self.feed(html.decode(self.encoding))
        else:
            self.feed(html) 
Example #22
Source File: Formatter.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parseFile(self, filename):
        '''
            parseFile - Parses a file and creates the DOM tree and indexes

                @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
        '''
        self.reset()

        if isinstance(filename, file):
            contents = filename.read()
        else:
            with codecs.open(filename, 'r', encoding=self.encoding) as f:
                contents = f.read()
        self.feed(contents) 
Example #23
Source File: Formatter.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 5 votes vote down vote up
def feed(self, contents):
        '''
            feed - Load contents

            @param contents - HTML contents
        '''
        contents = stripIEConditionals(contents)
        try:
            HTMLParser.feed(self, contents)
        except MultipleRootNodeException:
            self.reset()

            HTMLParser.feed(self, "%s%s" %(addStartTag(contents, INVISIBLE_ROOT_TAG_START), INVISIBLE_ROOT_TAG_END)) 
Example #24
Source File: Parser.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parseStr(self, html):
        '''
            parseStr - Parses a string and creates the DOM tree and indexes.

                @param html <str> - valid HTML
        '''
        self.reset()

        if isinstance(html, bytes):
            self.feed(html.decode(self.encoding))
        else:
            self.feed(html) 
Example #25
Source File: Parser.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parseFile(self, filename):
        '''
            parseFile - Parses a file and creates the DOM tree and indexes

                @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
        '''
        self.reset()

        if isinstance(filename, file):
            contents = filename.read()
        else:
            with codecs.open(filename, 'r', encoding=self.encoding) as f:
                contents = f.read()

        self.feed(contents) 
Example #26
Source File: Parser.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 5 votes vote down vote up
def feed(self, contents):
        '''
            feed - Feed contents. Use  parseStr or parseFile instead.

            @param contents - Contents
        '''
        contents = stripIEConditionals(contents)
        try:
            HTMLParser.feed(self, contents)
        except MultipleRootNodeException:
            self.reset()
            HTMLParser.feed(self, "%s%s" %(addStartTag(contents, INVISIBLE_ROOT_TAG_START), INVISIBLE_ROOT_TAG_END)) 
Example #27
Source File: Parser.py    From AdvancedHTMLParser with GNU Lesser General Public License v3.0 5 votes vote down vote up
def getMiniHTML(self):
        '''
            getMiniHTML - Gets the HTML representation of this document without any pretty formatting
                and disregarding original whitespace beyond the functional.

                @return <str> - HTML with only functional whitespace present
        '''
        from .Formatter import AdvancedHTMLMiniFormatter
        html = self.getHTML()
        formatter = AdvancedHTMLMiniFormatter(None) # Do not double-encode
        formatter.feed(html)
        return formatter.getHTML() 
Example #28
Source File: ehp.py    From ehp with MIT License 4 votes vote down vote up
def join(self, delim, *args):
        """
        It joins all the objects whose name appears in args.

        Example 1:

        html = Html()
        data = '<a><b> This is cool. </b><b> That is. </b></a>'
        dom = html.feed(data)
        
        print dom.join('', 'b')
        print type(dom.join('b'))

        It would print.

        <b > This is cool. </b><b > That is. </b>
        <type 'str'>

        Example 2:

        html = Html()
        data = '<a><b> alpha</b><c>beta</c> <b>gamma</a>'
        dom = html.feed(data)
        
        print dom.join('', 'b', 'c')

        It would print.

        <b > alpha</b><c >beta</c><b >gamma</b>

        Example 3:


        html = Html()
        data = '<a><b>alpha</b><c>beta</c><b>gamma</a>'
        dom = html.feed(data)
        
        print dom.join('\n', DATA)

        It would print.

        alpha
        beta
        gamma
        """

        data = ''

        for ind in self.sail():
            if ind.name in args:
                data = '%s%s%s' % (data, delim, ind)

        return data 
Example #29
Source File: ehp.py    From ehp with MIT License 4 votes vote down vote up
def fst(self, name, *args):
        """
        It returns the first object whose name
        matches.

        Example 1:

        html = Html()
        data = '<body> <em> Cool. </em></body>'
        dom = html.feed(data)
        
        print dom.fst('em')

        It outputs.

        <em > Cool. </em>

        Example 2:

        data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>'
        html = Html()
        dom  = html.feed(data)
        
        for ind in dom.find('p', ('style', 'color:green')):
            print ind
        
        print dom.fst('p', ('style', 'color:green'))
        print dom.fst_with_root('p', ('style', 'color:green'))

        Output:

        <p style="color:green" > beta.</p>
        <p style="color:green" > beta.</p>
        (<ehp.Tag object at 0xb7216c0c>, <ehp.Tag object at 0xb7216d24>)
        """

        
        # for ind in self.sail():
        #    if ind.name == name:
        #        for key, value in args:
        #            if ind.attr[key] != value:
        #                break
        #        else:
        #            return ind

        seq = self.find(name, *args)

        try:
            item = next(seq)
        except StopIteration:
            return None
        else:
            return item