Python html.parser.HTMLParser.feed() Examples
The following are 29
code examples of html.parser.HTMLParser.feed().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html.parser.HTMLParser
, or try the search function
.
Example #1
Source File: Parser.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 6 votes |
def getFormattedHTML(self, indent=' '): ''' getFormattedHTML - Get formatted and xhtml of this document, replacing the original whitespace with a pretty-printed version @param indent - space/tab/newline of each level of indent, or integer for how many spaces per level @return - <str> Formatted html @see getHTML - Get HTML with original whitespace @see getMiniHTML - Get HTML with only functional whitespace remaining ''' from .Formatter import AdvancedHTMLFormatter html = self.getHTML() formatter = AdvancedHTMLFormatter(indent, None) # Do not double-encode formatter.feed(html) return formatter.getHTML()
Example #2
Source File: Formatter.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 6 votes |
def getHTML(self): ''' getHTML - Get the full HTML as contained within this tree, converted to valid XHTML @returns - String ''' root = self.getRoot() if root is None: raise ValueError('Cannot format, use feed to load contents.') if self.doctype: doctypeStr = '<!%s>\n' %(self.doctype) else: doctypeStr = '' # 6.6.0: If we have a real root tag, print the outerHTML. If we have a fake root tag (for multiple root condition), # then print the innerHTML (skipping the outer root tag). Otherwise, we will miss # untagged text (between the multiple root nodes). rootNode = self.getRoot() if rootNode.tagName == INVISIBLE_ROOT_TAG: return doctypeStr + rootNode.innerHTML else: return doctypeStr + rootNode.outerHTML # return doctypeStr + ''.join([elem.outerHTML for elem in self.getRootNodes()])
Example #3
Source File: ehp.py From ehp with MIT License | 6 votes |
def __init__(self, data): """ The data holds the characters. Example: html = Html() data = '<body><em>alpha</em></body>' dom = html.feed(data) x = dom.fst('em') x.append(Data('\nbeta')) It outputs. <body ><em >alpha beta</em></body> """ Root.__init__(self, DATA) self.data = data
Example #4
Source File: ehp.py From ehp with MIT License | 6 votes |
def walk_with_root(self): """ Like walk but carries root. Example: html = Html() data = '<body><em>alpha</em></body>' dom = html.feed(data) for (root, name, attr), (ind, name, attr) in dom.walk_with_root(): print root, name, ind, name Output: <em >alpha</em> 1 alpha 1 <body ><em >alpha</em></body> em <em >alpha</em> em <body ><em >alpha</em></body> body <body ><em >alpha</em></body> body """ for root, ind in self.sail_with_root(): yield ((root, root.name, root.attr), (ind, ind.name, ind.attr))
Example #5
Source File: ehp.py From ehp with MIT License | 6 votes |
def remove(self, item): """ This is as list.remove but works with id. data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for root, ind in dom.sail_with_root(): if ind.name == 'b': root.remove(ind) print dom It should print. <a ></a> """ index = self.index(item) del self[index]
Example #6
Source File: ehp.py From ehp with MIT License | 6 votes |
def take(self, *args): """ It returns the first object whose one of its attributes matches (key0, value0), (key1, value1), ... . Example: data = '<a><b id="foo" size="1"></b></a>' html = Html() dom = html.feed(data) print dom.take(('id', 'foo')) print dom.take(('id', 'foo'), ('size', '2')) """ seq = self.match(*args) try: item = next(seq) except StopIteration: return None else: return item
Example #7
Source File: __init__.py From workload-automation with Apache License 2.0 | 5 votes |
def feed(self, data): try: HTMLParser.feed(self, data) except self.StopParsingException: pass
Example #8
Source File: __init__.py From workload-automation with Apache License 2.0 | 5 votes |
def update_output_v3(self, context): for test in self.benchmarks: # Get all scores from HTML files filename = None if test == "Browser": result_folder = self.target.path.join(self.target.package_data_directory, self.apk.apk_info.package, 'files') for result_file in self.target.listdir(result_folder, as_root=True): if result_file.startswith("Browser"): filename = result_file else: filename = '{}_results.html'.format(test) device_file = self.target.path.join(self.target.package_data_directory, self.apk.apk_info.package, 'files', filename) host_file = os.path.join(context.output_directory, filename) self.target.pull(device_file, host_file, as_root=True) with open(host_file) as fh: parser = VellamoResultParser() parser.feed(fh.read()) for benchmark in parser.benchmarks: benchmark.name = benchmark.name.replace(' ', '_') context.add_metric('{}_Total'.format(benchmark.name), benchmark.score) for name, score in list(benchmark.metrics.items()): name = name.replace(' ', '_') context.add_metric('{}_{}'.format(benchmark.name, name), score) context.add_artifact('vellamo_output', kind='raw', path=filename)
Example #9
Source File: citation.py From Carnets with BSD 3-Clause "New" or "Revised" License | 5 votes |
def feed(self, data): self.data = data HTMLParser.feed(self, data)
Example #10
Source File: citation.py From Carnets with BSD 3-Clause "New" or "Revised" License | 5 votes |
def citation2latex(s): """Parse citations in Markdown cells. This looks for HTML tags having a data attribute names `data-cite` and replaces it by the call to LaTeX cite command. The tranformation looks like this: `<cite data-cite="granger">(Granger, 2013)</cite>` Becomes `\\cite{granger}` Any HTML tag can be used, which allows the citations to be formatted in HTML in any manner. """ parser = CitationParser() parser.feed(s) parser.close() outtext = u'' startpos = 0 for citation in parser.citelist: outtext += s[startpos:citation[1]] outtext += '\\cite{%s}'%citation[0] startpos = citation[2] if len(citation)==3 else -1 outtext += s[startpos:] if startpos != -1 else '' return outtext #----------------------------------------------------------------------------- # Classes #-----------------------------------------------------------------------------
Example #11
Source File: freesound.py From RenderChan with BSD 3-Clause "New" or "Revised" License | 5 votes |
def feed(self, data): HTMLParser.feed(self, str(data)) if self.artist == None or self.title == None or self.license == None: raise Exception("Error parsing data from freesound!")
Example #12
Source File: ehp.py From ehp with MIT License | 5 votes |
def fromfile(self, filename): """ It builds a structure from a file. """ fd = open(filename, 'r') data = fd.read() fd.close() return self.feed(data)
Example #13
Source File: ehp.py From ehp with MIT License | 5 votes |
def walk(self): """ Like sail but carries name and attr. Example: html = Html() data = '<body> <em> This is all the text.</em></body>' dom = html.feed(data) for ind, name, attr in dom.walk(): print 'TAG:', ind print 'NAME:', name print 'ATTR:', attr It should print. TAG: NAME: 1 ATTR: TAG: This is all the text. NAME: 1 ATTR: TAG: <em > This is all the text.</em> NAME: em ATTR: TAG: <body > <em > This is all the text.</em></body> NAME: body ATTR: """ for ind in self.sail(): yield (ind, ind.name, ind.attr)
Example #14
Source File: ehp.py From ehp with MIT License | 5 votes |
def text(self): """ It returns all objects whose name matches DATA. It basically returns a string corresponding to all asci characters that are inside a xml/html tag. Example: html = Html() data = '<body><em>This is all the text.</em></body>' dom = html.feed(data) print dom.fst('em').text() It outputs. This is all the text. Notice that if you call text() on an item with children then it returns all the *printable* characters for that node. """ return self.join('', DATA, AMP, CODE)
Example #15
Source File: ehp.py From ehp with MIT License | 5 votes |
def match_with_root(self, *args): """ Like Root.match but with its parent tag. Example: from ehp import * html = Html() dom = html.feed('''<body> <p style="color:black"> xxx </p> <p style = "color:black"> mmm </p></body>''') for root, ind in dom.match_with_root(('style', 'color:black')): del ind.attr['style'] item = dom.fst('body') item.attr['style'] = 'color:black' print dom Output. <body style="color:black" > <p > xxx </p> <p > mmm </p></body> """ for root, ind in self.sail_with_root(): for key, value in args: if ind.attr[key] != value: break else: yield(root, ind)
Example #16
Source File: ehp.py From ehp with MIT License | 5 votes |
def match(self, *args): """ It returns a sequence of objects whose attributes match. (key0, value0), (key1, value1), ... . Example: data = '<a size="1"><b size="1"></b></a>' html = Html() dom = html.feed(data) for ind in dom.match(('size', '1')): print ind It would print. <b size="1" ></b> <a size="1" ><b size="1" ></b></a> """ for ind in self.sail(): for key, value in args: if ind.attr[key] != value: break else: yield(ind)
Example #17
Source File: ehp.py From ehp with MIT License | 5 votes |
def find_with_root(self, name, *args): """ Like Root.find but returns its parent tag. from ehp import * html = Html() dom = html.feed('''<body> <p> alpha </p> <p> beta </p> </body>''') for root, ind in dom.find_with_root('p'): root.remove(ind) print dom It would output. <body > </body> """ for root, ind in self.sail_with_root(): if ind.name == name: for key, value in args: if ind.attr[key] != value: break else: yield(root, ind)
Example #18
Source File: ehp.py From ehp with MIT License | 5 votes |
def find(self, name, *args): """ It is used to find all objects that match name. Example 1: data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for ind in dom.find('b'): print ind It should print. <b ></b> <b ></b> Example 2. data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>' html = Html() dom = html.feed(data) for ind in dom.find('p', ('style', 'color:green')): print ind Output. <p style="color:green" > beta.</p> """ for ind in self.sail(): if ind.name == name: for key, value in args: if ind.attr[key] != value: break else: yield(ind)
Example #19
Source File: ehp.py From ehp with MIT License | 5 votes |
def index(self, item): """ This is similar to index but uses id to check for equality. Example: data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for root, ind in dom.sail_with_root(): print root.name, ind.name, root.index(ind) It would print. a b 0 a b 1 a 0 The line where it appears ' a 0' corresponds to the outmost object. The outmost object is an instance of Root that contains all the other objects. """ count = 0 for ind in self: if ind is item: return count count = count + 1 raise ValueError
Example #20
Source File: ehp.py From ehp with MIT License | 5 votes |
def sail(self): """ This is used to navigate through the xml/html document. Every xml/html object is represented by a python class instance that inherits from Root. The method sail is used to return an iterator for these objects. Example: data = '<a> <b> </b> </a>' html = Html() dom = html.feed(data) for ind in dom.sail(): print type(ind),',', ind.name It would output. <class 'ehp.Root'> , a <class 'ehp.Root'> , b """ for indi in self[:]: for indj in indi.sail(): yield(indj) yield(indi)
Example #21
Source File: Formatter.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 5 votes |
def parseStr(self, html): ''' parseStr - Parses a string and creates the DOM tree and indexes. @param html <str> - valid HTML ''' self.reset() if isinstance(html, bytes): self.feed(html.decode(self.encoding)) else: self.feed(html)
Example #22
Source File: Formatter.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 5 votes |
def parseFile(self, filename): ''' parseFile - Parses a file and creates the DOM tree and indexes @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close. ''' self.reset() if isinstance(filename, file): contents = filename.read() else: with codecs.open(filename, 'r', encoding=self.encoding) as f: contents = f.read() self.feed(contents)
Example #23
Source File: Formatter.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 5 votes |
def feed(self, contents): ''' feed - Load contents @param contents - HTML contents ''' contents = stripIEConditionals(contents) try: HTMLParser.feed(self, contents) except MultipleRootNodeException: self.reset() HTMLParser.feed(self, "%s%s" %(addStartTag(contents, INVISIBLE_ROOT_TAG_START), INVISIBLE_ROOT_TAG_END))
Example #24
Source File: Parser.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 5 votes |
def parseStr(self, html): ''' parseStr - Parses a string and creates the DOM tree and indexes. @param html <str> - valid HTML ''' self.reset() if isinstance(html, bytes): self.feed(html.decode(self.encoding)) else: self.feed(html)
Example #25
Source File: Parser.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 5 votes |
def parseFile(self, filename): ''' parseFile - Parses a file and creates the DOM tree and indexes @param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close. ''' self.reset() if isinstance(filename, file): contents = filename.read() else: with codecs.open(filename, 'r', encoding=self.encoding) as f: contents = f.read() self.feed(contents)
Example #26
Source File: Parser.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 5 votes |
def feed(self, contents): ''' feed - Feed contents. Use parseStr or parseFile instead. @param contents - Contents ''' contents = stripIEConditionals(contents) try: HTMLParser.feed(self, contents) except MultipleRootNodeException: self.reset() HTMLParser.feed(self, "%s%s" %(addStartTag(contents, INVISIBLE_ROOT_TAG_START), INVISIBLE_ROOT_TAG_END))
Example #27
Source File: Parser.py From AdvancedHTMLParser with GNU Lesser General Public License v3.0 | 5 votes |
def getMiniHTML(self): ''' getMiniHTML - Gets the HTML representation of this document without any pretty formatting and disregarding original whitespace beyond the functional. @return <str> - HTML with only functional whitespace present ''' from .Formatter import AdvancedHTMLMiniFormatter html = self.getHTML() formatter = AdvancedHTMLMiniFormatter(None) # Do not double-encode formatter.feed(html) return formatter.getHTML()
Example #28
Source File: ehp.py From ehp with MIT License | 4 votes |
def join(self, delim, *args): """ It joins all the objects whose name appears in args. Example 1: html = Html() data = '<a><b> This is cool. </b><b> That is. </b></a>' dom = html.feed(data) print dom.join('', 'b') print type(dom.join('b')) It would print. <b > This is cool. </b><b > That is. </b> <type 'str'> Example 2: html = Html() data = '<a><b> alpha</b><c>beta</c> <b>gamma</a>' dom = html.feed(data) print dom.join('', 'b', 'c') It would print. <b > alpha</b><c >beta</c><b >gamma</b> Example 3: html = Html() data = '<a><b>alpha</b><c>beta</c><b>gamma</a>' dom = html.feed(data) print dom.join('\n', DATA) It would print. alpha beta gamma """ data = '' for ind in self.sail(): if ind.name in args: data = '%s%s%s' % (data, delim, ind) return data
Example #29
Source File: ehp.py From ehp with MIT License | 4 votes |
def fst(self, name, *args): """ It returns the first object whose name matches. Example 1: html = Html() data = '<body> <em> Cool. </em></body>' dom = html.feed(data) print dom.fst('em') It outputs. <em > Cool. </em> Example 2: data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>' html = Html() dom = html.feed(data) for ind in dom.find('p', ('style', 'color:green')): print ind print dom.fst('p', ('style', 'color:green')) print dom.fst_with_root('p', ('style', 'color:green')) Output: <p style="color:green" > beta.</p> <p style="color:green" > beta.</p> (<ehp.Tag object at 0xb7216c0c>, <ehp.Tag object at 0xb7216d24>) """ # for ind in self.sail(): # if ind.name == name: # for key, value in args: # if ind.attr[key] != value: # break # else: # return ind seq = self.find(name, *args) try: item = next(seq) except StopIteration: return None else: return item