Python sgmllib.SGMLParser.reset() Examples
The following are 30
code examples of sgmllib.SGMLParser.reset().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sgmllib.SGMLParser
, or try the search function
.
Example #1
Source File: sgml.py From learn_python3_spider with MIT License | 6 votes |
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in self.links: if isinstance(link.url, six.text_type): link.url = link.url.encode(response_encoding) try: link.url = urljoin(base_url, link.url) except ValueError: continue link.url = safe_url_string(link.url, response_encoding) link.text = to_unicode(link.text, response_encoding, errors='replace').strip() ret.append(link) return ret
Example #2
Source File: sgml.py From learn_python3_spider with MIT License | 6 votes |
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in self.links: if isinstance(link.url, six.text_type): link.url = link.url.encode(response_encoding) try: link.url = urljoin(base_url, link.url) except ValueError: continue link.url = safe_url_string(link.url, response_encoding) link.text = to_unicode(link.text, response_encoding, errors='replace').strip() ret.append(link) return ret
Example #3
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #4
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #5
Source File: BeautifulSoup.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #6
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #7
Source File: beautifulsoup.py From POC-EXP with GNU General Public License v3.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #8
Source File: beautifulsoup.py From POC-EXP with GNU General Public License v3.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #9
Source File: BeautifulSoup.py From Cloudmare with GNU General Public License v3.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #10
Source File: BeautifulSoup.py From Cloudmare with GNU General Public License v3.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #11
Source File: BeautifulSoup.py From doork with MIT License | 5 votes |
def _feed(self, inDocumentEncoding=None): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding if markup: if self.markupMassage: if not isList(self.markupMassage): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #12
Source File: BeautifulSoup.py From doork with MIT License | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #13
Source File: BeautifulSoup.py From filmkodi with Apache License 2.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #14
Source File: BeautifulSoup.py From filmkodi with Apache License 2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #15
Source File: dpc-shodanscan.py From darkc0de-old-stuff with GNU General Public License v3.0 | 5 votes |
def reset(self): SGMLParser.reset(self) self.urls=[]
Example #16
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #17
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #18
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #19
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #20
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #21
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #22
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #23
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #24
Source File: moduleBS.py From D-Tech with GNU General Public License v3.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #25
Source File: moduleBS.py From D-Tech with GNU General Public License v3.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #26
Source File: BeautifulSoup.py From yalih with Apache License 2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #27
Source File: __init__.py From arnold-usd with Apache License 2.0 | 5 votes |
def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not hasattr(self.markupMassage, "__iter__"): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
Example #28
Source File: dpc-shodanscan.py From d4rkc0de with GNU General Public License v2.0 | 5 votes |
def reset(self): SGMLParser.reset(self) self.urls=[]
Example #29
Source File: links.py From d4rkc0de with GNU General Public License v2.0 | 5 votes |
def reset(self): SGMLParser.reset(self) self.urls = []
Example #30
Source File: moduleBS.py From D-TECT with GNU General Public License v3.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)