Python scrapy.item.Item() Examples

The following are 5 code examples of scrapy.item.Item(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.item , or try the search function .
Example #1
Source File: EuropythonSpyder.py    From Learning-Python-Networking-Second-Edition with MIT License 6 votes vote down vote up
def main():
	"""Main routine for the execution of the Spider"""
	# set up signal to catch items scraped
	def catch_item(sender, item, **kwargs):
		print("Item extracted:", item)
	dispatcher.connect(catch_item, signal=signals.item_passed)
	
	settings = Settings()
	settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
	settings.set("LOG_ENABLED",False)	

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)

	# define the spider for the crawler
	crawler.crawl(EuropythonSpyder())

	# start scrapy
	print("STARTING ENGINE")
	crawler.start() #iniciar el crawler llamando al spider definido
	print("ENGINE STOPPED") 
Example #2
Source File: utils.py    From scrapy-autounit with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def binary_check(fx_obj, cb_obj, encoding):
    if isinstance(cb_obj, (dict, Item)):
        fx_obj = {
            key: binary_check(value, cb_obj[key], encoding)
            for key, value in fx_obj.items()
        }

    if isinstance(cb_obj, list):
        fx_obj = [
            binary_check(fxitem, cbitem, encoding)
            for fxitem, cbitem in zip(fx_obj, cb_obj)
        ]

    if isinstance(cb_obj, Request):
        headers = {}
        for key, value in fx_obj['headers'].items():
            key = to_bytes(key, encoding)
            headers[key] = [to_bytes(v, encoding) for v in value]
        fx_obj['headers'] = headers
        fx_obj['body'] = to_bytes(fx_obj['body'], encoding)

    if isinstance(cb_obj, six.binary_type):
        fx_obj = fx_obj.encode(encoding)

    return fx_obj 
Example #3
Source File: file.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def another_process_item(self, result, item, info):
        """
            custom process_item func,so it will manage the Request result.
        """

        assert isinstance(result, (Item, Request)), \
            "WoaiduBookFile pipeline' item_completed must return Item or Request, got %s" % \
            (type(result))
        if isinstance(result, Item):
            return result
        elif isinstance(result, Request):
            dlist = [self._process_request(r, info) for r in arg_to_iter(result)]
            dfd = DeferredList(dlist, consumeErrors=1)
            dfd.addCallback(self.item_completed, item, info)
            # XXX:This will cause one item maybe return many times,it depends on how many
            # times the download url failed.But it doesn't matter.Because when raise errors,
            # the items are no longer processed by further pipeline components.And when all
            # url download failed we can drop that item which book_file or book_file_url are
            # empty.
            return dfd.addCallback(self.another_process_item, item, info)
        else:
            raise NofilesDrop 
Example #4
Source File: __init__.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _get_item_field_attr(self, field_name, key, default=None):
        if isinstance(self.item, Item):
            value = self.item.fields[field_name].get(key, default)
        else:
            value = default
        return value 
Example #5
Source File: __init__.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _get_item_field_attr(self, field_name, key, default=None):
        if isinstance(self.item, Item):
            value = self.item.fields[field_name].get(key, default)
        else:
            value = default
        return value