Python scrapy.Item() Examples

The following are 12 code examples of scrapy.Item(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy , or try the search function .
Example #1
Source File: test_pipelines.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def test_process_item(self):
        item = self._get_item()

        spider = MagicMock()
        spider.name = "link"

        self.pipe.logger.info = MagicMock(side_effect=Exception("info"))
        try:
            self.pipe.process_item(item, spider)
            self.assertFalse(True)
        except Exception as e:
            self.assertEqual(str(e), "info")

        # test unknown item
        class WeirdItem(Item):
            pass
        item2 = WeirdItem()

        self.pipe.logger.warn = MagicMock(side_effect=Exception("warn"))
        try:
            self.pipe.process_item(item2, spider)
            self.assertFalse(True)
        except Exception as e:
            self.assertEqual(str(e), "warn") 
Example #2
Source File: middlewares.py    From scrapy-corenlp with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def process_spider_output(self, response, result, spider):
        for element in result:
            if isinstance(element, (Item, dict)):
                if isinstance(self.field_to_process, list):
                    text = ' '.join(
                        [element[field] for field in self.field_to_process]
                    )
                elif isinstance(self.field_to_process, string_types):
                    text = element[self.field_to_process]
                else:
                    yield element

                tagger = StanfordNERTagger(
                    model_filename=self.classifier,
                    path_to_jar=self.jar_file
                )
                token_entity_pairs = tagger.tag(
                    tokens=self.tokenizer(s=text)
                )
                accumulated = self.accumulate(token_entity_pairs)
                element.setdefault(self.output_field, accumulated)
                yield element
            else:
                yield element 
Example #3
Source File: middlewares.py    From realestate-scraper with MIT License 6 votes vote down vote up
def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        ts = datetime.now()
        stored_meta = response.meta.get('stored_meta')
        if stored_meta and 'timestamp' in stored_meta:
            ts = datetime.fromtimestamp(stored_meta['timestamp'])


        for i in result:
            if isinstance(i, (dict, Item)):
                i['scraped_time'] = ts
                i['scraped_time'] = ts.strftime('%d/%m/%Y')

                if 'DataAtualizacaoHumanizada' in i:
                    updated = parse(i['DataAtualizacaoHumanizada'],
                                    languages=['pt'],
                                    settings={'RELATIVE_BASE': ts})
                    i['updated_time'] = updated.strftime('%d/%m/%Y')
            yield i 
Example #4
Source File: pipelines.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def from_crawler(cls, crawler):
        spidermon_enabled = crawler.settings.getbool("SPIDERMON_ENABLED")
        if not spidermon_enabled:
            return PassThroughPipeline()

        validators = defaultdict(list)
        allowed_types = (list, tuple, dict)

        def set_validators(loader, schema):
            if type(schema) in (list, tuple):
                schema = {Item: schema}
            for obj, paths in schema.items():
                key = obj.__name__
                paths = paths if type(paths) in (list, tuple) else [paths]
                objects = [loader(v) for v in paths]
                validators[key].extend(objects)

        for loader, name in [
            (cls._load_jsonschema_validator, "SPIDERMON_VALIDATION_SCHEMAS"),
            (cls._load_schematics_validator, "SPIDERMON_VALIDATION_MODELS"),
        ]:
            res = crawler.settings.get(name)
            if not res:
                continue
            if type(res) not in allowed_types:
                raise NotConfigured(
                    "Invalid <{}> type for <{}> settings, dict or list/tuple"
                    "is required".format(type(res), name)
                )
            set_validators(loader, res)

        if not validators:
            raise NotConfigured("No validators were found")

        return cls(
            validators=validators,
            stats=crawler.stats,
            drop_items_with_errors=crawler.settings.getbool(
                "SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS"
            ),
            add_errors_to_items=crawler.settings.getbool(
                "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS"
            ),
            errors_field=crawler.settings.get("SPIDERMON_VALIDATION_ERRORS_FIELD"),
        ) 
Example #5
Source File: pipelines.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def find_validators(self, item):
        find = lambda x: self.validators.get(x.__name__, [])
        return find(item.__class__) or find(Item) 
Example #6
Source File: pipelines.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _add_errors_to_item(self, item, errors):
        try:
            if self.errors_field not in item.__class__.fields:
                item.__class__.fields[self.errors_field] = Field()
            if self.errors_field not in item._values:
                item[self.errors_field] = defaultdict(list)
        except AttributeError:
            # The item is just a dict object instead of a Scrapy.Item object
            if self.errors_field not in item:
                item[self.errors_field] = defaultdict(list)
        for field_name, messages in errors.items():
            item[self.errors_field][field_name] += messages 
Example #7
Source File: test_pipeline.py    From scrapy-jsonschema with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_default_item(self):
        stats = self._get_stats_for_docs(valid_docs, True)
        pipeline = JsonSchemaValidatePipeline(stats)
        item = Item()
        output_item = pipeline.process_item(item, None)
        assert item == output_item 
Example #8
Source File: cli.py    From scrapy-autounit with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def parse_data(self, data):
        if isinstance(data, (dict, scrapy.Item)):
            return {
                self.parse_data(k): self.parse_data(v)
                for k, v in data.items()
            }
        elif isinstance(data, list):
            return [self.parse_data(x) for x in data]
        elif isinstance(data, bytes):
            return to_unicode(data)
        elif isinstance(data, datetime):
            return data.isoformat()
        elif isinstance(data, (int, float)):
            return data
        return str(data) 
Example #9
Source File: introspection.py    From ws-backend-community with GNU General Public License v3.0 5 votes vote down vote up
def get_scrapy_item_classes():
        """
        Get a list of tuples containing (1) the class name and (2) the class for all of the Scrapy item
        classes defined in the crawling module.
        :return: A list of tuples containing (1) the class name and (2) the class for all of the Scrapy item
        classes defined in the crawling module.
        """
        import lib.inspection.web.crawling.item
        import scrapy
        return list(set(IntrospectionHelper.get_all_classes_of_type(
            to_find=scrapy.Item,
            path="lib/inspection/web/crawling",
        ))) 
Example #10
Source File: middlewares.py    From realestate-scraper with MIT License 5 votes vote down vote up
def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass 
Example #11
Source File: items.py    From OpenScraper with MIT License 5 votes vote down vote up
def create_item_class(class_name, fields_list):

	"""generic Item class creator populated from a list"""

	fields_dict = {}
	for field_name in fields_list:
		fields_dict[field_name] = Field()
	return type( str(class_name), (DictItem,), {'fields': fields_dict} ) 
Example #12
Source File: proxy.py    From fp-server with MIT License 5 votes vote down vote up
def hmset_dict(self, key, item):
        if not isinstance(item, (dict, Item)):
            raise TypeError("Error type: %s" % type(item))

        if not item:
            raise ValueError("item is empty")
        args = chain.from_iterable(item.items())

        return self.cli.hmset(key, *args)