Python scrapy.conf.settings() Examples

The following are 14 code examples of scrapy.conf.settings(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.conf , or try the search function .
Example #1
Source File: pipelines.py    From crawler_examples with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        name = settings['MONGODB_DBNAME']
        client = MongoClient(host=host,port=port)
        db = client[name]
        self.col = db[settings['MONGODB_DOCNAME']] 
Example #2
Source File: pipelines.py    From SourceCodeOfBook with MIT License 5 votes vote down vote up
def __init__(self):
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        db_name = settings['MONGODB_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        db = client[db_name]
        self.post = db[settings['MONGODB_DOCNAME']] 
Example #3
Source File: pipelines.py    From SourceCodeOfBook with MIT License 5 votes vote down vote up
def __init__(self):
        self.db = pymongo.MongoClient()[settings['MONGODB_DB']]
        self.handler = None 
Example #4
Source File: pipelines.py    From SourceCodeOfBook with MIT License 5 votes vote down vote up
def process_error(self, item):
        if not self.handler:
            self.handler = self.db[settings['MONGODB_ERROR']]
        self.handler.insert_one(dict(item)) 
Example #5
Source File: middlewares.py    From SourceCodeOfBook with MIT License 5 votes vote down vote up
def process_request(self, request, spider):
        proxy = random.choice(settings['PROXIES'])
        request.meta['proxy'] = proxy 
Example #6
Source File: middlewares.py    From SourceCodeOfBook with MIT License 5 votes vote down vote up
def process_request(self, request, spider):
        ua = random.choice(settings['USER_AGENT_LIST'])
        request.headers['User-Agent'] = ua 
Example #7
Source File: pipelines.py    From LotteryTicket with MIT License 5 votes vote down vote up
def __init__(self):
        self.server = settings['MONGODB_SERVER']
        self.port = settings['MONGODB_PORT']
        self.db = settings['MONGODB_DB']
        self.col = settings['MONGODB_COLLECTION']
        connection = pymongo.Connection(self.server, self.port)
        db = connection[self.db]
        self.collection = db[self.col] 
Example #8
Source File: pipelines.py    From NewsCrawler with MIT License 5 votes vote down vote up
def __init__(self):
        conn = pymongo.Connection(
            settings['MONGO_CONF']['host'],
            settings['MONGO_CONF']['port']
        )
        db = conn[settings['MONGO_CONF']['db']]
        self.news_collection = db[settings['MONGO_CONF']['collection']] 
Example #9
Source File: pipelines.py    From NewsCrawler with MIT License 5 votes vote down vote up
def __init__(self):
        conn = pymongo.Connection(
            settings['MONGO_CONF']['host'],
            settings['MONGO_CONF']['port']
        )
        db = conn[settings['MONGO_CONF']['db']]
        self.subscription_collection = db[settings['MONGO_CONF']['subscription_collection']] 
Example #10
Source File: pipelines.py    From Wenshu_Spider with MIT License 5 votes vote down vote up
def __init__(self):
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbname = settings['MONGODB_DBNAME']
        docname = settings['MONGODB_DOCNAME']
        self.client = pymongo.MongoClient(host=host,port=port)
        db = self.client[dbname]
        db[docname].ensure_index('casedocid', unique=True)  # 设置文书ID为唯一索引,避免插入重复数据
        self.post = db[docname] 
Example #11
Source File: pipelines.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        import pymongo
        connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
        self.db = connection[settings['MONGODB_DB']]
        self.collection = self.db[settings['MONGODB_COLLECTION']]
        if self.__get_uniq_key() is not None:
            self.collection.create_index(self.__get_uniq_key(), unique=True) 
Example #12
Source File: pipelines.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            self.collection.insert(dict(item))
        else:
            self.collection.update(
                {self.__get_uniq_key(): item[self.__get_uniq_key()]},
                dict(item),
                upsert=True)
        log.msg("Item wrote to MongoDB database %s/%s" %
                (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
                level=log.DEBUG, spider=spider)
        return item 
Example #13
Source File: pipelines.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def __get_uniq_key(self):
        if not settings['MONGODB_UNIQ_KEY'] or settings['MONGODB_UNIQ_KEY'] == "":
            return None
        return settings['MONGODB_UNIQ_KEY'] 
Example #14
Source File: pipelines.py    From Wenshu_Spider with MIT License 4 votes vote down vote up
def process_item(self, item, spider):
        '''插入数据'''
        try:
            data = dict(item)
            self.post.insert_one(data)
            return item
        except DuplicateKeyError:
            # 索引相同,即为重复数据,捕获错误
            spider.logger.debug('Duplicate key error collection')
            return item


# 2.异步存储item - 不行!插入不了数据! (参考:https://zhuanlan.zhihu.com/p/44003499)
# from twisted.internet import defer, reactor
# class WenshuPipeline(object):
#     def __init__(self, mongo_host, mongo_port, mongo_db, mongo_doc):
#         self.mongo_host = mongo_host
#         self.mongo_port = mongo_port
#         self.mongo_db = mongo_db
#         self.mongo_doc = mongo_doc
#
#     @classmethod
#     def from_crawler(cls, crawler):
#         return cls(
#             mongo_host=crawler.settings.get('MONGODB_HOST'),
#             mongo_port=crawler.settings.get('MONGODB_PORT'),
#             mongo_db=crawler.settings.get('MONGODB_DBNAME'),
#             mongo_doc=crawler.settings.get('MONGODB_DOCNAME'),
#         )
#
#     def open_spider(self, spider):
#         self.client = pymongo.MongoClient(host=self.mongo_host,port=self.mongo_port)
#         self.mongodb = self.client[self.mongo_db]
#         self.mongodb[self.mongo_doc].create_index('id', unique=True) # 创建索引,避免插入数据
#
#     def close_spider(self, spider):
#         self.client.close()
#
#     # 下面的操作是重点
#     @defer.inlineCallbacks
#     def process_item(self, item, spider):
#         out = defer.Deferred()
#         reactor.callInThread(self._insert, item, out, spider)
#         yield out
#         defer.returnValue(item)
#         return item
#
#     def _insert(self, item, out, spider):
#         time.sleep(10)
#         try:
#             self.mongodb[self.mongo_doc].insert_one(dict(item))
#             reactor.callFromThread(out.callback, item)
#         except DuplicateKeyError:
#             # 索引相同,即为重复数据,捕获错误
#             spider.logger.debug('duplicate key error collection')
#             reactor.callFromThread(out.callback, item)