# -*- coding:utf-8 -*- from scrapy.selector import Selector from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy.http import FormRequest from zhihu.settings import * class ZhihuLoginSpider(CrawlSpider): name = 'zhihulogin1' allowed_domains = ['zhihu.com'] start_urls = ['http://www.zhihu.com/lookup/class/'] rules = ( Rule(LinkExtractor(allow=r'search/')), Rule(LinkExtractor(allow=r'')), ) def __init__(self): self.headers = HEADER self.cookies = COOKIES def start_requests(self): for i, url in enumerate(self.start_urls): yield FormRequest(url, meta={'cookiejar': i}, \ headers=self.headers, \ cookies=self.cookies, callback=self.parse_item) # jump to login page def _openpage(self, cat, response): open('error_pages/' + cat + response.url.replace("http://", "_").replace("/", "_") + '.html', 'w').write( response.body) def parse_item(self, response): selector = Selector(response) self._openpage("page_", response) urls = [] for ele in selector.xpath('//ul/li[@class="suggest-item"]/div/a/@href').extract(): urls.append(ele) print urls return urls