Python scrapy.http.FormRequest() Examples

The following are 18 code examples of scrapy.http.FormRequest(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.http , or try the search function .
Example #1
Source File: zhihu_user_spider.py    From openslack-crawler with Apache License 2.0 7 votes vote down vote up
def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata={'email': 'june.chan@foxmail.com',
                      'password': 'czj0617_zhihu'
                      },
            callback=self.after_login
        )] 
Example #2
Source File: coursera_spider.py    From scrapy_example with MIT License 6 votes vote down vote up
def start_requests(self):
        print 'Preparing login'
        return [FormRequest("https://accounts.coursera.org/api/v1/login",
                            headers = self.make_header(response),
                            formdata = {
                            "email": "1095511864@qq.com",
                            "password": "HUAZANG.55789260",
                            "webrequest": "true"
                            },
                            callback = self.parse_page
                            )] 
Example #3
Source File: zhizhu_user_topic_spider.py    From Zhihu_Spider with Apache License 2.0 6 votes vote down vote up
def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata = {'email':'xxx@gmail.com',
                'password':'123456'
                },
            callback = self.after_login
            )] 
Example #4
Source File: zhizhu_user_topic_spider.py    From Zhihu_Spider with Apache License 2.0 6 votes vote down vote up
def gen_topic_form(self, response):
        # yield the beginning topics
        sel = Selector(response)
        for topic_sel in sel.xpath('//div[@id="zh-profile-topic-list"]/div[contains(@class, "zm-profile-section-item")]'):
            # new user-topic relationship
            yield self.get_UT_item(topic_sel, response.url)

        # get the number of topics of one user
        num_topic = sel.xpath('//div[contains(@class, "zm-profile-section-wrap")]/div[contains(@class, "zm-profile-section-head")]//span[contains(@class, "zm-profile-section-name")]/text()')
        number_str = num_topic.extract()[0]
        # print number_str
        p = re.compile(r'\d+')
        m = p.findall(number_str)
        if m:
            num_topic = int(m[0])
            # crawl the remainding topics of a user
            base_line = 20
            if num_topic > 20:
                while  num_topic > 0:
                    yield FormRequest(
                            url = response.url,
                            formdata = {
                                'start': '0',
                                'offset': str(base_line),
                                '_xsrf': self.xsrf
                                },
                            callback=self.parse
                            )
                    num_topic = num_topic - 20
                    base_line += 20 
Example #5
Source File: zhihu_spider.py    From Zhihu_Spider with Apache License 2.0 6 votes vote down vote up
def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata = {'email':'example.com',
                'password':'123456'
                },
            callback = self.after_login
            )] 
Example #6
Source File: login1_spider.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def start_requests(self):
        for i, url in enumerate(self.start_urls):
            yield FormRequest(url, meta={'cookiejar': i}, \
                              headers=self.headers, \
                              cookies=self.cookies,
                              callback=self.parse_item)  # jump to login page 
Example #7
Source File: zhihu_answer_spider.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata={'email': 'june.chan@foxmail.com',
                      'password': 'czj0617_zhihu'
                      },
            callback=self.after_login
        )] 
Example #8
Source File: main.py    From python-examples with MIT License 6 votes vote down vote up
def parse(self, response):
        # small images 200x200
        #urls = response.xpath('//div[@id="thumbsContainer"]//img/@data-original').extract()
        #urls = response.xpath('//img[@class="res-photo-thumbnail thumb-load lazy-photo-inner"]/@data-original').extract()
        #yield {'image_urls': urls}

        # big images 800x600
        #urls = [url.replace('200%3A200', '800%3A600') for url in urls]
        #yield {'image_urls': urls}

        # big images 1900x1200
        #urls = [url.replace('200%3A200', '1900%3A1200') for url in urls]
        #yield {'image_urls': urls}

        data = {
            'res_id': '16761868', #, '16780723', # place ID
            'offset': '30',    # change it
            'category':	'all', # 'food'
            'action': 'fetch_photos',
            'index': '30',
            'limit': '10', # chage it
        }

        url = 'https://www.zomato.com/php/load_more_res_pics.php'
        yield FormRequest(url, callback=self.parse_post, formdata=data) 
Example #9
Source File: msi_spider.py    From uefi-spider with MIT License 5 votes vote down vote up
def parse_search(self, response):
        sel = Selector(response)

        ### Parse each sub-product type.
        searches = []
        product_selector = sel.css(".mr20").xpath("@no")
        if product_selector:
            pno = product_selector.extract()[0]

            products = sel.css(".ProdSel-item")
            for product in products:
                no = product.xpath("@no").extract()[0]
                searches.append((no, pno))
        #print searches

        ### Parse the actual products/boards.
        boards = []
        items = sel.css(".Prod-item")
        for item in items:
            title = item.xpath("@title").extract()[0]
            no = item.xpath("@no").extract()[0]
            boards.append((title, no))
        #print boards

        for sub_search in searches:
            search_vars = self._get_vars(sub_search[0], sub_search[1])
            yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
                formdata= search_vars, callback= self.parse_search)

        for board in boards:
            url = "http://us.msi.com/product/mb/%s.html" % board[0]
            item = MsiUpdateLinkItem()
            item["id"] = board[1]
            item["title"] = board[0]
            item["url"] = url

            yield Request(url= "%s#/?div=BIOS" % url, callback= self.parse_board, 
                meta= {"attrs": item})
        pass 
Example #10
Source File: inshorts.py    From scrape with MIT License 5 votes vote down vote up
def parse(self, response):
        try:
            for news in response.css('div.news-card'):
                self.urls_parsed += 1
                try:
                    item = ScrapenewsItem()
                    item['image'] = news.css('div.news-card-image::attr(style)').extract_first()[23:-3]
                    item['title'] = news.css('a.clickable>span::text').extract_first()
                    item['content'] = news.css('div[itemprop*=articleBody]::text').extract_first()
                    item['newsDate'] = news.css('span.time::attr(content)').extract_first()[:-5]
                    item['link'] = news.css('div.read-more>a::attr(href)').extract_first()
                    item['source'] = 105
                    yield item
                    self.urls_scraped += 1
                except Exception as e:
                    logger.error(__name__ + " [UNHANDLED] Unable to Extract Data : " + str(e))
                    self.urls_dropped += 1

            #news_id extraction
            pattern = re.compile('var min_news_id\s+=\s+"(.*?)"')
            js = response.xpath('//script[@type="text/javascript"]/text()').extract()[-1]
            self.news_id = pattern.search(js).group(1)

            while (self.pages > 1 and not self.infinite):
                yield FormRequest('https://www.inshorts.com/en/ajax/more_news',
                                    formdata={'news-offset' : self.news_id},
                                    callback=self.parse_more_news,
                                    errback=self.errorRequestHandler,
                                    dont_filter=True)
                self.pages -= 1

            while (self.infinite):
                yield FormRequest('https://www.inshorts.com/en/ajax/more_news',
                                    formdata={'news-offset' : self.news_id},
                                    callback=self.parse_more_news,
                                    errback=self.errorRequestHandler,
                                    dont_filter=True)
        except Exception as e:
            logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url) 
Example #11
Source File: zhihu_ask_spider.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata={'email': 'june.chan@foxmail.com',
                      'password': 'czj0617_zhihu'
                      },
            callback=self.after_login
        )] 
Example #12
Source File: zapimoveis.py    From realestate-scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        hidden = lambda id: response.xpath(
                '/html/body/input[@id="{}"]/@data-value'.
                format(id)).extract_first()

        total_pages = int(hidden('quantidadeTotalPaginas').replace('.',''))

        hashfragment = OrderedDict([
            ('pagina', None),
            ('semente', self.seed or hidden('semente')),
        ])

        formdata = OrderedDict([
            ('tipoOferta', '1'),
            ('paginaAtual', None),
            ('pathName', parse_url(response.url).path),
            ('hashFragment', ''),
        ])

        headers = {'X-Requested-With': 'XMLHttpRequest'}
        url = 'https://www.zapimoveis.com.br/Busca/RetornarBuscaAssincrona/'

        from_page = self.start
        if self.count:
            to_page = min(self.start + self.count - 1, total_pages)
        else:
            to_page = total_pages

        self.crawler.stats.set_value('total_pages', total_pages)
        self.crawler.stats.set_value('selected_pages',
                                     max(0, to_page - from_page + 1))

        for page in range(from_page, to_page + 1):
            hashfragment['pagina'] = formdata['paginaAtual'] = str(page)
            formdata['hashFragment'] = json.dumps(hashfragment,
                                                  separators=(',', ':'))
            yield FormRequest(
                    url,
                    headers=headers,
                    formdata=formdata,
                    callback=self.parse_busca) 
Example #13
Source File: music.py    From Python_Master_Courses with GNU General Public License v3.0 5 votes vote down vote up
def parse(self, response):
        for songid in response.xpath('//a/@href').re('/song/(\d+)'):
            print('songIds:', songid)
            data = {'songIds': songid}  # 257524668
            yield FormRequest(url=self.songlink_url, formdata=data, callback=self.parse_song)
            # break 
Example #14
Source File: msi_spider.py    From uefi-spider with MIT License 5 votes vote down vote up
def parse(self, response):
        ### Generate a search for AMD and Intel chips
        intel_search = self._get_vars(170, 1)
        amd_search   = self._get_vars(171, 1)
        yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
            formdata= intel_search, callback= self.parse_search)
        yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
            formdata= amd_search, callback= self.parse_search) 
Example #15
Source File: asus_spider.py    From uefi-spider with MIT License 5 votes vote down vote up
def parse_again(self, response):
        sel = Selector(response)

        hidden_fields = {}
        inputs = sel.xpath("//input")
        for ele in inputs:
            input_type = ele.xpath(".//@type").extract()[0]
            value = ele.xpath(".//@value").extract()[0]
            name = ele.xpath(".//@name").extract()[0]
            if input_type not in ["hidden"]:
                continue
            hidden_fields[name] = value

        for product_type in self.product_types:
            ### Create a POST form and apply a generated ScriptManager
            form_data = _select_form(1, product_type)
            for field in hidden_fields:
                ### Replace static fields with page-generated inputs.
                form_data[field] = hidden_fields[field]
            #print form_data
            yield FormRequest(formdata= form_data, method= "POST",
                headers= {
                    "Content-Type": "application/x-www-form-urlencoded",
                    #"X-MicrosoftAjax": "Delta=true",
                    "X-Requested-With": "XMLHttpRequest",
                    "User-Agent": self._get_uas()
                },
                url= self.select_urls[0],
                #meta= {"cookiejar": "GLOBAL"},
                callback= self.parse_series)
            return 
Example #16
Source File: intel_spider.py    From uefi-spider with MIT License 5 votes vote down vote up
def parse(self, response):
    url = "https://downloadcenter.intel.com/SearchResult.aspx?lang=eng"

    search_form = {
      "search_downloads": ".BIO",
      "ctl00$body$submit_search_downloads": "Search downloads",
      "ctl00$body$searchKeyword": "BIO"
    }

    return [FormRequest(url= url, method= "POST",
      formdata= search_form, callback= self.parse_form)] 
Example #17
Source File: hp_spider.py    From uefi-spider with MIT License 4 votes vote down vote up
def parse_accept(self, response):
    ### At the search form, begin to generate monthly searches, alert if >100 results.
    sel = Selector(response)

    ### This will select the REAL url (with appended query string "tokens").
    url_path = ""
    forms = sel.xpath("//form")
    for form in forms:
      form_ids = form.xpath("@id").extract()
      if len(form_ids) == 0: 
        continue
      if form_ids[0] == "refineSearchForm":
        url_path = form.xpath("@action").extract()[0]

    ### The search load-balances
    domain = response.url[len("http://"):response.url.find(".")]

    url = "http://%s.www2.hp.com/%s"
    form_data = {
      "didYouMean": "",
      "searchCrit": "allwords",
      "docType":"Drivers",
      #"docType":"Patch",
      "dateRange":"all",
      "dateSearchType":"dateRange",
      "startDateYear": None,
      "startDateMonth": None,
      "startDateDay": "1",
      "endDateYear": None,
      "endDateMonth": None,
      "endDateDay":"1",
      "resPerPage":"100",
      "sortCrit":"date",
      "showSummary":"yesX",
      "calledBy":"Search_Main",
      "mode":"text",
      "searchString":"BIOS Update",
      "searchRes":"Search",
      "advSearchFlag":"true",
    }

    ### Pull off the remaining searchs, and fill in vars for the 'next' search.
    remaining_searches = response.meta["searches"]

    form_data["startDateYear"] = str(remaining_searches[0][0])
    form_data["startDateMonth"] = str(remaining_searches[0][1])
    form_data["endDateYear"] = str(remaining_searches[0][2])
    form_data["endDateMonth"] = str(remaining_searches[0][3])

    return FormRequest(url= url % (domain, url_path) + "&month=%d&year=%d" % (remaining_searches[0][1], remaining_searches[0][0]), 
      headers= {"Content-Type": "application/x-www-form-urlencoded"},
      formdata= form_data, method= "POST", cookies= self.cookies,
      meta= {"searches": remaining_searches[1:], "this": (form_data["startDateYear"], form_data["startDateMonth"], form_data["endDateYear"], form_data["endDateMonth"])},
      dont_filter= True,
      callback= self.parse_search)
    pass 
Example #18
Source File: lagou.py    From IPProxyTool with MIT License 4 votes vote down vote up
def start_requests(self):
        count = self.sql.get_proxy_count(self.name)
        count_httpbin = self.sql.get_proxy_count(config.httpbin_table)

        ids = self.sql.get_proxy_ids(self.name)
        ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)

        for i in range(0, count + count_httpbin):
            table = self.name if (i < count) else config.httpbin_table
            id = ids[i] if i < count else ids_httpbin[i - len(ids)]

            proxy = self.sql.get_proxy_with_id(table, id)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield FormRequest(
                        url = url,
                        headers = self.headers,
                        method = 'POST',
                        meta = {
                            'cur_time': cur_time,
                            'download_timeout': self.timeout,
                            'proxy_info': proxy,
                            'table': table,
                            'id': proxy.id,
                            'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
                            'vali_count': proxy.vali_count,
                        },
                        cookies = {
                            'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030',
                            '_ga': 'GA1.2.40497390.1488937014',
                            'TG-TRACK-CODE': 'search_code',
                            'index_location_city': '%E5%8C%97%E4%BA%AC',
                            'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce',
                            'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014',
                            'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586',
                            'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644',
                            'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644',
                            'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02',
                            'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa'
                        },
                        formdata = {
                            'first': 'true',
                            'kd': 'ios',
                            'pn': '1',
                        },
                        dont_filter = True,
                        callback = self.success_parse,
                        errback = self.error_parse,
                )