我有一个小问题:我需要scrapy循环预定次数。这样做的原因是我正在提交POST请求并抓取结果。但是,结果不在一个页面上,因此需要再次POST,并且“cpipage”递增。 cpipage是页码。这是我的蜘蛛代码,我已将URL更改为nourl.com,因为这不是我正在抓取的网站。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import FormRequest, Request
#from etmd.items import Etmditems
import scrapy
class EtmdSpider(Spider):
name = "etmd"
start_urls = ["http://b2.nourl.com/dp.asp"]
def parse(self, response):
url = "http://b2.nourl.com/dp.asp"
payload = {"AppKey": "94921000e1999f84a518725", "ComparisonType1_1": "LIKE", "Value1_1": "", "MatchNull1_1" : "N", "ComparisonType2_1" : "LIKE", "MatchNull2_1" : "N", "Value2_1" : "", "ComparisonType3_1": "=", "MatchNull3_1" : "N", "Value3_1" : "", "x":"69", "y":"27", "FieldName1" : "County", "Operator1": "OR", "NumCriteriaDetails1": "1", "Operator2" : "OR", "NumCriteriaDetails2" : "1", "FieldName3": "Year", "Operator3" : "OR", "NumCriteriaDetails3": "1", "PageID" : "2", "GlobalOperator": "AND", "NumCriteria" : "3", "Search" : "1", "cpipage": "4"}
return (FormRequest(url, formdata = payload, callback = self.parse_data))
def parse_data(self, response):
items = []
sel = Selector(response)
items.append(sel.xpath('//td').extract())
exportfile = open( "exported.txt", "a")
exportfile.write (str(items))
print items
因此在有效负载字典中我有cpipage
,在这种情况下是“4”,但是我需要它一直递增到175.无论如何在我目前拥有的代码中执行此操作或者通过脚本运行scrapy蜘蛛而没有shell?
我已经尝试了一个for循环:
for i in range(175):
url = "http://b2.nourl.com/dp.asp"
payload = {"AppKey": "94921000e1999f84a518725", "ComparisonType1_1": "LIKE", "Value1_1": "", "MatchNull1_1" : "N", "ComparisonType2_1" : "LIKE", "MatchNull2_1" : "N", "Value2_1" : "", "ComparisonType3_1": "=", "MatchNull3_1" : "N", "Value3_1" : "", "x":"69", "y":"27", "FieldName1" : "County", "Operator1": "OR", "NumCriteriaDetails1": "1", "Operator2" : "OR", "NumCriteriaDetails2" : "1", "FieldName3": "Year", "Operator3" : "OR", "NumCriteriaDetails3": "1", "PageID" : "2", "GlobalOperator": "AND", "NumCriteria" : "3", "Search" : "1", "cpipage": "%i" %i}
return (FormRequest(url, formdata = payload, callback = self.parse_data))
答案 0 :(得分:1)
return
语句将立即退出该方法。
您应该返回所有请求的列表:
def parse(self, response):
requests = []
for i in range(175):
url = "http://b2.nourl.com/dp.asp"
payload = {"AppKey": "94921000e1999f84a518725", "ComparisonType1_1": "LIKE", "Value1_1": "", "MatchNull1_1" : "N", "ComparisonType2_1" : "LIKE", "MatchNull2_1" : "N", "Value2_1" : "", "ComparisonType3_1": "=", "MatchNull3_1" : "N", "Value3_1" : "", "x":"69", "y":"27", "FieldName1" : "County", "Operator1": "OR", "NumCriteriaDetails1": "1", "Operator2" : "OR", "NumCriteriaDetails2" : "1", "FieldName3": "Year", "Operator3" : "OR", "NumCriteriaDetails3": "1", "PageID" : "2", "GlobalOperator": "AND", "NumCriteria" : "3", "Search" : "1", "cpipage": "%i" %i}
requests.append(FormRequest(url, formdata = payload, callback = self.parse_data))
return requests
或yield
逐个:
def parse(self, response):
for i in range(175):
url = "http://b2.nourl.com/dp.asp"
payload = {"AppKey": "94921000e1999f84a518725", "ComparisonType1_1": "LIKE", "Value1_1": "", "MatchNull1_1" : "N", "ComparisonType2_1" : "LIKE", "MatchNull2_1" : "N", "Value2_1" : "", "ComparisonType3_1": "=", "MatchNull3_1" : "N", "Value3_1" : "", "x":"69", "y":"27", "FieldName1" : "County", "Operator1": "OR", "NumCriteriaDetails1": "1", "Operator2" : "OR", "NumCriteriaDetails2" : "1", "FieldName3": "Year", "Operator3" : "OR", "NumCriteriaDetails3": "1", "PageID" : "2", "GlobalOperator": "AND", "NumCriteria" : "3", "Search" : "1", "cpipage": "%i" %i}
yield FormRequest(url, formdata = payload, callback = self.parse_data)