我使用scrapy使用FormRequest登录网站,但由于该网站是在ASP.Net中构建的,为了分页,我也必须使用FormRequest,代码运行正常,但我不知道如何因为我是scrapy的新手,所以将它配置为分页。当我在parse_item()中使用return时,它可以工作,但是我想将它用于分页,当我在parse_item()中使用yield时,我得到了这个错误。
2016-08-29 20:44:59 [scrapy] ERROR: Spider must return Request, BaseItem, dict or None, got 'list' in <GET https://recruiter.cwjobs.co.uk/Recruitment/CandidateSearch/CandidateSearchResults.aspx?SalaryRateTypeId=1&SalaryRangeIds=17%2c18%2c19%2c20%2c21%2c22%2c23%2c24%2c25%2c26&LastActivityId=15&Radius=-1&JobTypeProfile=20&LimitSearch=True&CandidateSearchCriteria=vKYIkjLZq5Af6OEmkANngg%3d%3d&scr=1&iasc=0>
这是我的代码
import scrapy
from scrapy.http import Request
from scrapy.http import FormRequest
from cwjobs.items import CwjobsItem
class RecruiterSpider(scrapy.Spider):
name = "recruiter"
allowed_domains = ["recruiter.cwjobs.co.uk"]
start_urls = (
'https://recruiter.cwjobs.co.uk/loginReturnUrl=%2fhome%3fRHP%3dnav_bar_SignIn/',)
def start_requests(self):
return [FormRequest("https://recruiter.cwjobs.co.uk/login/",formdata={"__EVENTTARGET":"","__EVENTARGUMENT":"","__VIEWSTATE":"QI2hCUmnX2GZ+vtA2RoynX1rSOZ0LG+0ixQlSPqGcTM9qCheVZwbfaMtPeQAfiQCmM/aJhVjQ7bljYbGfVUEhzVsDaNRB+3qBuOc+SYZ+pHoSk2s0cFz6f5ODgqv/6Jj12bUs7OKnyIa8mlPo+xfmhS+oWroHnJyfPvBAGZkInpW5EcmmKqHD2Ede0XdsH2mMM4nPIy+PRsGW1ZeVd6HifZC1RG9bFXlunoIlQDNhDQeOpRmVdcRroybtCCp+1jLrH4EOGKfOCQ+o2WFGBfldPfS1AHGXL9tDHwvrol4Cx/nK01y1E27PWobQ2RlUXINMBNditfn3qTKCKlGRSLHMJ+PpfZJv1ncmNTvtV+kR1O5vTLbw03Ct3HMzw4GI/zmwojQqUXa0Z4vAoe6bqkzZpm1qKtzzsdpsp5uLTaGiv3SAlDXrK/vuvCFGMqZTMAoqJ47WluyIFsA3Y4dak69mF/UMH3+Foizgh+37IHrL6hM2v17NyvfMAgJXncASJ6P85t8R3Xr2Q4Z1kEbKna1Qi4yINI+wrSmZSSdcTnw3oiklUBCATmFbbnPdhNbr9AIK3lm7hu8OxrXRDRjsOulpB5BgS0Xu4O/8G0A4UNWlLGFoaNdOa/P8UZFvTiRL0uZJR1bL9QImr7DT5ChOPPh4Xzf4KdmB/L7/gRiQlhxQ6ek4BxcjruN3sZ6eFNrEAAbFGMuxevrFlBM+FFvwHEOEK03pYtBjrDhGTVeujLJO7TCetqUZ7+PVGs17by20kkOEMvOFKx9mTeW4oFzbqAUQvQjhW+hSEVmNvRzw+lhov0v1OUcrTdGL6C6sk9jKUALgiyOWEabMSGqoWA5eVQEyiFXVuAQ5AJZcKeQ13wDGZ1HFXj/dlE+jA6p0E/FfEc+A5T69bTN6zjvCwkew/DxJxmxBBBxxnMhgbn8qnpbVRkJj9cg/uTJoD7zI7WWnUTK9neMdPCLGa1MPvXNV/YkCGgswrGqKk9B4eWdGQHqhJJj+Fgb7uW1ZycnuyBoHup8rpKEx1wz56voovTuVRBFk60CHv8MDMcmqAbXujUGwKgZCraUVtAgV10eTG8emVCGGAE5LOkl8eo1h7iV/VWZieE3H+VgD7hucFv2Ny7pzqrxZ68xZEu2F7MQgKL92uKGsNyrHcjTwtCcorYoTIXTGOAlZo3FA5LXL2XFAmVCHH3smh0r2yQyQitQ7oaqVX2jgTL4HdXTVny9Qf5pdkDlHneSCmkMVN45ILhmpTWKj27kpSK/QlYvoG+cvKKdXW2wWJ5ZZ2sqHqH4lWNVmgARYG8JDIXLNRRHv+S5MBGg0hQ6llYrparx6azMop5cx3AeMssimtPJvl+FvcNyqpAZpMsiXEpTBWlHUHdyO3PCq8yYpE4SoOn7NmiVqDE69c2z1/pHlH0fQDUsa7UsKHOAHtyznX0E29q8r0zNJEpNhUH/uX/6G7syXljeOB0P1XVTRbZmL8mFBCMxMPCt/vFi+MPKgr2aPlT7RPv+yy4bILRavikMOKFJ+Cf2Q3r0J60feH+bKISzib9VPvfdj2qudb0Ctt7XbTi0vWKmikStwMwZiVlZlpHImSgmokCC7T988NFHhGw+84Kxc7r8CyBTdfqC2flZpCM5VqY1q1kw/YklVnsm0Uv2FBT0gy4kAQxgOw9D4aA3Ahqr7dWiDDiGPc5/U/ci3D6v9pbbCg3rOGAI4zEUFli0n3OPjEIwCzRi3KVkgSenZjGcTNEtA/sqL8WuMzxv9dIporx76Iwxy6D8wPbWogn30WcHfqR2VWoPvH4Q1fz/4a1hnY9P6N8Y3AEVKrc9fnRaQu/LNQAQajqU5PqLAVmZgbJo4w8M839nQk+nxO+vkidRxU0hONe7dgADn9mqYf4ss0ITvzEvoLdFv9DdjcBVXh/ZxFZZeVZAZ0B+bXQ3Sf7oMEmZSL0rBxq47EG1MDLksHnQZF0VbOPsdsJpKK770zbcAe4yLgVRye6RGxObQfOWaJVGhZXjMnk8+HEspMLLLj3jUKPkHMUbK7mvjWs3A2o0Z4g=","__VIEWSTATEGENERATOR":"607E8F97","LoginPanel$txtUsername":"*******","LoginPanel$txtPassword":"*********","LoginPanel$btnSubmit":"Sign in","Register$txtFirstName":"","Register$txtLastName":"","Register$txtCompanyName":"","Register$txtBusinessPhone":"","Register$txtEmailAddress":"","Register$txtPassword":"","Register$txtPasswordConfirm":"","Register$txtCharityNumber":"","txtReminderUsername":""})]
def parse(self,response):
print response.xpath("//h1[@class='account-name']/text()").extract()
return Request("https://recruiter.cwjobs.co.uk/Recruitment/CandidateSearch/CandidateSearchResults.aspx?SalaryRateTypeId=1&SalaryRangeIds=17%2c18%2c19%2c20%2c21%2c22%2c23%2c24%2c25%2c26&LastActivityId=15&Radius=-1&JobTypeProfile=20&LimitSearch=True&CandidateSearchCriteria=vKYIkjLZq5Af6OEmkANngg%3d%3d&scr=1&iasc=0", callback = self.parse_item)
def parse_item(self, response):
candsearch = response.xpath("//input[@id='CandidateSearchResults']/@value").extract()[0]
viewsgenerator = response.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value").extract()[0]
print viewsgenerator
newsearch = response.xpath("//input[@id='NewSearchCriteria']/@value").extract()[0]
searchcriteria = response.xpath("//input[@id='CandidateSearchCriteria']/@value").extract()[0]
viewstate = response.xpath("//input[@id='__VIEWSTATE']/@value").extract()[0]
for i in range(1, 3):
print i
data = {"__EVENTTARGET":"ctl00$cphCentralPanel$ucSearchResults$pgrPager","__EVENTARGUMENT":str(i),"CandidateSearchCriteria":searchcriteria,"NewSearchCriteria":newsearch,"Keywords":"","CandidateSearchResults":candsearch,"__LASTFOCUS":"","__VIEWSTATE":viewstate,"__VIEWSTATEGENERATOR":viewsgenerator,"ctl00$cphCentralPanel$NewOrExistingSavedSearch":"rdoNewSavedSearch", "ctl00$cphCentralPanel$txtSavedSearchName":"","ctl00$cphCentralPanel$ucSearchResults$hdnPopoverLinkClicked":"","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$txtBoolean":"","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$hdnIsAutosuggestChosen":"0","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$searchTypePart$qsSearchType":"rbProfileAndCV", "ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$txtPostcode":"","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$ddlRadius":"-1", "ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$qsLoc":"rdoPostcode","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$ddlLastActivity":"15","ctl00$cphCentralPanel$ucSearchResults$ddlSort":"Relevancy#0", "ctl00$cphCentralPanel$ucSearchResults$ddlPageSize":"50"}
request = [FormRequest.from_response(response, formdata = data, callback = self.parse_page2)]
yield request
def parse_page2(self,response):
li = response.xpath("//div[@class = 'row card-row']")
for l in li:
item = CwjobsItem()
firstname = l.xpath(".//a[@class='candidate-lnk']//span[@class='firstName']/text()").extract()
lastname = l.xpath(".//a[@class='candidate-lnk']//span[@class='lastName']/text()").extract()
item['name'] = firstname + lastname
det = l.xpath(".//div[@id='current-expected-row']")
for d in det:
currs = d.xpath(".//li[contains(@id, 'CurrentSalary')]/span/text()").extract()
if currs:
item['currs'] = currs[0].strip()
currjobt = d.xpath(".//li[contains(@id, 'CurrentJobTitle')]/span/text()").extract()
if currjobt:
item['currjobt'] = currjobt[0].strip()
Experience = d.xpath(".//li[contains(@id, 'Experience')]/span/text()").extract()
if Experience:
item['Experience'] = Experience[0].strip()
Desiredjob = d.xpath(".//li[contains(@id, 'DesiredJobTitle')]/span/text()").extract()
if Desiredjob:
item['Desiredjob'] = Desiredjob[0].strip()
Desireds = d.xpath(".//li[contains(@id, 'DesiredSalary')]/span/text()").extract()
if Desireds:
item['Desireds'] = Desireds[0].strip()
DesiredLoc = d.xpath(".//li[contains(@id, 'DesiredLocations')]/span/text()").extract()
if DesiredLoc:
item['DesiredLoc'] = DesiredLoc[0].strip()
phone = l.xpath("//span[@class='action-span hiddendata']/@data-hiddendataurl").extract()
if phone:
item['phonel'] = "https://recruiter.cwjobs.co.uk"+ phone[0]
cvl = l.xpath("//a[@class='action-link view-cv-icon cv-action-button']/@href").extract()
if cvl:
item['cvl'] = "https://recruiter.cwjobs.co.uk"+ cvl[0]
emaillink = l.xpath("//a[@class='hiddendata action-link email-candidate']/@data-hiddendataurl").extract()
if emaillink:
emaillink = "https://recruiter.cwjobs.co.uk" + emaillink[0]
item['email'] = emaillink
# request.meta['item'] = item
# yield request
# return
# yield Request(item['cvl'])
# item['email'] = [response.body]
return item
# def parse_page(self,response):
# # item = response.meta['item']
# item['email'] = response.body
# yield item
我该如何解决这个问题?
答案 0 :(得分:0)
当scrapy要求scrapy.Item
或scrapy.Request
时,您会返回一个列表。
罪魁祸首:
request = [FormRequest.from_response(response, formdata = data, callback = self.parse_page2)]
return request
要解决此问题,您应该不要将请求放入列表中,也不要遍历它并生成每个元素。
request = FormRequest.from_response(response, formdata = data, callback = self.parse_page2)
return request
# or
requests = [FormRequest.from_response(response, formdata = data, callback = self.parse_page2)]
for r in requests:
yield r