所以我让我的刮刀使用一个表单请求。我甚至可以看到终端打印出来自这个单页版本的刮擦数据:
class MySpider(BaseSpider):
name = "swim"
start_urls = ["example.website"]
DOWNLAD_DELAY= 30.0
def parse(self, response):
return [FormRequest.from_response(response,formname="TTForm",
formdata={"Ctype":"A", "Req_Team": "", "AgeGrp": "0-6",
"lowage": "", "highage": "", "sex": "W", "StrkDist": "10025",
"How_Many": "50", "foolOldPerl": ""}
,callback=self.swimparse1,dont_click=True)]
def swimparse1(self, response):
open_in_browser(response)
hxs = Selector(response)
rows = hxs.xpath(".//tr")
items = []
for rows in rows[4:54]:
item = swimItem()
item["names"] = rows.xpath(".//td[2]/text()").extract()
item["age"] = rows.xpath(".//td[3]/text()").extract()
item["free"] = rows.xpath(".//td[4]/text()").extract()
item["team"] = rows.xpath(".//td[6]/text()").extract()
items.append(item)
return items
然而,当我添加第二个formrequest回调时,它只会抓取第二个中的项目。它也只打印第二页的刮痕,好像它完全跳过第一页刮擦? :
class MySpider(BaseSpider):
name = "swim"
start_urls = ["example.website"]
DOWNLAD_DELAY= 30.0
def parse(self, response):
return [FormRequest.from_response(response,formname="TTForm",
formdata={"Ctype":"A", "Req_Team": "", "AgeGrp": "0-6",
"lowage": "", "highage": "", "sex": "W", "StrkDist": "10025",
"How_Many": "50", "foolOldPerl": ""}
,callback=self.swimparse1,dont_click=True)]
def swimparse1(self, response):
open_in_browser(response)
hxs = Selector(response)
rows = hxs.xpath(".//tr")
items = []
for rows in rows[4:54]:
item = swimItem()
item["names"] = rows.xpath(".//td[2]/text()").extract()
item["age"] = rows.xpath(".//td[3]/text()").extract()
item["free"] = rows.xpath(".//td[4]/text()").extract()
item["team"] = rows.xpath(".//td[6]/text()").extract()
items.append(item)
#print item[]
return [FormRequest.from_response(response,formname="TTForm",
formdata={"Ctype":"A", "Req_Team": "", "AgeGrp": "0-6",
"lowage": "", "highage": "", "sex": "W", "StrkDist": "40025",
"How_Many": "50", "foolOldPerl": ""}
,callback=self.Swimparse2,dont_click=True),]
def swimparse2(self, response):
open_in_browser(response)
hxs = Selector(response)
rows = hxs.xpath(".//tr")
items = []
for rows in rows[4:54]:
item = swimItem()
item["names"] = rows.xpath(".//td[2]/text()").extract()
item["age"] = rows.xpath(".//td[3]/text()").extract()
item["fly"] = rows.xpath(".//td[4]/text()").extract()
item["team"] = rows.xpath(".//td[6]/text()").extract()
items.append(item)
#print item[]
return items
猜测: A)如何将第一次刮削中的物品导出或返回到第二次刮削中,以便最终得到所有物品数据,就像从一页上刮下一样?
B)或者如果完全跳过第一次刮擦,我怎么能停止跳过并将这些物品传递到下一个?
谢谢!
PS:另外:我尝试过使用:
item = response.request.meta = ["item]
item = response.request.meta = []
item = response.request.meta = ["names":item, "age":item, "free":item, "team":item]
所有这些都会产生密钥错误或引发的其他异常
我还尝试修改表单请求以包含meta = {“names”:item,“age”:item,“free”:item,“team”:item}。不会引起错误,但不会刮擦或储存任何东西。
编辑:我尝试使用这样的收益率:
class MySpider(BaseSpider):
name = "swim"
start_urls = ["www.website.com"]
DOWNLAD_DELAY= 30.0
def parse(self, response):
open_in_browser(response)
hxs = Selector(response)
rows = hxs.xpath(".//tr")
items = []
for rows in rows[4:54]:
item = swimItem()
item["names"] = rows.xpath(".//td[2]/text()").extract()
item["age"] = rows.xpath(".//td[3]/text()").extract()
item["free"] = rows.xpath(".//td[4]/text()").extract()
item["team"] = rows.xpath(".//td[6]/text()").extract()
items.append(item)
yield [FormRequest.from_response(response,formname="TTForm",
formdata={"Ctype":"A", "Req_Team": "", "AgeGrp": "0-6",
"lowage": "", "highage": "", "sex": "W", "StrkDist": "10025",
"How_Many": "50", "foolOldPerl": ""}
,callback=self.parse,dont_click=True)]
for rows in rows[4:54]:
item = swimItem()
item["names"] = rows.xpath(".//td[2]/text()").extract()
item["age"] = rows.xpath(".//td[3]/text()").extract()
item["fly"] = rows.xpath(".//td[4]/text()").extract()
item["team"] = rows.xpath(".//td[6]/text()").extract()
items.append(item)
yield [FormRequest.from_response(response,formname="TTForm",
formdata={"Ctype":"A", "Req_Team": "", "AgeGrp": "0-6",
"lowage": "", "highage": "", "sex": "W", "StrkDist": "40025",
"How_Many": "50", "foolOldPerl": ""}
,callback=self.parse,dont_click=True)]
仍然没有抓什么。我知道xpath是正确的,因为当我只尝试并抓取一个表单(返回而不是收益)时,它完美地工作。我已经阅读了那些杂乱无章的文档,它只是没有用处:(
答案 0 :(得分:3)
您错过了一个非常简单的解决方案,将return
更改为yield
然后你不必在数组中累积项目,只需从你的函数中产生尽可能多的项目和请求,scrapy将完成其余的工作
来自scrapy docs:
from scrapy.selector import Selector
from scrapy.spider import Spider
from scrapy.http import Request
from myproject.items import MyItem
class MySpider(Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
'http://www.example.com/3.html',
]
def parse(self, response):
sel = Selector(response)
for h3 in sel.xpath('//h3').extract():
yield MyItem(title=h3)
for url in sel.xpath('//a/@href').extract():
yield Request(url, callback=self.parse)