我的蜘蛛看起来像这样/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[@class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/@href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[@class="page-nav-btm"]/ul/li[last()]/a/@href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
它知道它擦除表格所做的事情(参见起始网址)。我希望它然后去链接(成员名称列),然后从这个链接中提取一些信息(链接是例如https://www.trekearth.com/members/monareng/)并将其作为项目返回。
我该如何处理?
如果有任何不清楚的地方,请不要犹豫要求澄清。
修改 nowy我的代码看起来如下(但仍然不起作用):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[@class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/@href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[@class="groups-btm"]/ul/li/text()').extract_first()
return item
答案 0 :(得分:0)
使用meta字段将项目转发到下一个回调
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD:要处理所有行,请在循环中使用yield
for row in response.xpath('//table[@class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/@href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request