在方法parse()
中,蜘蛛抓取4个网址然后发送到方法parse_dir_contents()
来抓取一些数据,但只有第4个网址正在被抓取我不明白为什么它不会抓取其他3个网址?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[@id="seo-dir"]/div/div/div/ul/li/a/@href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[@id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[@id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[@id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
答案 0 :(得分:0)
通过检查页面,我认为for
函数中不需要parse_dir_contents
循环。使功能如下:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[@id="name"]/text()').extract()
item['headline'] = response.xpath('//*[@id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
并检查这是否解决了您的问题。