我有这段代码:
from logging import INFO
import scrapy
class LinkedInAnonymousSpider(scrapy.Spider):
name = "linkedin_anonymous"
allowed_domains = ["linkedin.com"]
start_urls = []
base_url = "https://www.linkedin.com/pub/dir/?first=%s&last=%s&search=Search"
def __init__(self, input=None, first=None, last=None):
self.input = input # source file name
self.first = first
self.last = last
def start_requests(self):
if self.first and self.last: # taking input from command line parameters
url = self.base_url % (self.first, self.last)
yield self.make_requests_from_url(url)
elif self.input: # taking input from file
i = 0
self.log('Input from file: %s' % self.input, INFO)
for line in open(self.input, 'r').readlines():
i += 1
if line.strip(): # no blank line
t = line.split("\t")
name = t[0]
parts = [n.strip() for n in name.split(' ')]
last = parts.pop()
first = " ".join(parts)
if first and last:
url = self.base_url % (first, last)
yield self.make_requests_from_url(url)
else:
raise Exception('No input.')
def parse(self, response):
# if there is exactly one match the person's profile page is returned
if response.xpath('//div[@class="profile-overview-content"]').extract():
yield scrapy.Request(response.url, callback=self.parse_full_profile_page)
else:
# extracting profile urls from search result
for sel in response.css('div.profile-card'):
url = sel.xpath('./*/h3/a/@href').extract()[0] # Person's full profile URL in LinkedIn
yield scrapy.Request(url, callback=self.parse_full_profile_page)
........
使用此代码,我可以从linkedin获取人员列表的个人资料详细信息。
为了做到这一点,我写了这样一个主要功能。
import scrapy
import sys
from linkedin_anonymous_spider import LinkedInAnonymousSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
if __name__ == "__main__":
firstname = ['Hasan', 'James']
lastname = ['Arslan', 'Bond']
for a in range(len(firstname)):
settings = get_project_settings()
crawler = CrawlerProcess(settings)
spider = LinkedInAnonymousSpider()
crawler.crawl(spider, [], firstname[a], lastname[a])
crawler.start()
当循环进入第二步时,我收到此错误:
引发错误..ReactorNotRestartable() twisted.internet.error.ReactorNotRestartable
我该如何解决问题?
感谢。
答案 0 :(得分:2)
您只能运行一个反应堆,因此只需调用crawler.start()
一次。
尝试将crawler.start()
传递出循环。
答案 1 :(得分:2)
这是一个正确的版本:
firstname = ['Hasan', 'James']
lastname = ['Arslan', 'Bond']
settings = get_project_settings()
crawler = CrawlerProcess(settings)
for a in range(len(firstname)):
crawler.crawl(LinkedInAnonymousSpider, [], firstname[a], lastname[a])
crawler.start()