我的程序中有两个Spider,并且我正在使用Selenium从Macys抓取URL,并且我试图按顺序运行Spider。第一个蜘蛛抓取该URL并将其存储在文本文件中,第二个蜘蛛使用从文本文件存储的URL。
这是我的实际代码:
import scrapy
from scrapy.http import Request
from scrapy_selenium import SeleniumRequest
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
INITIAL_PATH = 'initialQueue.txt'
class MacysspiderSpider(scrapy.Spider):
name = 'macysSpider'
allowed_domains = ['macys.com']
start_urls = ['https://www.macys.com/']
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.initial_parse)
def initial_parse(self, response):
department = response.xpath(
'//*[@id="flexid_118"]/a/span').extract()
department_url = response.xpath(
'//*[@id="flexid_118"]/a/@href').extract()
for (e_department, e_department_url) in zip(department, department_url):
absolute_url = ((response.urljoin(e_department_url)))
with open(INITIAL_PATH, 'a') as file:
file.write(absolute_url + '\n')
DEPARTMENT_PATH = 'departmentQueus.txt'
with open(INITIAL_PATH) as f:
urls = f.read().splitlines()
class DepartmentSpider(scrapy.Spider):
name = 'department'
allowed_domains = ['macys.com']
start_urls = urls
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.department_parse)
def department_parse(self, response):
all_products = response.xpath(
'//*[@id="categoryTree"]/ul/li[4]/div[1]/div/div/ul/li[1]/a/@href').extract()
for url in all_products:
with open(DEPARTMENT_PATH, 'a') as file:
file.write(url + '\n')
configure_logging()
runner = CrawlerRunner(get_project_settings())
@defer.inlineCallbacks
def crawl():
yield runner.crawl(MacysspiderSpider)
yield runner.crawl(DepartmentSpider)
reactor.stop()
crawl()
reactor.run()
但是当我运行代码时,我得到的是“ twisted.internet.error.ReactorNotRestartable ”,有人可以建议我如何解决此问题。谢谢
我也尝试使用线程并进行了更改
reaactor.run()
Thread(target=reactor.run, args=(False,)).start()
我仍然遇到同样的问题。