我正在尝试编写一个可以多次启动scrapy进程的代码,因为扭曲的反应器无法重新启动,我正在尝试使用多处理模块生成一个新进程,该模块以函数中的所有scrpay导入为目标。以下是我正在使用的代码。
class LniSpider(scrapy.Spider):
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
class BaseHelper():
SPIDER = LniSpider
proxies_path = None
def run_crawler(self,q):
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Selector
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
CELERYD_CONCURRENCY = 10
settings = get_project_settings()
if self.proxies_path:
print("Using Proxies from file ", self.proxies_path)
settings.set("DOWNLOADER_MIDDLEWARES", self.DOWNLOADER_MIDDLEWARES)
settings.set("RETRY_TIMES", 10)
settings.set("RETRY_HTTP_CODES", [500, 503, 504, 400, 403, 404, 408])
settings.set("PROXY_MODE", 0 )
settings.set("PROXY_LIST", self.proxies_path)
settings.set("DOWNLOAD_DELAY", 0.050)
print("---------------------------------\n\n\n\n")
process = CrawlerProcess(settings)
self.SPIDER.temp_file_folder = self.temp_path
process.crawl(self.SPIDER)
process.start()
q.put(True)
def crawl(self):
queue = Queue()
p = Process(target=self.run_crawler, args=(self,queue))
p.start()
p.join()
result = queue.get()
self.run_crawler()
with open(self.temp_file_path,"r") as data_file:
data = json.load(data_file)
os.remove(self.temp_file_path)
return data
但是运行它会导致错误
(scraper) E:\scraper>python main.py
E:\scraper\spiders\lni.py:10: ScrapyDeprecationWarning: Importing from scrapy.xlib.pydispatch is deprecated and will no longer be supported in future Scrapy versions. If you just want to connect signals use the from_crawler class method, otherwise import pydispatch directly if needed. See: https://github.com/scrapy/scrapy/issues/1762
from scrapy.xlib.pydispatch import dispatcher
Traceback (most recent call last):
File "E:\scraper\helpers\base_helper.py", line 98, in crawl
p.start()
File "C:\Anaconda3\envs\scraper\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Anaconda3\envs\scraper\lib\multiprocessing\context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Anaconda3\envs\scraper\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "C:\Anaconda3\envs\scraper\lib\multiprocessing\popen_spawn_win32.py", line 65, in __init__
reduction.dump(process_obj, to_child)
File "C:\Anaconda3\envs\scraper\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
File "C:\Anaconda3\envs\scraper\lib\socket.py", line 185, in __getstate__
raise TypeError("Cannot serialize socket object")
TypeError: Cannot serialize socket object
(scraper) E:\scraper>Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\Anaconda3\envs\scraper\lib\multiprocessing\spawn.py", line 99, in spawn_main
new_handle = reduction.steal_handle(parent_pid, pipe_handle)
File "C:\Anaconda3\envs\scraper\lib\multiprocessing\reduction.py", line 87, in steal_handle
_winapi.DUPLICATE_SAME_ACCESS | _winapi.DUPLICATE_CLOSE_SOURCE)
PermissionError: [WinError 5] Access is denied
我在Anaconda虚拟环境中的Windows 10上运行此代码,该代码在Ubuntu中运行良好。如何在不重新启动整个脚本的情况下重新启动刮刀?