我正在尝试在线程环境中使用Scrapy - 请参阅此处:
GetJob.py
class Job():
def getJobs(self):
mysql = MysqlConnector.Mysql()
db = mysql.getConnection();
cur = db.cursor();
cur.execute("SELECT * FROM job WHERE status=0 OR days>0")
print "Get new jobs"
#JobModel
joblist=[]
for row in cur.fetchall():
job = JobModel.JobModel();
job.id = row[0]
job.user_id = row[1]
job.name = row[2]
job.url = row[3]
job.api = row[4]
job.max_pages = row[5]
job.crawl_depth = row[6]
job.processing_patterns = row[7]
job.status = row[8]
job.days = row[9]
joblist.append(job);
#Proces the job now
for job in joblist:
processJob = ProcessJob.ProcessJob();
th=Thread(target=processJob.processJob,args=(job,))
th.daemon=True
th.start();
db.close()
ProcessJob.py
class ProcessJob():
def processJob(self, job):
#update job
mysql = MysqlConnector.Mysql()
db = mysql.getConnection();
cur = db.cursor();
job.status = 1
update = "UPDATE job SET status=1 WHERE id=" + str(job.id)
cur.execute(update)
db.commit()
db.close()
runner = CrawlerRunner()
d = runner.crawl(spider.MySpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
当我执行此操作时,我收到以下错误:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 417, in fireEvent
DeferredList(beforeResults).addCallback(self._continueFiring)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", line 317, in addCallback
callbackKeywords=kw)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", line 306, in addCallbacks
self._runCallbacks()
File "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", line 588, in _runCallbacks
current.result = callback(current.result, *args, **kw)
--- <exception caught here> ---
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 430, in _continueFiring
callable(*args, **kwargs)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1189, in _reallyStartRunning
self._handleSignals()
File "/usr/local/lib/python2.7/site-packages/twisted/internet/posixbase.py", line 295, in _handleSignals
_SignalReactorMixin._handleSignals(self)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1154, in _handleSignals
signal.signal(signal.SIGINT, self.sigInt)
exceptions.ValueError: signal only works in main thread
我正在做的是:
Get Jobs从数据库中检索新作业(所以新的抓取工具)。 如果作业的状态为0(表示等待处理),则会将其提供给ProcessJob - &gt;然后ProcessJob应该启动爬虫。我实际上无法链接MySpider(SO限制的原因),但是当我在主线程上启动时,Spider会工作。
这里有什么建议吗?
答案 0 :(得分:0)
解决方案:
reactor.run(0)
# This will turn off signals