我是scrapy的新手。
我想废弃A - > B - > C - > A - > B - > C - > ......圈地。
但是,item_scraped回调后的请求没有被触发。
我不知道为什么回调函数没有触发?
下面是我的蜘蛛代码。
import scrapy
from scrapy import signals
import time
import settings
from scrapy.loader.processors import MapCompose
from scrapy.loader import ItemLoader
from items import StudentID, StudentInfo
class GetidSpider(scrapy.Spider):
name = "getid"
custom_settings = {
'ITEM_PIPELINES' : {
'pipelines.GetidPipeline' : 300
}
}
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(GetidSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.item_scraped, signal = signals.item_scraped)
crawler.signals.connect(spider.spider_closed, signal = signals.spider_closed)
return spider
def __init__(self, login_id = None, login_pwd = None, Center = None):
self.login_id = login_id
self.login_pwd = login_pwd
self.CENTER = Center
def start_requests(self):
yield scrapy.Request("https://sdszone1.e-wsi.com/index.jhtml", self.login)
def login(self, response):
return scrapy.FormRequest.from_response(
response,
formname = 'Logon',
formdata = {
'login' : self.login_id,
'password' : self.login_pwd
},
callback=self.get_student_id
)
def get_student_id(self, response):
for title in response.xpath('//title/text()').extract():
if title == "SDS : Main":
self.student_info_count = 3
return scrapy.Request('http://sdszone1.e-wsi.com/standard/followup/studyrecord/studentstudyrecord.jhtml',
callback=self.print_student_info)
def print_student_info(self, response):
print self.student_info_count
if self.student_info_count > 0:
print "in if"
yield scrapy.Request('http://sdszone1.e-wsi.com/standard/followup/studyrecord/contracts.jhtml?studentCode=18138',
callback=self.save_student_info)
else :
print "in else"
yield scrapy.Request('http://sdszone1.e-wsi.com/standard/index.jhtml')
def save_student_info(self, response):
print "in save_student_info"
print response.xpath('//input[@type="hidden"][@name="profileId"]/@value').extract()
if response.xpath('//input[@type="hidden"][@name="profileId"]/@value').extract() == "" :
yield scrapy.Request('http://sdszone1.e-wsi.com/standard/index.jhtml')
else :
student_info = ItemLoader(item=StudentInfo(), response=response)
student_info.add_value('item_name', 'student_info')
student_info.add_xpath('SDS_No', '//table/tr/td[@width="100%"][@class="text"]/text()', MapCompose(unicode.strip, unicode.title))
student_info.add_xpath('StartLevel', '//table/tbody/tr/td[@class="text"][3]/text()', MapCompose(unicode.strip, unicode.title))
student_info.add_xpath('EndLevel', '//table/tbody/tr/td[@class="text"][5]/text()', MapCompose(unicode.strip, unicode.title))
student_info.add_xpath('ProEnglish', '//table/tbody/tr/td[@class="text"][8]/table/tbody/tr/td[2]/text()', MapCompose(unicode.strip, unicode.title))
yield student_info.load_item()
del student_info
def item_scraped(self, item, spider):
if self.student_count > 0 :
self.student_count -= 1
print "in student_count"
elif self.student_info_count > 0 :
self.student_info_count -= 1
print "in student_info_count"
return scrapy.Request('http://sdszone1.e-wsi.com/standard/index.jhtml', callback=self.print_student_info)
def spider_closed(self, spider):
print "SPIDER IS CLOSED"
,以下是日志。
2016-11-19 18:42:36 [scrapy] INFO: Spider opened
2016-11-19 18:42:36 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-11-19 18:42:36 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-11-19 18:42:37 [scrapy] DEBUG: Crawled (404) <GET https://sdszone1.e-wsi.com/robots.txt> (referer: None)
2016-11-19 18:42:38 [scrapy] DEBUG: Crawled (200) <GET https://sdszone1.e-wsi.com/index.jhtml> (referer: None)
2016-11-19 18:42:38 [scrapy] DEBUG: Redirecting (meta refresh) to <GET https://sdszone1.e-wsi.com/standard/index.jhtml> from <POST https://sdszone1.e-wsi.com/index.jhtml?_DARGS=/index.jhtml.3&_dynSessConf=4369572730097781326>
2016-11-19 18:42:38 [scrapy] DEBUG: Redirecting (302) to <GET http://sdszone1.e-wsi.com/standard/index.jhtml> from <GET https://sdszone1.e-wsi.com/standard/index.jhtml>
2016-11-19 18:42:39 [scrapy] DEBUG: Crawled (200) <GET http://sdszone1.e-wsi.com/standard/index.jhtml> (referer: https://sdszone1.e-wsi.com/index.jhtml)
2016-11-19 18:42:39 [scrapy] DEBUG: Crawled (200) <GET http://sdszone1.e-wsi.com/standard/followup/studyrecord/studentstudyrecord.jhtml> (referer: http://sdszone1.e-wsi.com/standard/index.jhtml)
3
in if
2016-11-19 18:42:40 [scrapy] DEBUG: Crawled (200) <GET http://sdszone1.e-wsi.com/standard/followup/studyrecord/contracts.jhtml?studentCode=18138> (referer: http://sdszone1.e-wsi.com/standard/followup/studyrecord/studentstudyrecord.jhtml)
in save_student_info
[u'E530633464']
2016-11-19 18:42:40 [scrapy] DEBUG: Scraped from <200 http://sdszone1.e-wsi.com/standard/followup/studyrecord/contracts.jhtml?studentCode=18138>
None
in student_info_count
2016-11-19 18:42:40 [scrapy] INFO: Closing spider (finished)
SPIDER IS CLOSED
2016-11-19 18:42:40 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 3500,
'downloader/request_count': 7,
'downloader/request_method_count/GET': 6,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 18150,
'downloader/response_count': 7,
'downloader/response_status_count/200': 5,
'downloader/response_status_count/302': 1,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 11, 19, 9, 42, 40, 192000),
'item_scraped_count': 1,
'log_count/DEBUG': 9,
'log_count/INFO': 7,
'request_depth_max': 3,
'response_received_count': 5,
'scheduler/dequeued': 6,
'scheduler/dequeued/memory': 6,
'scheduler/enqueued': 6,
'scheduler/enqueued/memory': 6,
'start_time': datetime.datetime(2016, 11, 19, 9, 42, 36, 494000)}
2016-11-19 18:42:40 [scrapy] INFO: Spider closed (finished)
Done
[Finished in 5.6s]
下面的是管道代码
class GetidPipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
print item
def __del__(self):
pass
日志看起来只有一页废料并完成..
我不知道发生了什么
谢谢。
答案 0 :(得分:1)
Scrapy中的请求(和项目)只能由crawler.engine
对象处理,这就是为什么蜘蛛回调方法(在没有您注意到的情况下)由此对象在内部处理。
信号方法,管道,扩展,中间件等不会发生这种情况。仅限于蜘蛛回调方法。
通常情况下,当您想要抓取某个站点然后返回一个项目时,您只需调用链中的每个请求,因为start_requests
方法,然后直到最后一个回调返回一个项目。尽管如此,你还可以强制Scrapy在其引擎中添加一个Request,其中包含:
self.crawler.engine.crawl(
Request(
'http://sdszone1.e-wsi.com/standard/index.jhtml',
callback=self.print_student_info,
),
spider,
)