我使用 Scrapy 和python 2.7 我需要在班上使用2个产量,第一个用于刮掉子猫,第二个用于分页。我需要类似的东西
class myClass(BaseSpider):
cmt = 0
def __init__(self, *args, **kwargs):
super(myClass, self).__init__(*args, **kwargs)
def start_requests(self):
start_urls = 'https://www.adresse.fr'
yield Request(start_urls, callback=self.firstDef)
def firstDef(self,response):
body = response.css('body').extract_first()
#put the body in a file
if (self.cmt > 10) :
url = 'https://www.theOtherAdresse.com'
yield Request(url, callback=self.secondDef)
print self.cmt
self.cmt = self.cmt + 1
yield Request(response.url, callback=self.firstDef)
def secondDef(self,response):
body = response.css('body').extract_first()
#put the body in a file
print "Finish"
我的代码出了什么问题?为什么我不能获得2个收益?
更新
我阅读this并使用CrawlSpider,但我还无法调用secondDef。
更新
我的代码:
class Myclass(CrawlSpider):
reload(sys)
pageNumber = 0
cmt = 0
sys.setdefaultencoding('utf8')
name = 'myclass'
allowed_domains = ["amazon.fr"]
firstPage = True
rules = [
Rule(LinkExtractor(restrict_xpaths=('//div[@id="mainResults"]//h3[@class="newaps"]/a',)),
callback='parse_page1', follow=True),
Rule(LinkExtractor(restrict_xpaths=('//div[@id="bottomBar"]/div[@id="pagn"]/span[@class="pagnLink"]/a',)),
follow=True),
Rule(LinkExtractor(restrict_xpaths=(
'//div[@class="s-item-container"]//a[@class="a-link-normal s-access-detail-page a-text-normal"]',)),
callback='parse_page1', follow=True),
]
arrayCategories = []
pageCrawled = []
fileNumbers = 0
first = 0
start_urls = ['https://www.amazon.fr/s/ref=sr_nr_p_6_0?fst=as%3Aoff&rh=n%3A197861031%2Cn%3A!197862031%2Cn%3A212130031%2Cn%3A3008171031%2Cp_76%3A211708031%2Cp_6%3AA1X6FK5RDHNB96&bbn=3008171031&ie=UTF8&qid=1463074601&rnid=211045031'
,'https://www.amazon.fr/s/ref=sr_nr_p_6_0?fst=as%3Aoff&rh=n%3A197861031%2Cn%3A!197862031%2Cn%3A212130031%2Cn%3A3008171031%2Cp_76%3A211708031%2Cp_6%3AA1X6FK5RDHNB96&bbn=3008171031&ie=UTF8&qid=1463074601&rnid=211045031',
'https://www.amazon.fr/s/ref=sr_nr_n_1/275-0316831-3563928?fst=as%3Aoff&rh=n%3A197861031%2Cn%3A%21197862031%2Cn%3A212130031%2Cn%3A3008171031%2Cp_76%3A211708031%2Cp_6%3AA1X6FK5RDHNB96%2Cn%3A212136031&bbn=3008171031&ie=UTF8&qid=1463075247&rnid=3008171031',
]
def __init__(self, idcrawl=None, iddrive=None, idrobot=None, proxy=None, *args, **kwargs):
super(Myclass, self).__init__(*args, **kwargs)
def start_requests(self):
for i in range (0, len(self.start_urls)):
yield Request(self.start_urls[i], callback=self.parse)
def parse(self, response):
yield Request(response.url, callback = self.parse_produit)
hxs = HtmlXPathSelector(response)
try:
nextPageLink = hxs.select("//a[@id='pagnNextLink']/@href").extract()[0]
nextPageLink = urlparse.urljoin(response.url, nextPageLink)
self.log('\nGoing to next search page: ' + nextPageLink + '\n', log.DEBUG)
yield Request(nextPageLink, callback=self.parse)
except:
self.log('Whole category parsed: ', log.DEBUG)
def parse_produit(self,response):
print self.pageNumber
body = response.css('body').extract_first()
hxs = HtmlXPathSelector(response)
body = response.css('body').extract_first()
f = io.open('./amazon/page%s' % str(self.pageNumber), 'w+', encoding='utf-8')
f.write(body)
f.close()
self.pageNumber = self.pageNumber + 1
答案 0 :(得分:1)
我不相信有两个收益是你的问题,我认为它是if self.cmt > 10
语句,这就是为什么我问你是否看到self.cmt值大于10的原因。 #39;快速演示在一种方法中有两个yelds。
def example():
for i in range(1,10):
yield i
yield i * i
for e in example():
print e
这是它的输出: 1 1 2 4 3 9 4 16 这就是你所期望的。
另一种可能性是scrapy具有重复的URL过滤器。如果您向请求添加, dont_filter = True
,则会禁用过滤器。请参阅文档here
最后,从scrapy.Spider继承您的蜘蛛
class myClass(scrapy.Spider)
更新:您是否有任何证据表明firstDef()被多次调用,因为它看起来不像?