我有以下reddit spider代码。当我运行代码时,它会通过第一页并检索链接,但最后它应该使GET请求移动到下一行,但是它会再次通过初始响应并因为失败而失败例外:DEBUG: Filtered duplicate request:
# -*- coding: utf-8 -*-
import scrapy
from reddit.items import RedditItem
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
import re
import HTMLParser
class RedditSpider(CrawlSpider):
name = "reddit"
allowed_domains = ["reddit.com"]
start_urls = (
'http://www.reddit.com/r/progresspics',
)
# rules = (
# Rule(SgmlLinkExtractor(allow=('/(\?)(count)(=\d+)(.*)' )), callback='parse', follow= True),
# )
def parse(self, response):
sel = Selector(response)
#entries = sel.Selector.xpath('//div[@class="entry upvoted"]').extract()
entries = sel.css('.entry')
item = RedditItem()
for entry in entries:
next_button = re.search("(\?)(count)(=\d+.*?[\"])",response.body)
print next_button.group(0)
print entry.css('.title::text').extract()
#print entry.css('.title').xpath('.//a/@href').extract()
image_url = entry.css('.title').xpath('.//a/@href')[0].extract()
print(image_url)
item['title'] = entry.css('.title::text').extract()
item['link'] = image_url
item['desc'] = entry.css('.title::text').extract()
# item['image_urls'] = image_url
# item['image'] = entry.css('.title').xpath('.//a/@href')[0].extract()
yield item
nextbutton='http://www.reddit.com/r/progresspics/'+next_button.group(0).replace('\"','')
nextbutton=HTMLParser.HTMLParser().unescape(nextbutton)
#http://www.reddit.com/r/progresspics/?count=25&after=t3_2bv9c4
#nextbutton = "http://www.reddit.com/r/progresspics/?amp=&count=50&after=t3_2brip7"
print nextbutton
nextrequest = Request(nextbutton, callback=self.parse)
#yield Request(nextbutton, callback=self.parse)
yield nextrequest
答案 0 :(得分:0)
您需要将dont_filter=False
添加到Request
。
有关详细信息,请参阅http://doc.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request。
请尝试使用此功能来检查是否有下一页:
next = sel.xpath('//a[@rel="nofollow next"]/@href').extract()
if next:
yield Request(url=next[0], callback=self.parse)