我以前已经成功使用了CrawlSpider。但是,当我更改代码以与Redis集成并添加我自己的中间件以设置UserAgent和cookie时,蜘蛛不再解析响应,因此蜘蛛不再生成新请求,因此蜘蛛在开始后不久就关闭了
即使我对此进行编码: def parse_start_url(自己,响应): 返回self.parse_item(response) 它仅解析第一个网址的响应
这是我的代码: 蜘蛛:
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from yydzh.items import YydzhItem
from scrapy.spiders import Rule, CrawlSpider
class YydzhSpider(CrawlSpider):
name = 'yydzhSpider'
allowed_domains = ['yydzh.com']
start_urls = ['http://www.yydzh.com/thread.php?fid=198']
rules = (
Rule(LinkExtractor(allow='thread\.php\?fid=198&page=([1-9]|1[0-9])#s',
restrict_xpaths=("//div[@class='pages']")),
callback='parse_item', follow=True,
),
)
#def parse_start_url(self, response):
# return self.parse_item(response)
def parse_item(self, response):
item = YydzhItem()
for each in response.xpath \
("//*[@id='ajaxtable']//tr[@class='tr2'][last()]/following-sibling::tr[@class!='tr2']"):
item['title'] = each.xpath("./td[2]/h3[1]/a//text()").extract()[0]
item['author'] = each.xpath('./td[3]/a//text()').extract()[0]
item['category'] = each.xpath('./td[2]/span[1]//text()').extract()[0]
item['url'] = each.xpath("./td[2]/h3[1]//a/@href").extract()[0]
yield item
我认为至关重要的设置:
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
DOWNLOADER_MIDDLEWARES = {
'yydzh.middlewares.UserAgentmiddleware': 500,
'yydzh.middlewares.CookieMiddleware': 600
}
COOKIES_ENABLED = True
中间件: UserAgentmiddleware随机更改用户代理,以避免被服务器注意到
CookieMiddleware添加cookie来请求要求登录进行扫描的页面
logger = logging.getLogger(__name__)
class UserAgentmiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent
class CookieMiddleware(RetryMiddleware):
def __init__(self, settings, crawler):
RetryMiddleware.__init__(self, settings)
self.rconn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT,
password=REDIS_PASS, db=1, decode_responses=True)
init_cookie(self.rconn, crawler.spider.name)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings, crawler)
def process_request(self, request, spider):
redisKeys = self.rconn.keys()
while len(redisKeys) > 0:
elem = random.choice(redisKeys)
if spider.name + ':Cookies' in elem:
cookie = json.loads(self.rconn.get(elem))
request.cookies = cookie
request.meta["accountText"] = elem.split("Cookies:")[-1]
break
else:
redisKeys.remove(elem)
def process_response(self, request, response, spider):
if('您没有登录或者您没有权限访问此页面' in str(response.body)):
accountText = request.meta["accountText"]
remove_cookie(self.rconn, spider.name, accountText)
update_cookie(self.rconn, spider.name, accountText)
logger.warning("更新Cookie成功!(账号为:%s)" % accountText)
return request
return response
答案 0 :(得分:0)
查找问题:Redis服务器在之前的请求之前已过滤所有URL,然后重新启动就可以解决问题。