Scrapy:如何在request.meta [' proxy']更改后返回新请求?

时间:2015-05-12 09:49:14

标签: proxy web-scraping scrapy

我正在使用一些代理抓取一些网页,但有些代理不能正常工作并获得

DEBUG: Crawled (403) <GET http://xiyuanxiaoqu0571.fang.com/xiangqing/>

DEBUG: Crawled (302) <GET http://yilexincun.fang.com/xiangqing/> 

下面是我的spider.py和parse_community()我试图重新抓取其response.status不是200的页面,但它似乎不起作用。

非常感谢任何帮助!

此外,如何排除导致&#34; http 302/403&#34;什么时候运行爬虫?

#-*- coding=utf8 -*-

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from soufang.items import Community_info
from scrapy import log
import sys 
import random
from soufang.misc.proxy import PROXIES
from imp import reload
reload(sys)
sys.setdefaultencoding( "utf-8" )


class soufangSpider(CrawlSpider):
    name = 'soufang'
    allowed_domains = ['fang.com']
    start_urls = ['http://esf.hz.fang.com/housing/151_2352_1_0_0_0_1_0_0/']

    rules = (
             Rule(LinkExtractor(allow=('/xiangqing/$'),deny=    ('/\d.+\.html')),callback='parse_community'),
             Rule(LinkExtractor(allow=('/xiangqing/$'),deny=('/\d.+\.html'),restrict_xpaths = (u"//div[@class='info rel floatl ml15']/dl/dd[@id='detail_6']")),follow=True),
             Rule(LinkExtractor(deny=('/\d.+\.html'),restrict_xpaths = u"//a[text()='下一页']"))
         )

    handle_httpstatus_list = [302,404,503,403] 



    def parse_community(self,response):

        if response.status != 200 : 
           print response.url
           request = scrapy.Request(response.url)
           p = random.choice(PROXIES)
           request.meta['proxy'] = "http://%s" % p
           return request


        item = Community_info()

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='ewmBoxTitle']/span[@class='floatl']/text()").extract()
        item['community'] = temp[0] if temp else ''

        item['city'] = ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='开 发 商:']/../text()").extract()
        item['developer'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='所属区域:']/../text()").extract()
        item['district'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='小区地址:']/../text()").extract()
        item['address'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='邮${nbsp}${nbsp}${nbsp}${nbsp}编:']/../text()").extract()
        item['postcode'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='竣工时间:']/../text()").extract()
        item['yearOfDev'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='firstpic']/dd[text()='本月均价:']/span[1]/text()").extract()
        item['price'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='总 户 数:']/../text()").extract()
        item['household_no'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='物业类别:']/../text()").extract()
        item['community_type'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='物 业 费:']/../text()").extract()
        item['property_fee'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='建筑面积:']/../text()").extract()
        item['total_area'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='占地面积:']/../text()").extract()
        item['area'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='绿 化 率:']/../text()").extract()
        item['greening_rate'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='容 积 率:']/../text()").extract()
        item['volumn_rate'] = temp[0] if temp else ''

        temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='yihang']/h3[text()='交通状况']/../following-sibling::dl[1]/dt[1]/text()").extract()
        item['transportation'] = temp[0] if temp else ''

        temp = "".join(response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='yihang']/h3[text()='周边信息']/../following-sibling::dl[1]//text()").extract())
        item['periphery'] = temp if temp else ''


        log.msg(':'.join([response.url,item['community']]),level=log.INFO)

        return item

0 个答案:

没有答案