我正在使用一些代理抓取一些网页,但有些代理不能正常工作并获得
DEBUG: Crawled (403) <GET http://xiyuanxiaoqu0571.fang.com/xiangqing/>
或
DEBUG: Crawled (302) <GET http://yilexincun.fang.com/xiangqing/>
下面是我的spider.py和parse_community()我试图重新抓取其response.status不是200的页面,但它似乎不起作用。
非常感谢任何帮助!
此外,如何排除导致&#34; http 302/403&#34;什么时候运行爬虫?
#-*- coding=utf8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from soufang.items import Community_info
from scrapy import log
import sys
import random
from soufang.misc.proxy import PROXIES
from imp import reload
reload(sys)
sys.setdefaultencoding( "utf-8" )
class soufangSpider(CrawlSpider):
name = 'soufang'
allowed_domains = ['fang.com']
start_urls = ['http://esf.hz.fang.com/housing/151_2352_1_0_0_0_1_0_0/']
rules = (
Rule(LinkExtractor(allow=('/xiangqing/$'),deny= ('/\d.+\.html')),callback='parse_community'),
Rule(LinkExtractor(allow=('/xiangqing/$'),deny=('/\d.+\.html'),restrict_xpaths = (u"//div[@class='info rel floatl ml15']/dl/dd[@id='detail_6']")),follow=True),
Rule(LinkExtractor(deny=('/\d.+\.html'),restrict_xpaths = u"//a[text()='下一页']"))
)
handle_httpstatus_list = [302,404,503,403]
def parse_community(self,response):
if response.status != 200 :
print response.url
request = scrapy.Request(response.url)
p = random.choice(PROXIES)
request.meta['proxy'] = "http://%s" % p
return request
item = Community_info()
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='ewmBoxTitle']/span[@class='floatl']/text()").extract()
item['community'] = temp[0] if temp else ''
item['city'] = ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='开 发 商:']/../text()").extract()
item['developer'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='所属区域:']/../text()").extract()
item['district'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='小区地址:']/../text()").extract()
item['address'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='邮${nbsp}${nbsp}${nbsp}${nbsp}编:']/../text()").extract()
item['postcode'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='竣工时间:']/../text()").extract()
item['yearOfDev'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='firstpic']/dd[text()='本月均价:']/span[1]/text()").extract()
item['price'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='总 户 数:']/../text()").extract()
item['household_no'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='物业类别:']/../text()").extract()
item['community_type'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='物 业 费:']/../text()").extract()
item['property_fee'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='建筑面积:']/../text()").extract()
item['total_area'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='占地面积:']/../text()").extract()
item['area'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='绿 化 率:']/../text()").extract()
item['greening_rate'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='容 积 率:']/../text()").extract()
item['volumn_rate'] = temp[0] if temp else ''
temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='yihang']/h3[text()='交通状况']/../following-sibling::dl[1]/dt[1]/text()").extract()
item['transportation'] = temp[0] if temp else ''
temp = "".join(response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='yihang']/h3[text()='周边信息']/../following-sibling::dl[1]//text()").extract())
item['periphery'] = temp if temp else ''
log.msg(':'.join([response.url,item['community']]),level=log.INFO)
return item