我一直在尝试从consumercomplaints中提取数据。标题和这些标题链接中的数据。我编写了以下代码,无法解析链接并提取数据,我也无法提取所有链接related.plz指南
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[@class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[@class="complaint"]/a/@href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[@class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[@class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[@class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[@class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[@class="compl-text"]/div/text()').extract()
yield old_item
答案 0 :(得分:0)
您使用的是旧版本的Scrapy吗?
在最新的稳定版本中,您不需要执行hxs = Selector(response)
或使用hxs.select()
方法。您可以使用response.xpath()
执行相同的操作。
我认为代码中的问题是select()
(或response.xpath
)的结果实际上是Python list
,所以你需要这样做:
link = site.select('.//td[@class="complaint"]/a/@href').extract()
if link:
item['link'] = link[0]
你可能也希望为标题做类似的事情。
编辑:我做了一些改动:import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[@class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[@class="complaint"]/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[@class="compl-text"]/div/text()').extract()
yield old_item