我正在尝试使用scrapy从网站http://www.jabong.com/Puma-Wirko-Ind-Black-Sneakers-187839.html抓取一些内容。 下面的脚本运行成功,但有些变量(品牌,mrp,pcode,pdesc)有时会变为空值:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(BaseSpider):
name = "dmozjabong"
allowed_domains = ["jabong.com"]
start_urls = [
"http://www.jabong.com/Puma-Wirko-Ind-Black-Sneakers-187839.html"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = DmozItem()
item['title'] =' '.join(hxs.select('/html/head/title').extract()).strip()
item['link'] = ' '.join(hxs.select('//*[@id="refurl"]/@value').extract()).strip()
item['kwords'] = ' '.join(hxs.select('/html/head/meta[7]/@content').extract()).strip()
item['mdes'] = ' '.join(hxs.select('/html/head/meta[6]/@content').extract()).strip()
item['pname'] = ' '.join(hxs.select('//*[@id="qa-title-product"]/text()').extract()).strip()
item['pcode'] = ' '.join(hxs.select('//div[@id="productInfo"]//table/tr[8]/td[2]/text()').extract()).strip()
item['pdesc'] = ' '.join(hxs.select('//*[@id="productInfo"]/div[1]/div[2]/div[1]/p/text()').extract()).strip()
item['pimg'] = hxs.select('//*[@id="wrapper"]/div[2]/div[1]/div[3]/div[1]/ul/li[1]/img/@src').extract()
item['brand'] = ' '.join(hxs.select('//*[@id="wrapper"]/div[2]/div[2]/div[1]/a/img/@alt').extract()).strip()
counter = 0
for image_data in item['pimg']:
with open('image_' + str(counter) + '.jpg', 'wb') as fh:
fh.write(image_data)
counter += 1
item['bread'] = ' '.join(map(unicode.strip, hxs.select('id("breadcrumbs")//text()').extract())).strip()
item['listprice'] = ' '.join(hxs.select('//*[@id="before_price"]/span[2]/span/text()').extract()).strip()
item['mrp'] = ' '.join(hxs.select('//*[@id="price_div"]/span[2]/strike/text()').extract()).strip()
print item
答案 0 :(得分:0)
html代码中也可能存在一些小的“错误”或代码变化,因此scrapy不会返回任何内容。找出的最佳方法应该是尝试1或2个有效的链接,1或2个返回空信息,然后查找代码或smth的差异。