我正在使用 Scrappy 编写 scrapper 来从电子商务网站中提取数据。它是一个递归Scrapper,用于访问和解析页面,代码如下:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from kohls_scrapper.items import KohlsItem
class kohlsspider(CrawlSpider):
name="Kohls"
allowed_domains=["www.kohls.com"]
start_urls=\
[
"http://www.kohls.com/catalog/womens-active-clothing.jsp?CN=4294720878+4294737782+4294719810&cc=womens-LN3.7-S-workout",
]
rules=(
Rule(LinkExtractor(allow=('/product/prd.*')),follow=True,callback='parse_stuff')
)
def parse_stuff(self,response):
sel=Selector(response=response)
data=KohlsItem()
data['prod_name']=sel.xpath("//h1/text()").extract()
data['prod_name']=data['prod_name'][0].encode('utf-8','ignore')
data['colors']=sel.xpath('//div[@class="pdp-product-swatch"]/a/@title').extract()
data['price']=sel.xpath('//*[@id="pdp-Pricing"]/div[2]/div[2]/text()').extract()
print('\n\n\n\nProduct Colors :',data['colors'],'\n\n\n\n')
#data['price']=data['price'][0].encode('utf-8')
#data['sizes']=
#print('\n\n\n\nProduct Name :',data['prod_name'],'\n\n\n\n')
return data
现在的问题是,从响应页面中, Products_Name 被完全提取出来,但提取颜色和价格无法正常工作。这些显示为 None 元素。我检查了 Xpath ,看起来没问题。输出如下:
Product Colors : []
2016-10-14 17:06:44 [scrapy] DEBUG: Scraped from <200 http://www.kohls.com/product/prd-2445710/womens-columbia-three-lakes-fleece-jacket.jsp?bvrrp=9025%2Freviews%2Fproduct%2F3%2F2445710.htm>
{'colors': [], 'prod_name': b"Women's Columbia Three Lakes Fleece Jacket "}
NAA
Product Colors : []
2016-10-14 17:06:44 [scrapy] DEBUG: Scraped from <200 http://www.kohls.com/product/prd-2526246/womens-tek-gear-cowlneck-tunic-dress.jsp?bvrrp=9025%2Freviews%2Fproduct%2F4%2F2526246.htm>
{'colors': [], 'prod_name': b"Women's Tek Gear\xc2\xae Cowlneck Tunic Dress"}
NAA
Product Colors : []
我似乎无法弄清楚为什么在Product_name被完全提取时,其他两个项目没有通过Xpath提取。有人可以帮助吗?一点点的帮助会被赞赏......