您好,我想抓取以下链接的产品页面上提供的产品规格表:https://www.amazon.com/dp/B07HJ41HCF,为此,我用scrapy编写了以下蜘蛛图。
def parse(self, response):
item = GraingerItem()
item['url'] = response.url
item['proddescription'] = response.xpath('//*[@id="productDetails_detailBullets_sections1"]/td[1]/th/text()').extract()
item['title'] = response.xpath('//*[@id="productTitle"]/text()').extract()[0].strip()
try:
item['sellername'] = response.xpath('//*[@id="bylineInfo"]/text()').extract()[0].strip()
except IndexError:
item['sellername'] = "No Seller Name"
gg=[]
cc= response.xpath('//*[@class="a-link-normal a-color-tertiary"]')
for bb in cc:
dd=bb.xpath('text()').extract()[0].strip()
gg.append(dd)
gg.append(">")
qq=str(gg)
qr=qq.replace("'","")
qs = qr.replace(">]","")
qt=qs.replace("[","")
qu = qt.replace(",","")
item['travlink'] = qu
try:
item['rating'] = response.xpath('//*[@id="acrPopover"]/span[1]/a/i[1]/span/text()').extract()[0].strip()
except IndexError:
item['rating'] = "Be the First one to review"
try:
item['Crreview'] = response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract()[0].strip()
except IndexError:
item['Crreview'] = "Be the First one to review"
dd = response.xpath('//*[@id="feature-bullets"]/ul')
ft = []
for i in range(2,40):
q = str(i)
trows ="li["+q+"]"
xpathgiven = trows + "/span/text()"
for bullets in dd:
b1= bullets.xpath(xpathgiven).extract()
for ac in b1:
ab = ac.replace("\xa0", "")
ft.append(b1)
ft.append(";")
stft = str(ft)
stft1 = stft.replace("';', [], ';'","")
stft2 = stft1.replace("\\t","")
stft3 = stft2.replace('\\n',"")
stft4 = stft3.replace("'","")
stft5 = stft4.replace("[","")
stft6 = stft5.replace("]","")
stft7 = stft6.replace(",","")
item['feature'] = stft7
description = []
try:
for i in range(1, 100):
q1 = str(i)
trows1 = "[" + q1 + "]"
xpathgiven1 = "//*[@id='productDescription']/p/text()["+q1+"]"
gg = response.xpath(xpathgiven1).extract()
description.append(gg)
description.append(";")
stft = str(description)
dsft1 = stft.replace("';', [], ';'", "")
dsft2 = dsft1.replace("'], ';', ['", ";")
dsft3 = dsft2.replace('\\n', "")
dsft33 = dsft3.replace('\\t', "")
dsft4 = dsft33.replace("'", "")
dsft5 = dsft4.replace("[", "")
dsft6 = dsft5.replace("]", "")
dsft7 = dsft6.replace(",", "")
item['Description'] = dsft7
except IndexError:
item['Description'] = "No Description"
在上面的代码中,一切正常,但item ['proddescription']确实产生了一个空列表,对上述内容的任何帮助将受到高度赞赏