我目前正在使用scrapy刮擦亚马逊页面。我希望scrapy返回易于转换为表的输出(例如,数据框,MySQL等)。例如,这是我的蜘蛛在一个JSON文件(7列2行/页)中输出的内容:
当我将其转换为数据框时,它看起来像这样(我仍然必须清理它):
我的问题本质上是如何折叠初始输出,使其看起来像一张表/轻松地转换成一个表。如果可以通过某种方式将其添加到下面的parse函数中,那就太棒了。我最初尝试使用for循环来获取例如每个列表的第一个值。有任何想法吗?感谢您抽出宝贵的时间阅读本文!
import scrapy
from ..items import AmazonscrapeItem
class AmazonSpiderSpider(scrapy.Spider):
page_number = 2
name = 'amazon_scraper'
start_urls = [
'https://www.amazon.co.uk/s?i=stripbooks&bbn=266239&rh=n%3A266239%2Cp_72%3A184315031%2Cp_36%3A389028011&dc&page=1&fst=as%3Aoff&qid=1598942460&rnid=389022011&ref=sr_pg_1'
]
def parse(self, response, **kwargs):
items = AmazonscrapeItem()
# if multiple classes --> .css("::text").extract()
product_name = response.css('.a-color-base.a-text-normal::text').extract()
product_author = response.css('.a-color-secondary .a-size-base.a-link-normal').css('::text').extract()
product_nbr_reviews = response.css('.a-size-small .a-link-normal .a-size-base').css('::text').extract()
product_type = response.css('.a-spacing-top-small .a-link-normal.a-text-bold').css('::text').extract()
product_price = response.css('.a-spacing-top-small .a-price-whole').css('::text').extract()
product_more_choice = response.css('.a-spacing-top-mini .a-color-secondary .a-link-normal').css('::text').extract()
# this only selects the element that has the image --> need stuff inside src (source attr)
product_imagelink = response.css('.s-image::attr(src)').extract() # want attr of src
items['product_name'] = product_name
items['product_author'] = product_author
items['product_nbr_reviews'] = product_nbr_reviews
items['product_type'] = product_type
items['product_price'] = product_price
items['product_more_choice'] = product_more_choice
items['product_imagelink'] = product_imagelink
# CAN IT BE UNPACKED HERE SOMEHOW??
yield items
next_page = 'https://www.amazon.co.uk/s?i=stripbooks&bbn=266239&rh=n%3A266239%2Cp_72%3A184315031%2Cp_36%3A389028011&dc&page='+ str(AmazonSpiderSpider.page_number)+'&fst=as%3Aoff&qid=1598942460&rnid=389022011&ref=sr_pg_'+ str(AmazonSpiderSpider.page_number)
if AmazonSpiderSpider.page_number <3:
AmazonSpiderSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)