我最近尝试使用 Scrapy 抓取网站。我使用 Base_url 和 Start_request 方法从 CSV 文件(大约 60K)获取链接,但是当我启动我的蜘蛛时,它会跳过一些链接,如下所示:
2020-05-21 10:03:49 [scrapy.core.engine] DEBUG: Crawled (200)
有人知道如何解决吗?
这是我的代码:
import scrapy
import os
from ..items import PricedataItem
from ..Product_ID import read_csv
base_url = 'https://www.digikala.com/product/dkp-{}'
class EvaSpider(scrapy.Spider):
name = 'Eva'
# start_urls = ['https://www.digikala.com/product/dkp-3151766']
def start_requests(self):
for ID in read_csv():
yield scrapy.Request(base_url.format(ID))
def parse(self, response):
items = PricedataItem()
product_name = response.css('.c-product__title::text').extract()
group = response.css('.c-breadcrumb li:nth-child(2) span::text').extract()
main_category = response.css('.c-breadcrumb li:nth-child(3) span::text').extract()
sub_category = response.css('.c-breadcrumb li:nth-child(4) span::text').extract()
price = response.css('.c-product__seller-price-pure::text')
brand = response.css('.c-product__title-container--brand-link:nth-child(1)::text')
product_id = response.css('div.container article::attr(data-product-id)').extract()
items['Price'] = price
if items['Price']:
items['Price'] = price.extract()
else:
items['Price'] = 'null'
items['ProductName'] = product_name
items['Group'] = group
items['MainCategory'] = main_category
items['SubCategory'] = sub_category
items['Brand'] = brand
if items['Brand']:
items['Brand'] = brand.extract()
else:
items['Brand'] = "No Brand"
items['ProductID'] = product_id
yield items