目前,我的抓取工具只能抓取超过6.5M的大约20,000种产品。似乎每个类别都被刮掉了,但每个类别的前5页都被刮掉了。我相信这是我的linkextractor,但我不确定。
CrawlSpider:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
class DigikeyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
partnumber = scrapy.Field()
manufacturer = scrapy.Field()
description = scrapy.Field()
quanity= scrapy.Field()
minimumquanity = scrapy.Field()
price = scrapy.Field()
class DigikeySpider(CrawlSpider):
name = 'digikey'
allowed_domains = ['digikey.com']
start_urls = ['https://www.digikey.com/products/en']
rules = (
Rule(LinkExtractor(allow=('products', )),callback='parse_item'),
)
def parse_item(self, response):
for row in response.css('table#productTable tbody tr'):
item = DigikeyItem()
item['partnumber'] = row.css('.tr-mfgPartNumber [itemprop="name"]::text').extract_first()
item['manufacturer'] = row.css('[itemprop="manufacture"] [itemprop="name"]::text').extract_first()
item['description'] = row.css('.tr-description::text').extract_first()
item['quanity'] = row.css('.tr-qtyAvailable::text').extract_first()
item['price'] = row.css('.tr-unitPrice::text').extract_first()
item['minimumquanity'] = row.css('.tr-minQty::text').extract_first()
yield item
环境:
BOT_NAME = 'digikey'
SPIDER_MODULES = ['digikey.spiders']
NEWSPIDER_MODULE = 'digikey.spiders'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
输出:
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Closing spider (finished)
2017-11-01 10:53:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 6,
'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 6,
'downloader/request_bytes': 1198612,
'downloader/request_count': 988,
'downloader/request_method_count/GET': 988,
'downloader/response_bytes': 23932614,
'downloader/response_count': 982,
'downloader/response_status_count/200': 982,
'dupefilter/filtered': 46,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 11, 1, 17, 53, 11, 421641),
'item_scraped_count': 21783,
'log_count/DEBUG': 22773,
'log_count/ERROR': 2,
'log_count/INFO': 10,
'request_depth_max': 1,
'response_received_count': 982,
'retry/count': 4,
'retry/max_reached': 2,
'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 4,
'scheduler/dequeued': 988,
'scheduler/dequeued/memory': 988,
'scheduler/enqueued': 988,
'scheduler/enqueued/memory': 988,
'start_time': datetime.datetime(2017, 11, 1, 17, 49, 38, 427669)}
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Spider closed (finished)
PS C:\Users\dalla_000\digikey>
答案 0 :(得分:0)
使用此特定网站,进行两阶段抓取可能有意义:
一种方法可能是使用两个蜘蛛和一个消息队列。第一个蜘蛛可能看起来像这样:
/Applications/IntelliJ\ IDEA.app/Contents/plugins/terminal/fish/config.fish
第一个蜘蛛获取要爬网的所有页面的列表,然后将它们写入消息队列(例如Kafka)。
第二个蜘蛛将使用来自Kafka主题的URL并抓取它们。它可能看起来像这样:
import scrapy
from bs4 import BeautifulSoup
import re
import math
import urllib
from kafka import KafkaClient, SimpleProducer
ITEMS_PER_PAGE = 500
class CreateXxxxxxxUrlListSpider(scrapy.Spider):
kafka = KafkaClient('10.0.1.12:9092')
producer = SimpleProducer(kafka)
name = "create_xxxxxxx_url_list"
allowed_domains = ["xxxxxxx.com"]
start_urls = [
"http://www.xxxxxxx.com/product-search/en?stock=1"
]
def parse(self, response):
soup = BeautifulSoup(response.body)
catfilterlinks = soup.find_all('a', {'class':'catfilterlink'})
for catfilterlink in catfilterlinks:
location = catfilterlink['href'].split("?")[0]
items = re.match(".*\(([0-9]+) items\).*", catfilterlink.next_sibling).group(1)
for page in range(int(math.ceil(float(items) / ITEMS_PER_PAGE))):
if page == 0:
url = "http://www.xxxxxxx.com" + location + "?" + urllib.urlencode({"stock":1})
self.producer.send_messages("xxxxxxx_search_page_urls", url)
else:
url = "http://www.xxxxxxx.com" + location + "/page/" + str(page + 1) + "?" + urllib.urlencode({"stock":1})
self.producer.send_messages("xxxxxxx_search_page_urls", url)