我正在尝试用Scrapy刮除Yelp,一切似乎都很好。我什至确保url和所有xpaths在Scrapy shell中都能正常工作,但我不断得到:抓取0页(以0页/分钟的速度),并且没有给出我可以修复的错误。
我也尝试寻找类似的问题,但似乎与蜘蛛发生的事情无关。
from scrapy import Spider
from ..items import YelpItem
import scrapy
import re
class YelpSpider(Spider):
name = "yelp"
allowed_domains = ['www.yelp.com']
# Defining the list of pages to scrape
start_url = ['https://www.yelp.com/search?find_desc=Dog&find_loc=Boston%2C%20MA&start=0' + str(1 * i) for i in range(0, 3)]
def parse(self, response):
# Defining rows to be scraped
rows = response.xpath('//[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/ul/li').extract_first()
for row in rows:
# Scraping Busines' Name
name = row.xpath('.//p/a/text()').extract_first()
# Scraping Phone number
phone = rows.xpath('.//div[1]/p[1][@class= "lemon--p__373c0__3Qnnj text__373c0__2pB8f textcolor--normal__373c0__K_MKN text-align--right__373c0__3ARv7"]/text()').extract_first()
# scraping area
area = rows.xpath('.//p/span[@class = "lemon--span__373c0__3997G"]/text()').extract_first()
item = YelpItem()
item['name'] = name
item['phone'] = phone
item['area'] = area
yield item
答案 0 :(得分:1)
您需要将start_url
更改为start_urls
。