我有一个Yelp的Scrapy脚本,在大多数情况下,它正在工作。基本上我可以提供一个Yelp页面列表,它应该返回所有页面的所有评论。到目前为止的脚本如下:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
item['url'] = response.url
items.append(item)
return items
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
return requests
然而,我遇到的问题是这个特定的脚本会刮擦每个请求的评论的每一页,除了第一页。如果我注释掉最后一个" if"声明,它只会刮擦FIRST页面。我怀疑我需要的只是一个简单的"否则"命令,但我很难过...非常感谢帮助!
编辑:这是根据收到的援助目前的代码......
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
正如下面的评论所述,按原样运行此代码会抓取每个所需的页面,但每页只返回一次审核,而不是全部。
我尝试将yield item
更改为yield items
,但每次抓取的网址都会返回ERROR: Spider must return Request, BaseItem or None, got 'list' in <GET http://www.yelp.com/biz/[...]>
的错误消息。
答案 0 :(得分:1)
您需要稍微重新组织方法。首先用parse()
方法解析餐厅页面。然后,返回评论请求并以另一种方法处理响应,例如parse_review()
:
import re
from scrapy.item import Item, Field
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse_review) for n in range(totalReviews/reviewsPerPage)]
return pages
def parse_review(self, response):
sel = Selector(response)
reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
答案 1 :(得分:0)
如果您要在多个地方返回项目/请求,则应将return
语句替换为yield
语句,这会将您的函数转换为生成器,每次返回一个新元素它是生成的(产生它),不退出函数,直到它们全部返回。否则,正如您的代码现在一样,您的函数将在第一个return
之后退出,并且不会发送对以下页面的请求。
编辑:更正 - 您应该一次产生一个项目/请求,所以:
替换
for review in reviews:
item = ...
return items
与
for review in reviews:
item = ...
yield item
并替换
return requests
与
for request in requests:
yield request
答案 2 :(得分:0)
最终的答案确实在于单一yield
行的缩进。这段代码最终完成了我需要它做的事情。
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class YelpXSpider(Spider):
name = "yelpx"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
感谢所有人帮助一个菜鸟!