我是scrapy的新手,但是当我运行我的代码时,Debug返回时没有错误,当我查看它已经刮掉的数据量时,不应该是这种情况?以下是我的代码。我试图从tripadvisor获得评论。
import HTMLParser
import unicodedata
import re
import time
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
class scrapingtestSpider(CrawlSpider):
name = "scrapingtest"
allowed_domains = ["tripadvisor.com"]
base_uri = "http://www.tripadvisor.com"
start_urls = [
base_uri + "/RestaurantSearch?geo=60763&q=New+York+City%2C+New+York&cat=&pid="
]
htmlparser = HTMLParser.HTMLParser()
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def clean_parsed_string(string):
if len(string) > 0:
ascii_string = string
if is_ascii(ascii_string) == False:
ascii_string = unicodedata.normalize('NFKD', ascii_string).encode('ascii', 'ignore')
return str(ascii_string)
else:
return None
def get_parsed_string(selector, xpath):
return_string = ''
extracted_list = selector.xpath(xpath).extract()
if len(extracted_list) > 0:
raw_string = extracted_list[0].strip()
if raw_string is not None:
return_string = htmlparser.unescape(raw_string)
return return_string
def get_parsed_string_multiple(selector, xpath):
return_string = ''
return selector.xpath(xpath).extract()
def parse(self, response):
tripadvisor_items = []
sel = Selector(response)
snode_restaurants = sel.xpath('//div[@id="EATERY_SEARCH_RESULTS"]/div[starts-with(@class, "listing")]')
# Build item index.
for snode_restaurant in snode_restaurants:
# Cleaning string and taking only the first part before whitespace.
snode_restaurant_item_avg_stars = clean_parsed_string(get_parsed_string(snode_restaurant, 'div[@class="wrap"]/div[@class="entry wrap"]/div[@class="description"]/div[@class="wrap"]/div[@class="rs rating"]/span[starts-with(@class, "rate")]/img[@class="sprite-ratings"]/@alt'))
tripadvisor_item['avg_stars'] = re.match(r'(\S+)', snode_restaurant_item_avg_stars).group()
# Popolate reviews and address for current item.
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_search_page)
def parse_fetch_review(self, response):
tripadvisor_item = response.meta['tripadvisor_item']
sel = Selector(response)
counter_page_review = response.meta['counter_page_review']
# TripAdvisor reviews for item.
snode_reviews = sel.xpath('//div[@id="REVIEWS"]/div/div[contains(@class, "review")]/div[@class="col2of2"]/div[@class="innerBubble"]')
# Reviews for item.
for snode_review in snode_reviews:
tripadvisor_review_item = ScrapingtestreviewItem()
tripadvisor_review_item['title'] = clean_parsed_string(get_parsed_string(snode_review, 'div[@class="quote"]/text()'))
# Review item description is a list of strings.
# Strings in list are generated parsing user intentional newline. DOM: <br>
tripadvisor_review_item['description'] = get_parsed_string_multiple(snode_review, 'div[@class="entry"]/p/text()')
# Cleaning string and taking only the first part before whitespace.
snode_review_item_stars = clean_parsed_string(get_parsed_string(snode_review, 'div[@class="rating reviewItemInline"]/span[starts-with(@class, "rate")]/img/@alt'))
tripadvisor_review_item['stars'] = re.match(r'(\S+)', snode_review_item_stars).group()
snode_review_item_date = clean_parsed_string(get_parsed_string(snode_review, 'div[@class="rating reviewItemInline"]/span[@class="ratingDate"]/text()'))
snode_review_item_date = re.sub(r'Reviewed ', '', snode_review_item_date, flags=re.IGNORECASE)
snode_review_item_date = time.strptime(snode_review_item_date, '%B %d, %Y') if snode_review_item_date else None
tripadvisor_review_item['date'] = time.strftime('%Y-%m-%d', snode_review_item_date) if snode_review_item_date else None
tripadvisor_item['reviews'].append(tripadvisor_review_item)
这是DEBUG日志
C:\Users\smash_000\Desktop\scrapingtest\scrapingtest>scrapy crawl scrapingtest -
o items.json
C:\Users\smash_000\Desktop\scrapingtest\scrapingtest\spiders\scrapingtest_spider
.py:6: ScrapyDeprecationWarning: Module `scrapy.spider` is deprecated, use `scra
py.spiders` instead
from scrapy.spider import BaseSpider
C:\Users\smash_000\Desktop\scrapingtest\scrapingtest\spiders\scrapingtest_spider
.py:9: ScrapyDeprecationWarning: Module `scrapy.contrib.spiders` is deprecated,
use `scrapy.spiders` instead
from scrapy.contrib.spiders import CrawlSpider, Rule
2015-07-14 11:07:04 [scrapy] INFO: Scrapy 1.0.1 started (bot: scrapingtest)
2015-07-14 11:07:04 [scrapy] INFO: Optional features available: ssl, http11
2015-07-14 11:07:04 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'sc
rapingtest.spiders', 'FEED_FORMAT': 'json', 'SPIDER_MODULES': ['scrapingtest.spi
ders'], 'FEED_URI': 'items.json', 'BOT_NAME': 'scrapingtest'}
2015-07-14 11:07:04 [scrapy] INFO: Enabled extensions: CloseSpider, FeedExporter
, TelnetConsole, LogStats, CoreStats, SpiderState
2015-07-14 11:07:05 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddl
eware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultH
eadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMidd
leware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-07-14 11:07:05 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddlewa
re, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-07-14 11:07:05 [scrapy] INFO: Enabled item pipelines:
2015-07-14 11:07:05 [scrapy] INFO: Spider opened
2015-07-14 11:07:05 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 i
tems (at 0 items/min)
2015-07-14 11:07:05 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2015-07-14 11:07:06 [scrapy] DEBUG: Crawled (200) <GET http://www.tripadvisor.co
m/RestaurantSearch?geo=60763&q=New+York+City%2C+New+York&cat=&pid=> (referer: No
ne)
2015-07-14 11:07:06 [scrapy] INFO: Closing spider (finished)
2015-07-14 11:07:06 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 281,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 46932,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 7, 14, 5, 37, 6, 929000),
'log_count/DEBUG': 2,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2015, 7, 14, 5, 37, 5, 474000)}
2015-07-14 11:07:06 [scrapy] INFO: Spider closed (finished)
答案 0 :(得分:2)
您是否尝试使用print
语句调试代码?
我试图执行你的解析器。如果我按原样复制提供的代码,我会得到相同的结果,因为蜘蛛类scrapingtestSpider
没有parse
方法,并且不会被调用。
如果我对你的代码做了一些格式化(我将start_urls
下的所有内容缩进到类中),我会得到一些错误,帮助方法没有被它们的全局名称定义。
如果我更进一步并且只为抓取工具留下parse
方法,我会收到其他错误,提到tripadvisor_item
未定义....所以代码并没有真正起作用。
尝试在IDE中更好地格式化代码,并向print
方法添加parse
消息,以查看它们是否被调用。当Scrapy抓取第一个URL时,应输入主parse
方法。我认为它不会起作用。
顺便提一句,您添加到Request
的回调名称也是错误的:
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_search_page)
应改为
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review)
修复缩进问题。
在您在解析方法中创建的parse_fetch_review
方法return
或yield
的末尾。