我试图榨取当前的汇率。请你看看是否发现错误。
items.py
import scrapy
class CurrItem(scrapy.Item):
country = scrapy.Field()
currency = scrapy.Field()
value = scrapy.Field()
currencySpider.py
import scrapy
from currency.items import CurrItem
class currencySpider(scrapy.Spider):
name = 'curr'
allowed_domains = ['https://www.travelex.co.uk/']
start_urls = [
'https://www.travelex.co.uk/currency/exchange-rates',
]
def parse(self, response):
for site in response.xpath('//*[@id="rows"]/div[@class="currency-holder"]'):
item = CurrItem()
item['country'] = site.xpath('//*div/span[1]/text()').extract()
item['currency'] = site.xpath('//*div[1]/div/span[2]').extract()
item['value'] = site.xpath('//*div[1]/div/span[3]').extract()
yield item
pipelines.py
import pymongo
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class CurrencyPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']
)
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("currency was added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
settings.py
BOT_NAME = 'currency'
SPIDER_MODULES = ['currency.spiders']
NEWSPIDER_MODULE = 'currency.spiders'
ITEM_PIPELINES = ['currency.pipelines.MongoDBPipeline', ]
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "ProTech"
MONGODB_COLLECTION = "currency"
命令提示输出:
C:\Users\MyName\currency>scrapy crawl curr
2015-09-27 20:18:02 [scrapy] INFO: Scrapy 1.0.3 started (bot: currency)
2015-09-27 20:18:02 [scrapy] INFO: Optional features available: ssl, http11
2015-09-27 20:18:02 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'cu
rrency.spiders', 'SPIDER_MODULES': ['currency.spiders'], 'BOT_NAME': 'currency'}
2015-09-27 20:18:02 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsol
e, LogStats, CoreStats, SpiderState
2015-09-27 20:18:03 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddl
eware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultH
eadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMidd
leware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-09-27 20:18:03 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddlewa
re, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-09-27 20:18:03 [py.warnings] WARNING: C:\Python27\lib\site-packages\scrapy\
pipelines\__init__.py:21: ScrapyDeprecationWarning: ITEM_PIPELINES defined as a
list or a set is deprecated, switch to a dict
category=ScrapyDeprecationWarning, stacklevel=1)
2015-09-27 20:18:03 [py.warnings] WARNING: C:\Users\MyName\currency\currency\p
ipelines.py:5: ScrapyDeprecationWarning: Module `scrapy.log` has been deprecated
, Scrapy now relies on the builtin Python library for logging. Read the updated
logging entry in the documentation to learn more.
from scrapy import log
Unhandled error in Deferred:
2015-09-27 20:18:03 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "C:\Python27\lib\site-packages\scrapy\cmdline.py", line 150, in _run_comm
and
cmd.run(args, opts)
File "C:\Python27\lib\site-packages\scrapy\commands\crawl.py", line 57, in run
self.crawler_process.crawl(spname, **opts.spargs)
File "C:\Python27\lib\site-packages\scrapy\crawler.py", line 153, in crawl
d = crawler.crawl(*args, **kwargs)
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 1274, in
unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- <exception caught here> ---
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 1128, in
_inlineCallbacks
result = g.send(result)
File "C:\Python27\lib\site-packages\scrapy\crawler.py", line 71, in crawl
self.engine = self._create_engine()
File "C:\Python27\lib\site-packages\scrapy\crawler.py", line 83, in _create_en
gine
return ExecutionEngine(self, lambda _: self.stop())
File "C:\Python27\lib\site-packages\scrapy\core\engine.py", line 67, in __init
__
self.scraper = Scraper(crawler)
File "C:\Python27\lib\site-packages\scrapy\core\scraper.py", line 70, in __ini
t__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "C:\Python27\lib\site-packages\scrapy\middleware.py", line 56, in from_cr
awler
return cls.from_settings(crawler.settings, crawler)
File "C:\Python27\lib\site-packages\scrapy\middleware.py", line 32, in from_se
ttings
mwcls = load_object(clspath)
File "C:\Python27\lib\site-packages\scrapy\utils\misc.py", line 49, in load_ob
ject
raise NameError("Module '%s' doesn't define any object named '%s'" % (module
, name))
exceptions.NameError: Module 'currency.pipelines' doesn't define any object name
d 'MongoDBPipeline'
2015-09-27 20:18:03 [twisted] CRITICAL:
答案 0 :(得分:3)
所以,从错误:
exceptions.NameError: Module 'currency.pipelines' doesn't define any object named 'MongoDBPipeline'
看起来scrapy
需要一些实际上与Mongo联系的帮助。您可以在配置文件中看到以下行:
ITEM_PIPELINES = ['currency.pipelines.MongoDBPipeline', ]
您似乎实际上没有定义该项目管道。根据您的上述内容,将CurrencyPipeline
重命名为MongoDBPipeline
可以解决您的问题。
此外,虽然这不会破坏您的计划,但您不应该使用scrapy.log
。 Python
具有一些优秀的内置日志记录功能,文档为here。如果它已被弃用,我建议不要使用它。
答案 1 :(得分:3)
您应该在此处输入正确的pipline类名称:
ITEM_PIPELINES = ['currency.pipelines.MongoDBPipeline', ]
将其更改为:
ITEM_PIPELINES = ['currency.pipelines.CurrencyPipeline', ]