我已经开始为mongodb编写一个简单的scrapy模块来使用。我是python的新手,我写的代码存在问题:
congress.py
import scrapy
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import HtmlResponse
from congress.items import CongressItem
class CongressSpider(CrawlSpider):
name = "congres"
allowed_domains = ["www.congress.gov"]
start_urls = [
'https://www.congress.gov/members',
]
#creating a rule for my crawler. I only want it to continue to the next page, don't follow any other links.
rules = (Rule(LinkExtractor(allow=(),restrict_xpaths=("//a[@class='next']",)), callback="parse_page", follow=True),)
def parse_page(self, response):
for search in response.selector.xpath(".//li[@class='compact']"):
yield {'member' : ' '.join(search.xpath("normalize-space(span/a/text())").extract()).strip(),
'state' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item']/span/text())").extract()).strip(),
'District' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][2]/span/text())").extract()).strip(),
'party' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][3]/span/text())").extract()).strip(),
'Served' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][4]/span//li/text())").extract()).strip(),
}
items.py
import scrapy
class CongressItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
member = scrapy.Field()
state = scrapy.Field()
District = scrapy.Field()
party = scrapy.Field()
served = scrapy.Field()
pipelines.py
from pymongo import MongoClient
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class CongressPipeline(object):
collection_name= 'members'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert(dict(item))
return item
settings.py
BOT_NAME = 'congres'
SPIDER_MODULES = ['congres.spiders']
NEWSPIDER_MODULE = 'congres.spiders'
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'congres'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'congress.pipelines.CongresPipeline': 300,
}
它显示的错误是
Unhandled error in Deferred:
2017-07-09 11:15:33 [twisted] CRITICAL: Unhandled error in Deferred:
2017-07-09 11:15:34 [twisted] CRITICAL:
Traceback (most recent call last):
File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 1386,
in _inlineCallbacks
result = g.send(result)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 95, in crawl
six.reraise(*exc_info)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 79, in crawl
yield self.engine.open_spider(self.spider, start_requests)
NameError: global name 'pymongo' is not defined
答案 0 :(得分:0)
您只是在MongoClient
pipelines.py
from pymongo import MongoClient
在open_spider
方法中,您正以这种方式使用
self.client = pymongo.MongoClient(self.mongo_uri)
您收到错误是因为未导入pymongo
。将最后一行更改为
self.client = MongoClient(self.mongo_uri)