用mongodb编写一个简单的python scrapy爬虫

时间:2017-07-09 05:05:08

标签: python mongodb web-scraping scrapy

我已经开始为mongodb编写一个简单的scrapy模块来使用。我是python的新手,我写的代码存在问题:

congress.py

import scrapy

from scrapy.selector import Selector
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import HtmlResponse
from congress.items import CongressItem

class CongressSpider(CrawlSpider):
    name = "congres"
    allowed_domains = ["www.congress.gov"]
    start_urls = [
            'https://www.congress.gov/members',
        ]
    #creating a rule for my crawler. I only want it to continue to the next page, don't follow any other links.
    rules = (Rule(LinkExtractor(allow=(),restrict_xpaths=("//a[@class='next']",)), callback="parse_page", follow=True),)

    def parse_page(self, response):
        for search in response.selector.xpath(".//li[@class='compact']"):
            yield {'member' : ' '.join(search.xpath("normalize-space(span/a/text())").extract()).strip(),
               'state' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item']/span/text())").extract()).strip(),
                'District' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][2]/span/text())").extract()).strip(),
                'party' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][3]/span/text())").extract()).strip(),
                'Served' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][4]/span//li/text())").extract()).strip(),
            }

items.py

import scrapy
class CongressItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()

    member = scrapy.Field()
    state = scrapy.Field()
    District = scrapy.Field()
    party = scrapy.Field()
    served = scrapy.Field()

pipelines.py

from pymongo import MongoClient
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log

class CongressPipeline(object):
    collection_name= 'members'
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )
    def open_spider(self,spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    def close_spider(self, spider):
        self.client.close()
    def process_item(self, item, spider):
        self.db[self.collection_name].insert(dict(item))
        return item

settings.py

BOT_NAME = 'congres'

SPIDER_MODULES = ['congres.spiders']
NEWSPIDER_MODULE = 'congres.spiders'





MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'congres'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
   'congress.pipelines.CongresPipeline': 300,
}

它显示的错误是

Unhandled error in Deferred:
2017-07-09 11:15:33 [twisted] CRITICAL: Unhandled error in Deferred:

2017-07-09 11:15:34 [twisted] CRITICAL:
Traceback (most recent call last):
File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 1386, 
in _inlineCallbacks
result = g.send(result)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 95, in crawl
six.reraise(*exc_info)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 79, in crawl
yield self.engine.open_spider(self.spider, start_requests)
NameError: global name 'pymongo' is not defined

1 个答案:

答案 0 :(得分:0)

您只是在MongoClient

中导入pipelines.py
from pymongo import MongoClient

open_spider方法中,您正以这种方式使用

self.client = pymongo.MongoClient(self.mongo_uri)

您收到错误是因为未导入pymongo。将最后一行更改为

self.client = MongoClient(self.mongo_uri)