scrapy使用mongodb来存储信息

时间:2016-05-22 14:34:08

标签: mongodb scrapy

我正在编写一个网络蜘蛛来获取有关堆栈溢出的用户信息。我正在尝试使用mongodb存储信息。 这是我的网络蜘蛛的代码,工作正常:

class webSpider(Spider):
    name = "user_spider1"
    allowed_domains = ["stackoverflow.com"]
    start_urls = []
def start_requests(self):
    for i in range(1,2):
        self.start_urls.append(  "http://stackoverflow.com/users?page="+ str(i)+"&tab=reputation&filter=week")
    for url in self.start_urls:
        yield self.make_requests_from_url(url)

def parse(self, response):

    htmlTxt = response.body
    baseDomain = etree.HTML(htmlTxt)

    userSubUrl = baseDomain.xpath('//div[@class="user-details"]/a/@href')
    baseUrl = 'http://stackoverflow.com'
    for subUrl in userSubUrl:


        yield Request(baseUrl+subUrl,callback=self.parse_userinfo)

def parse_userinfo(self,response):
    htmlTxt = response.body
    infoDomain = etree.HTML(htmlTxt)

    item['user_name'] = stringprocessor(str(infoDomain.xpath('//h2[@class="user-card-name"]/text()[1]')))
    item['user_location'] = stringprocessor(str(infoDomain.xpath('//ul[@class="list-unstyled"]/li[1]/text()[2]')))
    item['user_reputation'] = stringprocessor(str(infoDomain.xpath('//div[@class="reputation"]/text()[1]')))
    tags = infoDomain.xpath('//div[@class="tag-container row"]/div/a[@class="post-tag"]/text()')

    item['user_tags'] = tags
    yield item

这是我的管道文件和设置,这可能是错误的:

import pymongo


from scrapy import log
from scrapy.conf import settings

class Spider1Pipeline(object):
    def __init__(self):

        connection = pymongo.Connection(
            settings['MONGODB_SERVER'],settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DB']]

        self.collection = db[settings['MONGODB_COLLECTION']]

    def process_item(self, item, spider):
        self.collection.insert(dict(item))
        log.msg('Item written to MongoDB database ',level=log.DEBUG, spider=spider)
        return item

设置:

BOT_NAME = 'test1'

SPIDER_MODULES = ['test1.spiders']
NEWSPIDER_MODULE = 'test1.spiders'


ROBOTSTXT_OBEY = True

ITEM_PIPELINES = ['test1.pipelines.Spider1Pipeline',]

MONGODB_SERVER='localhost'
MONGODB_PORT=27017
MONGODB_DB='test1'
MONGODB_COLLECTION='user_info'

我得到的错误是这样的:

AttributeError: 'list' object has no attribute 'iteritems'

我真的很困惑。 Plz帮帮我。

2 个答案:

答案 0 :(得分:0)

您的管道看起来不错。你的蜘蛛有点奇怪。这是一个更好的版本:

import scrapy
from scrapy import Request

class WebSpider(scrapy.Spider):
    name = "user_spider1"
    allowed_domains = ["stackoverflow.com"]
    start_urls = []

    def start_requests(self):
        for i in range(1,2):
            self.start_urls.append(  "http://stackoverflow.com/users?page="+ str(i)+"&tab=reputation&filter=week")
        for url in self.start_urls:
            yield self.make_requests_from_url(url)

    def parse(self, response):
        userSubUrl = response.xpath('//div[@class="user-details"]/a/@href').extract()
        baseUrl = 'http://stackoverflow.com'
        for subUrl in userSubUrl:
            yield Request(baseUrl+subUrl, callback=self.parse_userinfo)

    def parse_userinfo(self,response):
        item = {}

        stringprocessor = lambda x: x
        item['user_name'] = stringprocessor(str(response.xpath('//h2[@class="user-card-name"]/text()[1]').extract_first()))
        item['user_location'] = stringprocessor(str(response.xpath('//ul[@class="list-unstyled"]/li[1]/text()[2]').extract_first()))
        item['user_reputation'] = stringprocessor(str(response.xpath('//div[@class="reputation"]/text()[1]').extract_first()))
        tags = response.xpath('//div[@class="tag-container row"]/div/a[@class="post-tag"]/text()').extract_first()

        item['user_tags'] = tags
        yield item

答案 1 :(得分:0)

我有同样的问题 替换你的清单

ITEM_PIPELINES = ['test1.pipelines.Spider1Pipeline',]

by dict

ITEM_PIPELINES = {'test1.pipelines.Spider1Pipeline':300}

“您在此设置中为类分配的整数值决定了它们运行的​​顺序,从订单号从低到高的顺序通过管道。通常在0-1000范围内定义这些数字。”< / EM> 来源:http://doc.scrapy.org/en/0.24/topics/item-pipeline.html#activating-an-item-pipeline-component