我正在编写一个网络蜘蛛来获取有关堆栈溢出的用户信息。我正在尝试使用mongodb存储信息。 这是我的网络蜘蛛的代码,工作正常:
class webSpider(Spider):
name = "user_spider1"
allowed_domains = ["stackoverflow.com"]
start_urls = []
def start_requests(self):
for i in range(1,2):
self.start_urls.append( "http://stackoverflow.com/users?page="+ str(i)+"&tab=reputation&filter=week")
for url in self.start_urls:
yield self.make_requests_from_url(url)
def parse(self, response):
htmlTxt = response.body
baseDomain = etree.HTML(htmlTxt)
userSubUrl = baseDomain.xpath('//div[@class="user-details"]/a/@href')
baseUrl = 'http://stackoverflow.com'
for subUrl in userSubUrl:
yield Request(baseUrl+subUrl,callback=self.parse_userinfo)
def parse_userinfo(self,response):
htmlTxt = response.body
infoDomain = etree.HTML(htmlTxt)
item['user_name'] = stringprocessor(str(infoDomain.xpath('//h2[@class="user-card-name"]/text()[1]')))
item['user_location'] = stringprocessor(str(infoDomain.xpath('//ul[@class="list-unstyled"]/li[1]/text()[2]')))
item['user_reputation'] = stringprocessor(str(infoDomain.xpath('//div[@class="reputation"]/text()[1]')))
tags = infoDomain.xpath('//div[@class="tag-container row"]/div/a[@class="post-tag"]/text()')
item['user_tags'] = tags
yield item
这是我的管道文件和设置,这可能是错误的:
import pymongo
from scrapy import log
from scrapy.conf import settings
class Spider1Pipeline(object):
def __init__(self):
connection = pymongo.Connection(
settings['MONGODB_SERVER'],settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg('Item written to MongoDB database ',level=log.DEBUG, spider=spider)
return item
设置:
BOT_NAME = 'test1'
SPIDER_MODULES = ['test1.spiders']
NEWSPIDER_MODULE = 'test1.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = ['test1.pipelines.Spider1Pipeline',]
MONGODB_SERVER='localhost'
MONGODB_PORT=27017
MONGODB_DB='test1'
MONGODB_COLLECTION='user_info'
我得到的错误是这样的:
AttributeError: 'list' object has no attribute 'iteritems'
我真的很困惑。 Plz帮帮我。
答案 0 :(得分:0)
您的管道看起来不错。你的蜘蛛有点奇怪。这是一个更好的版本:
import scrapy
from scrapy import Request
class WebSpider(scrapy.Spider):
name = "user_spider1"
allowed_domains = ["stackoverflow.com"]
start_urls = []
def start_requests(self):
for i in range(1,2):
self.start_urls.append( "http://stackoverflow.com/users?page="+ str(i)+"&tab=reputation&filter=week")
for url in self.start_urls:
yield self.make_requests_from_url(url)
def parse(self, response):
userSubUrl = response.xpath('//div[@class="user-details"]/a/@href').extract()
baseUrl = 'http://stackoverflow.com'
for subUrl in userSubUrl:
yield Request(baseUrl+subUrl, callback=self.parse_userinfo)
def parse_userinfo(self,response):
item = {}
stringprocessor = lambda x: x
item['user_name'] = stringprocessor(str(response.xpath('//h2[@class="user-card-name"]/text()[1]').extract_first()))
item['user_location'] = stringprocessor(str(response.xpath('//ul[@class="list-unstyled"]/li[1]/text()[2]').extract_first()))
item['user_reputation'] = stringprocessor(str(response.xpath('//div[@class="reputation"]/text()[1]').extract_first()))
tags = response.xpath('//div[@class="tag-container row"]/div/a[@class="post-tag"]/text()').extract_first()
item['user_tags'] = tags
yield item
答案 1 :(得分:0)
我有同样的问题 替换你的清单
ITEM_PIPELINES = ['test1.pipelines.Spider1Pipeline',]
by dict
ITEM_PIPELINES = {'test1.pipelines.Spider1Pipeline':300}
“您在此设置中为类分配的整数值决定了它们运行的顺序,从订单号从低到高的顺序通过管道。通常在0-1000范围内定义这些数字。”< / EM> 来源:http://doc.scrapy.org/en/0.24/topics/item-pipeline.html#activating-an-item-pipeline-component