Pipelines.py
class DotabuffPipeline(object):
def open_spider(self, spider):
self.match_dict = {}
def process_item(self, item, spider):
ID = item['matchID']
if ID in self.match_dict:
self.match_dict[ID] = self.match_dict[ID] + 1
if self.match_dict[ID]==5:
return item
else:
self.match_dict[ID] = 1
firstspider.py
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
import json
from dotabuff.items import DotabuffItem
class DotaSpider(CrawlSpider):
name = "dotaspider"
allow_domains = ['www.dotabuff.com']
start_urls = []
with open('spiders/Steam.json','r') as f:
steam_data = json.load(f)
f.close
steam_members = steam_data['members']
for member in steam_members:
url = 'http://www.dotabuff.com/players/%s/matches?page=1' %str(member-76561197960265728)
start_urls.append(url)
rules = (Rule(LinkExtractor(allow=(r'http://www.dotabuff.com/players/\d+/matches\?page=\d+')), callback="parse_item", follow= True),)
def parse_item(self, response):
sel = Selector(response)
matches = sel.xpath('//td[@class="cell-large"]/a/@href').extract()
for match in matches:
item = DotabuffItem()
match = match.split('/')[-1]
item['matchID'] = match
yield item
我在www.dotabuff.com上搜索一些比赛号码,我在json中有五个蒸汽ID。我想找出我们五人一起比赛的比赛。所以我定义了一个用作计数器的dict来计算出现次数。但它不起作用。
Traceback (most recent call last):
File "e:\anaconda2\lib\site-packages\twisted\internet\defer.py", line 150, in
maybeDeferred
result = f(*args, **kw)
File "e:\anaconda2\lib\site-packages\scrapy\xlib\pydispatch\robustapply.py", l
ine 57, in robustApply
return receiver(*arguments, **named)
File "e:\anaconda2\lib\site-packages\scrapy\extensions\feedexport.py", line 19
3, in item_scraped
slot.exporter.export_item(item)
File "e:\anaconda2\lib\site-packages\scrapy\exporters.py", line 111, in export
_item
itemdict = dict(self._get_serialized_fields(item))
File "e:\anaconda2\lib\site-packages\scrapy\exporters.py", line 63, in _get_se
rialized_fields
field_iter = six.iterkeys(item)
File "e:\anaconda2\lib\site-packages\six.py", line 593, in iterkeys
return d.iterkeys(**kw)
AttributeError: 'NoneType' object has no attribute 'iterkeys'
答案 0 :(得分:0)
查看scrapy here中管道的文档,它说
为每个项目管道组件调用此方法,并且必须 或者返回带有数据的Dict,Item(或任何后代类)对象 或者引发DropItem异常。
您的process_item方法不遵守此规则,并且可以返回None
,这是不可迭代的。