我正在尝试通过对等的友谊网络进行爬网,在该网络中,我需要挑选特定用户的前5名最受欢迎的追随者,并通过他们各自的追随者群体进行爬网。当我尝试获取作为爬网函数列表的数据时,收到以下错误。
TypeError:文档必须是dict,bson.son.SON,bson.raw_bson.RawBSONDocument的实例,或者是从collections.MutableMapping继承的类型
在处理上述异常期间,发生了另一个异常:pymongo.errors.ServerSelectionTimeoutError:localhost:27017:[Errno 61]连接被拒绝。程序以退出代码结束:1
这是我的代码
def save_to_mongo(data, mongo_db, mongo_db_coll, **mongo_conn_kw):
client = pymongo.MongoClient(**mongo_conn_kw)
db = client[mongo_db]
coll = db[mongo_db_coll]
try:
return coll.insert_many(data)
except:
return coll.insert_one(data)
def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False,criteria=None, projection=None, **mongo_conn_kw):
client = pymongo.MongoClient(**mongo_conn_kw)
db = client[mongo_db]
coll = db[mongo_db_coll]
if criteria is None:
criteria = {}
if projection is None:
cursor = coll.find(criteria)
else:
cursor = coll.find(criteria, projection)
if return_cursor:
return cursor
else:
return [ item for item in cursor ]
def pickFiveMostPopular(users):
unsortedList_by_follower_count = []
sortedList_by_follower_count = []
p = twitter_network.get_user_profile(twitter_api, screen_names=None, user_ids=users)
for user in users:
unsortedList_by_follower_count.append(tuple((user, p[user]['followers_count'])))
sortedList_by_follower_count = sorted(unsortedList_by_follower_count, key = lambda x : x[1], reverse=True)
top5 = [x[0] for x in sortedList_by_follower_count[:5]]
print("The five most popular people who " + screen_name + " also follows are: " + getScreenName(top5))
return top5
def crawl_followers(twitter_api, screen_name, limit=1000000, depth=3, **mongo_conn_kw):
seed_id = str(twitter_api.users.show(screen_name=screen_name)['id'])
#_, next_queue = twitter_network.get_friends_followers_ids(twitter_api, user_id=seed_id, friends_limit=0, followers_limit=limit)
next_queue = pickFiveMostPopular(twitter_network.get_reciprocal_friends(twitter_api, screen_name=screen_name, friends_limit=0, followers_limit=limit))
save_to_mongo({'followers' : [ _id for _id in next_queue ]}, 'followers_crawl', '{0}-follower_ids'.format(seed_id), **mongo_conn_kw)
d = 1
while d < depth:
d += 1
(queue, next_queue) = (next_queue, [])
for fid in queue:
_, follower_ids = get_friends_followers_ids(twitter_api, user_id=fid,friends_limit=0, followers_limit=limit)
save_to_mongo({'followers' : [ _id for _id in follower_ids ]},'followers_crawl', '{0}-follower_ids'.format(fid))
next_queue += follower_ids
print("Done crawling!")
请告诉我如何正确地将数据获取到save_to_mongo函数。赞赏!