使用图书馆:
from multiprocessing.dummy import Pool as ThreadPool
我正在发送推文,然后按如下方式处理它们:
class StreamListener(tweepy.StreamListener):
httpsCheck = 'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
httpCheck = 'http?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
def on_status(self, status):
if status.retweeted:
return
tweetText = status.text.encode('utf8')
created_at = status.created_at
id = status.id
if (re.findall(self.httpCheck, tweetText) or re.findall(self.httpsCheck, tweetText)):
return
if (re.search('[a-zA-Z]', tweetText)):
response = natural_language_understanding.analyze(
text=tweetText,
features=Features(
entities=EntitiesOptions(
emotion=True,
sentiment=True,
limit=2),
keywords=KeywordsOptions(
emotion=True,
sentiment=True,
limit=2)),
language='en'
)
response["tweet"] = tweetText
response["id"] = id
response["created_at"] = created_at
with open('#LFCROMA-SF2.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for key, value in response.items():
writer.writerow([key, value])
def on_error(self, status_code):
if status_code == 420:
return False
现在,我想使用多线程来提高pool
auth = tweepy.OAuthHandler(settingsTwitter.TWITTER_APP_KEY, settingsTwitter.TWITTER_APP_SECRET)
auth.set_access_token(settingsTwitter.TWITTER_KEY, settingsTwitter.TWITTER_SECRET)
api = tweepy.API(auth)
stream_listener = StreamListener()
stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
pool = ThreadPool(4)
pool.map(stream.filter, languages=["en"], track=["#LFC"])
通过这样做,我收到一个错误:
Traceback(最近一次调用最后一次):文件“atomic.py”,第66行,in pool.map(stream.filter,languages = [“en”],track = [“#LFC”])TypeError:map()得到一个意外的关键字参数'languages'
我无法找到在pool.map()
目的是流式传输推文,应用一些繁重的处理,然后保存结果。流媒体>>存在瓶颈。处理速度。这就是我想以多线程方式传输推文的原因。
答案 0 :(得分:1)
警告: from multiprocessing.pool import Pool as ThreadPool
将覆盖pool.ThreadPool方法。
因此请将其用作from multiprocessing.pool import Pool
>>>help(Pool.map)
Help on function map in module multiprocessing.pool:
map(self, func, iterable, chunksize=None)
Apply `func` to each element in `iterable`, collecting the results
in a list that is returned.
您应该传递函数和可迭代对象的实例,以实现此目的。
因此,删除languages = ['en']
,因为地图功能没有语言参数
而是尝试
pool.map(function, <a_list you want to pass to function> )
来tweepy
在Stream
之后,请按照文档
stream.filter(languages=['en'], track=["#LFC"])
您无需在此处拨打
pool.map
功能。
stream.filter
会很好地为您提供结果。
要调用自定义流侦听器,请将stream_listener = StreamListener()
更改为
stream_listener = CustomStreamListener()
您可以按以下方式致电pool.map
:
with Pool(4) as p:
p.map(stream.filter(languages=['en'], track=["#LFC"]))
Twitter API附带速率限制和访问限制。如果您使用的是标准版,则会出现以下错误。
尝试在之前启动新流程 当前进程已完成其自举阶段。
这是因为您正在创建线程并尝试多次调用相同的API, 因为你单一访问令牌是禁止的。
如果您有多个API访问帐户,那么您可以执行以下操作:
auth_list = [auth1,auth2,auth3,auth4] #.... if more
with Pool(4) as p:
p.map(stream.filter(languages=['en'], track=["#LFC"]),auth_list)
请检查企业版是否没有limitations
注意:为避免被阻止,请使用wait_on_rate_limit=True, wait_on_rate_limit_notify=True
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
答案 1 :(得分:0)
解决这个问题的最佳方法是:
tail -f
方式以并行方式(多线程)从文件中读取推文将确保没有处理瓶颈。用例如:
理解这一点# -*- coding: utf-8 -*-
import re
import csv
import json
import settingsWatson
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, EntitiesOptions, KeywordsOptions, CategoriesOptions
import os
import time
import settingsTwitter
import tweepy
import datetime
from multiprocessing.dummy import Pool as ThreadPool
natural_language_understanding = NaturalLanguageUnderstandingV1(
username=settingsWatson.username,
password=settingsWatson.password,
version='2018-03-16')
class StreamListener(tweepy.StreamListener):
tweet = {}
httpsCheck = 'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
httpCheck = 'http?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
idSelf = 0
fieldNames = ['tweet', 'sequence', 'created_at', 'id']
def on_status(self, status):
if status.retweeted:
return
tweetText = status.text.encode('utf8')
created_at = status.created_at
id = status.id
if (re.findall(self.httpCheck, tweetText) or re.findall(self.httpsCheck, tweetText)):
return
if (re.search('[a-zA-Z]', tweetText)):
self.idSelf += 1
self.tweet["tweet"] = tweetText
self.tweet["id"] = id
self.tweet["sequence"] = self.idSelf
self.tweet["created_at"] = created_at
with open('#ELCLASICO-2018-05-07.csv', 'a') as csv_file:
#json.dump(self.tweet, json_file, sort_keys=True, indent=4, default = str)
writer = csv.DictWriter(csv_file, self.tweet.keys())
#for key, value in self.tweet.items():
#a = [self.tweet]
#print a[0]['tweet']
writer.writerow(self.tweet)
def on_error(self, status_code):
if status_code == 420:
return False
auth = tweepy.OAuthHandler(settingsTwitter.TWITTER_APP_KEY, settingsTwitter.TWITTER_APP_SECRET)
auth.set_access_token(settingsTwitter.TWITTER_KEY, settingsTwitter.TWITTER_SECRET)
api = tweepy.API(auth)
print "Twitter API Authentication is successful!"
stream_listener = StreamListener()
stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
print "Streaming begins!"
def startStream():
while True:
try:
stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
stream.filter(languages=["en"], track=["#ElClasico"])
except:
continue
startStream()
以上代码用于在特定条件下传输推文,然后将其保存在CSV文件中。
然而,这从来都不是问题。问题出在我的“原子”方法中。我应该避免瓶颈而不是将整个过程(流,流程,保存)放在一个但是通过分布式计算方法(流是一个任务,流程是另一个)。请注意,此方法也会失败(因为处理永远不会匹配流的速度)。为了克服这个问题,我以分布式的方式应用了多线程:
这是处理内容的代码:
# -*- coding: utf-8 -*-
import re
import csv
import json
import settingsWatson
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, EntitiesOptions, KeywordsOptions, CategoriesOptions
import os
import time
import settingsTwitter
import tweepy
import datetime
from multiprocessing.dummy import Pool as ThreadPool
natural_language_understanding = NaturalLanguageUnderstandingV1(
username=settingsWatson.username,
password=settingsWatson.password,
version='2018-03-16')
class FileTailer(object):
def __init__(self, file, delay=1):
self.file = file
self.delay = delay
def __iter__(self):
while True:
where = self.file.tell()
line = self.file.readline()
if line and line.endswith('\n'): # only emit full lines
yield line
#response = self.naturalLanguageProcessing(line)
else:
print "Waiting for new line"
# for a partial line, pause and back up
time.sleep(self.delay) # ...not actually a recommended approach.
self.file.seek(where)
class watson:
entityDict = {"Messi":["Lionel Messi", "Leo", "Messi"], "Ronaldo":["Cristiano Ronaldo", "Cristiano", "Ronaldo"], "Iniesta":["Andres Iniesta", "Iniesta"], "Barcelona":["Barca", "Barcelona", "FC Barcelona", "#FCBarcelona"], "Real Madrid":["Real Madrid", "Madrid", "#RMA", "#RealMadrid"]}
date = "2018-05-06"
def createFiles(self):
for entity in self.entityDict:
fileName = str(entity) + "-" + str(self.date) + ".csv"
print fileName
with open(fileName, 'wb') as myFile:
wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
print "This is the entity defined list:"
print self.entityDict
for i in self.entityDict:
for j in self.entityDict[i]:
print j
def naturalLanguageProcessing(self,tweetText):
print "NLP is called with this text: " + tweetText
try:
response = natural_language_understanding.analyze(
text=tweetText,
features=Features(
entities=EntitiesOptions(
emotion=True,
sentiment=True,
limit=2),
keywords=KeywordsOptions(
emotion=True,
sentiment=True,
limit=2)),
language='en'
)
response["tweet"] = tweetText
self.saveResults(response)
except:
print "Error occured. Sleeping for one"
time.sleep(1)
return None
def saveResults(self, response):
print "Saving the results"
entitiesTweet = response["entities"]
print "Printing entitiesTweet"
print entitiesTweet
for entity in entitiesTweet:
try:
for i in self.entityDict:
for j in self.entityDict[i]:
if(j == entity["text"]):
fileName = str(self.entityDict[i]) + "-" + str(self.date) + ".csv"
with open(fileName, 'a') as myFile:
wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
entity["tweet"] = response["tweet"]
wr.writerow([entity])
except Exception as e:
print (e)
#entityToBeInserted = entity
#entityToBeInserted["tweet"] = response["tweet"]
#fileName = str(entityItem) + "-" + str(date) + ".csv"
#with open(fileName, 'a') as myFile:
#wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#wr.writerow([entityToBeInserted])
with open('#ELCLASICO-2018-05-07-Watson.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([response])
#def saveEntityResults(self, entities, date):
def initiator(farzi):
csv_reader = csv.reader(FileTailer(open('#ELCLASICO-2018-05-07.csv')))
ob = watson()
for row in csv_reader:
tweet = row[1]
ob.naturalLanguageProcessing(tweet)
pool = ThreadPool(4)
farzi = "farzi"
pool.map(initiator, farzi)