前言,这段代码来自Github / Youtube上的一个好人: https://github.com/the-javapocalypse/
我做了一些小的调整供个人使用。
我自己和推特上的情绪分析之间始终存在的一个问题是,存在如此多的机器人帖子。我想如果我完全无法避免机器人,也许我可以删除重复以对冲影响。
例如 - “#bitcoin”或“#btc” - Bot帐户存在于发布相同确切推文的许多不同句柄下。它可以说“它正在登月!现在购买#btc或永远后悔!买,买,买!这是我个人网站的链接[在这里插入个人网站网址]”
这似乎是一个积极的情绪发布。如果25个帐户每个帐户发布2次,如果我只分析最近包含“#btc”的500条推文,我们会有一些通货膨胀
所以我的问题:
请参阅下面的整个脚本:
import tweepy
import csv
import re
from textblob import TextBlob
import matplotlib.pyplot as plt
class SentimentAnalysis:
def __init__(self):
self.tweets = []
self.tweetText = []
def DownloadData(self):
# authenticating
consumerKey = ''
consumerSecret = ''
accessToken = ''
accessTokenSecret = ''
auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)
# input for term to be searched and how many tweets to search
searchTerm = input("Enter Keyword/Tag to search about: ")
NoOfTerms = int(input("Enter how many tweets to search: "))
# searching for tweets
self.tweets = tweepy.Cursor(api.search, q=searchTerm, lang="en").items(NoOfTerms)
csvFile = open('result.csv', 'a')
csvWriter = csv.writer(csvFile)
# creating some variables to store info
polarity = 0
positive = 0
negative = 0
neutral = 0
# iterating through tweets fetched
for tweet in self.tweets:
# Append to temp so that we can store in csv later. I use encode UTF-8
self.tweetText.append(self.cleanTweet(tweet.text).encode('utf-8'))
analysis = TextBlob(tweet.text)
# print(analysis.sentiment) # print tweet's polarity
polarity += analysis.sentiment.polarity # adding up polarities
if (analysis.sentiment.polarity == 0): # adding reaction
neutral += 1
elif (analysis.sentiment.polarity > 0.0):
positive += 1
else:
negative += 1
csvWriter.writerow(self.tweetText)
csvFile.close()
# finding average of how people are reacting
positive = self.percentage(positive, NoOfTerms)
negative = self.percentage(negative, NoOfTerms)
neutral = self.percentage(neutral, NoOfTerms)
# finding average reaction
polarity = polarity / NoOfTerms
# printing out data
print("How people are reacting on " + searchTerm +
" by analyzing " + str(NoOfTerms) + " tweets.")
print()
print("General Report: ")
if (polarity == 0):
print("Neutral")
elif (polarity > 0.0):
print("Positive")
else:
print("Negative")
print()
print("Detailed Report: ")
print(str(positive) + "% positive")
print(str(negative) + "% negative")
print(str(neutral) + "% neutral")
self.plotPieChart(positive, negative, neutral, searchTerm, NoOfTerms)
def cleanTweet(self, tweet):
# Remove Links, Special Characters etc from tweet
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) | (\w +:\ / \ / \S +)", " ", tweet).split())
# function to calculate percentage
def percentage(self, part, whole):
temp = 100 * float(part) / float(whole)
return format(temp, '.2f')
def plotPieChart(self, positive, negative, neutral, searchTerm, noOfSearchTerms):
labels = ['Positive [' + str(positive) + '%]', 'Neutral [' + str(neutral) + '%]',
'Negative [' + str(negative) + '%]']
sizes = [positive, neutral, negative]
colors = ['yellowgreen', 'gold', 'red']
patches, texts = plt.pie(sizes, colors=colors, startangle=90)
plt.legend(patches, labels, loc="best")
plt.title('How people are reacting on ' + searchTerm +
' by analyzing ' + str(noOfSearchTerms) + ' Tweets.')
plt.axis('equal')
plt.tight_layout()
plt.show()
if __name__ == "__main__":
sa = SentimentAnalysis()
sa.DownloadData()
答案 0 :(得分:2)
您可以使用此衬纸删除重复项。
self.tweets = list(set(self.tweets))
这将删除所有重复的推文。以防万一,如果您希望它正常工作,这是一个简单的示例
>>> tweets = ['this is a tweet', 'this is a tweet', 'Yet another Tweet', 'this is a tweet']
>>> print(tweets)
['this is a tweet', 'this is a tweet', 'Yet another Tweet', 'this is a tweet']
>>> tweets = list(set(tweets))
>>> print(tweets)
['this is a tweet', 'Yet another Tweet']
由于您已经删除了重复项,因此可以通过计算self.tweets
和NoOfTerms
的差值来获取已删除的tweet的数量
tweets_to_further_scrape = NoOfTerms - self.tweets
现在,您可以抓取tweets_to_further_scrape
条Tweet,并重复此删除重复和抓取的过程,直到找到所需数量的唯一Tweet。
迭代推文列表时,添加此行以删除外部链接。
tweet.text = ' '.join([i for i in tweet.text.split() if 'http' not in i])
希望这会对您有所帮助。编码愉快!
答案 1 :(得分:0)
您可以使用defaultdict
简单地保持推文实例的运行计数。您可能也想要删除网址,以防它们爆破新的缩短网址。
from collections import defaultdict
def __init__(self):
...
tweet_count = defaultdict(int)
def track_tweet(self, tweet):
t = self.clean_tweet(tweet)
self.tweet_count[t] += 1
def clean_tweet(self, tweet):
t = tweet.lower()
# any other tweet normalization happens here, such as dropping URLs
return t
def DownloadData(self):
...
for tweet in self.tweets:
...
# add logic to check for number of repeats in the dictionary.