我编写了以下代码来创建包含术语“happy”的推文中使用的术语词云。
import tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from time import sleep
import json
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from scipy.misc import imread
import time
import pandas as pd
consumer_key =''
consumer_secret =''
access_token =''
access_token_secret =''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
searchTerm ='happy'
MAX_TWEETS = 10
search_data=[ ]
current_working_dir = os.path.dirname(os.path.realpath(__file__))
current_working_dir = "./"
log_tweets = current_working_dir + str(time.time()) + '_searchtweets.txt'
with open(log_tweets, 'w') as outfile:
for tweet in tweepy.Cursor(api.search,q=searchTerm).items(MAX_TWEETS):
search_data.append(json.loads(json.dumps(tweet._json)))
outfile.write(json.dumps(tweet._json))
outfile.write("\n")
tweets = pd.DataFrame()
tweets['created_at'] = list(map(lambda tweet: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')), search_data))
tweets['user'] = list(map(lambda tweet: tweet['user']['screen_name'], search_data))
tweets['text'] = list(map(lambda tweet: tweet['text'].encode('utf-8'), search_data))
tweets['lang'] = list(map(lambda tweet: tweet['lang'], search_data))
tweets['Location'] = list(map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, search_data))
tweets['retweet_count'] = list(map(lambda tweet: tweet['retweet_count'], search_data))
tweets['favorite_count'] = list(map(lambda tweet: tweet['favorite_count'], search_data))
tweets['long'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][0] if tweet['coordinates'] != None else 'NaN', search_data))
tweets['latt'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][1] if tweet['coordinates'] != None else 'NaN', search_data))
words = " ".join(tweets['text'].values.astype(str))
no_urls_no_tags = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
search_mask = imread('images/twitter_mask.png', flatten=True )
wc = WordCloud(background_color="white",
font_path="/Library/Fonts/Verdana.ttf",
stopwords=STOPWORDS,
width=1800,
height=140,
mask=search_mask)
wc.generate(no_urls_no_tags)
plt.imshow(wc)
plt.axis("off")
plt.savefig('search_term_wordcloud_print.png', dpi =300)
plt.show()
运行时,我收到以下错误消息,
words = " ".join(tweets['text'].values.astype(str))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 139: ordinal not in range(128)
是否有一种方法或代码行可以更改以阻止此错误发生?