我已经在网上采取了一些代码来对Twitter数据库进行情绪分析。我尝试运行它,它在开始时给了我打印错误,我发现新版本的python已经改变了它的打印方式。我收到错误,显示我的数据没有填入数组,如果有人使用python并且有老鹰眼,看看我哪里出错了请帮助。
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle
import chardet
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence
import pandas as pd
pd.options.mode.chained_assignment = None
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def ingest(filename):
with open(filename, 'rb') as f:
result = chardet.detect(f.read())
data = pd.read_csv(filename, encoding=result['encoding'])
data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
data = data[data.Sentiment.isnull() == False]
data['Sentiment'] = data['Sentiment'].map({4:1, 0:0})
data = data[data['SentimentText'].isnull() == False]
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)
print('dataset loaded with shape {}', format(data.shape))
return data
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = filter(lambda t: not t.startswith('@'), tokens)
tokens = filter(lambda t: not t.startswith('#'), tokens)
tokens = filter(lambda t: not t.startswith('http'), tokens)
return tokens
except:
return 'NC'
def postprocess(data, n=100):
data = data.head(n)
data['tokens'] = data['SentimentText'].progress_map(tokenize)
data = data[data.tokens != 'NC']
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)
return data
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in enumerate(tweets):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
print(":::::::::::::::::::::::::")
return labelized
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in tqdm(enumerate(tweets)):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
def buildWordVector(tokens, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in tokens:
try:
vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
if __name__ == '__main__':
filename = './training.csv'
#n = 1000000
n = 100
n_dim = 200
data = ingest(filename)
#data = data.head(5)
data = postprocess(data, n)
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2)
print("training length X", len(x_train))
print("training length Y", len(y_train))
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
print("jljkjkjlkjlj", len(x_train))
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
#tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.build_vocab([x.words for x in x_train])
#tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
print(tweet_w2v.most_similar('good'))
if True:
print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2)
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print (score[1])
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)
这是我得到的错误
C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dataset loaded with shape {} (505, 2)
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s]
training length X 0
training length Y 0
0it [00:00, ?it/s]
0it [00:00, ?it/s]
jljkjkjlkjlj 0
Traceback (most recent call last):
File "Sentiment_Analysis.py", line 127, in <module>
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train
raise RuntimeError("you must first build vocabulary before training the model")
RuntimeError: you must first build vocabulary before training the model
答案 0 :(得分:1)
我对相同的代码有同样的问题。网站上的代码绝对没有问题,但无论您如何订购,它都会返回一个空的词汇表。
我的解决方法是,当您在Python 2.7而不是3.x中运行相同的代码时,它运行顺畅。但是,如果您确实设法将其成功移植到Python 3.x,那么您可以获得更快的数据/内存访问速率。
编辑:发现问题,现在它也适用于Python 3。编辑相应的代码段,并且应该没有任何问题地构建词汇表。
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = list(filter(lambda t: not t.startswith('@'), tokens))
tokens = list(filter(lambda t: not t.startswith('#'), tokens))
tokens = list(filter(lambda t: not t.startswith('http'), tokens))
return tokens
except:
return 'NC'