情感分析代码(word2vec)在我的python版本中没有正常工作(词汇未构建)

时间:2017-12-26 09:02:50

标签: python twitter nltk sentiment-analysis word2vec

我已经在网上采取了一些代码来对Twitter数据库进行情绪分析。我尝试运行它,它在开始时给了我打印错误,我发现新版本的python已经改变了它的打印方式。我收到错误,显示我的数据没有填入数组,如果有人使用python并且有老鹰眼,看看我哪里出错了请帮助。

    import numpy as np 
    from copy import deepcopy
    from string import punctuation
    from random import shuffle
    import chardet
    from sklearn.manifold import TSNE
    from sklearn.preprocessing import scale


    import bokeh.plotting as bp
    from bokeh.models import HoverTool, BoxSelectTool
    from bokeh.plotting import figure, show, output_notebook

    import gensim
    from gensim.models.word2vec import Word2Vec 
    LabeledSentence = gensim.models.doc2vec.LabeledSentence 

    import pandas as pd 
    pd.options.mode.chained_assignment = None

    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")

    from nltk.tokenize import TweetTokenizer 
    tokenizer = TweetTokenizer()

    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer

    def ingest(filename):
        with open(filename, 'rb') as f:
            result = chardet.detect(f.read())
        data = pd.read_csv(filename, encoding=result['encoding'])
        data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
        data = data[data.Sentiment.isnull() == False]
        data['Sentiment'] = data['Sentiment'].map({4:1, 0:0})
        data = data[data['SentimentText'].isnull() == False]
        data.reset_index(inplace=True)
        data.drop('index', axis=1, inplace=True)
        print('dataset loaded with shape {}', format(data.shape)) 

        return data

    def tokenize(tweet):
        try:
            tweet = unicode(tweet.decode('utf-8').lower())
            tokens = tokenizer.tokenize(tweet)
            tokens = filter(lambda t: not t.startswith('@'), tokens)
            tokens = filter(lambda t: not t.startswith('#'), tokens)
            tokens = filter(lambda t: not t.startswith('http'), tokens)
            return tokens
        except:
            return 'NC'

    def postprocess(data, n=100):
        data = data.head(n)
        data['tokens'] = data['SentimentText'].progress_map(tokenize)  
        data = data[data.tokens != 'NC']
        data.reset_index(inplace=True)
        data.drop('index', inplace=True, axis=1)
        return data


    def labelizeTweets(tweets, label_type):
        labelized = []
        for i,v in  enumerate(tweets):
            label = '%s_%s'%(label_type,i)
            labelized.append(LabeledSentence(v, [label]))
            print(":::::::::::::::::::::::::")
        return labelized


    def labelizeTweets(tweets, label_type):
        labelized = []
        for i,v in tqdm(enumerate(tweets)):
            label = '%s_%s'%(label_type,i)
            labelized.append(LabeledSentence(v, [label]))
        return labelized


    def buildWordVector(tokens, size):
        vec = np.zeros(size).reshape((1, size))
        count = 0.
        for word in tokens:
            try:
                vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
                count += 1.
            except KeyError: 

                continue
        if count != 0:
            vec /= count
        return vec



    if __name__ == '__main__':

        filename = './training.csv'

        #n = 1000000
        n = 100
        n_dim = 200

        data = ingest(filename)
        #data = data.head(5)
        data = postprocess(data, n)

        x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2)


        print("training length X", len(x_train))

        print("training length Y", len(y_train))


        x_train = labelizeTweets(x_train, 'TRAIN')
        x_test = labelizeTweets(x_test, 'TEST')

        print("jljkjkjlkjlj", len(x_train))

        tweet_w2v = Word2Vec(size=n_dim, min_count=10)
        #tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
        tweet_w2v.build_vocab([x.words for x in x_train])

        #tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
        tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)




        print(tweet_w2v.most_similar('good'))

        if True:
            print('building tf-idf matrix ...')
            vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
            matrix = vectorizer.fit_transform([x.words for x in x_train])
            tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
            print('vocab size :', len(tfidf))

            train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
            train_vecs_w2v = scale(train_vecs_w2v)

            test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
            test_vecs_w2v = scale(test_vecs_w2v)

            model = Sequential()
            model.add(Dense(32, activation='relu', input_dim=200))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(optimizer='rmsprop',
                                        loss='binary_crossentropy',
                                        metrics=['accuracy'])

            model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2)

            score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
            print (score[1])

    output_notebook()
    plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
        x_axis_type=None, y_axis_type=None, min_border=1)

    word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]

    tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
    tsne_w2v = tsne_model.fit_transform(word_vectors)

    tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
    tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]

    plot_tfidf.scatter(x='x', y='y', source=tsne_df)
    hover = plot_tfidf.select(dict(type=HoverTool))
    hover.tooltips={"word": "@words"}
    show(plot_tfidf)

这是我得到的错误

    C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dataset loaded with shape {} (505, 2)
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s]
training length X 0
training length Y 0
0it [00:00, ?it/s]
0it [00:00, ?it/s]
jljkjkjlkjlj 0
Traceback (most recent call last):
  File "Sentiment_Analysis.py", line 127, in <module>
    tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
  File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train
    raise RuntimeError("you must first build vocabulary before training the model")
RuntimeError: you must first build vocabulary before training the model

1 个答案:

答案 0 :(得分:1)

我对相同的代码有同样的问题。网站上的代码绝对没有问题,但无论您如何订购,它都会返回一个空的词汇表。

我的解决方法是,当您在Python 2.7而不是3.x中运行相同的代码时,它运行顺畅。但是,如果您确实设法将其成功移植到Python 3.x,那么您可以获得更快的数据/内存访问速率。

编辑:发现问题,现在它也适用于Python 3。编辑相应的代码段,并且应该没有任何问题地构建词汇表。

def tokenize(tweet):
        try:
            tweet = unicode(tweet.decode('utf-8').lower())
            tokens = tokenizer.tokenize(tweet)
            tokens = list(filter(lambda t: not t.startswith('@'), tokens))
            tokens = list(filter(lambda t: not t.startswith('#'), tokens))
            tokens = list(filter(lambda t: not t.startswith('http'), tokens))
            return tokens
        except:
            return 'NC'