Question

我有一个由头条新闻组成的熊猫数据框。我正在做一个简单的情绪计算，通过标记和比较标题和正面和负面的单词列表。我将标题的所有情绪附加到列中，然后将其附加到原始数据框并保存为Excel文件。

生成的原始文件大约为12 MB。虽然下面的代码有效，但速度很慢;并花了几个小时来完全阅读文件并分配分数。这是正常的吗？有什么办法可以加快这个过程吗？我知道pandas dataframe列中的循环可能很慢 - 有哪些替代方案？

# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import pandas as pd
from violencevocabulary import new_words as extended_neg_list
import unicodedata


#function to calculate sentiment
def sentimentanalyzer (country_name,text_type):

    data = []
    xls_file = pd.ExcelFile('/UsersDesktop/MasterData.xlsx')
    df = xls_file.parse(country_name)
    text_body = df[text_type]
    text_body = pd.Series(text_body)
    headlines = text_body.tolist()


    for i in headlines:
        if type(i) == unicode:
            i = unicodedata.normalize('NFKD', i).encode('ascii','ignore')

            data.append(i)


    # processing the sentiment comparispon files

    pos_words = []
    neg_words = []

    f = open('/Users/positive-words.txt','r')
    plines = f.readlines()
    for line in plines:
        line = line.rstrip('\n')
        line = line.lower()
        pos_words.append(line)
    positive_words = pos_words[35:]
    f.close()

    g  = open('/Users/Desktop/negative-words.txt','r')
    nlines = g.readlines()
    neg_words = []
    for nline in nlines:
        nline = nline.strip('\n')
        nline = nline.lower()
        neg_words.append(nline)
    negative_words = neg_words[35:]
    g.close()
    negative_words = negative_words + extended_neg_list


    senti_list = []
    for j in data:

        tokens = word_tokenize(j)

        for k in tokens:
            negs = [k for k in tokens if k in negative_words]
            negs = len(negs)
            pos = [k for k in tokens if k in positive_words]
            pos = len(pos)
        calc = pos - negs
        print calc
        senti_list.append(calc)

    df2 = pd.Series(senti_list,name="Sentiment")
    new_data = pd.concat([df,df2,],axis=1)
    new_data_name = '/Users/Desktop/Results/' + country_name + " " + text_type + ".xls"
    writer_new_data_name = pd.ExcelWriter(new_data_name, engine='xlsxwriter')
    new_data.to_excel(writer_new_data_name,sheet_name='Sheet1')



    return

使用Pandas计算情绪 - 循环计算缓慢

0 个答案: