我有一个由头条新闻组成的熊猫数据框。我正在做一个简单的情绪计算,通过标记和比较标题和正面和负面的单词列表。我将标题的所有情绪附加到列中,然后将其附加到原始数据框并保存为Excel文件。
生成的原始文件大约为12 MB。虽然下面的代码有效,但速度很慢;并花了几个小时来完全阅读文件并分配分数。这是正常的吗?有什么办法可以加快这个过程吗?我知道pandas dataframe列中的循环可能很慢 - 有哪些替代方案?
# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import pandas as pd
from violencevocabulary import new_words as extended_neg_list
import unicodedata
#function to calculate sentiment
def sentimentanalyzer (country_name,text_type):
data = []
xls_file = pd.ExcelFile('/UsersDesktop/MasterData.xlsx')
df = xls_file.parse(country_name)
text_body = df[text_type]
text_body = pd.Series(text_body)
headlines = text_body.tolist()
for i in headlines:
if type(i) == unicode:
i = unicodedata.normalize('NFKD', i).encode('ascii','ignore')
data.append(i)
# processing the sentiment comparispon files
pos_words = []
neg_words = []
f = open('/Users/positive-words.txt','r')
plines = f.readlines()
for line in plines:
line = line.rstrip('\n')
line = line.lower()
pos_words.append(line)
positive_words = pos_words[35:]
f.close()
g = open('/Users/Desktop/negative-words.txt','r')
nlines = g.readlines()
neg_words = []
for nline in nlines:
nline = nline.strip('\n')
nline = nline.lower()
neg_words.append(nline)
negative_words = neg_words[35:]
g.close()
negative_words = negative_words + extended_neg_list
senti_list = []
for j in data:
tokens = word_tokenize(j)
for k in tokens:
negs = [k for k in tokens if k in negative_words]
negs = len(negs)
pos = [k for k in tokens if k in positive_words]
pos = len(pos)
calc = pos - negs
print calc
senti_list.append(calc)
df2 = pd.Series(senti_list,name="Sentiment")
new_data = pd.concat([df,df2,],axis=1)
new_data_name = '/Users/Desktop/Results/' + country_name + " " + text_type + ".xls"
writer_new_data_name = pd.ExcelWriter(new_data_name, engine='xlsxwriter')
new_data.to_excel(writer_new_data_name,sheet_name='Sheet1')
return