我有一个csv数据文件,其中包含“注释”列以及希伯来语中的满意答案。
我想使用情感分析来为数据中的每个单词或单词分配分数,并通过逻辑回归获得正/负概率。
到目前为止,我的代码:
PYTHONIOENCODING="UTF-8"
df= pd.read_csv('keep.csv', encoding='utf-8' , usecols=['notes'])
txt = df.notes.str.lower().str.replace(r'\|', ' ').str.cat(sep=' ')
words = nltk.tokenize.word_tokenize(txt)
tokens=[word.lower() for word in words if word.isalpha()]
bigrm = list(nltk.bigrams(tokens))
word_index = {}
current_index = 0
for token in tokens:
if token not in word_index:
word_index[token] = current_index
current_index += 1
def tokens_to_vector(tokens, label):
x = np.zeros(len(word_index) + 1)
for t in tokens:
i = word_index[t]
x[i] += 1
x = x / x.sum()
x[-1] = label
return x
N= len(word_index)
data = np.zeros((N, len(word_index) + 1))
i = 0
for token in tokens:
xy = tokens_to_vector(tokens, 1)
data[i,:] = xy
i += 1
此循环无效。 如何生成数据,然后为每个bigrm接收正/负概率?
答案 0 :(得分:1)
您的代码段正确吗?您需要在所有for循环中缩进。
df= pd.read_csv('keep.csv', encoding='utf-8' , usecols=['notes'])
txt = df.notes.str.lower().str.replace(r'\|', ' ').str.cat(sep=' ')
words = nltk.tokenize.word_tokenize(txt)
tokens=[word.lower() for word in words if word.isalpha()]
bigrm = list(nltk.bigrams(tokens))
word_index = {}
current_index = 0
for token in tokens:
if token not in word_index:
word_index[token] = current_index
current_index += 1
def tokens_to_vector(tokens, label):
x = np.zeros(len(word_index) + 1)
for t in tokens:
i = word_index[t]
x[i] += 1
x = x / x.sum()
x[-1] = label
return x
N= len(word_index)
data = np.zeros((N, len(word_index) + 1))
i = 0
for token in tokens:
xy = tokens_to_vector(tokens, 1)
data[i,:] = xy
i += 1```