import glob
import os
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize
sid = SentimentIntensityAnalyzer()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
files = glob.glob(os.path.join(os.getcwd(), 'speeches', '*.txt'))
text = []
for file in files:
#open the file and then call .read() to get the text
with open(file) as f:
text.append(f.read())
sentences = tokenizer.tokenize(text)
并打印每个的极性分数。
for sentence in sentences:
print(sentence, file=open('sentiment1.txt', 'a'))
scores = sid.polarity_scores(sentence)
for key in sorted(scores):
print('{0}: {1}, '.format(key, scores[key]), end='', file=open('sentiment1.txt', 'a'))
print()
我在'speeches'文件夹中有五个文本文件已成功读取,但问题是输出文件(文本)是列表类型而不是字符串,而tokenize需要一个字符串或字节作为其参数。请帮我调整一下,因为我读过这可能是Python 3的一个问题。谢谢
答案 0 :(得分:0)
# below is the sentiment analysis code written for sentence-level analysis
import glob
import os
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize
# Next, VADER is initialized so I can use it within the Python script
sid = SentimentIntensityAnalyzer()
# I will also initialize the 'english.pickle' function and give it a short
# name
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#Each of the text file is listed from the folder speeches
files = glob.glob(os.path.join(os.getcwd(), 'speeches', '*.txt'))
text = []
#iterate over the list getting each file
for file in files:
#open the file and then call .read() to get the text
with open(file) as f:
text=f.read()
sentences = tokenizer.tokenize(text)
for sentence in sentences:
print(sentence, file=open('sentiment1.txt', 'a'))
scores = sid.polarity_scores(sentence)
for key in sorted(scores):
print('{0}: {1}, '.format(key, scores[key]), end='',
file=open('sentiment1.txt', 'a'))
print()
我已经测试了没有空列表的代码,因此不需要追加,但该方法只选择文件夹中的最后一个文本文件。我正在再次编写代码段,以便您可以清楚地看到我拥有的内容。感谢
答案 1 :(得分:0)
如果将text
初始化为空列表,并将五个输入文件的内容追加到text
,则可以使用join()
函数将这些文件粉碎成一个字符串供tokenizer.tokenize()
使用:
text = []
for file in files:
#open the file and then call .read() to get the text
with open(file) as f:
text.append(f.read())
text_str = "".join(text) # or "\n".join(text)
sentences = tokenizer.tokenize(text_str)
或者,您可以将text
初始化为空字符串,并依次连接每个文件的内容:
text = ""
for file in files:
#open the file and then call .read() to get the text
with open(file) as f:
text += f.read()
sentences = tokenizer.tokenize(text)