#Reading files with txt extension
def get_sentences():
for root, dirs, files in os.walk("/Users/Documents/test1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
yield lines
formoreprocessing = get_sentences()
#Tokenizing sentences of the text files
from nltk.tokenize import sent_tokenize
for i in formoreprocessing:
raw_docs = sent_tokenize(i)
tokenized_docs = [sent_tokenize(i) for sent in raw_docs]
'''Removing Stop Words'''
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = set(stopwords.words("English"))
def strip_stopwords(sentence):
return ' '.join(word for word in sentence.split() if word not in stopset)
stopword_removed_sentences = (strip_stopwords(sentence) for sentence in raw_docs)
print(stopword_removed_sentences)
上面的代码没有打印它应该是什么。而是投掷: 在0x1193417d8> 作为输出。这里有什么错误? 我使用的是python 3.5。
答案 0 :(得分:1)
试试print(list(stopword_removed_sentences))
。这会在打印之前将生成器转换为列表
答案 1 :(得分:0)
这是最终答案,它提供了解决我在之前评论中提到的问题的最佳结果。
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(''.join(formoreprocessing))
#print(raw_docs)
tokenized_docs = [sent_tokenize(''.join(formoreprocessing)) for sent in raw_docs]
#Removing Stop Words
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = set(stopwords.words("English"))
def strip_stopwords(sentence):
return ' '.join(word for word in sentence.split() if word not in stopset)
stopword_removed_sentences = (strip_stopwords(sentence) for sentence in raw_docs)
print(list(stopword_removed_sentences))