我在周末使用python 2.7大约一个月(新手),我想学习更多并变得更好。我的任务是从我收集的3个(csv)数据文件(推文)列表中查找Twitter上使用的前50个单词的频率计数。但请记住,我想从nltk.corpus导入停用词中删除停用词。到目前为止,我只尝试f =打开-1的csv文件。理想情况下,我想在同一个脚本中执行其他2个csv文件。但是在我转移到其他2个csv文件之前,我想要至少完成1个工作。
from nltk.tokenize import TweetTokenizer
import csv
from nltk import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
from nltk.probability import *
tknzr = TweetTokenizer()
stop = stopwords.words("english")
#print(stop)
f = open ('tweets_table_favoritecount_top 50_test.csv')
#, encoding="utf8")
csv_f = csv.reader(f)
print(stopwords)
favoritecount = []
for row in csv_f:
favoritecount.append(row[2])
txt = str(row[2])
print([i for i in txt.lower().split() if i not in stop])
for i in txt:
fdist1 = FreqDist()
print(fdist1.most_common(50))