import nltk
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
import csv
import pandas as pd
从CSV获取数据:
address = 'filepath'
example = pd.read_csv(address)
review_column = example.Reviews
获得的数据并将其插入列表
reviews = []
for w in review_column:
reviews.append(w)
通过删除停用词和标点符号进一步增强列表中的数据。
reviews = [word for word in reviews if word not in stopwords.words('english')]
reviews = [word for word in reviews if word not in punctuation]
#reviews = [word_tokenize(i) for i in reviews]
从CSV获取数据
address = 'filepath'
example = pd.read_csv(address)
keywords_column = example.Keywords
将数据插入列表
keywords = []
for w in keywords_column:
keywords.append(w)
#keywords_tokenised = []
#keywords_tokenised = [word_tokenize(i) for i in keywords]
frequency_list = []
frequency = 0
for word in keywords:
if word == reviews:
frequency += 1
frequency_list.append((word,frequency))
else:
frequency_list.append((word,frequency))
print(frequency_list)