我正在基于用户评论(基于内容)开发旅游推荐系统。
我分析了数据集(515K CVS数据库)中的每个用户评论,所以我的问题是我需要分析每个酒店的评论,如何收集每个酒店的评论然后进行分析,反之亦然
这是我的代码,您能告诉我出什么问题了吗,您是否建议我安装特定的库?
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
# create the label 1=rating<5 , 0=rating>5
reviews_df["is_bad_review"] = reviews_df["Reviewer_Score"].apply(lambda x: 1 if x < 5 else 0)
# select only relevant columns
reviews_df = reviews_df[["review", "is_bad_review"]]
print (reviews_df.head())
# Reviews data is sampled in order to speed up computations.
reviews_df = reviews_df.sample(frac = 0.1, replace = False, random_state=42)
#Clean data
# remove 'No Negative' or 'No Positive' from text
reviews_df["review"] = reviews_df["review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
def clean_text(text):
# lower text
text = text.lower()
# tokenize text and remove puncutation
text = [word.strip(string.punctuation) for word in text.split(" ")]
# remove words that contain numbers
text = [word for word in text if not any(c.isdigit() for c in word)]
# remove stop words
stop = stopwords.words('english')
text = [x for x in text if x not in stop]
# remove empty tokens
text = [t for t in text if len(t) > 0]
# pos tag text
pos_tags = pos_tag(text)
# lemmatize text
text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
# remove words with only one letter
text = [t for t in text if len(t) > 1]
# join all
text = " ".join(text)
return(text)
# clean text data
reviews_df["review_clean"] = reviews_df["review"].apply(lambda x: clean_text(x))
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
reviews_df["sentiments"] = reviews_df["review"].apply(lambda x: sid.polarity_scores(x))
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)
# add number of characters column
reviews_df["nb_chars"] = reviews_df["review"].apply(lambda x: len(x))
# add number of words column
reviews_df["nb_words"] = reviews_df["review"].apply(lambda x: len(x.split(" ")))
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(reviews_df["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = reviews_df.index
reviews_df = pd.concat([reviews_df, tfidf_df], axis=1)