我使用朴素贝叶斯算法。它有负面和积极的评论,我正在对数据进行情感分析。
df=pd.DataFrame()
df=pd.read_csv('Data.tsv',delimiter='\t',quoting=3,engine='python')
df=df.dropna()
df=df.reindex(np.random.permutation(df.index))
corpus=[]
for i in range(len(df)):
review=re.sub('[^a-zA-Z]',' ',df['review'][i])
review=review.lower()
review=review.split()
ps=PorterStemmer()
review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review=' '.join(review)
corpus.append(review)
错误必须在这里因为我将所有结果都归结为语料库。我得到了
File "pandas\hashtable.pyx", line 309, in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:6554)
KeyError: 102
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features =135)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values
答案 0 :(得分:0)
这是一个小型演示,我们将使用NLTK模块提供的movie review
语料库。
输出:
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] MultinomNB__alpha=0.001, vect__analyzer=char_wb, vect__min_df=1, vect__ngram_range=(1, 1)
[CV] MultinomNB__alpha=0.001, vect__analyzer=char_wb, vect__min_df=1, vect__ngram_range=(1, 1)
[CV] MultinomNB__alpha=0.001, vect__analyzer=char_wb, vect__min_df=1, vect__ngram_range=(1, 1)
...
skipped
...
[Parallel(n_jobs=-1)]: Done 72 out of 72 | elapsed: 6.1min finished
Best parameters: {'MultinomNB__alpha': 1, 'vect__analyzer': 'char_wb', 'vect__min_df': 1, 'vect__ngram_range': (2, 5)}
precision recall f1-score support
neg 0.77 0.85 0.81 146
pos 0.84 0.75 0.79 154
avg / total 0.80 0.80 0.80 300
完整代码:
from pathlib import Path
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, movie_reviews
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.externals import joblib
stop = set(stopwords.words('english'))
ps = PorterStemmer()
# read moview reviews data set from NLTK
df = pd.DataFrame([[category, Path(movie_reviews.abspath(fileid))
.read_text().casefold()]
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)],
columns=['category','review'])
df['review'] = df['review'].str.replace(r'[^a-zA-Z]', ' ')
# replace all words with their stemmed form
X = [' '.join([ps.stem(w) for w in word_tokenize(review)
if w not in stop])
for review in df['review'].values]
y = df['category'].map({'pos':1, 'neg':-1})
# split data set into train and test parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
# build a pipeline: vectorize and fit `MultinomNB` classifier
pipeline = Pipeline([
('vect', CountVectorizer()),
('MultinomNB', MultinomialNB()) ])
# prepare a grid of hyperparameters
param_grid = dict(vect__min_df=[1,3],
vect__ngram_range=[(1,1),(2,4),(2,5)],
vect__analyzer=['char_wb'],
MultinomNB__alpha=[0.001, 0.01, 0.1, 1])
# use 3 folds, use all available CPUs, be verbose
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# find best hyperparameters
grid_search.fit(X_train, y_train)
# save trained model into Pickle file
joblib.dump(grid_search, r'c:/temp/grid_search_49192570.pkl')
# print best values for the hyperparameters
print('Best parameters: {}'.format(grid_search.best_params_))
# predict categories for the "unseen" test data set
y_pred = grid_search.best_estimator_.predict(X_test)
# show the main classification metrics
print(classification_report(y_test, y_pred, target_names=['neg','pos']))