我使用SVM,Logistic回归和NB训练了一个多标签分类器。 我的问题是如何将看不见的数据传递给分类器? 这是我的完整代码
# Bring all the important libraries
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
#from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns
df = pd.read_csv("movies_genres_en.csv", delimiter='\t')
df.drop('plot_lang', axis=1, inplace=True)
df.rename(columns={'plot':'plot_text'}, inplace=True)
df.info()
#using for loop get a count of movies by genre
df_genres = df.drop(['plot_text', 'title'], axis=1)
counts = []
categories = list(df_genres.columns.values)
for i in categories:
counts.append((i, df_genres[i].sum()))
df_stats = pd.DataFrame(counts, columns = ['genre','#movies'])
df_stats
# Create a fuction to clean the text
def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "can not ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"\'scuse", " excuse ", text)
text = re.sub('\W', ' ', text)
text = re.sub('\s+', ' ', text)
text = text.strip(' ')
return text
# clean up the text in plot
df['plot_text'] = df['plot_text'].map(lambda com : clean_text(com))
# define genre
genres = ['Action','Adult','Adventure','Animation','Biography','Comedy','Crime','Documentary','Drama','Family','Fantasy','Game-Show','History','Horror','Music','Musical','Mystery','News','Reality-TV','Romance','Sci-Fi','Short','Sport','Talk-Show','Thriller','War','Western']
将数据拆分为测试和训练
train, test = train_test_split(df, random_state=42, test_size = 0.33, shuffle=True)
x_train = train.plot_text
x_test = test.plot_text
训练分类器 #使用SVM预测准确度
SVC_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words='english')),
('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])
for genre in genres:
print('... Processing {}'.format(genre))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train, train[genre])
# compute the testing accuracy
prediction = SVC_pipeline.predict(x_test)
print('Test accuracy is {}'.format(accuracy_score(test[genre], prediction)))
执行此操作后,我得到了准确度分数,我决定使用SVM分类器来标记看不见的数据。如何传递看不见的数据?这是一个有两列电影片名和情节的数据集。有人可以帮忙吗?
答案 0 :(得分:0)
只需将您看不见的数据集转换为与您的训练数据框同名的新数据框。例如
AttributeError:
'dict' object has no attribute 'format'
输出
from sklearn.svm import LinearSVC
import pandas as pd
model=LinearSVC()
train=pd.DataFrame({'a':[1,2,3,4,5],'b':[21,22,23,24,25],'c':['c1','c0','c2','c1','c0']})
a b c
0 1 21 c1
1 2 22 c0
2 3 23 c2
3 4 24 c1
4 5 25 c0
model.fit(train[['a','b']],train['c'])
unseen=pd.DataFrame({'a':[1,2,1,3,4],'b':[22,21,22,23,25]})
a b
0 1 22
1 2 21
2 1 22
3 3 23
4 4 25
model.predict(unseen)
然后使用array(['c1', 'c1', 'c1', 'c0', 'c0'], dtype=object)
获取
pd.get_dummies(model.predict(unseen))
我不确定这是不是你想要的......