如何将看不见的数据传递到多标签分类器中

时间:2018-05-21 18:52:28

标签: python-3.x scikit-learn classification svm multilabel-classification

我使用SVM,Logistic回归和NB训练了一个多标签分类器。 我的问题是如何将看不见的数据传递给分类器? 这是我的完整代码

# Bring all the important libraries

%matplotlib inline

import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
#from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns


df = pd.read_csv("movies_genres_en.csv", delimiter='\t')
df.drop('plot_lang', axis=1, inplace=True)
df.rename(columns={'plot':'plot_text'}, inplace=True)
df.info()

#using for loop get a count of movies by genre
df_genres = df.drop(['plot_text', 'title'], axis=1)
counts = []
categories = list(df_genres.columns.values)
for i in categories:
counts.append((i, df_genres[i].sum()))
df_stats = pd.DataFrame(counts, columns = ['genre','#movies'])
df_stats

# Create a fuction to clean the text

def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "can not ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"\'scuse", " excuse ", text)
text = re.sub('\W', ' ', text)
text = re.sub('\s+', ' ', text)
text = text.strip(' ')
return text    

# clean up the text in plot
df['plot_text'] = df['plot_text'].map(lambda com : clean_text(com))

# define genre
genres =   ['Action','Adult','Adventure','Animation','Biography','Comedy','Crime','Documentary','Drama','Family','Fantasy','Game-Show','History','Horror','Music','Musical','Mystery','News','Reality-TV','Romance','Sci-Fi','Short','Sport','Talk-Show','Thriller','War','Western']   

将数据拆分为测试和训练

将数据拆分为训练和测试集

train, test = train_test_split(df, random_state=42, test_size = 0.33, shuffle=True)
x_train = train.plot_text
x_test = test.plot_text

训练分类器     #使用SVM预测准确度

SVC_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
        ])
for genre in genres:
print('... Processing {}'.format(genre))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train, train[genre])
# compute the testing accuracy
prediction = SVC_pipeline.predict(x_test)
print('Test accuracy is {}'.format(accuracy_score(test[genre], prediction)))

执行此操作后,我得到了准确度分数,我决定使用SVM分类器来标记看不见的数据。如何传递看不见的数据?这是一个有两列电影片名和情节的数据集。有人可以帮忙吗?

1 个答案:

答案 0 :(得分:0)

只需将您看不见的数据集转换为与您的训练数据框同名的新数据框。例如

AttributeError:
'dict' object has no attribute 'format'

输出

from sklearn.svm import LinearSVC
import pandas as pd
model=LinearSVC()
train=pd.DataFrame({'a':[1,2,3,4,5],'b':[21,22,23,24,25],'c':['c1','c0','c2','c1','c0']})

    a   b   c
0   1   21  c1
1   2   22  c0
2   3   23  c2
3   4   24  c1
4   5   25  c0

model.fit(train[['a','b']],train['c'])
unseen=pd.DataFrame({'a':[1,2,1,3,4],'b':[22,21,22,23,25]})

    a   b
0   1   22
1   2   21
2   1   22
3   3   23
4   4   25

model.predict(unseen)

然后使用array(['c1', 'c1', 'c1', 'c0', 'c0'], dtype=object) 获取

pd.get_dummies(model.predict(unseen))

我不确定这是不是你想要的......