我试图让一个非常简单的scikit OneVsRest分类器工作,但遇到了一个奇怪的问题
这是代码
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
input_file = "small.csv"
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
df = pd.read_csv(input_file, sep=',', quotechar='"', encoding='utf-8')
codes = df.ix[:,'act_code1':'act_code33']
y = []
for index, row in codes.iterrows():
row = row[np.logical_not(np.isnan(row))].astype(str)
row = row.tolist()
y.append(row)
lb = preprocessing.MultiLabelBinarizer()
Y = lb.fit_transform(y)
classifier = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LinearSVC()))])
classifier.fit(df['text'], Y)
predicted = classifier.predict(["BASIC SOCIAL SERVICES AID IN ARARATECA VALLEY"])
all_labels = lb.inverse_transform(predicted)
print all_labels
small.csv的内容如下:
https://drive.google.com/file/d/0Bzt48lX3efsQTnYySFdaTlZhZGc/view?usp=sharing
当尝试分类时,我收到以下警告,并且没有发生分类
UserWarning: indices array has non-integer dtype (float64)
% self.indices.dtype.name)
[()]
但是,如果删除开头的行(第6行):
61821559,LEATHER PROJECT SKILLS TRAININ
代码正常工作,产生正确的分类输出([(' 15150.07',)])。你也可以修复'这通过删除最后一行。这是怎么回事?
编辑:只是为了确保我正确地传达了问题:这是一个文本标签分类问题,而不是数字回归曲线拟合。 '数字'在标签中意味着被视为文本字符串(它们是)。这是一个多标签分类问题。
答案 0 :(得分:0)
问题在于您的代码的以下部分:
y = []
for index, row in codes.iterrows():
row = row[np.logical_not(np.isnan(row))].astype(str)
row = row.tolist()
y.append(row)
print(y)
[['12105.01', '15150.07', '15130.06', '11105.01', '16010.07', '16020.05'], ['99810.01'], ['11430.02', '15140.01'], ['16010.05', '15150.07'], ['32120.08', '32181.01', '16010.01'], ['99810.01'], ['72020.01'], ['72010.01']]
act_code
的数值不是标签......列名act_code
本身就是。顺便说一下,你正在做分类任务吗?如果我理解正确,则根据text
输入,您尝试将其归类为act_code 1:33
中的一个/多个。如果你的真正目的是预测一些数值(在你的帖子中,output ([('15150.07',)])
真的让我感到困惑),那么你必须完全重新制定你的所有项目,因为它是一个回归问题,而不是分类。
您应该使用
y = [row.index[row.notnull()].tolist() for _, row in y_codes.iterrows()]
[[u'act_code1', u'act_code2', u'act_code3', u'act_code4', u'act_code5', u'act_code6'], [u'act_code1'], [u'act_code1', u'act_code2'], [u'act_code1', u'act_code2'], [u'act_code1', u'act_code2', u'act_code3'], [u'act_code1'], [u'act_code1'], [u'act_code1']]
完整的工作代码:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
import pandas as pd
input_file = '/home/Jian/Downloads/small.csv'
df = pd.read_csv(input_file, sep=',', quotechar='"', encoding='utf-8')
y_codes = df.ix[:,'act_code1':'act_code33']
# process your y-label
# ==============================
y = [row.index[row.notnull()].tolist() for _, row in y_codes.iterrows()]
lb = lb = preprocessing.MultiLabelBinarizer()
Y = lb.fit_transform(y)
print(Y)
# standard text classificaiton with multi-label classes
# ======================================================
# CountVectorizer + TfidTransformer is equivalent to TfidfVectorizer
classifier = make_pipeline(TfidfVectorizer(), OneVsRestClassifier(LinearSVC()))
X = df.text.values
# give a warning msg: Label 0 is present in all training examples.
# it's fine since this is just a very small sample
# in reality, it's unlikely for all your obs belong to class 0
classifier.fit(X, Y)
y_pred = classifier.predict(["BASIC SOCIAL SERVICES AID IN ARARATECA VALLEY"])
all_labels = lb.inverse_transform(y_pred)
print(all_labels)
[(u'act_code1',)]