带有class_weight ='auto'的SVC在scikit-learn上失败了吗?

时间:2015-06-25 22:54:09

标签: python numpy machine-learning scikit-learn svm

我有以下数据集。我用SVC对它进行分类(它有5个标签)。当我想要执行时:describe SkillQueryService do let(:skills) { create_list(:skill, 10, :skill) } let(:languages) { create_list(:skill, 2, :language) } let(:qualifications) { create_list(:skill, 3, :qualification) } let(:roles) { create_list(:skill, 4, :role) } let(:personal_attributes) { create_list(:skill, 5, :personal_attribute) } let(:unprovisioned_skills) { create_list(:skill, 10, :skill, :provisioned => false) } let(:unprovisioned_languages) { create_list(:skill, 2, :language, :provisioned => false) } let(:unprovisioned_qualifications) { create_list(:skill, 3, :qualification, :provisioned => false) } let(:unprovisioned_roles) { create_list(:skill, 4, :role, :provisioned => false) } let(:unprovisioned_personal_attributes) { create_list(:skill, 5, :personal_attribute, :provisioned => false) } context 'sugguest' do it 'returns 20 suggested provisioned skills' do # Build TEST data service = SkillQueryService.new rows = service.suggest('skill') # rows.each do |r| # display_skill(r) # end # THIS CODE PRINTS OUT SKILLS 1-48 expect(rows.length).to eq(20) end it 'returns 20 suggested (UN)-provisioned skills' do # Build TEST data full_data_set service = SkillQueryService.new rows = service.suggest('skill') # rows.each do |r| # display_skill(r) # end # THIS CODE PRINTS OUT SKILLS 49-96 # HOW do I get it to have the same data as above, SKILLS 41-48 expect(rows.length).to eq(20) end end def full_data_set skills languages qualifications roles personal_attributes unprovisioned_skills unprovisioned_languages unprovisioned_qualifications unprovisioned_roles unprovisioned_personal_attributes end def display_skill(skill) PL.kv 'name', skill.name PL.kv 'provisioned', skill.provisioned PL.kv 'skill', skill.skill PL.kv 'language', skill.language PL.kv 'qualification', skill.qualification PL.kv 'role', skill.role PL.kv 'personal_attribute', skill.personal_attribute PL.line end def display_skills PL.line Skill.all.each do |r| display_skill(r) end end end 像这样:

class_weight='auto'

然后我得到了这个例外:

X = tfidf_vect.fit_transform(df['content'].values)
y = df['label'].values


from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
                                                y)



svm_1 = SVC(kernel='linear', class_weight='auto')
svm_1.fit(X, y)
svm_1_prediction = svm_1.predict(X_test)

然后对于previous question我尝试了以下方法:

Traceback (most recent call last):
  File "test.py", line 62, in <module>
    svm_1.fit(X, y)
  File "/usr/local/lib/python2.7/site-packages/sklearn/svm/base.py", line 140, in fit
    y = self._validate_targets(y)
  File "/usr/local/lib/python2.7/site-packages/sklearn/svm/base.py", line 474, in _validate_targets
    self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)
  File "/usr/local/lib/python2.7/site-packages/sklearn/utils/class_weight.py", line 47, in compute_class_weight
    raise ValueError("classes should have valid labels that are in y")
ValueError: classes should have valid labels that are in y

问题在于我得到了这个例外:

svm_1 = SVC(kernel='linear', class_weight='auto')
svm_1.fit(X, y_encoded)
svm_1_prediction = le.inverse_transform(svm_1.predict(X))

有人可以帮助我理解上述方法的问题吗?如何正确使用SVC File "/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py", line 179, in accuracy_score y_type, y_true, y_pred = _check_targets(y_true, y_pred) File "/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py", line 74, in _check_targets check_consistent_length(y_true, y_pred) File "/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py", line 174, in check_consistent_length "%s" % str(uniques)) ValueError: Found arrays with inconsistent numbers of samples: [ 858 2598] 参数来自动平衡数据?

更新

当我class_weight='auto'时,这是输出: print(y)

更新

然后我执行以下操作:

0       5
1       4
2       5
3       4
4       4
5       5
6       4
7       4
8       3
9       5
10      4
11      4
12      1
13      4
14      4
15      5
16      4
17      4
18      5
19      5
20      4
21      4
22      5
23      5
24      3
25      3
26      4
27      5
28      4
29      4
       ..
2568    4
2569    4
2570    4
2571    3
2572    4
2573    5
2574    5
2575    5
2576    5
2577    3
2578    4
2579    4
2580    2
2581    4
2582    3
2583    4
2584    5
2585    4
2586    5
2587    4
2588    4
2589    3
2590    5
2591    5
2592    4
2593    4
2594    4
2595    2
2596    2
2597    5

这是输出:

mask = np.array(test)
print y[np.arange(len(y))[~mask]]

1 个答案:

答案 0 :(得分:1)

问题在于:

df.label.unique()
Out[50]: array([  5.,   4.,   3.,   1.,   2.,  nan])

示例代码:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# replace your own data file_path
df = pd.read_csv('data1.csv', header=0)
df[df.label.isnull()]

Out[52]: 
                               id content  label
900   Daewoo_DWD_M1051__Opinio...       5    NaN
1463  Indesit_IWC_5105_B_it__O...       1    NaN


# drop those two 
df = df[df.label.notnull()]

X = df.content.values
y = df.label.values

transformer = TfidfVectorizer()
X = transformer.fit_transform(X)

estimator = SVC(kernel='linear', class_weight='auto', probability=True)
estimator.fit(X, y)

estimator.predict(X)

Out[54]: array([ 4.,  4.,  4., ...,  2.,  2.,  3.])

estimator.predict_proba(X)

Out[55]: 
array([[ 0.0252,  0.0228,  0.0744,  0.3427,  0.535 ],
       [ 0.002 ,  0.0122,  0.0604,  0.4961,  0.4292],
       [ 0.0036,  0.0204,  0.1238,  0.5681,  0.2841],
       ..., 
       [ 0.1494,  0.3341,  0.1586,  0.1316,  0.2263],
       [ 0.0175,  0.1984,  0.0915,  0.3406,  0.3519],
       [ 0.049 ,  0.0264,  0.2087,  0.3267,  0.3891]])