我有以下数据集。我用SVC对它进行分类(它有5个标签)。当我想要执行时:describe SkillQueryService do
let(:skills) { create_list(:skill, 10, :skill) }
let(:languages) { create_list(:skill, 2, :language) }
let(:qualifications) { create_list(:skill, 3, :qualification) }
let(:roles) { create_list(:skill, 4, :role) }
let(:personal_attributes) { create_list(:skill, 5, :personal_attribute) }
let(:unprovisioned_skills) { create_list(:skill, 10, :skill, :provisioned => false) }
let(:unprovisioned_languages) { create_list(:skill, 2, :language, :provisioned => false) }
let(:unprovisioned_qualifications) { create_list(:skill, 3, :qualification, :provisioned => false) }
let(:unprovisioned_roles) { create_list(:skill, 4, :role, :provisioned => false) }
let(:unprovisioned_personal_attributes) { create_list(:skill, 5, :personal_attribute, :provisioned => false) }
context 'sugguest' do
it 'returns 20 suggested provisioned skills' do
# Build TEST data
service = SkillQueryService.new
rows = service.suggest('skill')
# rows.each do |r|
# display_skill(r)
# end
# THIS CODE PRINTS OUT SKILLS 1-48
expect(rows.length).to eq(20)
end
it 'returns 20 suggested (UN)-provisioned skills' do
# Build TEST data
full_data_set
service = SkillQueryService.new
rows = service.suggest('skill')
# rows.each do |r|
# display_skill(r)
# end
# THIS CODE PRINTS OUT SKILLS 49-96
# HOW do I get it to have the same data as above, SKILLS 41-48
expect(rows.length).to eq(20)
end
end
def full_data_set
skills
languages
qualifications
roles
personal_attributes
unprovisioned_skills
unprovisioned_languages
unprovisioned_qualifications
unprovisioned_roles
unprovisioned_personal_attributes
end
def display_skill(skill)
PL.kv 'name', skill.name
PL.kv 'provisioned', skill.provisioned
PL.kv 'skill', skill.skill
PL.kv 'language', skill.language
PL.kv 'qualification', skill.qualification
PL.kv 'role', skill.role
PL.kv 'personal_attribute', skill.personal_attribute
PL.line
end
def display_skills
PL.line
Skill.all.each do |r|
display_skill(r)
end
end
end
像这样:
class_weight='auto'
然后我得到了这个例外:
X = tfidf_vect.fit_transform(df['content'].values)
y = df['label'].values
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
y)
svm_1 = SVC(kernel='linear', class_weight='auto')
svm_1.fit(X, y)
svm_1_prediction = svm_1.predict(X_test)
然后对于previous question我尝试了以下方法:
Traceback (most recent call last):
File "test.py", line 62, in <module>
svm_1.fit(X, y)
File "/usr/local/lib/python2.7/site-packages/sklearn/svm/base.py", line 140, in fit
y = self._validate_targets(y)
File "/usr/local/lib/python2.7/site-packages/sklearn/svm/base.py", line 474, in _validate_targets
self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)
File "/usr/local/lib/python2.7/site-packages/sklearn/utils/class_weight.py", line 47, in compute_class_weight
raise ValueError("classes should have valid labels that are in y")
ValueError: classes should have valid labels that are in y
问题在于我得到了这个例外:
svm_1 = SVC(kernel='linear', class_weight='auto')
svm_1.fit(X, y_encoded)
svm_1_prediction = le.inverse_transform(svm_1.predict(X))
有人可以帮助我理解上述方法的问题吗?如何正确使用SVC的 File "/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py", line 179, in accuracy_score
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py", line 74, in _check_targets
check_consistent_length(y_true, y_pred)
File "/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py", line 174, in check_consistent_length
"%s" % str(uniques))
ValueError: Found arrays with inconsistent numbers of samples: [ 858 2598]
参数来自动平衡数据?
更新
当我class_weight='auto'
时,这是输出:
print(y)
更新
然后我执行以下操作:
0 5
1 4
2 5
3 4
4 4
5 5
6 4
7 4
8 3
9 5
10 4
11 4
12 1
13 4
14 4
15 5
16 4
17 4
18 5
19 5
20 4
21 4
22 5
23 5
24 3
25 3
26 4
27 5
28 4
29 4
..
2568 4
2569 4
2570 4
2571 3
2572 4
2573 5
2574 5
2575 5
2576 5
2577 3
2578 4
2579 4
2580 2
2581 4
2582 3
2583 4
2584 5
2585 4
2586 5
2587 4
2588 4
2589 3
2590 5
2591 5
2592 4
2593 4
2594 4
2595 2
2596 2
2597 5
这是输出:
mask = np.array(test)
print y[np.arange(len(y))[~mask]]
答案 0 :(得分:1)
问题在于:
df.label.unique()
Out[50]: array([ 5., 4., 3., 1., 2., nan])
示例代码:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
# replace your own data file_path
df = pd.read_csv('data1.csv', header=0)
df[df.label.isnull()]
Out[52]:
id content label
900 Daewoo_DWD_M1051__Opinio... 5 NaN
1463 Indesit_IWC_5105_B_it__O... 1 NaN
# drop those two
df = df[df.label.notnull()]
X = df.content.values
y = df.label.values
transformer = TfidfVectorizer()
X = transformer.fit_transform(X)
estimator = SVC(kernel='linear', class_weight='auto', probability=True)
estimator.fit(X, y)
estimator.predict(X)
Out[54]: array([ 4., 4., 4., ..., 2., 2., 3.])
estimator.predict_proba(X)
Out[55]:
array([[ 0.0252, 0.0228, 0.0744, 0.3427, 0.535 ],
[ 0.002 , 0.0122, 0.0604, 0.4961, 0.4292],
[ 0.0036, 0.0204, 0.1238, 0.5681, 0.2841],
...,
[ 0.1494, 0.3341, 0.1586, 0.1316, 0.2263],
[ 0.0175, 0.1984, 0.0915, 0.3406, 0.3519],
[ 0.049 , 0.0264, 0.2087, 0.3267, 0.3891]])