我正在使用字符串和数字数据集的组合训练SVM,下面是代码和我的测试和训练集。当我运行代码时,它会抛出以下错误。
import numpy as np
import pandas as pd
import scipy.sparse as sp
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.linear_model
import sklearn.pipeline
import sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
class UniRank(object):
vect = CountVectorizer(ngram_range=(1, 3))
def __init__(self, trained_data_csv, sep=","):
df = pd.read_csv(trained_data_csv, sep=sep)
sample = df[['Name', 'Location']]
sample = sample.apply(lambda col: col.str.strip())
# convert the characters to matrix
train = sp.hstack(sample.apply(lambda col: self.vect.fit_transform(col)))
values = df[['score']]
values.to_records()
# feature selection and manipulation
self.clf = svm.SVC(gamma=0.001, C=100)
X,y = train, values
# applying the model
self.clf.fit(X,y)
def test(self, test_data_csv, sep=","):
df = pd.read_csv(test_data_csv, sep=sep)
sample = df[['Name', 'Location']]
sample = sample.apply(lambda col: col.str.strip())
test = sp.hstack(sample.apply(lambda col: self.vect.fit_transform(col)))
return self.clf.predict(test)
if __name__ == '__main__':
ur = UniRank('/home/maitreyee/Documents/Rdata/classifyUniWithR/version2/scored_collg1.csv')
print ur.test('/home/maitreyee/Documents/Rdata/classifyUniWithR/version2/test1uni.csv')
以下是运行上述脚本时的错误
/home/maitreyee/anaconda/lib/python2.7/site-packages/sklearn /svm/base.py:472: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y_ = column_or_1d(y, warn=True)
Traceback (most recent call last):
File "uni_rank2.py", line 40, in <module>
print ur.test('/home/maitreyee/Documents/Rdata/classifyUniWithR/version2/test1uni.csv')
File "uni_rank2.py", line 35, in test
return self.clf.predict(test)
File "/home/maitreyee/anaconda/lib/python2.7/site-packages/sklearn /svm/base.py", line 500, in predict
y = super(BaseSVC, self).predict(X)
File "/home/maitreyee/anaconda/lib/python2.7/site-packages/sklearn/svm/base.py", line 290, in predict
X = self._validate_for_predict(X)
File "/home/maitreyee/anaconda/lib/python2.7/site-packages/sklearn/svm/base.py", line 443, in _validate_for_predict
(n_features, self.shape_fit_[1]))
ValueError: X.shape[1] = 181 should be equal to 865, the number of features at training time
我的训练集
Index,Name,Location,loc_val,sal_val,mean_rank,mean_score,sum_score,score
0,Indian Institute of Technology (IITDelhi),Delhi,0.0128,1.028125,0.0162,0.352375,1.057125,100
1,Indian Institute of Technology (IITDelhi),Delhi,0.0128,0.990625,0.0162,0.339875,1.019625,100
2,Indian Institute of Technology (IITDelhi),Delhi,0.0128,0.959375,0.0162,0.3294583333,0.988375,100
3,Indian Institute of Technology (IITBombay),Bombay,0.008,1,0.02025,0.34275,1.02825,100
4,Indian Institute of Technology (IITBombay),Bombay,0.008,1,0.02025,0.34275,1.02825,100
5,Indian Institute of Technology (IITBombay),Bombay,0.008,1,0.02025,0.34275,1.02825,100
6,Indian Institute of Technology (IITKharagpur),Kharagpur,0.0176,0.991875,0.022275,0.3439166667,1.03175,100
7,Indian Institute of Technology (IITKharagpur),Kharagpur,0.0176,1.0125,0.022275,0.3507916667,1.052375,100
8,Indian Institute of Technology (IITKharagpur),Kharagpur,0.0176,0.95375,0.022275,0.3312083333,0.993625,100
9,Indian Institute of Technology (IITMadras),Madras,0.0224,0.9875,0.02835,0.3460833333,1.03825,100
测试集
Index,Name,Location,location_val,salary_val,mean_rank,mean_score,sum_score
254,Gandhi Institute of Technology and Management Engineering,Vishakapatnam,0.0096,0.4925,0.5508,0.3509666667,1.0529
255,Cochin University of science and technology Engineering,Cochin,0.0112,0.296875,0.62775,0.3119416667,0.935825
256,Cochin University of science and technology Engineering,Cochin,0.0112,0.443125,0.62775,0.3606916667,1.082075
257,Cochin University of science and technology Engineering,Cochin,0.0112,0.296875,0.62775,0.3119416667,0.935825
258,KC College of Arts Science & Commerce Arts,Lucknow,0.008,0.21875,0.32805,0.1849333333,0.5548
259,Faculty of Arts University of Lucknow Arts,Lucknow,0.0032,0.21875,0.3483,0.1900833333,0.57025
260,Scottish Church College Arts,Kolkata,0.0192,0.21875,0.3564,0.1981166667,0.59435
261,L.D. Arts College Arts,Ahmedabad,0.0112,0,0.3645,0.1252333333,0.3757
262,St. Francis College for Women Arts,Hyderabad,0.0112,0.125,0.2997,0.1453,0.4359
263,Wilson College Arts,Mumbai,0.008,0.125,0.3807,0.1712333333,0.5137
264,PSG College of Arts & Science Arts,Coimbatore,0.0064,0.125,0.3888,0.1734,0.5202
我在python中第一次使用SVM作为字符串数据。如果有更好的方法来训练字符串上的SVM,请告诉我。