如何修复此ValueError:标签数= 14与样本数= 56不匹配?

时间:2017-10-26 18:25:14

标签: python python-3.x csv scikit-learn

数据:

rid age     income  student credit_rating   class_buy_computer
1   young   high    no  fair    no
2   young   high    no  excellent   no
3   middle  high    no  fair    yes
4   senior  medium  no  fair    yes
5   senior  low yes fair    yes
6   senior  low yes excellent   no
7   middle  low yes excellent   yes
8   young   medium  no  fair    yes
9   young   low yes fair    yes
10  senior  medium  yes fair    yes
11  young   medium  yes excellent   yes
12  middle  medium  no  excellent   yes
13  middle  high    yes fair    yes
14  senior  medium  no  excellent   no

代码:

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO

myData = open(r'C:\Users\USER\Desktop\test.csv')
reader = csv.reader(myData)
headers=next(reader)
print (headers)
featuelist=[]
labeList=[]
for row in reader:
    labeList.append(row[len(row)-1])
    rowDict={}
    for i in range(1,len(row)-1):
        rowDict[headers[i]]=row[i]
        featuelist.append(rowDict)
print(featuelist)

vec=DictVectorizer()
dummyX=vec.fit_transform(featuelist).toarray()
print('dummyX:'+str(dummyX))
print(vec.get_feature_names())
print('labeList:'+str(labeList))

lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labeList)
print('dummyY:'+str(dummyY))

clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX,dummyY)
print('clf:'+str(clf))

我收到此错误:

  File "<ipython-input-20-eacaea56a8a9>", line 1, in <module>
    runfile('C:/Users/USER/Desktop/test.py', wdir='C:/Users/USER/Desktop')

  File "D:\tools\python\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile

  File "D:\tools\python\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile

  File "C:/Users/USER/Desktop/test.py", line 32, in <module>
    clf=clf.fit(dummyX,dummyY)

  File "D:\tools\python\lib\site-packages\sklearn\tree\tree.py", line 790, in fit
    X_idx_sorted=X_idx_sorted)

  File "D:\tools\python\lib\site-packages\sklearn\tree\tree.py", line 236, in fit
    "number of samples=%d" % (len(y), n_samples))

ValueError: Number of labels=14 does not match number of samples=56

1 个答案:

答案 0 :(得分:1)

这只是因为每行在=IFERROR((SUM(AX11025*1)/AV11025),SUM(AX425*1), 0) 字典中添加了4次。行featuelist不应该在第二个循环内。

featuelist.append(rowDict)

输出

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO

myData = open('/home/kashif/test.csv')
reader = csv.reader(myData)
headers=next(reader)
print (headers)
featuelist=[]
labeList=[]
for row in reader:
    labeList.append(row[len(row)-1])
    rowDict={}
    for i in range(1,len(row)-1):
        rowDict[headers[i]]=row[i]

    #Make sure the below line is not inside the second loop
    featuelist.append(rowDict)  #<--This was the typo. 
print(featuelist)    
vec=DictVectorizer(sparse=False)
dummyX=vec.fit_transform(featuelist)
print('dummyX:'+str(dummyX))
print(vec.get_feature_names())
print('labeList:'+str(labeList))

lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labeList)
print('dummyY:'+str(dummyY))

clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX,dummyY)
print('clf:'+str(clf))