Question

我有一个csv数据，数据的第一列是'label'，第一列到第784列之后的列包含一个图像（28 * 28）格式的表示。我创建了一个numpy数组的元组使用load（filename）函数。此功能为图像指定标签。

现在我想使用下面函数read（digits，path =“。”）中生成的数据。此格式用于scikit库的支持向量机分析。事实上，我试图模仿（第二个例子是底部）：http://cvxopt.org/applications/svm/index.html

当我尝试重新格式化数据时，我收到了正文中下面复制的错误。有没有办法，我可以用所需的格式获得它？

train_name=dir_path+'train8.csv'
def load(filename):
    # read file into a list of rows
    with open(filename, 'rU') as csvfile:
        lines = csv.reader(csvfile, delimiter=',')
        rows = list(lines)

    # create empty numpy arrays of the required size
    data = np.empty((len(rows), len(rows[0])-1), dtype=np.float64)
    expected = np.empty((len(rows),), dtype=np.int64)

    # fill array with data from the csv-rows
    for i, row in enumerate(rows):
        data[i,:] = row[1:]
        expected[i] = row[0]

    result_data = data, expected
    return result_data

> Result:

    (array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.],
               ..., 
               [ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.],
               [ 0.,  0.,  0., ...,  0.,  0.,  0.]]), array([1, 1, 1, ..., 1, 1, 1]))


def read(digits, path = "."):
    data= load(train_name)
    print "sizeImages = ",len(data[0]), "sizelabels = ", len(data[1])

    lbl=data[1]
    size=len(data[1])
    img=data[0] #print type(img) OUTPUT: <type 'numpy.ndarray'>

    img =[l[0] for l in img] # print type(img) OUTPUT: <type 'list'>, 
    *This is used to unpack the numpy array from above.* 

    ind = [ k for k in xrange(size) if lbl[k] in digits ]

    images =  matrix(0, (len(ind), 28*28))
    labels = matrix(0, (len(ind), 1))

    #images =  data[0]
    labels = img

    for i in xrange(len(ind)):
        images[i, :] = img[ ind[i]*28*28: (ind[i]+1)*28*28]
        labels[i] = lbl[ind[i]]
    return images,labels
print read([8],  path = dir_path)

结果：

sizeImages =  5851 sizelabels =  5851
Traceback (most recent call last):
  File "svm.py", line 62, in <module>
    print read([8],  path = dir_path)
  File "svm.py", line 59, in read
    images[i, :] = img[ ind[i]*28*28: (ind[i]+1)*28*28]
NotImplementedError: invalid type in assignment

所需格式：

**(<5949x784 matrix, tc='i'>, <5949x1 matrix, tc='i'>)**

也就是说，上面的第一个矩阵是：{array-like，sparse matrix}，shape = [n_samples，n_features]训练向量，其中样本数和n_features中的n_samples是特征数。

sample data: 
'''1    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   38  254 109 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   87  252 82  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   135 241 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   45  244 150 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   84  254 63  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   202 223 11  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   32  254 216 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   95  254 195 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   140 254 77  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   57  237 205 8   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   124 255 165 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   171 254 81  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   24  232 215 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   120 254 159 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   151 254 142 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   228 254 66  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   61  251 254 66  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   141 254 205 3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   10  215 254 121 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   5   198 176 10  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0'''

Answer 1

一个可能的解决方案是：

from sklearn.datasets.base import Bunch

file=np.loadtxt('~/data/test3.txt', dtype=int)
target_index=0

dataset = file
data    = None
target  = None
target_names  = None
feature_names = None

# Target assumed to be either last or first row
if target_index == -1:
    data   = dataset[:, 0:-1]
    target = dataset[:, -1]
elif target_index == 0:
    data   = dataset[:, 1:]
    target = dataset[:, 0]
else:
    raise ValueError("Target index must be either -1 or 0")

reformatData=Bunch(data=data, target=target)

print reformatData['data'].shape, reformatData['target'].shape

NotImplementedError：赋值

1 个答案: