  # imports
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression

col=['Id','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion',
                            'Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
# read data into a DataFrame
data = pd.read_csv("breast_cancer.txt",header=None, prefix="V")
data.columns = col
d = pd.DataFrame(data,columns=col)


list_of_means = d.mean()

# filling missing values with mean 
for i in range (2,10):
    for j in range(699):
        if d.iloc[j, i] == "?":
            d.iloc[j, i] = round(list_of_means[i],0)
d['Type'] = 'benign'
# map Type to 0 if class is 2 and 1 if class is 4 
d['Type'] = d.Class.map({2:0, 4:1})
X = d[['Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion',
                            'Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses']]


# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X.reshape(X.shape[0], 1), y)
# check the accuracy on the training set
score = model.score(X, y)
#calculate correlation matrix
corMat = DataFrame(data.iloc[:,2:10].corr())
print 'correlation matrix'

print score

print X.head()

但是我收到了这个错误 逻辑回归ValueError:找到样本数不一致的输入变量: 在我做了一些搜索之后我发现sklearn需要数据形状(行号,列号)所以适合方法

model = model.fit(X.reshape(X.shape[0], 1), y) 


返回对象.__ getattribute __(self,name) AttributeError:'DataFrame'对象没有属性'reshape'


 #  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)


我的代码是正确的,只是当我试图分配时我有一个错字 我所以我改变了



还要注意scikit learn会抛出警告,因为我传递的是列向量而不是id数组,但你可以通过更改来解决这个问题

model.fit(X, y) to 
model.fit(X, y.values.ravel())

DataConversionWarning:当预期有1d数组时,传递了列向量y。请将y的形状更改为(n_samples,),例如使用ravel()。   y = column_or_1d(y,warn = True)