我有一个功能,它预先处理包含该信息的{csv}文件中的数据http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
def loadAndPreprocess(filename):
# ------------------------------------------------------------------------------------------------
print("Activity 1a: ")
dataLabels = [
'age',
'sex',
'cp',
'trestbps',
'chol',
'fbs',
'restecg',
'thalach',
'exang',
'oldpeak',
'slope',
'ca',
'thal',
'class'
]
data=pd.read_csv(filename,sep=',',header=None,names=dataLabels,na_values=["?"])
print(data)
#print("Original data")
#print(data.ix[:,0:])
n=len(data)
#print ("Amount of instances: " + str(n))
#print ("Analyzing class distribution")
#print (list(data['class'].value_counts()))
#print("Rows with missing values")
#print(sum(numpy.isnan(data).any(axis=1)))
#print("Attributes with missing values")
#print(len(data.isnull().sum().loc[data.isnull().sum()> 0]))
#Remove rows with missing data
cleanData=data[~numpy.isnan(data).any(axis=1)]
cleanData=cleanData.reset_index(drop=True) #Required. Otherwise, the index of the rows dropped keep active
#print("Clean data")
#print(cleanData[:])
# Separating classes (Y) from values (X)
dataX=cleanData.ix[:,0:13]
dataY=cleanData.ix[:,13]
# Extract status and standardize product values
attributes = preprocessing.scale(dataX)
#print("Scaled data")
#print(attributes[:])
return attributes, dataY
def exercise1(attributes, classes):
print("Activity 1a")
# Apply PCA requesting all components (no argument)
pca = PCA(n_components=3)
pca.fit(attributes)
result = pca.transform(attributes)
fig = plt.figure(1, figsize=(10, 6))
sp = fig.gca(projection='3d')
sp.scatter(attributes[:, 0], attributes[:, 1], attributes[:, 2])
plt.show()
X, y = loadAndPreprocess('processedCleveland.csv')
exercise1(X, y)
但是当我显示结果时,会分为两组,一组靠近y轴的最大值,另一组靠近y轴的最小值,形成水平线。
我认为我没有以正确的方式传递数据,但我不确定是什么问题。
答案 0 :(得分:0)
请更改以下行
sp.scatter(attributes[:, 0], attributes[:, 1], attributes[:, 2])
到
sp.scatter(result[:, 0], result[:, 1], result[:, 2])