在此脚本的数据集中(可用here),我有100列。
但是,PCA在n_components 153处显示<5%的方差。
n_components如何大于列数?
谢谢
df = pd.read_csv(path, delimiter=',', header='infer')
number_features = len(df.columns)-1
y = df[[target]]
x = df.drop([target], axis=1)
#encoding with get_dummies
x = pd.get_dummies( x )
#fill in NA values with zeros
x = x.fillna(0)
#standardize the scale
x = StandardScaler().fit_transform(x)
#convert dataframes to numpy arrays
x = np.array(x)
y = np.array(y)
#Find the best number of components that still retain much of the variability from the original dataset
pca = PCA().fit(x)
n_pca = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.95)[0][0]
pca = PCA(n_components = n_pca)
dataset = pca.fit_transform(x)
train_features, test_features, train_labels, test_labels = train_test_split(dataset, y, test_size = pct_data_test/100)