我想训练一个相对较大的记录集。 (200000行和400列)在管道中。只有一个性能较弱的笔记本才可以执行此任务。 该数据集具有15个独立的类,并包含分类和数字特征。应该选择类似SVM的算法。
我已经尝试将一些代码放在一起。
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelBinarizer,StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from sklearn.multiclass import OneVsRestClassifier
X, y= make_classification(n_samples=200000, n_features=130, n_informative=105,
n_redundant=25, n_classes=15, n_clusters_per_class=15)
#add some categorical columns
X [:,:2]= np.abs(X[:,:2]).astype(int)
X = pd.DataFrame(X, columns=[f'F{i}' for i in range(X.shape[1])])
cols = X.columns.tolist()
y = LabelBinarizer().fit_transform(y)
#%%Transformation
full_pipeline = ColumnTransformer([
('numerical', StandardScaler(), cols[2:]),
('categorical', OneHotEncoder(categories='auto'), cols[:2])
])
#Sparse matrix
X = full_pipeline.fit_transform(X)
#set start
rbf = RBFSampler(gamma=0.1, random_state=42)
semi_svm = SGDClassifier(loss="hinge", penalty="l2", max_iter=50)
clf_pipe = Pipeline([
('rbf', rbf),
('svm', semi_svm)
])
cv = StratifiedShuffleSplit(n_splits=5)
grid_search = RFECV(estimator=OneVsRestClassifier(clf_pipe), step=3, cv=cv,
scoring='accuracy', n_jobs=-1, verbose=10)
grid_search.fit(X, y)
ValueError:输入形状错误(200000,15)
在这种情况下如何处理多类错误?
答案 0 :(得分:1)
以下解决方案对我有用:
...
y = LabelEncoder().fit_transform(y)
...
rbf = RBFSampler(gamma=0.1, random_state=42)
semi_svm = OneVsOneClassifier(SGDClassifier(loss="hinge", penalty="l2", max_iter=5000))
selection = SelectKBest(k=1)
clf_pipe = Pipeline([
('rbf', rbf),
('features', selection ),
('svm', semi_svm)
])
cv = StratifiedShuffleSplit(n_splits=5)
param_grid = dict(features__k=np.logspace(1,6, num=5, base=2).round().astype(int),
rbf__gamma = [0.1,1])
grid_search = GridSearchCV(estimator=clf_pipe, cv=cv, param_grid = param_grid,
scoring='f1', n_jobs=-1, verbose=10)