这是我的代码和错误消息,可供重现。我还打印了原始数据内容和数据帧的形状以进行连接(使用hstack),这似乎还可以,并且想知道错误是什么吗?
from sklearn.model_selection import train_test_split
import pandas as pd
from pandas import DataFrame
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})
X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
result_matrix_train = X_train['Age']
result_matrix_test = X_test['Age']
sparse_columns = ['Tags']
for feature_colunm_name in sparse_columns:
print('processing feature name: ', feature_colunm_name)
cv = CountVectorizer(stop_words=None)
X_train_cv = cv.fit_transform(X_train[feature_colunm_name])
print ('X_train_cv: ', X_train_cv)
print ('result_matrix_train: ', result_matrix_train)
# Merge the vector with others
if result_matrix_train is not None:
print (result_matrix_train)
print (X_train_cv)
result_matrix_train = hstack((result_matrix_train, X_train_cv))
else:
result_matrix_train = X_train_cv
# Now transform the test data
X_test_cv = cv.transform(X_test[feature_colunm_name])
if result_matrix_test is not None:
result_matrix_test = hstack((result_matrix_test, X_test_cv))
else:
result_matrix_test = X_test_cv
错误消息,
24 print (result_matrix_train)
25 print (X_train_cv)
---> 26 result_matrix_train = hstack((result_matrix_train, X_train_cv))
27 else:
28 result_matrix_train = X_train_cv
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 2, expected 1.
答案 0 :(得分:1)
result_matrix_test的形状为(2,),变为(1,2)。您需要使用scipy.sparse.csr_matrix.reshape(spar_mat,(-1,1))将其制成(2,1)。
from sklearn.model_selection import train_test_split
import pandas as pd
from pandas import DataFrame
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
import scipy
big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})
X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
result_matrix_train = X_train['Age']
result_matrix_test = X_test['Age']
feature_colunm_name = "Tags"
cv = CountVectorizer(stop_words=None)
X_train_cv = cv.fit_transform(X_train[feature_colunm_name])
result_matrix_train.shape # (2,)
# explicity convert to csr matrix (your code did this implicitly when calling hstack)
spar_mat = scipy.sparse.csr_matrix(result_matrix_train.values)
# this now has the wrong shape
spar_mat.shape # (1,2)
# reshape this to be (n x 1)
spar_mat_shape = scipy.sparse.csr_matrix.reshape(spar_mat, (-1,1))
# this now has the right shape for hstack
spar_mat_shape.shape # (2, 1)
X_train_cv.shape # (2, 3)
# hstack succeeds
result_matrix_train = hstack((spar_mat_shape, X_train_cv))
result_matrix_train.shape # (2, 4)
# you need to do the same for the "test" portion of your code
result_matrix_test.shape
X_test_cv = cv.transform(X_test[feature_colunm_name])
# result_matrix_test = hstack((result_matrix_test, X_test_cv)) ... this would fail
# this will succeed:
spar_mat_test = scipy.sparse.csr_matrix(result_matrix_test.values)
spar_mat_test_shape = scipy.sparse.csr_matrix.reshape(spar_mat_test, (-1,1))
result_matrix_test = hstack((spar_mat_test_shape, X_test_cv))
result_matrix_test.shape # (2,5)