消除高度相关的列,但保留我的非数字列

时间:2015-11-19 16:30:36

标签: r correlation

我有一个带有分类和数字变量的数据框。我想摆脱高度相关的变量。

所以,首先我删除了所有的分类列,制作了矩阵,并找出了哪些数字列消除了。现在我想回去并且还有我的分类变量列。

我对最简单的方法感到困惑......也许是一个联接选择我要从第二个表中添加哪些列...

library(caret)
training_2 <- subset(training, select = -c(user_name ,timestamp, etc))
corr_matrix <- cor(training_2)
highCorr <- findCorrelation(corr_matrix, .75)
training_2<- training_2[,-highCorr]

编辑:数据

structure(list(X = 1:15, yaw_belt = c(-94.4, -94.4, -94.4, -94.4, 
-94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, 
-94.4, -94.4), gyros_belt_x = c(0, 0.02, 0, 0.02, 0.02, 0.02, 
0.02, 0.02, 0.02, 0.03, 0.03, 0.02, 0.02, 0.02, 0), gyros_belt_y = c(0, 
0, 0, 0, 0.02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), gyros_belt_z = c(-0.02, 
-0.02, -0.02, -0.03, -0.02, -0.02, -0.02, -0.02, -0.02, 0, -0.02, 
-0.02, 0, -0.02, 0), magnet_belt_x = c(-3L, -7L, -2L, -6L, -6L, 
0L, -4L, -2L, 1L, -3L, -5L, -2L, -3L, -8L, -1L), magnet_belt_y = c(599L, 
608L, 600L, 604L, 600L, 603L, 599L, 603L, 602L, 609L, 596L, 602L, 
606L, 598L, 597L), roll_arm = c(-128, -128, -128, -128, -128, 
-128, -128, -128, -128, -128, -128, -128, -128, -128, -129), 
    pitch_arm = c(22.5, 22.5, 22.5, 22.1, 22.1, 22, 21.9, 21.8, 
    21.7, 21.6, 21.5, 21.5, 21.4, 21.4, 21.4), yaw_arm = c(-161, 
    -161, -161, -161, -161, -161, -161, -161, -161, -161, -161, 
    -161, -161, -161, -161), total_accel_arm = c(34L, 34L, 34L, 
    34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L
    ), gyros_arm_y = c(0, -0.02, -0.02, -0.03, -0.03, -0.03, 
    -0.03, -0.02, -0.03, -0.03, -0.03, -0.03, -0.02, 0, 0), gyros_arm_z = c(-0.02, 
    -0.02, -0.02, 0.02, 0, 0, 0, 0, -0.02, -0.02, 0, 0, -0.02, 
    -0.03, -0.03), magnet_arm_x = c(-368L, -369L, -368L, -372L, 
    -374L, -369L, -373L, -372L, -369L, -376L, -366L, -363L, -372L, 
    -371L, -374L), magnet_arm_z = c(516L, 513L, 513L, 512L, 506L, 
    513L, 509L, 510L, 518L, 516L, 509L, 520L, 509L, 523L, 510L
    ), roll_dumbbell = c(13.05217456, 13.13073959, 12.85074981, 
    13.43119971, 13.37871611, 13.38245941, 13.12694911, 12.75083041, 
    13.15463353, 13.33033728, 13.13073959, 13.10320545, 13.38245941, 
    13.41047767, 13.07948887), pitch_dumbbell = c(-70.49400371, 
    -70.63750507, -70.27811982, -70.39379464, -70.42855971, -70.81758832, 
    -70.24756905, -70.34768359, -70.42520377, -70.85058796, -70.63750507, 
    -70.45974712, -70.81758832, -70.99594236, -70.67116245), 
    yaw_dumbbell = c(-84.87393888, -84.71064711, -85.14078134, 
    -84.87362553, -84.85305745, -84.46500278, -85.09961258, -85.09708174, 
    -84.91563379, -84.44601983, -84.71064711, -84.89472246, -84.46500278, 
    -84.28004856, -84.69053461), total_accel_dumbbell = c(37L, 
    37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 
    37L, 37L), gyros_dumbbell_y = c(-0.02, -0.02, -0.02, -0.02, 
    -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, 
    -0.02, -0.02), magnet_dumbbell_z = c(-65, -64, -63, -60, 
    -68, -66, -70, -74, -65, -69, -64, -65, -69, -68, -63), roll_forearm = c(28.4, 
    28.3, 28.3, 28.1, 28, 27.9, 27.9, 27.8, 27.7, 27.7, 27.6, 
    27.5, 27.2, 27.2, 27.2), pitch_forearm = c(-63.9, -63.9, 
    -63.9, -63.9, -63.9, -63.9, -63.9, -63.8, -63.8, -63.8, -63.8, 
    -63.8, -63.9, -63.9, -63.9), yaw_forearm = c(-153, -153, 
    -152, -152, -152, -152, -152, -152, -152, -152, -152, -152, 
    -151, -151, -151), total_accel_forearm = c(36L, 36L, 36L, 
    36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L
    ), gyros_forearm_x = c(0.03, 0.02, 0.03, 0.02, 0.02, 0.02, 
    0.02, 0.02, 0.03, 0.02, 0.02, 0.02, 0, 0, 0), gyros_forearm_z = c(-0.02, 
    -0.02, 0, 0, -0.02, -0.03, -0.02, 0, -0.02, -0.02, -0.02, 
    -0.03, -0.03, -0.03, -0.02), accel_forearm_x = c(192L, 192L, 
    196L, 189L, 189L, 193L, 195L, 193L, 193L, 190L, 193L, 191L, 
    193L, 193L, 192L), accel_forearm_z = c(-215L, -216L, -213L, 
    -214L, -214L, -215L, -215L, -213L, -214L, -215L, -214L, -215L, 
    -215L, -214L, -214L), magnet_forearm_x = c(-17L, -18L, -18L, 
    -16L, -17L, -9L, -18L, -9L, -16L, -22L, -17L, -11L, -15L, 
    -14L, -16L), magnet_forearm_y = c(654, 661, 658, 658, 655, 
    660, 659, 660, 653, 656, 657, 657, 655, 659, 656), magnet_forearm_z = c(476, 
    473, 469, 469, 473, 478, 470, 474, 476, 473, 465, 478, 472, 
    478, 472)), .Names = c("X", "yaw_belt", "gyros_belt_x", "gyros_belt_y", 
"gyros_belt_z", "magnet_belt_x", "magnet_belt_y", "roll_arm", 
"pitch_arm", "yaw_arm", "total_accel_arm", "gyros_arm_y", "gyros_arm_z", 
"magnet_arm_x", "magnet_arm_z", "roll_dumbbell", "pitch_dumbbell", 
"yaw_dumbbell", "total_accel_dumbbell", "gyros_dumbbell_y", "magnet_dumbbell_z", 
"roll_forearm", "pitch_forearm", "yaw_forearm", "total_accel_forearm", 
"gyros_forearm_x", "gyros_forearm_z", "accel_forearm_x", "accel_forearm_z", 
"magnet_forearm_x", "magnet_forearm_y", "magnet_forearm_z"), row.names = c(NA, 
15L), class = "data.frame")

由于

1 个答案:

答案 0 :(得分:2)

遗憾的是,我无法使用你的数据@mclzc,因为其中一列只有一个值,当我尝试计算相关性时会返回一个错误,但我会提供一个样本数据集。

library(caret)
#sample data, 4 numeric fields and 2 categorical
df <- data.frame(a=runif(10), b=runif(10), c=1:10, d=1:10, e=letters[1:10], f=letters[1:10])

#categorical columns
cat_cols <- c('e', 'f')
#remove categorical
df2 <- df[!names(df) %in% cat_cols]

#run correlations
cor_mat <- cor(df2)

#the index of the columns to be removed because they have a high correlation
index <- findCorrelation(cor_mat, .75)

#the name of the columns chosen above
to_be_removed <- colnames(cor_mat)[index]

#now go back to df and use to_be_removed to subset the original df
df[!names(df) %in% to_be_removed]

输出(只有列c被findCorrelations指示删除,而分类列e和f仍然保留):

           a          b  d e f
1  0.8495639 0.52388246  1 a a
2  0.9478409 0.11466655  2 b b
3  0.9086955 0.12097760  3 c c
4  0.7724303 0.03160203  4 d d
5  0.1269406 0.10740182  5 e e
6  0.7454943 0.95892265  6 f f
7  0.2651961 0.15223472  7 g g
8  0.5714763 0.49808509  8 h h
9  0.1226760 0.06088693  9 i i
10 0.5526266 0.14715471 10 j j

请注意,findCorrelation函数返回要删除的列的索引,但不返回名称。您需要具有该名称,以便可以从原始数据集中排除该名称(在排除分类列后,不能将索引用作更改的列数)。因此,在上面的代码中,我使用index查找列名,然后使用它来删除不需要的列(保留分类列)