Question

我想生成相关性，它可能是基本的，但我无法得到它。需要你的帮助!!

我正在尝试为用户指定的变量生成相关性（即需要生成相关性的变量不固定。在不同的场景中可能会有所不同，因此需要在向量中输入和存储 str_char ）

对于这些变量中的每一个，我需要生成与值变量的相关性，并且相关性应基于类型变量中的组。

以下是样本虚拟数据。我的实际数据有更多的列和行。

library("plyr")
library("data.table")

set.seed(1200)
id <- 1:100
bills <- sample(1:20,100,replace = T)
nos <- sample(1:80,100,replace = T)
stru <- sample(c("A","B","C","D"),100,replace = T)
var1 <- sample(1:80,100,replace = T)
var2 <- sample(1:80,100,replace = T)
v1 <- sample(1:80,100,replace = T)
v2 <- sample(1:80,100,replace = T)
a1 <- sample(1:80,100,replace = T)
b1 <- sample(1:80,100,replace = T)
type <- sample(1:7,100,replace = T)
value <- sample(100:1000,100,replace = T)

df1 <- as.data.table(data.frame(id,bills,nos,stru,var1,var2,v1,v2,a1,b1,type,value))

#storing the variables for which need to generate correlation. This would change in different scenarios and one would need to update this variable.
str_char <- c("bills","nos","stru","var2","v1","b1")

len <- length(str_char)

#Since the variables are not fixed using for loop. To tackle the requirement of generating correlation by group using ddply

corr<-data.frame()
for (i in 1:len){
  df1$var1 <- df1[,which(colnames(df1) == str_char[i])]
  var1 <- str_char[i]
  temp1 <- ddply(
    df1
    , .(type)
    , summarize
    , var1=cor(value,var1,method="spearman")
  )
  corr <- as.data.frame(cbind(corr,temp1))
}

这会为 corr 生成一个空数据框。不知道我哪里错了。我希望将行中的类型和每个变量放在具有相关值的单元格的列中。

一旦我有了具有相关值的数据框，我想识别相关性为> 1的变量。 0.2并将它们存储在矢量中。

请你帮忙建议我出错的地方或建议一些更好的方法来满足这个要求。

谢谢！

Answer 1

data.table没有＆＃34;复杂的技巧＆＃34;是必须的。可以使用by参数（而不是split()）和.SDcols参数来指定要在cor()调用中使用的列。所以，它非常直接data.table语法：

# without stru because it is factor not numeric
str_char <- c("bills", "nos", "var2", "v1", "b1")
df1[, lapply(.SD, function(x) cor(value, x, method = "spearman")), 
    keyby = type, .SDcol = str_char]

    type       bills         nos        var2          v1          b1
1:     1 -0.58026951  0.16493506 -0.07664827  0.11627152 -0.05595326
2:     2  0.02646100  0.22246750  0.40308468  0.38943918 -0.10121018
3:     3 -0.11389551  0.36446564 -0.16438528  0.00000000 -0.04100238
4:     4 -0.45645233 -0.21585955 -0.19560440  0.28351648 -0.08580863
5:     5 -0.18596606 -0.23776224 -0.06304738 -0.03508794  0.39860140
6:     6 -0.72346726 -0.04175824  0.24862501 -0.30583077 -0.31718139
7:     7 -0.02649032 -0.08810594  0.48398529  0.30143033  0.50165047

# with stru after coersion of factor to numeric
str_char <- c("bills", "nos", "stru", "var2", "v1", "b1")
result <- df1[, lapply(.SD, function(x) cor(value, as.numeric(x), method = "spearman")), 
    keyby = type, .SDcol = str_char]
result

    type       bills         nos        stru        var2          v1          b1
1:     1 -0.58026951  0.16493506  0.08202645 -0.07664827  0.11627152 -0.05595326
2:     2  0.02646100  0.22246750  0.21968328  0.40308468  0.38943918 -0.10121018
3:     3 -0.11389551  0.36446564 -0.11769798 -0.16438528  0.00000000 -0.04100238
4:     4 -0.45645233 -0.21585955 -0.37551547 -0.19560440  0.28351648 -0.08580863
5:     5 -0.18596606 -0.23776224  0.39444627 -0.06304738 -0.03508794  0.39860140
6:     6 -0.72346726 -0.04175824  0.28585837  0.24862501 -0.30583077 -0.31718139
7:     7 -0.02649032 -0.08810594 -0.05718863  0.48398529  0.30143033  0.50165047

请注意，keyby代替by使结果与LAP's answer中的结果相同，以便进行比较。

此外，OP已要求在结果中附加一个新列，其中包含具有最高cor()值＆gt;的3个顶部变量的名称。每个type为0.2。

从宽格式到长格式重塑result之后，可以最方便地查找前3个值：

# reshape from wide to long
melt(result, id.vars = "type")[
  # select by value
  value > 0.2][
    # order by descending value and pick the first 3 (if any)
    order(-value), toString(head(variable, 3L)), keyby = type]

   type            V1
1:    2 var2, v1, nos
2:    3           nos
3:    4            v1
4:    5      b1, stru
5:    6    stru, var2
6:    7  b1, var2, v1

通过更新加入来追加result：

result[
  melt(result, id.vars = "type")[value > 0.2][
    order(-value), toString(head(variable, 3L)), keyby = type],
  on = "type", selected := V1][
    # beautify result
    is.na(selected), selected := ""][]

   type       bills         nos        stru        var2          v1          b1      selected
1:    1 -0.58026951  0.16493506  0.08202645 -0.07664827  0.11627152 -0.05595326              
2:    2  0.02646100  0.22246750  0.21968328  0.40308468  0.38943918 -0.10121018 var2, v1, nos
3:    3 -0.11389551  0.36446564 -0.11769798 -0.16438528  0.00000000 -0.04100238           nos
4:    4 -0.45645233 -0.21585955 -0.37551547 -0.19560440  0.28351648 -0.08580863            v1
5:    5 -0.18596606 -0.23776224  0.39444627 -0.06304738 -0.03508794  0.39860140      b1, stru
6:    6 -0.72346726 -0.04175824  0.28585837  0.24862501 -0.30583077 -0.31718139    stru, var2
7:    7 -0.02649032 -0.08810594 -0.05718863  0.48398529  0.30143033  0.50165047  b1, var2, v1

Answer 2

我使用split获得基本rbind解决方案，以生成子集列表，按照您想要的方式计算相关性并data.table。我想使用data.frame会有更复杂的方法，但现在它可以做到这一点。

根据您提供的数据生成df1 <- data.frame(id,bills,nos,stru,var1,var2,v1,v2,a1,b1,type,value) > head(df1) id bills nos stru var1 var2 v1 v2 a1 b1 type value 1 1 4 74 A 36 1 54 75 9 31 2 139 2 2 8 36 D 75 73 10 72 43 55 6 743 3 3 10 12 B 64 60 39 22 62 40 4 574 4 4 11 33 B 11 73 69 33 29 38 1 409 5 5 10 32 B 73 66 37 34 29 58 6 620 6 6 12 39 D 38 39 40 56 68 29 6 539：

split

使用subsets <- split(df1, df1$type)：

创建子集

lapply

使用嵌套的str_char解决方案循环遍历corlist <- lapply(subsets, function(x) lapply(str_char, function(y) cor(x[,"value"], as.numeric(x[,y]), method = "spearman")))中的变量名称：

do.call

使用嵌套的cormatrix <- do.call(rbind, lapply(corlist, function(x) do.call(c, x)))创建相关系数矩阵：

colnames(cormatrix) <- str_char

为列分配名称：

> cormatrix
        bills         nos        var2          v1          b1
1 -0.58026951  0.16493506 -0.07664827  0.11627152 -0.05595326
2  0.02646100  0.22246750  0.40308468  0.38943918 -0.10121018
3 -0.11389551  0.36446564 -0.16438528  0.00000000 -0.04100238
4 -0.45645233 -0.21585955 -0.19560440  0.28351648 -0.08580863
5 -0.18596606 -0.23776224 -0.06304738 -0.03508794  0.39860140
6 -0.72346726 -0.04175824  0.24862501 -0.30583077 -0.31718139
7 -0.02649032 -0.08810594  0.48398529  0.30143033  0.50165047

输出：

cormatrix

使用相关系数＆gt;添加最多三个变量的类型和名称。 0.2（按值排序）到maxvector <- apply(cormatrix, 1, function(x) sort(x[which(x > .2)], decreasing = T)) maxvector <- lapply(maxvector, function(x) names(x)[1:3]) maxvector <- lapply(maxvector, function(x) x[!is.na(x)]) maxvector <- lapply(maxvector, function(x) paste(x, collapse = ",")) cormatrix <- cbind(type = 1:7, cormatrix, maxvector)，请使用：

> cormatrix
  type bills       nos         stru        var2        v1          b1          maxvector    
1 1    -0.5802695  0.1649351   0.08202645  -0.07664827 0.1162715   -0.05595326 ""           
2 2    0.026461    0.2224675   0.2196833   0.4030847   0.3894392   -0.1012102  "var2,v1,nos"
3 3    -0.1138955  0.3644656   -0.117698   -0.1643853  0           -0.04100238 "nos"        
4 4    -0.4564523  -0.2158596  -0.3755155  -0.1956044  0.2835165   -0.08580863 "v1"         
5 5    -0.1859661  -0.2377622  0.3944463   -0.06304738 -0.03508794 0.3986014   "b1,stru"    
6 6    -0.7234673  -0.04175824 0.2858584   0.248625    -0.3058308  -0.3171814  "stru,var2"  
7 7    -0.02649032 -0.08810594 -0.05718863 0.4839853   0.3014303   0.5016505   "b1,var2,v1"

结果：

stru

修改：我还通过转换为as.numeric（感谢@Uwe）重新加入new_col。

Answer 3

这是一个整齐的尝试：

library(tidyverse)
df1 %>%
  select(bills, nos, var2, v1, b1, type) %>% #select needed variables, one can also do: select(str_char, type), however `stru` is not numeric
  group_by(type) %>% #group by type
  do(correlation = as.data.frame(cor(.[1:5]))) %>% #correlation
  unnest(correlation)  %>% #convenient output
  gather(key, value, bills:b1) %>% #for easier pairwise removal
  filter(var != key)  %>% #remove self correlation
  arrange(type, var, key)
  #output 
   # A tibble: 140 x 4
    type    var   key       value
   <int> <fctr> <chr>       <dbl>
 1     1     b1 bills  0.01978168
 2     1     b1   nos -0.40581082
 3     1     b1    v1 -0.08507922
 4     1     b1  var2  0.15430381
 5     1  bills    b1  0.01978168
 6     1  bills   nos  0.21208062
 7     1  bills    v1 -0.15127493
 8     1  bills  var2 -0.02983736
 9     1    nos    b1 -0.40581082
10     1    nos bills  0.21208062
# ... with 130 more rows

循环中的R相关与一个动态变量和使用ddply基于组变量

3 个答案: