R中的“反转”数据组织(不是转置)

时间:2012-06-16 21:38:25

标签: r

这是我的数据框的示例列,RR是标题:

RR
Cvv  
Cvv  
Caa 

我需要的是“反转”数据,以便在数据帧中获得子串vv和aa作为标题和RR。得到的矩阵将是:

vv  | aa  
CRR |  
CRR |  
    | CRR  

因此我们在两个矩阵中都得到了相同的关系。在第一行和第二行,vv与RR耦合。在第三行,aa与RR结合。

这可以通过R实现吗?有什么想法吗?

感谢您的期待!

我在上面的示例中过度简化了我的数据。所以这是我的实际数据集的样本:

> dput(head(A1F[4:15],n=20))
structure(list(RR = structure(c(15L, 15L, 15L, 27L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
" ", "Caa", "Caj", "Cbb", "Cbb ", "Cbv", "Cja", "Cjr", "Crj", 
"Crr", "Crv", "Cvb", "Cvr", "Cvv", "Gaa", "Gaj", "Gbb", "Gbv", 
"Gja", "Gjr", "Grj", "Grr", "Grv", "Gvb", "Gvr", "Gvv"), class = "factor"), 
    AA = structure(c(13L, 13L, 13L, 1L, 1L, 1L, 1L, 15L, 27L, 
    27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 1L), .Label = c("", 
    "Caa", "Caj", "Car", "Cbb", "Cbv", "Cja", "Cjr", "Cjr ", 
    "Crj", "Crr", "Crv", "Cvb", "Cvr", "Cvv", "Gaa", "Gaj", "Gbb", 
    "Gbv", "Gja", "Gjr", "Grj", "Grr", "Grv", "Gvb", "Gvr", "Gvv"
    ), class = "factor"), BB = structure(c(9L, 9L, 9L, 9L, 9L, 
    9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L
    ), .Label = c("", "?", "Caa", "Caj", "Cbv", "Cja", "Cjr", 
    "Crj", "Crr", "Crv", "Cvb", "Cvr", "Cvv", "Gaa", "Gaj", "Gbv", 
    "Gja", "Gjr", "Grj", "Grr", "Grv", "Gvb", "Gvr", "Gvv"), class = "factor"), 
    VV = structure(c(8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
    8L, 1L, 1L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), .Label = c("", 
    " ", "Caa", "Caj", "Caj+", "Cbb", "Cbv", "Cja", "Cjr", "Crv", 
    "Cvb", "Cvr", "Cvv", "Gaa", "Gbb", "Gja", "Gjr", "Grv", "Gvb", 
    "Gvr"), class = "factor"), RJ = structure(c(8L, 3L, 3L, 1L, 
    1L, 12L, 12L, 12L, 12L, 12L, 1L, 12L, 12L, 12L, 12L, 12L, 
    12L, 12L, 12L, 12L), .Label = c("", "Caa", "Caj", "Cbv", 
    "Ccrj", "Cja", "Cjr", "Crj", "Crj ", "Crr", "Crv", "Cvr", 
    "Cvv", "Gaa", "Gaj", "Gbv", "Gja", "Gjr", "Grj", "Grr", "Grv", 
    "Gvr", "Gvv"), class = "factor"), JR = structure(c(7L, 7L, 
    18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 
    18L, 18L, 18L, 18L, 18L, 18L), .Label = c("", "Caa", "Caj", 
    "Cbv", "Cja", "Cjr", "Crj", "Crr", "Crv", "Cvb", "Cvr", "Cvv", 
    "Gaa", "Gaj", "Gbv", "Gja", "Gjr", "Grj", "Grr", "Grv", "Grv ", 
    "Gvb", "Gvb ", "Gvr", "Gvv"), class = "factor"), BV = structure(c(4L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
    4L, 4L, 4L, 4L), .Label = c("", "Caa", "Caj", "Cbb", "Cbv", 
    "Cja", "Cjr", "Crj", "Crr", "Crv", "Cvb", "Cvr", "Cvv", "Gaa", 
    "Gaj", "Gbb", "Gbv", "Gja", "Gjr", "Grj", "Grv", "Gvb", "Gvr", 
    "Gvv", "R"), class = "factor"), VB = structure(c(1L, 1L, 
7L, 7L, 18L, 18L, 1L, 1L, 10L, 10L, 21L, 21L, 21L, 1L, 21L, 
21L, 21L, 21L, 21L, 1L), .Label = c("", "Caa", "Caj", "Cbb", 
"Cbv", "Cja", "Cjr", "Crj", "Crr", "Crv", "Cvb", "Cvv", "Gaa", 
"Gaj", "Gbb", "Gbv", "Gja", "Gjr", "Grj", "Grr", "Grv", "Gvb", 
"Gvr", "Gvv"), class = "factor"), AJ = structure(c(2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 10L, 
1L, 10L, 10L), .Label = c("", "Caa", "Caj", "Cbb", "Cbv", 
"Cja", "Cjr", "Crj", "Crv", "Cvb", "Cvr", "Cvv", "Gaa", "Gaj", 
"Gbb", "Gbv", "Gja", "Gjr", "Grj", "Grj ", "Grr", "Grv", 
"Gvb", "Gvr", "Gvv"), class = "factor"), JA = structure(c(10L, 
10L, 10L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 6L, 6L, 6L, 6L), .Label = c("", "Caa", "Caj", "Cbv", 
"Cja", "Cjr", "Crr", "Crv", "Cvb", "Cvr", "Cvv", "Gaa", "Gaj", 
"Gbv", "Gja", "Gjr", "Grr", "Grv", "Gvb", "Gvv"), class = "factor"), 
VR = structure(c(1L, 5L, 5L, 5L, 16L, 16L, 16L, 16L, 16L, 
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), .Label = c("", 
"Caa", "Caj", "Caj ", "Cbv", "Cja", "Cjr", "Crj", "Crr", 
"Crv", "Cvb", "Cvr", "Cvv", "Gaa", "Gaj", "Gbv", "Gja", "Gjr", 
"Grj", "Grr", "Grv", "Gvb", "Gvr", "Gvv"), class = "factor"), 
RV = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 1L, 1L), .Label = c("", 
"Caa", "Caj", "Cbb", "Cbv", "Cja", "Cjr", "Crj", "Crr", "Crv", 
"Cvr", "Cvv", "Cvv ", "Gaa", "Gaj", "Gbb", "Gbv", "Gja", 
"Gjr", "Grj", "Grr", "Grv", "Gvr", "Gvv"), class = "factor")), .Names = c("RR", 
"AA", "BB", "VV", "RJ", "JR", "BV", "VB", "AJ", "JA", "VR", "RV"
), row.names = c(NA, 20L), class = "data.frame")

如上所述,期望的矩阵将保持关系和行顺序。 GSee提供了一个我可以应用的答案,但仅提供给我的矩阵的一列,因为[[仅选择特定条目并选择多个条目用[不起作用]。我不确定我是否正朝着正确的方向前进......

根据实际数据集(如上所示),这是所需输出(前三行)的样子:

structure(list(vv = structure(c(1L, 1L, 1L), .Label = "CRR", class = "factor"), 
    rv = c(NA, NA, NA), ja = structure(c(1L, 1L, 1L), .Label = "CVV", class = "factor"), 
    aa = structure(c(1L, 1L, 1L), .Label = "CAJ", class = "factor"), 
    bv = structure(c(1L, 2L, 2L), .Label = c("", "CVR"), class = "factor"), 
    aj = structure(c(1L, 2L, 2L), .Label = c("", "CRJ"), class = "factor"), 
    vb = structure(c(1L, 1L, 1L), .Label = "CAA", class = "factor"), 
    rj = structure(c(2L, 1L, 1L), .Label = c("", "CRJ"), class = "factor"), 
    rr = structure(c(1L, 1L, 1L), .Label = "CBB", class = "factor"), 
    vr = structure(c(1L, 1L, 1L), .Label = "CJA", class = "factor"), 
    bb = structure(c(1L, 1L, 1L), .Label = "CBV", class = "factor"), 
    jr = c(NA, NA, NA)), .Names = c("vv", "rv", "ja", "aa", "bv", 
"aj", "vb", "rj", "rr", "vr", "bb", "jr"), class = "data.frame", row.names = c(NA, 
-3L))

我希望这更有意义。

2 个答案:

答案 0 :(得分:5)

这有点硬编码,但想法就在那里。

require(stringr)
require(plyr)
vect <- data.frame(RR=c("Cvv", "Cvv", "Caa"))
theMat <- t(adply(levels(vect$RR), .margins=1, .fun=function(x){str_extract(string=vect$RR, pattern=x)}))[-1 ,]
colnames(theMat) <- levels(vect$RR)
colnames(theMat) <- str_sub(colnames(theMat), start=2, end=3)
theMat <- str_replace(string=theMat, pattern=paste(colnames(theMat), collapse="|"), replacement="RR")

答案 1 :(得分:3)

好的。我不知道密码4你。

#dat is the data.frame that was created from the `dput` output in the question
m <- as.matrix(dat) #convert to matrix
m[10, "AJ"] <- "" # Fix the typo/error in your data

找出输出矩阵的名称,并制作矩阵(现在用NA填充)

ocn <- unique(substr(paste(m[m!=""]), 2, 3)) #out column names
out <- matrix(NA, nrow(m), length(ocn))
colnames(out) <- ocn

循环遍历每列的每一行

for (i in seq_len(NCOL(m))) { #for each column
  cn <- colnames(m)[i] #this will become the second 2 characters of new value
  for (j in seq_along(m[, i])) { # for each row of this column
    if (nzchar(m[j, i])) { # if there is something there (i.e. it is not "")
      # do the substitution
      out[j, substr(m[j, i], 2, 3)] <- paste0(substr(m[j, i], 1, 1), cn)   
    }
  }
}
out
#      vv    vb    rr    ja    rj    aj    vr    bb    jr    rv    aa    bv   
# [1,] "CRR" "CAA" "CBB" "CVV" "CJR" NA    "CJA" "CBV" NA    NA    "CAJ" NA   
# [2,] "CRR" "CAA" "CBB" "CVV" "CJR" "CRJ" "CJA" "CBV" NA    NA    "CAJ" "CVR"
# [3,] "CRR" "CAA" "CBB" "CVV" "GJR" "CRJ" "CJA" "CBV" "CVB" NA    "CAJ" "CVR"
# [4,] "GRR" NA    "CBB" "CVV" "GJR" NA    NA    "CBV" "CVB" NA    "CAJ" "CVR"
# [5,] NA    NA    "CBB" "CVV" "GJR" NA    NA    "CBV" "GVB" NA    "CAJ" "GVR"
# [6,] NA    NA    "CBB" "CVV" "GJR" NA    "CRJ" "CBV" "GVB" NA    "CAJ" "GVR"
# [7,] NA    NA    "CBB" "CVV" "GJR" NA    "CRJ" "CBV" NA    NA    "CAJ" "GVR"
# [8,] "CAA" NA    "CBB" "CVV" "GJR" NA    "CRJ" "CBV" NA    NA    "CAJ" "GVR"
# [9,] "GAA" NA    "CBB" "CVV" "GJR" "CRV" "CRJ" "CBV" NA    "CVB" NA    "GVR"
# [10,] "GAA" NA    "CBB" "CVV" "GJR" "CRV" "CRJ" "CBV" NA    "CVB" NA    "GVR"
# [11,] "GAA" NA    "CBB" "CVV" "GJR" "GRV" NA    "CBV" NA    "GVB" NA    "GVR"
# [12,] "GAA" NA    "CBB" NA    "GJR" "GRV" "CRJ" "CBV" NA    "GVB" NA    "GVR"
# [13,] "GAA" NA    "CBB" NA    "GJR" "GRV" "CRJ" "CBV" NA    "GVB" NA    "GVR"
# [14,] "GAA" NA    "CBB" "GVV" "GJR" "GRV" "CRJ" "CBV" NA    NA    NA    "GVR"
# [15,] "GAA" NA    "CBB" "GVV" "GJR" "GRV" "CRJ" "CBV" NA    "GVB" NA    "GVR"
# [16,] "GAA" NA    "CBB" "GVV" "GJR" "GRV" "CRJ" "CBV" NA    "GVB" NA    "GVR"
# [17,] "GAA" "CAJ" "CBB" "GVV" "GJR" "GRV" "CRJ" "CBV" "CJA" "GVB" NA    "GVR"
# [18,] "GAA" NA    "CBB" "GVV" "GJR" "GRV" "CRJ" "CBV" "CJA" "GVB" NA    "GVR"
# [19,] "GAA" "CAJ" "CBB" "GVV" "GJR" NA    "CRJ" "CBV" "CJA" "GVB" NA    "GVR"
# [20,] NA    "CAJ" "CBB" "GVV" "GJR" NA    "CRJ" "CBV" "CJA" NA    NA    "GVR"