转换数据框

时间:2015-05-06 22:04:06

标签: r dataframe

我的调查问卷上有一个开放式的问题,例如“请命名为十只动物”,这给了我以下数据框(每个字母代表一只动物):

nrow <- 1000
list <- vector("list", nrow)

for(i in 1:nrow){
  na <- rep(NA, sample(1:10, 1))
  list[[i]] <- sample(c(letters, na), 10, replace=FALSE)
}

df <- data.frame()
df <- rbind(df, do.call(rbind, list))

head(df)
# V1   V2 V3 V4   V5 V6   V7 V8 V9  V10
# 1  r <NA>  a  j    w  e    i  h  u    z
# 2  t    o  e  x    d  v <NA>  z  n    c
# 3  f    y  e  s    n  c    z  i  u    k
# 4  y <NA>  v  j    h  z    p  i  c    q
# 5  w    s  v  f <NA>  c    g  b  x    e
# 6  p <NA>  a  h    v  x    k  z  o <NA>

如何将此数据帧转换为以下数据框?请记住,我实际上并不知道列名。

 r <- 1000
 c <- length(letters)
 t1 <- matrix(rbinom(r*c,1,0.5),r,c)
 colnames(t1) <- letters
 head(t1)
 #      a b c d e f g h i j k l m n o p q r s t u v w x y z
 # [1,] 0 1 0 1 0 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0
 # [2,] 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 1 0 1 0 1
 # [3,] 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
 # [4,] 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 1 1 0 0
 # [5,] 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0
 # [6,] 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 0 1

3 个答案:

答案 0 :(得分:5)

td <-  data.frame(t(apply(df, 1, function(x) as.numeric( unique(unlist(df)) %in% x))))
colnames (td) <- unique(unlist(df))

letters可以替换为动物名称colnames(t1)的矢量。

答案 1 :(得分:3)

你可以使用tidyr执行以下操作,这可能比其他方法快得多,尽管我非常喜欢@germcd的方法。您可能需要修改选择,删除NA以及空白区域,这可能是您提供的模拟数据的工件:

public static void main(String[] args) {

    Random rand = new Random();
    int randomSize = rand.nextInt((13) + 8);

    String password = generatePassword(rand, "", 0, randomSize);
    System.out.println(password);
}


public static char[]chars = {'a','b','c','d','e','f','g','h','i','j','k','l','m',
    'n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5',
    '6','7','8','9','!','@','$','%','^','&'};

//I think it makes more sense returning the generated string and passing along the random number generator instead of recreating it at each recursive call
public static String generatePassword(Random rand, String password, int position, int size)
{
    boolean isLowerCase = rand.nextBoolean();
    int randomChar = rand.nextInt(chars.length);
    char c = chars[randomChar];
    if(position == size) //base case
        return pasword;
    if(isLowerCase)
    {
        generatePassword(rand, password + chars[randomChar], position + 1, size);
    }
    else //its either upper or lower case
    {
        generatePassword(rand, password + Char.toUpperCase(c), position + 1, size);
    }
}

此代码将未命名的列收集为长格式,删除所有空列或缺少数据,然后按动物列的唯一值进行传播。这也具有保留动物命名的列顺序的潜在期望特性。如果不合适,那么您可以轻松地将生成的动物列转换为数字:

require(tidyr)

##  Add an ID for each record:
df$id <- 1:nrow(df)

out <- (df %>% 
  gather(column, animal, -id) %>% 
  filter(animal != " ") %>% 
  spread(animal, column)
)

head(out)

答案 2 :(得分:2)

您可以在&#34; qdapTools&#34;中尝试mtabulate包:

library(qdapTools)
head(mtabulate(as.data.frame(t(df))))
#   c d i l m o r v x y a f s t k p u b h j n q e g w z
# 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 2 0 1 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
# 3 0 0 1 0 0 0 1 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0
# 4 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0
# 5 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 0 0 0
# 6 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0

当然还有很多其他选择。

例如,来自我的&#34; splitstackshape&#34;的cSplit_e包(缺点是效率低下,你需要先将值粘贴在一起才能拆分):

library(splitstackshape)
library(dplyr)

作为零和零:

df %>%
  mutate(combined = apply(., 1, function(x) paste(na.omit(x), collapse = ","))) %>%
  cSplit_e("combined", ",", mode = "binary", type = "character", fill = 0) %>%
  select(starts_with("combined_")) %>%
  head
#   combined_a combined_b combined_c combined_d combined_e combined_f combined_g combined_h combined_i
# 1          0          0          1          1          0          0          0          0          1
# 2          1          0          0          1          0          1          0          0          0
# 3          1          0          0          0          0          0          0          0          1
# 4          0          1          1          0          0          0          0          1          1
# 5          0          1          0          1          0          0          0          1          0
# 6          0          1          0          0          0          0          0          0          0
#   combined_j combined_k combined_l combined_m combined_n combined_o combined_p combined_q combined_r
# 1          0          0          1          1          0          1          0          0          1
# 2          0          0          0          1          0          0          0          0          0
# 3          0          1          0          0          0          0          1          0          1
# 4          1          0          1          0          1          0          0          0          0
# 5          0          1          0          0          1          0          1          1          1
# 6          1          1          0          1          0          0          0          1          0
#   combined_s combined_t combined_u combined_v combined_w combined_x combined_y combined_z
# 1          0          0          0          1          0          1          1          0
# 2          1          1          0          0          0          0          0          0
# 3          0          1          1          0          0          1          1          0
# 4          0          0          1          0          0          0          1          0
# 5          1          0          0          0          0          0          0          0
# 6          1          1          1          0          0          0          0          0

作为原始值:

df %>%
  mutate(combined = apply(., 1, function(x) paste(na.omit(x), collapse = ","))) %>%
  cSplit_e("combined", ",", mode = "value", type = "character", fill = "") %>%
  select(starts_with("combined_")) %>%
  head
#   combined_a combined_b combined_c combined_d combined_e combined_f combined_g combined_h combined_i
# 1                                c          d                                                      i
# 2          a                                d                     f                                 
# 3          a                                                                                       i
# 4                     b          c                                                      h          i
# 5                     b                     d                                           h           
# 6                     b                                                                             
#   combined_j combined_k combined_l combined_m combined_n combined_o combined_p combined_q combined_r
# 1                                l          m                     o                                r
# 2                                           m                                                       
# 3                     k                                                      p                     r
# 4          j                     l                     n                                            
# 5                     k                                n                     p          q          r
# 6          j          k                     m                                           q           
#   combined_s combined_t combined_u combined_v combined_w combined_x combined_y combined_z
# 1                                           v                     x          y           
# 2          s          t                                                                  
# 3                     t          u                                x          y           
# 4                                u                                           y           
# 5          s                                                                             
# 6          s          t          u   

或者,您可以使用&#34; reshape2&#34;:

library(reshape2)

## The values
dcast(melt(as.matrix(df), na.rm = TRUE), 
      Var1 ~ value, value.var = "value")

## ones and zeroes
dcast(melt(as.matrix(df), na.rm = TRUE), 
      Var1 ~ value, value.var = "value", fun.aggregate = length)