根据行名重新格式化数据表以在R中生成新列

时间:2015-03-10 12:13:09

标签: r dataframe data.table

我的数据框看起来像

            m1      m2     m3
 P001.st   60.00   2.0     1
 P003.nd   14.30   2.077   1
 P003.rt   29.60   2.077   1
 P006.st   10.30   2.077   1
 P006.nd   79.30   2.077   1
 P008.nd    9.16   2.077   1

我想重新格式化表格,以便只有行名称的第一部分(在句点之前,即P001,P003等)显示为行名称,并将具有相似名称的每个后续行追加到列。输出应该看起来像

         m1st   m2st  m3st  m1nd   m2nd  m3nd   m1rt   m2rt   m3rt
 P001   60.00   2.0     1   0       0      0     0       0      0
 P003   0       0       0   14.30   2.077  1     29.60   2.077  1
 P006   10.30   2.077   1   79.30   2.077  1     0       0      0
 P008   0       0       0    9.16   2.077  1     0       0      0

聚合函数如

aggregate(value~name, df, I)

或来自data.table的方法,如

setDT(df)[, list(value=list(value)), by=name] 

不起作用,因为row.names不完全相同。有关数百行与许多变量子类型匹配的任何建议(即,在句点之后:.nd,.st等)。

5 个答案:

答案 0 :(得分:3)

dt = as.data.table(your_df, keep.rownames = T)

# split the row names into two id's
dt[, `:=`(id1 = sub('\\..*', '', rn), id2 = sub('.*\\.', '', rn), rn = NULL)]

# melt and dcast (need latest 1.9.5 or have to load reshape2 and use dcast.data.table)
dcast(melt(dt, id.vars = c('id1', 'id2')), id1 ~ variable + id2, fill = 0)
#    id1 m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
#1: P001  0.00   0.0  60.0 0.000 0.000 2.000     0     0     1
#2: P003 14.30  29.6   0.0 2.077 2.077 0.000     1     1     0
#3: P006 79.30   0.0  10.3 2.077 0.000 2.077     1     0     1
#4: P008  9.16   0.0   0.0 2.077 0.000 0.000     1     0     0

答案 1 :(得分:2)

这是另一种方法:

library(dplyr)
library(tidyr)
(wide <- reshape(df %>% add_rownames() %>% separate(rowname, c("rowname", "id")), 
                 idvar = "rowname", 
                 timevar = "id", 
                 direction = "wide", 
                 sep = ""))
#   rowname m1st  m2st m3st  m1nd  m2nd m3nd m1rt  m2rt m3rt
# 1    P001 60.0 2.000    1    NA    NA   NA   NA    NA   NA
# 2    P003   NA    NA   NA 14.30 2.077    1 29.6 2.077    1
# 4    P006 10.3 2.077    1 79.30 2.077    1   NA    NA   NA
# 6    P008   NA    NA   NA  9.16 2.077    1   NA    NA   NA

wide[is.na(wide)] <- 0
rownames(wide) <- wide[, 1]
wide$rowname <- NULL
wide
#      m1st  m2st m3st  m1nd  m2nd m3nd m1rt  m2rt m3rt
# P001 60.0 2.000    1  0.00 0.000    0  0.0 0.000    0
# P003  0.0 0.000    0 14.30 2.077    1 29.6 2.077    1
# P006 10.3 2.077    1 79.30 2.077    1  0.0 0.000    0
# P008  0.0 0.000    0  9.16 2.077    1  0.0 0.000    0

答案 2 :(得分:1)

试试这个:

library(tidyr)
library(dplyr)
library(reshape2)
library(stringr)

data <-
  structure(list(m1 = c(60, 14.3, 29.6, 10.3, 79.3, 9.16), 
                 m2 = c(2, 2.077, 2.077, 2.077, 2.077, 2.077),
                 m3 = c(1L, 1L, 1L, 1L, 1L, 1L)),
            .Names = c("m1", "m2", "m3"),
            class = "data.frame", 
            row.names = c("P001.st", "P003.nd", "P003.rt", 
                          "P006.st", "P006.nd", "P008.nd"))

my_data <- 
  as_data_frame(cbind(col_01 = rownames(data), data)) %>% 
  melt(.) %>% 
  separate(., col_01, into = c("var_01", "var_02"), sep = "\\.") %>% 
  mutate(my_var = str_c(variable, var_02)) %>% 
  select(var_01, my_var, value) %>% 
  arrange(var_01, my_var) %>% 
  spread(., my_var, value)

my_data

  var_01  m1nd m1rt m1st  m2nd  m2rt  m2st m3nd m3rt m3st
1   P001    NA   NA 60.0    NA    NA 2.000   NA   NA    1
2   P003 14.30 29.6   NA 2.077 2.077    NA    1    1   NA
3   P006 79.30   NA 10.3 2.077    NA 2.077    1   NA    1
4   P008  9.16   NA   NA 2.077    NA    NA    1   NA   NA

如果要将NAs替换为0,可以这样做:

my_data[is.na(my_data)] <- 0

  var_01  m1nd m1rt m1st  m2nd  m2rt  m2st m3nd m3rt m3st
1   P001  0.00  0.0 60.0 0.000 0.000 2.000    0    0    1
2   P003 14.30 29.6  0.0 2.077 2.077 0.000    1    1    0
3   P006 79.30  0.0 10.3 2.077 0.000 2.077    1    0    1
4   P008  9.16  0.0  0.0 2.077 0.000 0.000    1    0    0

答案 3 :(得分:1)

如果您的数据框被称为&#34;数据&#34;:

library(reshape2)
data$prefix <- gsub("(.*)\\..*","\\1",row.names(data))
data$suffix <- gsub(".*\\.(.*)","\\1",row.names(data))
data.melt <- melt(data)
data.melt
data.cast <- dcast(data.melt,prefix~variable+suffix,mean)
# set the row names to prefix 
row.names(data.cast) <- data.cast$prefix
# get rid of the prefix column
data.cast <- data.cast[,-1]
data.cast 

给出

Using prefix, suffix as id variables
     m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
P001   NaN   NaN  60.0   NaN   NaN 2.000   NaN   NaN     1
P003 14.30  29.6   NaN 2.077 2.077   NaN     1     1   NaN
P006 79.30   NaN  10.3 2.077   NaN 2.077     1   NaN     1
P008  9.16   NaN   NaN 2.077   NaN   NaN     1   NaN   NaN

要更正列名和零而不是NaN,请执行

names(data.cast) <- gsub("_","",names(data.cast))
apply(data.cast,c(1,2),function(x){as.numeric(ifelse(is.na(x),0,x)) })

获得

       m1nd m1rt m1st  m2nd  m2rt  m2st m3nd m3rt m3st
P001  0.00  0.0 60.0 0.000 0.000 2.000    0    0    1
P003 14.30 29.6  0.0 2.077 2.077 0.000    1    1    0
P006 79.30  0.0 10.3 2.077 0.000 2.077    1    0    1
P008  9.16  0.0  0.0 2.077 0.000 0.000    1    0    0

答案 4 :(得分:1)

使用extract()代替separate()使用更灵活的正则表达式,使用tidyrdplyr

df %>% 
  extract(id, c("id2", "var"), c("(P00.)\\.(..)")) %>% 
  gather(variable,value,c(m1,m2,m3)) %>% 
  mutate(var=paste0(variable,".",var)) %>% 
  select(-variable) %>% 
  spread(var,value,fill=0)

   id2 m1.nd m1.rt m1.st m2.nd m2.rt m2.st m3.nd m3.rt m3.st
1 P001  0.00   0.0  60.0 0.000 0.000 2.000     0     0     1
2 P003 14.30  29.6   0.0 2.077 2.077 0.000     1     1     0
3 P006 79.30   0.0  10.3 2.077 0.000 2.077     1     0     1
4 P008  9.16   0.0   0.0 2.077 0.000 0.000     1     0     0