用于将文件拆分为表的函数

时间:2013-10-28 22:11:44

标签: r

框架看起来像这样:

Col1       Col2
Name1      Attr1=10;Attr2=24; Attr3=5;Attr4=9;    
Name2      Attr1=1;Attr3=2.4; Attr4=16;Attr5=90;          
Name3      Attr1=2;Attr2=45; Attr4=122;Attr6=120; 

我想要以下输出:

Col1      Attr1    Attr2   Attr3   Attr4   Attr5   Attr6   
Name1       10       24       5      9       NA      NA     
Name2       1        NA      2.4     16      90      NA     
Name3       2        45      NA      122     NA      120    

有人可以帮我吗?

最佳

2 个答案:

答案 0 :(得分:4)

使用@ agstudy上面的答案中的dat,您可以尝试使用我的“splitstackshape”软件包以及“reshape2”软件包,如下所示:

library(splitstackshape)
library(reshape2)

## Convert the rownames to an "ID" column
dat$ID <- rownames(dat)

## Split Col1 and Col2 into a "lonf" form
S1 <- concat.split.multiple(dat, c("Col1", "Col2"), ";", "long")

## Make that output even longer
S2 <- melt(S1[complete.cases(S1), ], id.vars=c("ID", "time"))

## Split again, this time on the "="
S3 <- concat.split.multiple(S2, "value", "=")

## Use `dcast` to get the data into the right shape
dcast(S3, ID ~ value_1, value.var="value_2")
#      ID Attr1 Attr2 Attr3 Attr4 Attr5 Attr6
# 1 Name1    10    24   5.0     9    NA    NA
# 2 Name2     1    NA   2.4    16    90    NA
# 3 Name3     2    45    NA   122    NA   120

答案 1 :(得分:1)

这有效,但太长了。我很确定有更简单的方法。

dat <- read.table(text='Col1       Col2
Name1      Attr1=10;Attr2=24; Attr3=5;Attr4=9;    
Name2      Attr1=1;Attr3=2.4; Attr4=16;Attr5=90;          
Name3      Attr1=2;Attr2=45; Attr4=122;Attr6=120;',
                  header=TRUE,stringsAsFactors=FALSE) 

res <- do.call(cbind,lapply(dat,function(x)
      do.call(rbind,strsplit(x,split=";"))))

indx <- gsub('Attr([0-9]+).*','\\1',res)
vals <- gsub('.*=([0-9]+)','\\1',res)

mm <- max(as.integer(indx))
M <- matrix(NA_real_,nrow=nrow(res),ncol=mm)

for(i in seq_len(nrow(M)))
  M[i,as.integer(indx[i,])] <- as.numeric(vals[i,])

rownames(M) <- rownames(dat)
colnames(M) <- paste0('Attr',seq_len(mm))


     Attr1 Attr2 Attr3 Attr4 Attr5 Attr6
Name1    10    24   5.0     9    NA    NA
Name2     1    NA   2.4    16    90    NA
Name3     2    45    NA   122    NA   120