将项列表与data.frame的行项匹配

时间:2013-11-01 16:06:39

标签: r

大家好我有一个难以管理的情况: 我有一个看起来像这样的data.frame:

General_name
     a 
     b
     c 
     d
     m
     n

和另一个看起来像这样的data.frame:

First_names_list     a=34;b=4   
Second_names_list    d=2;m=98;n=32    
Third_names_list     c=1;d=12;m=0.1

我必须将第一个data.frame的每个元素与第二个data.frame [,2]中的每个元素匹配,以便最后我必须获取下表:

Names                 a     b     c      d      m      n
First_names_list     34     4     NA    NA     NA     NA        
Second_names_list    NA     NA    NA    2      98     32     
Third_names_list     NA     NA    1     12     0.1    NA      

有什么建议吗?对我来说似乎太难了。

最佳

电子。

4 个答案:

答案 0 :(得分:2)

选项1

以下是使用“reshape2”中的dcast和“splitstackshape”包中的concat.split的方法:

library(splitstackshape)
## The following can also be done in 2 steps. The basic idea is to split 
##   the values into a semi-long form for `dcast` to be able to use. So, 
##   I've split first on the semicolon, and made the data into a long form 
##   at the same time, then I've split on =, but kept it wide that time.
out <- concat.split(concat.split.multiple(df, "V2", ";", "long"),
                    "V2", "=", drop = TRUE)
out
#                  V1 time V2_1 V2_2
# 1  First_names_list    1    a 34.0
# 2 Second_names_list    1    d  2.0
# 3  Third_names_list    1    c  1.0
# 4  First_names_list    2    b  4.0
# 5 Second_names_list    2    m 98.0
# 6  Third_names_list    2    d 12.0
# 7  First_names_list    3 <NA>   NA
# 8 Second_names_list    3    n 32.0
# 9  Third_names_list    3    m  0.1

library(reshape2)
dcast(out[complete.cases(out), ], V1 ~ V2_1, value.var="V2_2")
#                  V1  a  b  c  d    m  n
# 1  First_names_list 34  4 NA NA   NA NA
# 2 Second_names_list NA NA NA  2 98.0 32
# 3  Third_names_list NA NA  1 12  0.1 NA

选项2

这是使用更新版data.table的另一个选项。这个概念与上面采用的方法非常相似。

library(data.table)
library(reshape2)
packageVersion("data.table")
# [1] ‘1.8.11’

dt <- data.table(df)
S1 <- dt[, list(X = unlist(strsplit(as.character(V2), ";"))), by = V1]
S1[,  c("A", "B") := do.call(rbind.data.frame, strsplit(X, "="))]
S1
#                   V1     X A   B
# 1:  First_names_list  a=34 a  34
# 2:  First_names_list   b=4 b   4
# 3: Second_names_list   d=2 d   2
# 4: Second_names_list  m=98 m  98
# 5: Second_names_list  n=32 n  32
# 6:  Third_names_list   c=1 c   1
# 7:  Third_names_list  d=12 d  12
# 8:  Third_names_list m=0.1 m 0.1
dcast.data.table(S1, V1 ~ A, value.var="B")
#                   V1  a  b  c  d   m  n
# 1:  First_names_list 34  4 NA NA  NA NA
# 2: Second_names_list NA NA NA  2  98 32
# 3:  Third_names_list NA NA  1 12 0.1 NA

以上两个选项都假设我们从:

开始
df <- structure(list(V1 = c("First_names_list", "Second_names_list", 
    "Third_names_list"), V2 = c("a=34;b=4", "d=2;m=98;n=32", 
    "c=1;d=12;m=0.1")), .Names = c("V1", "V2"), class = "data.frame", 
    row.names = c(NA, -3L))

答案 1 :(得分:1)

以下是apply中使用apply的解决方案:

#Data frame 1
df1 <- read.table(text=
"General_name
a 
b
c 
d
m
n", header=T, as.is=T)

#Data frame 2
df2 <- read.table(text=
"col1     col2
First_names_list     a=34;b=4   
Second_names_list    d=2;m=98;n=32    
Third_names_list     c=1;d=12;m=0.1", header=T, as.is=T)

#make lists for each row, sep by ";"
df2split <- strsplit(df2$col2,split=";") 

#result
t(
  sapply(seq(1:nrow(df2)),function(c){
    x <- df2split[[c]]
    sapply(df1$General_name,function(n){
      t <- gsub(paste0(n,"="),"",x[grepl(n,x)])
      ifelse(length(t)==0,NA,as.numeric(t))
      })
    })
  )

答案 2 :(得分:0)

我觉得这是一种略微圆润的方式,所以我期待更好的解决方案。但这很有效。

library(data.table)
library(reshape2)

#creating datasets
dt <- data.table(read.csv(textConnection('
"First_names_list","a=34;b=4"
"Second_names_list","d=2;m=98;n=32"
"Third_names_list","c=1;d=12;m=0.1"
'),header = FALSE))

General_name = c('a','b','c','d','m','n')
TotalBreakup <- data.table(
V1 = General_name
)

# Fixing datatypes
TotalBreakup <- TotalBreakup[,lapply(.SD,as.character)]
dt <- dt[,lapply(.SD,as.character)]


# looping through each row and calculating breakdown
for(i in 1:nrow(dt))
{
  # the next two statements are the workhorse of this code. Run each part of these statements step by step to see 
  dtlist <- strsplit(unlist(strsplit(dt[i,V2],";")),"=")
  breakup <- data.table(
    t(
      matrix(
        unlist(
          strsplit(
            unlist(
              strsplit(
                dt[i,V2],
                ";"
                )
              ),
            "="
            )
          ),
        nrow = 2
        )
      )
    )

  # fixing datatypes again
  breakup <- breakup[,lapply(.SD,as.character)]
  #appending to master dataset
  TotalBreakup <- merge(TotalBreakup, breakup, by = "V1", all.x = TRUE)
}

#formatting results
setnames(TotalBreakup,c("Names",dt[,V1]))
TotalBreakup <- acast(melt(TotalBreakup,id.vars = "Names"),variable~Names)

输出 -

> TotalBreakup
                  a    b   c   d    m     n   
First_names_list  "34" "4" NA  NA   NA    NA  
Second_names_list NA   NA  NA  "2"  "98"  "32"
Third_names_list  NA   NA  "1" "12" "0.1" NA

答案 3 :(得分:0)

方法是:

#the second dataframe you provided
DF2 <- read.table(text = '

   First_names_list     a=34;b=4   
   Second_names_list    d=2;m=98;n=32    
   Third_names_list     c=1;d=12;m=0.1

', header = F, stringsAsFactors = F)

#empty dataframe
DF <- structure(list(a = c(NA, NA, NA), b = c(NA, NA, NA), c = c(NA, 
NA, NA), d = c(NA, NA, NA), m = c(NA, NA, NA), n = c(NA, NA, 
NA)), .Names = c("a", "b", "c", "d", "m", "n"), row.names = c("First_names_list", 
"Second_names_list", "Third_names_list"), class = "data.frame")
DF
#                   a  b  c  d  m  n
#First_names_list  NA NA NA NA NA NA
#Second_names_list NA NA NA NA NA NA
#Third_names_list  NA NA NA NA NA NA

#fill the dataframe
myls <- strsplit(DF2$V2, split = ";")
for(i in 1:length(myls))
{
 sapply(myls[[i]], 
      function(x) { res <- unlist(strsplit(x, "=")) ; DF[i,res[1]] <<- res[2] })
}

DF
#                     a    b    c    d    m    n
#First_names_list    34    4 <NA> <NA> <NA> <NA>
#Second_names_list <NA> <NA> <NA>    2   98   32
#Third_names_list  <NA> <NA>    1   12  0.1 <NA>