Question

在我的数据框中，有些行具有相同的ID，但测试年份和年龄的值不同。我想折叠重复的行并为不同的值创建新列。

我是R的新人，并且已经和它挣扎了一段时间。

这是数据框：

>df
    id     project     testyr1      testyr2    age1     age2
1   16S       AS       2008         NA         29       NA 
2   32S       AS       2004         NA         30       NA 
3   37S       AS       NA           2011       NA       36 
4   50S       AS       2004         NA         23       NA 
5   50S       AS       1998         NA         16       NA 
6   55S       AS       2007         NA         28       NA

testyr1应该是最早的一年，testyr2是最近的一年。 age1应该是较年轻的年龄，age2年龄较大。

输出应为：

      id   project    testyr1   testyr2   age1   age2   
1     16S       AS    2008      NA        29     NA  
2     32S       AS    2004      NA        30     NA  
3     37S       AS    NA        2011      NA     36  
4     50S       AS    1998      2004      16     23  
6     55S       AS    2007      NA        28     NA

我试着写一个循环，但不知道如何结束它：

df.undup <- c()
df.undup <- c()    
for (i in 1:nrow(df)){   
  if i == i+1    
    df$testyr1 != NA {   

    testyr2 = max(testyr1)   
    testyr1 = min(testyr1)   
    nage2 = max(nage1)   
    nage1 = min(nage1)   
  }   
 else{   
    testyr2 = max(testyr2)   
    testyr1 = min(testyr2)   
    nage2 = max(nage2)   
    nage1 = min(nage2)   
  }   
}

非常感谢任何帮助。

Answer 1

library(plyr)

data <- read.csv(textConnection("id,project,testyr1,testyr2,age1,age2
16S,AS,2008,NA,29,NA
32S,AS,2004,NA,30,NA
37S,AS,NA,2011,NA,36
50S,AS,2004,NA,23,NA
50S,AS,1998,NA,16,NA
55S,AS,2007,NA,28,NA"))


new_data <- ddply(data, .(id), function(x) {
  return(data.frame(id = unique(x$id), project = unique(x$project), 
    testyr1 = min(x$testyr1), 
    testyr2 = max(x$testyr2), age1= min(x$age1), age2 = max(x$age2)))
    })

> new_data

    id project testyr1 testyr2 age1 age2
1 16S      AS    2008      NA   29   NA
2 32S      AS    2004      NA   30   NA
3 37S      AS      NA    2011   NA   36
4 50S      AS    2004      NA   23   NA
5 50S      AS    1998      NA   16   NA
6 55S      AS    2007      NA   28   NA

# But your result example suggests you want the lowest 
# of testyr to be in testyr1 and the highest of the combined
# testyrs to be in testyr2. Same logic for ages.
# If so, the one below should work:

new_data <- ddply(data, .(id), function(x) {
    if(dim(x)[1]>1) {
    years <- c(x$testyr1, x$testyr2)
    ages <-  c(x$age1, x$age2)
    return(data.frame(id = unique(x$id), project = unique(x$project), 
        testyr1 = min(years, na.rm=T), testyr2 = max(years , na.rm=T), 
        age1= min(ages, na.rm=T), age2 = max(ages, na.rm=T)))   
    } else {
    return(data.frame(id = unique(x$id), project = unique(x$project), 
        testyr1 = x$testyr1, testyr2 = x$testyr2, 
        age1= x$age1, age2 = x$age2)) 
    }       
    })

> new_data
   id project testyr1 testyr2 age1 age2
1 16S      AS    2008      NA   29   NA
2 32S      AS    2004      NA   30   NA
3 37S      AS      NA    2011   NA   36
4 50S      AS    1998    2004   16   23
5 55S      AS    2007      NA   28   NA

Answer 2

我真的怀疑这是最有效的方法，但我的大脑目前还不起作用。

temp = names(which(table(df$id) > 1))
temp1 = vector("list")
for (i in 1:length(temp)) {
  temp1[[i]] = df[df$id == temp[i], ]
  temp1[[i]] = data.frame(temp1[[i]][1, 1:2], 
                     testyr1 = min(temp1[[i]]$testyr1), 
                     testyr2 = max(temp1[[i]]$testyr1), 
                     age1 = min(temp1[[i]]$age1), 
                     age2 = max(temp1[[i]]$age1))
}

rbind(df[-c(which(df$id %in% temp)), ], do.call(rbind, temp1))
#    id project testyr1 testyr2 age1 age2
# 1 16S      AS    2008      NA   29   NA
# 2 32S      AS    2004      NA   30   NA
# 3 37S      AS      NA    2011   NA   36
# 6 55S      AS    2007      NA   28   NA
# 4 50S      AS    1998    2004   16   23

### rm(i, temp, temp1) ### Cleanup the workspace

使用R在几列中折叠具有不同值的重复行

2 个答案: