我没有找到解决方案所以我想知道是否有人可以提供帮助。对于大多数人来说,这只是几秒钟的事情。
示例数据:
df <- data.frame(id=c(rep(1,4),rep(2,1),rep(3,2),rep(4,3)),rep("Name",10),rnorm(10,2,2),rbinom(10,2,0.7),rexp(10,2),rnorm(10,0,1),
rgamma(10,2,7),rnorm(10,3,2),rexp(10,5))
colnames(df) <- c("id","Person","X2012","X2011","X2010","X2009","X2013","X2008","X2007")
df$X2007[1:4] <- NA;df$X2008[1:4] <- NA;df$X2011[1:4] <- NA;df$X2012[5:5] <- NA;df$X2013[5:5] <- NA
df$X2011[5:5] <- NA;df$X2007[5:5] <- NA;df$X2008[5:5] <- NA;df$X2009[5:5] <- NA;df$X2013[5:5] <- NA
df$X2012[6:7] <- NA;df$X2011[6:7] <- NA;df$X2013[6:7] <- NA;df$X2009[6:7] <- NA;df$X2010[6:7] <- NA
df$X2008[8:10] <- NA;df$X2007[8:10] <- NA;df$X2008[8:10] <- NA;df$X2013[8:10] <- NA;df$X2009[8:10] <- NA
预览上述数据:
> df
id Person X2012 X2011 X2010 X2009 X2013 X2008 X2007
1 1 Name 1.0235950 NA 0.97326338 -0.3755807 0.7672341 NA NA
2 1 Name 0.1920131 NA 0.08904475 -0.5014424 0.2120472 NA NA
3 1 Name 1.2191632 NA 0.04186686 0.4966126 0.3395579 NA NA
4 1 Name 3.6281268 NA 1.63105614 1.5208809 0.2904623 NA NA
5 2 Name NA NA 0.34680106 NA NA NA NA
....
预览我想要的内容:
> df.ok
Year id Person X2012 X2011 X2010 X2009 X2013 X2008 X2007
1 X2009 1 Name 1.0235950 NA 0.97326338 -0.3755807 0.7672341 NA NA
2 X2010 1 Name 0.1920131 NA 0.08904475 -0.5014424 0.2120472 NA NA
3 X2012 1 Name 1.2191632 NA 0.04186686 0.4966126 0.3395579 NA NA
4 X2013 1 Name 3.6281268 NA 1.63105614 1.5208809 0.2904623 NA NA
5 X2010 2 Name NA NA 0.34680106 NA NA NA NA
...
基本上,我在给出与列向量对应的数值后,在列名称中创建新的列向量。使用NA输入意味着年份不能用于给定的行。
编辑:数据集更新:
my.vec <- c(1,"Activa","","","","","","","")
df <- rbind(df[1:4, ], my.vec, df[5:nrow(df), ])
df[,3:ncol(df)] <- apply(df[, 3:ncol(df)],2, function(x) as.numeric(as.character(x)))
df[,2] <- rep("Name",nrow(df))
所以现在每个id之前还有1行。现在该如何处理?
答案 0 :(得分:1)
这是一种可行的方法:
## Identify the non-`NA` values, make necessary adjustment
matches <- which(!is.na(df[-c(1, 2)]), arr.ind=TRUE)
matches[, "col"] <- matches[, "col"] + 2
## Create a `data.frame` from which we can aggregate the
## sorted column names according to the "id" variable
matches <- data.frame(matches,
Year = names(df)[matches[, 2]],
ID = df$id[matches[, 1]])
## Aggregate and retain only the unlisted aggregated values
Year <- unlist(aggregate(as.character(Year) ~ ID, matches,
function(x) sort(unique(x)))[[2]], use.names=FALSE)
## Bind that back with the original data
cbind(Year, df)
# Year id Person X2012 X2011 X2010 X2009 X2013 X2008 X2007
# 1 X2009 1 Name 2.724841 NA 0.57090153 0.2346739 0.1861127 NA NA
# 2 X2010 1 Name 1.604744 NA 0.54476273 1.4647495 0.1477001 NA NA
# 3 X2012 1 Name 0.887420 NA 0.03375393 -0.7514787 0.1252799 NA NA
# 4 X2013 1 Name 1.827136 NA 2.48897284 1.3697001 0.1026063 NA NA
# 5 X2010 2 Name NA NA 0.70943724 NA NA NA NA
# 6 X2007 3 Name NA NA NA NA NA -4.196199 0.00789531
# 7 X2008 3 Name NA NA NA NA NA 5.853701 0.16814434
# 8 X2010 4 Name 2.887311 1 0.64874998 NA NA NA NA
# 9 X2011 4 Name -0.610759 2 0.74106815 NA NA NA NA
# 10 X2012 4 Name 1.827282 2 0.25219805 NA NA NA NA
答案 1 :(得分:1)
Year = rep("", nrow(df))
cols = colnames(df[,-c(1:2)])
for (id in unique(df$id)) {
# rows should match id AND should have at least a numeric value:
rows = df$id == id & apply(!is.na(df[,cols]), 1, any)
isYearOk = !is.na(apply(df[rows, cols], 2, sum))
Year[rows] = sort(cols[isYearOk])
}
cbind(Year, df) # This is equal to df.ok
# Test:
all(Year == df.ok$Year)
# TRUE