我正在尝试重新格式化数据集my.data
以获取my.data2
语句下面显示的输出。具体来说,我想将my.data
的最后4列放在每record.id
行一行,其中最后四行
如果my.data
,group=1
的列将占据新数据矩阵的第2-5列,如果group=2
,则会占据第6-9列。
我在下面编写了繁琐的代码,但是双for循环导致了一个我根本无法找到的错误。
即使双循环工作,我怀疑有一种更有效的方法来完成
同样的事情 - (也许reshape
?)
感谢您帮助纠正双重for-loop或更高效的代码。
my.data <- "record.id group s1 s2 s3 s4
1 1 2 0 1 3
1 2 0 0 0 12
2 1 0 0 0 0
3 1 10 0 0 0
4 1 1 0 0 0
4 2 0 0 0 0
8 2 0 2 2 0
9 1 0 0 0 0
9 2 0 0 0 0"
my.data2 <- read.table(textConnection(my.data), header=T)
# desired output
#
# 1 2 0 1 3 0 0 0 12
# 2 0 0 0 0 0 0 0 0
# 3 10 0 0 0 0 0 0 0
# 4 1 0 0 0 0 0 0 0
# 8 0 0 0 0 0 2 2 0
# 9 0 0 0 0 0 0 0 0
代码:
dat_sorted <- sort(unique(my.data2[,1]))
my.seq <- match(my.data2[,1],dat_sorted)
my.data3 <- cbind(my.seq, my.data2)
group.min <- tapply(my.data3$group, my.data3$my.seq, min)
group.max <- tapply(my.data3$group, my.data3$my.seq, max)
# my.min <- group.min[my.data3[,1]]
# my.max <- group.max[my.data3[,1]]
my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9)
x <- 1
for(i in 1:max(my.data3$my.seq)) {
for(j in group.min[i]:group.max[i]) {
if(my.data3[x,1] == i) my.records[i,1] = i
# the two lines below seem to be causing an error
if((my.data3[x,1] == i) & (my.data3[x,3] == 1)) (my.records[i,2:5] = my.data3[x,4:7])
if((my.data3[x,1] == i) & (my.data3[x,3] == 2)) (my.records[i,6:9] = my.data3[x,4:7])
x <- x + 1
}
}
答案 0 :(得分:2)
你是对的,reshape
在这里有帮助。
library(reshape2)
m <- melt(my.data2, id.var = c("record.id", "group"))
dcast(m, record.id ~ group + variable, fill = 0)
record.id 1_s1 1_s2 1_s3 1_s4 2_s1 2_s2 2_s3 2_s4
1 1 2 0 1 3 0 0 0 12
2 2 0 0 0 0 0 0 0 0
3 3 10 0 0 0 0 0 0 0
4 4 1 0 0 0 0 0 0 0
5 8 0 0 0 0 0 2 2 0
6 9 0 0 0 0 0 0 0 0
比较:
dfTest <- data.frame(record.id = rep(1:10e5, each = 2), group = 1:2,
s1 = sample(1:10, 10e5 * 2, replace = TRUE),
s2 = sample(1:10, 10e5 * 2, replace = TRUE),
s3 = sample(1:10, 10e5 * 2, replace = TRUE),
s4 = sample(1:10, 10e5 * 2, replace = TRUE))
system.time({
...# Your code
})
Error in my.records[i, 1] = i : incorrect number of subscripts on matrix
Timing stopped at: 41.61 0.36 42.56
system.time({m <- melt(dfTest, id.var = c("record.id", "group"))
dcast(m, record.id ~ group + variable, fill = 0)})
user system elapsed
25.04 2.78 28.72
答案 1 :(得分:1)
Julius的答案更好,但为了完整起见,我认为我设法让以下for循环工作:
dat_x <- (unique(my.data2[,1]))
my.seq <- match(my.data2[,1],dat_x)
my.data3 <- as.data.frame(cbind(my.seq, my.data2))
my.records <- matrix(0, nrow=length(unique(my.data3$record.id)), ncol=9)
my.records <- as.data.frame(my.records)
my.records[,1] = unique(my.data3[,2])
for(i in 1:9) {
if(my.data3[i,3] == 1) (my.records[my.data3[i,1],c(2:5)] = my.data3[i,c(4:7)])
if(my.data3[i,3] == 2) (my.records[my.data3[i,1],c(6:9)] = my.data3[i,c(4:7)])
}