我有一个简化的数据集,如下所示:
YEAR = c(2009,2009,2009,2009,2009,2009,2009,2010,2010,2010,2010,2010,2010,2010)
FROM = c("A","C","B","D","B","A","C","A","C","B","A","D","B","A")
TO = c("B","D","C","A","D","C","B","B","A","D","D","C","A","D")
DATA = data.frame(YEAR,FROM,TO)
YEAR FROM TO
2009 A B
2009 C D
2009 B C
2009 D A
2009 B D
2009 A C
2009 C B
2010 A B
2010 C A
2010 B D
2010 A D
2010 D C
2010 B A
2010 A D
我想要的是两个额外的列,让我们说OCC_FROM和OCC_TO,它是先前行中FROM和TO列中出现的累计计数,按YEAR计算。像这样:
YEAR FROM TO OCC_FROM OCC_TO
2009 A B 0 0
2009 C D 0 0
2009 B C 1 1
2009 C A 2 1
2009 B D 2 1
2009 A C 2 3
2009 C B 4 3
2010 A B 0 0
2010 C A 0 1
2010 B D 1 0
2010 A B 2 2
2010 D C 1 1
2010 B A 3 3
2010 A D 4 2
在Cumulative count in R的帮助下,我设法制作了这个,这显然不是我想要的,因为它没有考虑到YEAR:
DATA$OCC_FROM = sapply(1:length(DATA$FROM),function(i)sum(DATA$FROM[i]==DATA$FROM[1:i]))+sapply(1:length(DATA$FROM),function(i)sum(DATA$FROM[i]==DATA$TO[1:i]))-1
DATA$OCC_TO = sapply(1:length(DATA$TO),function(i)sum(DATA$TO[i]==DATA$FROM[1:i]))+sapply(1:length(DATA$TO),function(i)sum(DATA$TO[i]==DATA$TO[1:i]))-1
YEAR FROM TO OCC_FROM OCC_TO
2009 A B 0 0
2009 C D 0 0
2009 B C 1 1
2009 C A 2 1
2009 B D 2 1
2009 A C 2 3
2009 C B 4 3
2010 A B 3 4
2010 C A 5 4
2010 B D 5 2
2010 A B 5 6
2010 D C 3 6
2010 B A 7 6
2010 A D 7 4
编辑:我也希望能够像以前那样按照YEAR和FROM一起累计累加两列。为简单起见,我将使用OCC_FROM和OCC_TO。像这样:
YEAR FROM TO OCC_FROM OCC_TO TOTAL_FROM TOTAL_TO
2009 A B 0 0 0 0
2009 C D 0 0 0 0
2009 B C 1 1 0 0
2009 C A 2 1 1 0
2009 B D 2 1 1 0
2009 A C 2 3 1 3
2009 C B 4 3 6 3
2010 A B 0 0 0 0
2010 C A 0 1 0 0
2010 B D 1 0 0 0
2010 A B 2 2 1 1
2010 D C 1 1 0 0
2010 B A 3 3 3 3
2010 A D 4 2 6 1
答案 0 :(得分:1)
你可以尝试
prevCount <- function(x) {
eq <- outer(x,x,"==")
eq <- eq & upper.tri(eq)
eqInt <- ifelse(eq, 1, 0)
return(apply(eqInt,2,sum))
}
DATA$OCC_FROM <- ave(DATA$FROM, DATA$YEAR, FUN=prevCount )
prevCount是一个函数,它在一年内返回每个元素之前相同的元素数。然后,ave
电话会按年适用。
在评论中汇总更正,我们得到
ord <- order(c(1:nrow(DATA), 1:nrow(data)))
targets <- c(data$FROM, data$TO)[ord]
yr <- c(data$YEAR, data$YEAR)[ord]
res <- ave(targets, yr, FUN=prevCount)
data$occ_from <- res[seq(1, length(res), 2)]
data$occ_to <- res[seq(2, length(res), 2)]
此外,prevCount函数可以简化为:
prevCount <- function(x) {ave(x==x, x, FUN=cumsum)}
答案 1 :(得分:1)
# Your data - this is not the data at the top of your question but from your
# solution [the 'from' and 'to' don't correspond exactly between your question
# and solution]
YEAR <- c(2009,2009,2009,2009,2009,2009,2009,2010,2010,2010,2010,2010,2010,2010)
FROM <- c("A","C","B","C","B","A","C","A","C","B","A","D","B","A")
TO <- c( "B","D","C","A","D","C","B","B","A","D","B","C","A","D")
mydf <- data.frame(YEAR,FROM,TO)
names(mydf) <- tolower(names(mydf))
#---------------------------------------------------
# Function to get cumulative sum across columns by group
f <- function(from , to){
# combine the columns 'from' and 'to' alternately
l <- c(rbind(from , to))
# Get and sum duplicate values
dup <- duplicated(l)
sums <- ave(dup , l , FUN= cumsum)
# Reshape data & output
out <- t(matrix(sums ,2))
colnames(out) <- c("occ_from","occ_to")
out
}
# Not considering year
f(mydf$from , mydf$to)
# (data.frame(mydf , f(mydf$from , mydf$to) )
# Calculate by year
s <- split(mydf , mydf$year)
d <- do.call(rbind,lapply(s,function(i) f(i[,"from"],i[,"to"])))
(mydf <- data.frame(mydf , d , row.names=NULL))