我有一个指标,分布在四个类别a, b, c, d
中。
在一段时间内,我会跟踪每个类别的指标移动。这些移动的总和表示从其他地方离开或进入系统的数量('外部')。
# SETUP -------------------------------------------------------------------
categories <- letters[1:4]
set.seed(1)
movements <- lapply(categories, function(...) {round(runif(10, -10,10))*10})
names(movements) <- categories
movements[['external']] <- Reduce(`+`, movements)*-1
problem <- as.data.frame(movements)
problem
a b c d external
1 -50 -60 90 0 20
2 -30 -60 -60 20 130
3 10 40 30 0 -80
4 80 -20 -70 -60 70
5 -60 50 -50 70 -10
6 80 0 -20 30 -90
7 90 40 -100 60 -90
8 30 100 -20 -80 -30
9 30 -20 70 40 -120
10 -90 60 -30 -20 80
如果某些类别经历了积极的变动而其他类别经历了负面变动,我们可以推断出系统内的转移。
# ADD TRANSFER COLUMNS AND INITIALISE TO 0 --------------------------------
transfer_matrix <- combn(c(categories, 'external'), 2)
transfer_list <- combn(c(categories, 'external'), 2, simplify=F)
problem[,sapply(transfer_list, paste, collapse='.')] <- 0
paste(names(problem), collapse=', ')
[1] "a, b, c, d, external, a.b, a.c, a.d, a.external, b.c, b.d, b.external, c.d, c.external, d.external"
例如,a
减少了50,c
增加了90,因此我们可以推断从a
到c
的传输将被存储在变量a.c
。
计算转移的规则是成比例的。所以,当&#39; a&#39;减少了50,b
减少了60,然后c
的增加50 /(50 + 60)应该归因于'a'
和60 /(50 + 60) c
的增加应归因于b
。同样,对于进出系统的转移。
下面显示了我需要的所有变量的完整手动计算,第一行:
# MANUAL CALCULATION ------------------------------------------------------
row_limit <- 1 # change to e.g. 1:10
problem[row_limit, 'a.b'] <- 0
problem[row_limit, 'a.c'] <- 90*(-50/(-50+-60))
problem[row_limit, 'a.d'] <- 0
problem[row_limit, 'a.external'] <- 20 * -50/(-50+-60)
problem[row_limit, 'b.c'] <- 90*(-60/(-50+-60))
problem[row_limit, 'b.d'] <- 0
problem[row_limit, 'b.external'] <- 20 * -60/(-50+-60)
problem[row_limit, 'c.d'] <- 0
problem[row_limit, 'c.external'] <- 0
problem[row_limit, 'd.external'] <- 0
请注意,由于a.c = -c.a
只需要计算所有可能传输的子集。
我的问题是,如何以编程方式编写上述计算,以简洁有效的方式处理10-20个类别和大量行?
我通常使用data.table,但对要使用的包的任何建议都是开放的。
下面是一些检查输出的代码:
# CHECKING ----------------------------------------------------------------
check <- function(problem, category, categories, transfer_list, transfer_matrix) {
out_columns <- sapply(transfer_list[transfer_matrix[1,] == category], paste, collapse='.')
in_columns <- sapply(transfer_list[transfer_matrix[2,] == category], paste, collapse='.')
stopifnot(length(c(out_columns, in_columns)) == length(categories)-1)
out_sum <- 0
if(length(out_columns) == 1) {
out_sum <- problem[,out_columns]
} else if(length(out_columns) > 1) {
out_sum <- Reduce(`+`, problem[,out_columns])
}
in_sum <- 0
if(length(in_columns) == 1) {
in_sum <- problem[,in_columns]
}
else if(length(in_columns) > 1) {
in_sum <- Reduce(`+`, problem[,in_columns])
}
lhs <- out_sum - in_sum
rhs <- -problem[, category]
sprintf('%s vs %s',lhs, rhs)
}
# For each category, actual vs expected
sapply(c(categories,'external'), check, problem=problem, categories=c(categories,'external'), transfer_list=transfer_list,transfer_matrix=transfer_matrix)
a b c d
[1,] "50 vs 50" "60 vs 60" "-90 vs -90" "0 vs 0"
[2,] "0 vs 30" "0 vs 60" "0 vs 60" "0 vs -20"
[3,] "0 vs -10" "0 vs -40" "0 vs -30" "0 vs 0"
[4,] "0 vs -80" "0 vs 20" "0 vs 70" "0 vs 60"
[5,] "0 vs 60" "0 vs -50" "0 vs 50" "0 vs -70"
[6,] "0 vs -80" "0 vs 0" "0 vs 20" "0 vs -30"
[7,] "0 vs -90" "0 vs -40" "0 vs 100" "0 vs -60"
[8,] "0 vs -30" "0 vs -100" "0 vs 20" "0 vs 80"
[9,] "0 vs -30" "0 vs 20" "0 vs -70" "0 vs -40"
[10,] "0 vs 90" "0 vs -60" "0 vs 30" "0 vs 20"
答案 0 :(得分:1)
这是一个想法。我相信输出符合您的要求。
#x is a row from problem df
#y is a column from transfer_matrix
check_pairs <- function(x,y){
#split y into which columns are being compared . e.g. if col 1 is 'd' vs 'external', then ...
a <- y[1] #would be 'd'
b <- y[2] #would be 'external'
#if both pos, both neg, or one val is 0, then return 0
if( sign(x[a]) == sign(x[b]) | sign(x[[a]]) == 0){
return(0)
}else{ #else return formula from your manual calculation
return( x[[b]] * x[[a]] / sum( x[sign(x)==sign(x[[a]]) ] ) )
}
}
#for each row of the problem matrix, compare to each column of the transfer_matrix
check_matrix_cols <- function(x){
return( apply(transfer_matrix, 2, function(y) check_pairs(x,y)) )
}
problem[,-seq(length(c(categories, 'external')))] <- t( apply(problem, 1, check_matrix_cols) )
sapply(c(categories,'external'), check, problem=problem, categories=c(categories,'external'), transfer_list=transfer_list,transfer_matrix=transfer_matrix)
a b c d external
[1,] "50 vs 50" "60 vs 60" "-90 vs -90" "0 vs 0" "-20 vs -20"
[2,] "30 vs 30" "60 vs 60" "60 vs 60" "-20 vs -20" "-130 vs -130"
[3,] "-10 vs -10" "-40 vs -40" "-30 vs -30" "0 vs 0" "80 vs 80"
[4,] "-80 vs -80" "20 vs 20" "70 vs 70" "60 vs 60" "-70 vs -70"
[5,] "60 vs 60" "-50 vs -50" "50 vs 50" "-70 vs -70" "10 vs 10"
[6,] "-80 vs -80" "0 vs 0" "20 vs 20" "-30 vs -30" "90 vs 90"
[7,] "-90 vs -90" "-40 vs -40" "100 vs 100" "-60 vs -60" "90 vs 90"
[8,] "-30 vs -30" "-100 vs -100" "20 vs 20" "80 vs 80" "30 vs 30"
[9,] "-30 vs -30" "20 vs 20" "-70 vs -70" "-40 vs -40" "120 vs 120"
[10,] "90 vs 90" "-60 vs -60" "30 vs 30" "20 vs 20" "-80 vs -80"