我一直在尝试获取一列加权平均值,该列使用data.table为每行排除一些行。
在以下示例中,FIPS是ID变量,而STATE是组变量。我想计算相同状态下不包括邻近县的价值加权平均值。以及其他州。
我知道如何实现它,但是我想有一种更有效的方法。我对使用data.table的按行操作不熟悉。有任何想法吗?先感谢您。
library(data.table)
rm(list=ls())
set.seed(920410)
DT <- data.table(FIPS =1:21, STATE = LETTERS[1:2], value=1:3, weight=2:7); DT
DT[, nbs := list(list(sample(1:21, 3))), by= names(DT)]
for(i in 1:nrow(DT)){
DT$neighbor_sum_in_the_same_state[i] <- sum(DT[FIPS %in% unlist(DT$nbs[i]) & STATE == DT$STATE[i], value*weight])
DT$neighbor_sum_in_other_states[i] <- sum(DT[FIPS %in% unlist(DT$nbs[i]) & STATE != DT$STATE[i], value*weight])
}
答案 0 :(得分:0)
也许有等效的data.table
,这是tidyverse
中的一种方式
library(dplyr)
library(purrr)
DT %>%
group_by(STATE) %>%
mutate(val1 = map_dbl(nbs, ~{inds <- FIPS %in% .x;
sum(value[inds] * weight[inds])})) %>%
ungroup() %>%
mutate(val2 = map2_dbl(nbs, STATE, ~{inds <- FIPS %in% .x & STATE != .y;
sum(value[inds] * weight[inds])}))
# A tibble: 21 x 9
# FIPS STATE value weight nbs OP_val1 OP_val2 val1 val2
# <int> <chr> <int> <int> <list> <int> <int> <dbl> <dbl>
# 1 1 A 1 2 <int [3]> 14 21 14 21
# 2 2 B 2 3 <int [3]> 11 12 11 12
# 3 3 A 3 4 <int [3]> 0 17 0 17
# 4 4 B 1 5 <int [3]> 5 14 5 14
# 5 5 A 2 6 <int [3]> 16 0 16 0
# 6 6 B 3 7 <int [3]> 26 12 26 12
# 7 7 A 1 2 <int [3]> 14 5 14 5
# 8 8 B 2 3 <int [3]> 27 2 27 2
# 9 9 A 3 4 <int [3]> 2 42 2 42
#10 10 B 1 5 <int [3]> 6 14 6 14
# … with 11 more rows
其中OP_val1
和OP_val2
是在OP中运行for
循环后的输出。
数据
set.seed(920410)
DT <- data.table(FIPS =1:21, STATE = LETTERS[1:2], value=1:3, weight=2:7)
DT[, nbs := list(list(sample(1:21, 3))), by= names(DT)]
for(i in 1:nrow(DT)){
DT$OP_val1[i] <- sum(DT[FIPS %in% unlist(DT$nbs[i]) & STATE == DT$STATE[i], value*weight])
DT$OP_val2[i] <- sum(DT[FIPS %in% unlist(DT$nbs[i]) & STATE != DT$STATE[i], value*weight])
}
答案 1 :(得分:0)
这是data.table
中的一个选项,可以在执行连接之前将其转换为长格式:
#convert into long format i.e. unlist the nbs column
nm <- c("FIPS","STATE","value", "weight")
DT_long <- DT[, .(nbs=unlist(nbs)), nm]
#look for neighbours in same STATE and FIPS
DT_long[, neighbor_sum_in_the_same_state :=
.SD[.SD, on=.(FIPS=nbs, STATE), sum(x.value[1L] * x.weight[1L]), by=.EACHI]$V1]
#look for all in same FIPS but exclude those with same STATE
DT_long[, neighbor_sum_in_other_states :=
.SD[.SD, on=.(FIPS=nbs), sum(x.value[x.STATE!=i.STATE][1L] * x.weight[x.STATE!=i.STATE][1L]), by=.EACHI]$V1]
#produce desired output
DT_long[, lapply(.SD, sum, na.rm=TRUE), nm,
.SDcols=c("neighbor_sum_in_the_same_state", "neighbor_sum_in_other_states")]
输出:
FIPS STATE value weight neighbor_sum_in_the_same_state neighbor_sum_in_other_states
1: 1 A 1 2 14 21
2: 2 B 2 3 11 12
3: 3 A 3 4 0 17
4: 4 B 1 5 5 14
5: 5 A 2 6 16 0
6: 6 B 3 7 26 12
7: 7 A 1 2 14 5
8: 8 B 2 3 27 2
9: 9 A 3 4 2 42
10: 10 B 1 5 6 14
11: 11 A 2 6 12 26
12: 12 B 3 7 11 2
13: 13 A 1 2 12 11
14: 14 B 2 3 5 24
15: 15 A 3 4 12 26
16: 16 B 1 5 21 24
17: 17 A 2 6 4 5
18: 18 B 3 7 6 14
19: 19 A 1 2 14 5
20: 20 B 2 3 11 12
21: 21 A 3 4 12 27
FIPS STATE value weight neighbor_sum_in_the_same_state neighbor_sum_in_other_states
答案 2 :(得分:0)
谢谢你们:)那些帮助!
尝试各种方法后,我编写了以下代码。以下代码计算不使用循环的相同州以及其他州中不包括相邻县的值的加权平均值。
DT[, weighted_avg_nonneighboring_counties_in_same_state :=
weighted.mean(
DT[!FIPS == .BY[1] & !FIPS %in% unlist(nbs[.I]) & STATE == .BY[2], value],
DT[!FIPS == .BY[1] & !FIPS %in% unlist(nbs[.I]) & STATE == .BY[2], weight],
na.rm=TRUE),
by=.(FIPS,STATE)][,
weighted_avg_nonneighboring_counties_in_other_states :=
weighted.mean(
DT[!FIPS == .BY[1] & !FIPS %in% unlist(nbs[.I]) & STATE != .BY[2], value],
DT[!FIPS == .BY[1] & !FIPS %in% unlist(nbs[.I]) & STATE != .BY[2], weight],
na.rm=TRUE),
by=.(FIPS,STATE)]