我的数据框包含key_var
和Amount
字段,如下所示:
在删除特定记录/元素后,特定key_var的总量应该在0到1(0-0.99)之间。现在我需要识别那些可移动记录并创建一个' FLAG'对他们(通过创建一个新的变量FLAG
)。
可以生成多个组合,但我只需要使用R生成一个set组合。
仅供参考,如果我们从以下数据集中删除最后12个记录/元素,则总和匹配为0.25。 现在这是手动完成的,我需要生成R代码来实现自动化。
df<-structure(list(key_var = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "XYZ_1234", class = "factor"), Amount = c(8200304.5, 8160830.25, -8035850.35, -7843855.06, -7638726.82, 7635197.95, 6947059.96, -6779376.16, -6659630.59, -6538178.03, 3890858.28, 3727088.57, 3440399.02, 2612664.47, 2147241.37, -1381553.09, -1307455.22, 1253244.05, -1077622.65, 1035065.78, 1020172.5, -1018263.84, 893138.6, -892595.1, -676137.21, 565106.18, -451752.19, -444984.92, -333922.62, -333922.39, 281748.19, -235644.35, -159120.68, 145970.31, 124236.96, 124160.23, -98276.99, -88602.23, -66468.98, 61162.81, 36316.05, 24832.04, 21011.73, 13469.54, -13143.08, -11365.96, 5528.03,3822.78, -3788.55, -1809.79, 995.66, -543.5, 511.52, -18.22, -1.81, 0.96, 0.87, 0.86, 1.66)), .Names = c("key_var", "Amount"), row.names = c(NA, -59L), class = "data.frame")
谢谢, 维纳亚克
答案 0 :(得分:2)
你可以尝试自上而下&#34;做法。这将从开始cumsum
,并在总和在范围内时停止。
library(tidyverse)
df %>%
mutate(Sum=cumsum(Amount),
Flag=between(Sum,0,0.99)) %>%
filter(c(rep(T, which(Flag)), rep(F,n()-which(Flag))))
key_var Amount Sum Flag
1 XYZ_1234 8200304.50 8200304.50 FALSE
2 XYZ_1234 8160830.25 16361134.75 FALSE
3 XYZ_1234 -8035850.35 8325284.40 FALSE
4 XYZ_1234 -7843855.06 481429.34 FALSE
5 XYZ_1234 -7638726.82 -7157297.48 FALSE
6 XYZ_1234 7635197.95 477900.47 FALSE
7 XYZ_1234 6947059.96 7424960.43 FALSE
8 XYZ_1234 -6779376.16 645584.27 FALSE
9 XYZ_1234 -6659630.59 -6014046.32 FALSE
10 XYZ_1234 -6538178.03 -12552224.35 FALSE
11 XYZ_1234 3890858.28 -8661366.07 FALSE
12 XYZ_1234 3727088.57 -4934277.50 FALSE
13 XYZ_1234 3440399.02 -1493878.48 FALSE
14 XYZ_1234 2612664.47 1118785.99 FALSE
15 XYZ_1234 2147241.37 3266027.36 FALSE
16 XYZ_1234 -1381553.09 1884474.27 FALSE
17 XYZ_1234 -1307455.22 577019.05 FALSE
18 XYZ_1234 1253244.05 1830263.10 FALSE
19 XYZ_1234 1020172.50 2850435.60 FALSE
20 XYZ_1234 -1018263.84 1832171.76 FALSE
21 XYZ_1234 893138.60 2725310.36 FALSE
22 XYZ_1234 -892595.10 1832715.26 FALSE
23 XYZ_1234 -676137.21 1156578.05 FALSE
24 XYZ_1234 565106.18 1721684.23 FALSE
25 XYZ_1234 -451752.19 1269932.04 FALSE
26 XYZ_1234 -444984.92 824947.12 FALSE
27 XYZ_1234 -333922.62 491024.50 FALSE
28 XYZ_1234 -333922.39 157102.11 FALSE
29 XYZ_1234 -235644.35 -78542.24 FALSE
30 XYZ_1234 -159120.68 -237662.92 FALSE
31 XYZ_1234 145970.31 -91692.61 FALSE
32 XYZ_1234 124236.96 32544.35 FALSE
33 XYZ_1234 124160.23 156704.58 FALSE
34 XYZ_1234 -98276.99 58427.59 FALSE
35 XYZ_1234 -88602.23 -30174.64 FALSE
36 XYZ_1234 -66468.98 -96643.62 FALSE
37 XYZ_1234 61162.81 -35480.81 FALSE
38 XYZ_1234 24832.04 -10648.77 FALSE
39 XYZ_1234 13469.54 2820.77 FALSE
40 XYZ_1234 -3788.55 -967.78 FALSE
41 XYZ_1234 995.66 27.88 FALSE
42 XYZ_1234 -543.50 -515.62 FALSE
43 XYZ_1234 511.52 -4.10 FALSE
44 XYZ_1234 0.96 -3.14 FALSE
45 XYZ_1234 0.87 -2.27 FALSE
46 XYZ_1234 0.86 -1.41 FALSE
47 XYZ_1234 1.66 0.25 TRUE
更通用的解决方案是从上到下获得所有组合。首先是1:nrow(df)
行,然后是2:nrow(df)
,3:nrow(df)
等等...输出是一个data.frame,它将开始和结束值设置为data.frame的子集。所以sum(df$Amount[1:47])
以及三个单一值,例如sum(df$Amount[44:44])
为您提供预期的结果。然后,您可以添加包含TRUE
/ FALSE
&#39>的列。
res <- data.frame(A=1:nrow(df), B=nrow(df)) %>%
split(.$A) %>%
map(~df[.$A:.$B,]) %>%
map(~mutate(.,Sum=cumsum(.$Amount),
Flag=between(Sum,0,0.99))) %>%
keep(~any(.$Flag)) %>%
map_dbl(~which(.$Flag)[1]) %>%
tibble(Start=as.numeric(names(.)), Stop=.) %>%
mutate(Stop= Start + Stop - 1)
res
# A tibble: 4 x 2
Start Stop
<dbl> <dbl>
1 1 47
2 44 44
3 45 45
4 46 46
# add Flag column of the first match
df %>%
rownames_to_column() %>%
mutate(Flag=FALSE) %>%
mutate(Flag=ifelse(between(as.numeric(rowname), res$Start[1], res$Stop[1]), TRUE, Flag)) %>%
head
key_var Amount Flag
1 XYZ_1234 8200305 TRUE
2 XYZ_1234 8160830 TRUE
3 XYZ_1234 -8035850 TRUE
4 XYZ_1234 -7843855 TRUE
5 XYZ_1234 -7638727 TRUE
6 XYZ_1234 7635198 TRUE
数据
df <- structure(list(key_var = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "XYZ_1234", class = "factor"),
Amount = c(8200304.5, 8160830.25, -8035850.35, -7843855.06,
-7638726.82, 7635197.95, 6947059.96, -6779376.16, -6659630.59,
-6538178.03, 3890858.28, 3727088.57, 3440399.02, 2612664.47,
2147241.37, -1381553.09, -1307455.22, 1253244.05, 1020172.5,
-1018263.84, 893138.6, -892595.1, -676137.21, 565106.18,
-451752.19, -444984.92, -333922.62, -333922.39, -235644.35,
-159120.68, 145970.31, 124236.96, 124160.23, -98276.99, -88602.23,
-66468.98, 61162.81, 24832.04, 13469.54, -3788.55, 995.66,
-543.5, 511.52, 0.96, 0.87, 0.86, 1.66, -1077622.65, 1035065.78,
281748.19, 36316.05, 21011.73, -13143.08, -11365.96, 5528.03,
3822.78, -1809.79, -18.22, -1.81)), class = "data.frame", row.names = c(NA,
-59L), .Names = c("key_var", "Amount"))
答案 1 :(得分:0)
我假设您想要对“金额”值的一部分进行采样,直到满足条件,金额值的总和将介于0和1之间。我采用了这种方法,尽管可能需要花费很多时间:
dput(df)
structure(list(key_var = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "XYZ_1234", class = "factor"),
Amount = c(8200304.5, 8160830.25, -8035850.35, -7843855.06,
-7638726.82, 7635197.95, 6947059.96, -6779376.16, -6659630.59,
-6538178.03, 3890858.28, 3727088.57, 3440399.02, 2612664.47,
2147241.37, -1381553.09, -1307455.22, 1253244.05, 1020172.5,
-1018263.84, 893138.6, -892595.1, -676137.21, 565106.18,
-451752.19, -444984.92, -333922.62, -333922.39, -235644.35,
-159120.68, 145970.31, 124236.96, 124160.23, -98276.99, -88602.23,
-66468.98, 61162.81, 24832.04, 13469.54, -3788.55, 995.66,
-543.5, 511.52, 0.96, 0.87, 0.86, 1.66, -1077622.65, 1035065.78,
281748.19, 36316.05, 21011.73, -13143.08, -11365.96, 5528.03,
3822.78, -1809.79, -18.22, -1.81)), .Names = c("key_var",
"Amount"), row.names = c(NA, -59L), class = "data.frame")
continueloop = TRUE
while(continueloop){
x <- sample(df$Amount, sample(2:dim(df)[1], 1))
if(sum(x) > 0 & sum(x) < 1){
continueloop = FALSE
}
}
df$FLAG <- ifelse(df$Amount %in% x, TRUE, FALSE)
您可以替换第二个sample()函数中的2来指定FLAG应该为TRUE的最小次数。如果它是1,它可以只采样Amount在0和1之间的一行。