Question

我的数据框包含key_var和Amount字段，如下所示：

在删除特定记录/元素后，特定key_var的总量应该在0到1（0-0.99）之间。现在我需要识别那些可移动记录并创建一个＆＃39; FLAG＆＃39;对他们（通过创建一个新的变量FLAG）。可以生成多个组合，但我只需要使用R生成一个set组合。

仅供参考，如果我们从以下数据集中删除最后12个记录/元素，则总和匹配为0.25。现在这是手动完成的，我需要生成R代码来实现自动化。

df<-structure(list(key_var = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "XYZ_1234", class = "factor"),  Amount = c(8200304.5, 8160830.25, -8035850.35, -7843855.06, -7638726.82, 7635197.95, 6947059.96, -6779376.16, -6659630.59, -6538178.03,  3890858.28, 3727088.57, 3440399.02, 2612664.47, 2147241.37, -1381553.09, -1307455.22, 1253244.05, -1077622.65, 1035065.78, 1020172.5, -1018263.84, 893138.6, -892595.1, -676137.21, 565106.18, -451752.19, -444984.92, -333922.62, -333922.39, 281748.19, -235644.35, -159120.68, 145970.31, 124236.96, 124160.23, -98276.99, -88602.23, -66468.98, 61162.81, 36316.05, 24832.04, 21011.73, 13469.54, -13143.08, -11365.96, 5528.03,3822.78, -3788.55, -1809.79, 995.66, -543.5, 511.52, -18.22, -1.81, 0.96, 0.87, 0.86, 1.66)), .Names = c("key_var", "Amount"), row.names = c(NA, -59L), class = "data.frame")

谢谢，维纳亚克

Answer 1

你可以尝试自上而下＆＃34;做法。这将从开始cumsum，并在总和在范围内时停止。

library(tidyverse)
df %>% 
  mutate(Sum=cumsum(Amount),
  Flag=between(Sum,0,0.99)) %>% 
  filter(c(rep(T, which(Flag)), rep(F,n()-which(Flag))))
    key_var      Amount          Sum  Flag
1  XYZ_1234  8200304.50   8200304.50 FALSE
2  XYZ_1234  8160830.25  16361134.75 FALSE
3  XYZ_1234 -8035850.35   8325284.40 FALSE
4  XYZ_1234 -7843855.06    481429.34 FALSE
5  XYZ_1234 -7638726.82  -7157297.48 FALSE
6  XYZ_1234  7635197.95    477900.47 FALSE
7  XYZ_1234  6947059.96   7424960.43 FALSE
8  XYZ_1234 -6779376.16    645584.27 FALSE
9  XYZ_1234 -6659630.59  -6014046.32 FALSE
10 XYZ_1234 -6538178.03 -12552224.35 FALSE
11 XYZ_1234  3890858.28  -8661366.07 FALSE
12 XYZ_1234  3727088.57  -4934277.50 FALSE
13 XYZ_1234  3440399.02  -1493878.48 FALSE
14 XYZ_1234  2612664.47   1118785.99 FALSE
15 XYZ_1234  2147241.37   3266027.36 FALSE
16 XYZ_1234 -1381553.09   1884474.27 FALSE
17 XYZ_1234 -1307455.22    577019.05 FALSE
18 XYZ_1234  1253244.05   1830263.10 FALSE
19 XYZ_1234  1020172.50   2850435.60 FALSE
20 XYZ_1234 -1018263.84   1832171.76 FALSE
21 XYZ_1234   893138.60   2725310.36 FALSE
22 XYZ_1234  -892595.10   1832715.26 FALSE
23 XYZ_1234  -676137.21   1156578.05 FALSE
24 XYZ_1234   565106.18   1721684.23 FALSE
25 XYZ_1234  -451752.19   1269932.04 FALSE
26 XYZ_1234  -444984.92    824947.12 FALSE
27 XYZ_1234  -333922.62    491024.50 FALSE
28 XYZ_1234  -333922.39    157102.11 FALSE
29 XYZ_1234  -235644.35    -78542.24 FALSE
30 XYZ_1234  -159120.68   -237662.92 FALSE
31 XYZ_1234   145970.31    -91692.61 FALSE
32 XYZ_1234   124236.96     32544.35 FALSE
33 XYZ_1234   124160.23    156704.58 FALSE
34 XYZ_1234   -98276.99     58427.59 FALSE
35 XYZ_1234   -88602.23    -30174.64 FALSE
36 XYZ_1234   -66468.98    -96643.62 FALSE
37 XYZ_1234    61162.81    -35480.81 FALSE
38 XYZ_1234    24832.04    -10648.77 FALSE
39 XYZ_1234    13469.54      2820.77 FALSE
40 XYZ_1234    -3788.55      -967.78 FALSE
41 XYZ_1234      995.66        27.88 FALSE
42 XYZ_1234     -543.50      -515.62 FALSE
43 XYZ_1234      511.52        -4.10 FALSE
44 XYZ_1234        0.96        -3.14 FALSE
45 XYZ_1234        0.87        -2.27 FALSE
46 XYZ_1234        0.86        -1.41 FALSE
47 XYZ_1234        1.66         0.25  TRUE

更通用的解决方案是从上到下获得所有组合。首先是1:nrow(df)行，然后是2:nrow(df)，3:nrow(df)等等...输出是一个data.frame，它将开始和结束值设置为data.frame的子集。所以sum(df$Amount[1:47])以及三个单一值，例如sum(df$Amount[44:44])为您提供预期的结果。然后，您可以添加包含TRUE / FALSE＆＃39>的列。

res <- data.frame(A=1:nrow(df), B=nrow(df)) %>% 
  split(.$A) %>% 
  map(~df[.$A:.$B,]) %>% 
  map(~mutate(.,Sum=cumsum(.$Amount),
              Flag=between(Sum,0,0.99))) %>% 
  keep(~any(.$Flag)) %>% 
  map_dbl(~which(.$Flag)[1]) %>% 
  tibble(Start=as.numeric(names(.)), Stop=.)  %>% 
  mutate(Stop= Start + Stop - 1)
res
# A tibble: 4 x 2
Start  Stop
<dbl> <dbl>
1     1    47
2    44    44
3    45    45
4    46    46

# add Flag column of the first match
df %>% 
  rownames_to_column() %>% 
  mutate(Flag=FALSE) %>% 
  mutate(Flag=ifelse(between(as.numeric(rowname), res$Start[1], res$Stop[1]), TRUE, Flag)) %>% 
  head
   key_var   Amount Flag
1 XYZ_1234  8200305 TRUE
2 XYZ_1234  8160830 TRUE
3 XYZ_1234 -8035850 TRUE
4 XYZ_1234 -7843855 TRUE
5 XYZ_1234 -7638727 TRUE
6 XYZ_1234  7635198 TRUE

数据

df <- structure(list(key_var = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), .Label = "XYZ_1234", class = "factor"), 
    Amount = c(8200304.5, 8160830.25, -8035850.35, -7843855.06, 
    -7638726.82, 7635197.95, 6947059.96, -6779376.16, -6659630.59, 
    -6538178.03, 3890858.28, 3727088.57, 3440399.02, 2612664.47, 
    2147241.37, -1381553.09, -1307455.22, 1253244.05, 1020172.5, 
    -1018263.84, 893138.6, -892595.1, -676137.21, 565106.18, 
    -451752.19, -444984.92, -333922.62, -333922.39, -235644.35, 
    -159120.68, 145970.31, 124236.96, 124160.23, -98276.99, -88602.23, 
    -66468.98, 61162.81, 24832.04, 13469.54, -3788.55, 995.66, 
    -543.5, 511.52, 0.96, 0.87, 0.86, 1.66, -1077622.65, 1035065.78, 
    281748.19, 36316.05, 21011.73, -13143.08, -11365.96, 5528.03, 
    3822.78, -1809.79, -18.22, -1.81)), class = "data.frame", row.names = c(NA, 
-59L), .Names = c("key_var", "Amount"))

Answer 2

我假设您想要对“金额”值的一部分进行采样，直到满足条件，金额值的总和将介于0和1之间。我采用了这种方法，尽管可能需要花费很多时间：

dput(df)
structure(list(key_var = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), .Label = "XYZ_1234", class = "factor"), 
Amount = c(8200304.5, 8160830.25, -8035850.35, -7843855.06, 
-7638726.82, 7635197.95, 6947059.96, -6779376.16, -6659630.59, 
-6538178.03, 3890858.28, 3727088.57, 3440399.02, 2612664.47, 
2147241.37, -1381553.09, -1307455.22, 1253244.05, 1020172.5, 
-1018263.84, 893138.6, -892595.1, -676137.21, 565106.18, 
-451752.19, -444984.92, -333922.62, -333922.39, -235644.35, 
-159120.68, 145970.31, 124236.96, 124160.23, -98276.99, -88602.23, 
-66468.98, 61162.81, 24832.04, 13469.54, -3788.55, 995.66, 
-543.5, 511.52, 0.96, 0.87, 0.86, 1.66, -1077622.65, 1035065.78, 
281748.19, 36316.05, 21011.73, -13143.08, -11365.96, 5528.03, 
3822.78, -1809.79, -18.22, -1.81)), .Names = c("key_var", 
"Amount"), row.names = c(NA, -59L), class = "data.frame")

continueloop = TRUE
while(continueloop){
  x <- sample(df$Amount, sample(2:dim(df)[1], 1))
  if(sum(x) > 0 & sum(x) < 1){ 
  continueloop = FALSE
 }
}

df$FLAG <- ifelse(df$Amount %in% x, TRUE, FALSE)

您可以替换第二个sample（）函数中的2来指定FLAG应该为TRUE的最小次数。如果它是1，它可以只采样Amount在0和1之间的一行。

使用R找到与给定值相加的元素

2 个答案: