我在2018年的几个月中都有一个data.frame。当我按州进行全局汇总(西班牙语为DEPARTAMENTO)时,一切正常,我返回了25个州的df并保留了每个州的销售价值
但是当我按一天过滤特定状态时,例如,我只会得到15个状态。总状态为25:
unique(ventas$DEPARTAMENTO) #returns 25 states when the data corresponds a long period
[1] "LIMA" "AREQUIPA" "LA LIBERTAD" "MOQUEGUA"
[5] "HUANUCO" "CUSCO" "PIURA" "CALLAO"
[9] "CAJAMARCA" "UCAYALI" "JUNIN" "ICA"
[13] "LAMBAYEQUE" "AMAZONAS" "ANCASH" "AYACUCHO"
[17] "LORETO" "SAN MARTIN" "TACNA" "MADRE DE DIOS"
[21] "PUNO" "TUMBES" "PASCO" "APURIMAC"
[25] "HUANCAVELICA"
我如何完成其余状态的显示,因为它们在那个特定日期没有记录,因此没有显示?
*此案适用于2018年8月16日,另一方面,2018年8月15日包含每个州的数据。
#Returns only 16 states of 25
ventas_departamentos <- ventas %>%
filter(FECHA_PED == "16/08/2018") %>%
group_by(DEPARTAMENTO) %>%
summarise(VENTAS= sum(VENTAS))
样本数据::仅显示“ 16/07/2018”上14个州的数据,但是有25个州。
structure(list(FECHA_PED = c("15/08/2018", "15/08/2018", "15/08/2018",
"15/08/2018", "15/08/2018", "15/08/2018", "15/08/2018", "15/08/2018",
"15/08/2018", "15/08/2018", "15/08/2018", "15/08/2018", "15/08/2018",
"15/08/2018", "15/08/2018", "15/08/2018", "15/08/2018", "15/08/2018",
"15/08/2018", "15/08/2018", "15/08/2018", "15/08/2018", "15/08/2018",
"15/08/2018", "15/08/2018", "16/08/2018", "16/08/2018", "16/08/2018",
"16/08/2018", "16/08/2018", "16/08/2018", "16/08/2018", "16/08/2018",
"16/08/2018", "16/08/2018", "16/08/2018", "16/08/2018", "16/08/2018",
"16/08/2018", "17/08/2018", "17/08/2018", "17/08/2018", "17/08/2018",
"17/08/2018", "17/08/2018", "17/08/2018", "17/08/2018", "17/08/2018",
"17/08/2018", "17/08/2018", "17/08/2018", "17/08/2018"), DEPARTAMENTO = c("AMAZONAS",
"ANCASH", "APURIMAC", "AREQUIPA", "AYACUCHO", "CAJAMARCA", "CALLAO",
"CUSCO", "HUANCAVELICA", "HUANUCO", "ICA", "JUNIN", "LA LIBERTAD",
"LAMBAYEQUE", "LIMA", "LORETO", "MADRE DE DIOS", "MOQUEGUA",
"PASCO", "PIURA", "PUNO", "SAN MARTIN", "TACNA", "TUMBES", "UCAYALI",
"AMAZONAS", "ANCASH", "AREQUIPA", "AYACUCHO", "CALLAO", "CUSCO",
"ICA", "LAMBAYEQUE", "LIMA", "LORETO", "MOQUEGUA", "PIURA", "SAN MARTIN",
"TACNA", "AREQUIPA", "CAJAMARCA", "CALLAO", "CUSCO", "HUANUCO",
"ICA", "JUNIN", "LA LIBERTAD", "LAMBAYEQUE", "LIMA", "MOQUEGUA",
"PIURA", "UCAYALI"), VENTAS = c(1545, 1212, 2349.18483011377,
2349.18483011377, 448.655378691318, 250, 998.040346840917, 1235.83321024607,
878, 760.094289441334, 1262.85505782431, 474.702268205715, 2001.15771843356,
2245.18603040845, 51062.7935183381, 186.570326518456, 4548, 410.101788287779,
1548, 1779.77394863005, 78, 2245.18603040845, 9440.63315148726,
1500.37246329124, 998.040346840917, 241.361194343254, 751.944809713059,
3422.55258780341, 833.947218393584, 3270.73872294869, 1814.77433185166,
818.095057353685, 1737.0159616384, 33361.873446708, 710.900787004681,
1764.20759408142, 1864.98050395575, 857.36085379108, 588.900514206351,
2117.36171824294, 346.930386805224, 1648.91437144541, 965.971270003731,
2001.15771843356, 298.239316298776, 514.223203738548, 1611.43834003938,
940.833636649437, 22909.898424303, 2323.9439806791, 2337.35194677909,
888.715034174863)), .Names = c("FECHA_PED", "DEPARTAMENTO", "VENTAS"
), row.names = c(NA, -52L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), spec = structure(list(cols = structure(list(
FECHA_PED = structure(list(), class = c("collector_character",
"collector")), DEPARTAMENTO = structure(list(), class = c("collector_character",
"collector")), VENTAS = structure(list(), class = c("collector_double",
"collector"))), .Names = c("FECHA_PED", "DEPARTAMENTO", "VENTAS"
)), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"), vars = "FECHA_PED", drop = TRUE)
这是df的结构:
FECHA_PED DEPARTAMENTO VENTAS
16/08/2018 AMAZONAS 241.36
16/08/2018 ANCASH 751.94
16/08/2018 AREQUIPA 3422.55
16/08/2018 AYACUCHO 833.94
16/08/2018 CALLAO 3270.73
16/08/2018 CUSCO 1814.77
16/08/2018 ICA 818.09
16/08/2018 LAMBAYEQUE 1737.01
16/08/2018 LIMA 33361.87
16/08/2018 LORETO 710.90
16/08/2018 MOQUEGUA 1764.20
16/08/2018 PIURA 1864.98
16/08/2018 SAN MARTIN 857.36
16/08/2018 TACNA 588.90
答案 0 :(得分:2)
将group_by()
应用于整个数据集,但仅使用summarise()
中想要的日期。
下面这应为其他10个州分配VENTAS
中的0
:
ventas_departamentos <- ventas %>%
group_by(DEPARTAMENTO) %>%
summarise(VENTAS = sum(TOTAL_PEDIDO * (FECHA_PED == "16/08/2018"),
na.rm = TRUE))
要为它们分配NA
而不是0
,您可以跟踪缺失,然后替换:
ventas_departamentos <- ventas %>%
group_by(DEPARTAMENTO) %>%
summarise(VENTAS = sum(TOTAL_PEDIDO * (FECHA_PED == "16/08/2018"),
na.rm = TRUE),
n_obs = sum(FECHA_PED == "16/08/2018", na.rm = TRUE)) %>%
mutate(VENTAS = replace(VENTAS, n_obs == 0, NA)
要总结一个日期范围,只需在两个地方修改逻辑语句部分:
days <- 14:16
dates <- paste0(days, "/08/2018")
dates
# [1] "14/08/2018" "15/08/2018" "16/08/2018"
ventas_departamentos <- ventas %>%
group_by(DEPARTAMENTO) %>%
summarise(VENTAS = sum(TOTAL_PEDIDO * (FECHA_PED %in% dates),
na.rm = TRUE),
n_obs = sum(FECHA_PED %in% dates, na.rm = TRUE)) %>%
mutate(VENTAS = replace(VENTAS, n_obs == 0, NA)
答案 1 :(得分:1)
以下解决方案适用于具有任意数量日期的ventas
数据框:
步骤1:为每个日期建立一个具有完整状态的新df,但所有ventas值为0。
all_states = unique(ventas$DEPARTAMENTO)
all_dates = unique(ventas$FECHA_PED)
new_ventas = data.frame(FECHA_PED = rep(all_dates, each=length(all_states)),
DEPARTAMENTO = rep(all_states, length(all_dates)),
VENTAS = 0)
步骤2:创建两个VENTAS值向量,一个用于new_ventas$VENTAS
,另一个用于原始ventas$VENTAS
。用合并的日期和状态命名向量。这应该为您提供每个值的唯一名称。
new_ventas_values = new_ventas$VENTAS
names(new_ventas_values) = paste0(new_ventas$FECHA_PED, '_', new_ventas$DEPARTAMENTO)
ventas_values = ventas$VENTAS
names(ventas_values) = paste0(ventas$FECHA_PED, '_', ventas$DEPARTAMENTO)
步骤3:现在,您可以使用唯一名称将new_ventas
中的0替换为ventas
中的值。
new_ventas_values[names(ventas_values)] = ventas_values
new_ventas$VENTAS = new_ventas_values
new_ventas
结果df为:
FECHA_PED DEPARTAMENTO VENTAS
1 15/08/2018 AMAZONAS 1545.0000
2 15/08/2018 ANCASH 1212.0000
3 15/08/2018 APURIMAC 2349.1848
4 15/08/2018 AREQUIPA 2349.1848
5 15/08/2018 AYACUCHO 448.6554
6 15/08/2018 CAJAMARCA 250.0000
7 15/08/2018 CALLAO 998.0403
8 15/08/2018 CUSCO 1235.8332
9 15/08/2018 HUANCAVELICA 878.0000
10 15/08/2018 HUANUCO 760.0943
11 15/08/2018 ICA 1262.8551
12 15/08/2018 JUNIN 474.7023
13 15/08/2018 LA LIBERTAD 2001.1577
14 15/08/2018 LAMBAYEQUE 2245.1860
15 15/08/2018 LIMA 51062.7935
16 15/08/2018 LORETO 186.5703
17 15/08/2018 MADRE DE DIOS 4548.0000
18 15/08/2018 MOQUEGUA 410.1018
19 15/08/2018 PASCO 1548.0000
20 15/08/2018 PIURA 1779.7739
21 15/08/2018 PUNO 78.0000
22 15/08/2018 SAN MARTIN 2245.1860
23 15/08/2018 TACNA 9440.6332
24 15/08/2018 TUMBES 1500.3725
25 15/08/2018 UCAYALI 998.0403
26 16/08/2018 AMAZONAS 241.3612
27 16/08/2018 ANCASH 751.9448
28 16/08/2018 APURIMAC 0.0000
29 16/08/2018 AREQUIPA 3422.5526
30 16/08/2018 AYACUCHO 833.9472
31 16/08/2018 CAJAMARCA 0.0000
32 16/08/2018 CALLAO 3270.7387
33 16/08/2018 CUSCO 1814.7743
34 16/08/2018 HUANCAVELICA 0.0000
35 16/08/2018 HUANUCO 0.0000
36 16/08/2018 ICA 818.0951
37 16/08/2018 JUNIN 0.0000
38 16/08/2018 LA LIBERTAD 0.0000
39 16/08/2018 LAMBAYEQUE 1737.0160
40 16/08/2018 LIMA 33361.8734
41 16/08/2018 LORETO 710.9008
42 16/08/2018 MADRE DE DIOS 0.0000
43 16/08/2018 MOQUEGUA 1764.2076
44 16/08/2018 PASCO 0.0000
45 16/08/2018 PIURA 1864.9805
46 16/08/2018 PUNO 0.0000
47 16/08/2018 SAN MARTIN 857.3609
48 16/08/2018 TACNA 588.9005
49 16/08/2018 TUMBES 0.0000
50 16/08/2018 UCAYALI 0.0000
51 17/08/2018 AMAZONAS 0.0000
52 17/08/2018 ANCASH 0.0000
53 17/08/2018 APURIMAC 0.0000
54 17/08/2018 AREQUIPA 2117.3617
55 17/08/2018 AYACUCHO 0.0000
56 17/08/2018 CAJAMARCA 346.9304
57 17/08/2018 CALLAO 1648.9144
58 17/08/2018 CUSCO 965.9713
59 17/08/2018 HUANCAVELICA 0.0000
60 17/08/2018 HUANUCO 2001.1577
61 17/08/2018 ICA 298.2393
62 17/08/2018 JUNIN 514.2232
63 17/08/2018 LA LIBERTAD 1611.4383
64 17/08/2018 LAMBAYEQUE 940.8336
65 17/08/2018 LIMA 22909.8984
66 17/08/2018 LORETO 0.0000
67 17/08/2018 MADRE DE DIOS 0.0000
68 17/08/2018 MOQUEGUA 2323.9440
69 17/08/2018 PASCO 0.0000
70 17/08/2018 PIURA 2337.3519
71 17/08/2018 PUNO 0.0000
72 17/08/2018 SAN MARTIN 0.0000
73 17/08/2018 TACNA 0.0000
74 17/08/2018 TUMBES 0.0000
75 17/08/2018 UCAYALI 888.7150