我知道有很多问题专门针对融化或更改dataframes
,但经过一些研究,我找不到与我的问题直接相关的任何问题。
每个月,我们公司都会上传一个数据文件,其中包含前24个月的销售数据。因此,在最近的三个上载中,每个文件都有一个07/2017
的数据点。问题是,有时这些数据点即使在理论上应该完全相同也有所不同。
我当前的表格按照产品品牌合并了最后三个上载内容,如下所示。列名称是数据框中的实际月份,各行各重复三个重复,以说明最近三个上载中的每一个。
我想更好地比较数据并更改表(我不知道是否需要融合,变异,转置等)。本质上,我需要最终表来显示每个月的每个品牌(我认为实际上最终将是26个,因为在三个上载之间,在第一个和第三个上载的背面和正面会有一个额外的月份)。连同它们的数据,理想情况下是上传之间的标准差。
任何提示或建议将不胜感激。
> current_table
Brand 04/2017 05/2017 06/2017 07/2017
x 10 11 9 7
x 11 11 8 9
x 17 13 8 9
y 10 12 4 9
y 14 12 6 9
y 15 18 8 9
> needed_table
Brand Upload1 Upload2 Upload3 SD Month
x 10 11 17 3.79 04/2017
x 11 11 13 1.15 05/2017
x 9 8 8 .58 06/2017
x 7 9 9 1.15 07/2017
y 10 14 15 2.65 04/2017
y 12 12 18 3.46 05/2017
y 4 6 8 2.00 06/2017
y 9 9 9 0.00 07/2017
> dput(head(final_merged))
structure(list(Brand = structure(1:6, .Label = c("BrandA", "BrandB",
"BrandC", "BrandD", "BrandE", "BrandF", "BrandG",
"BrandH", "BrandI", "BrandJ"), class = "factor"),
`2016-05-01` = c(542600.938987453, 121369.200906975, 194807.339528314,
26130.0960930597, 47328.5580057147, 878650.440203509), `2016-06-01` = c(427613.257159588,
154315.362993513, 293030.414270183, 35400.7075418873, 61389.4811620766,
1052195.59004308), `2016-07-01` = c(256692.139087568, 105309.022725613,
251793.570894072, 23245.2160336776, 45525.4213585196, 820502.422580008
), `2016-08-01` = c(473379.365918433, 100168.559639524, 186741.222637413,
31521.7915518086, 55076.6485031421, 1041982.37829743), `2016-09-01` = c(490091.694067029,
131598.231274657, 318420.372833846, 24057.2447362826, 56038.1064247602,
993577.092222938), `2016-10-01` = c(406522.567301472, 128957.624222936,
261565.423872617, 33422.3089757409, 57736.9698688817, 678066.590110671
), `2016-11-01` = c(477084.744242994, 133275.014021741, 284441.459082257,
30410.5664089452, 56822.1141268123, 1064886.69561047), `2016-12-01` = c(521829.584534476,
97983.5318536767, 208008.005137491, 31289.318, 52479.0371167326,
843310.325681422), `2017-01-01` = c(273501.342043654, 80012.2533117421,
195121.917662579, 22666.14, 54259.0136360474, 668189.841756365
), `2017-02-01` = c(685662.788776875, 127218.360798343, 242264.578372733,
25834.168, 53822.8322244458, 838142.3930596), `2017-03-01` = c(391139.806441577,
165270.729667372, 344475.891325452, 24324.698, 51447.0694269768,
1163066.49036161), `2017-04-01` = c(513219.288877507, 117680.874763073,
225660.11443535, 25882.934, 50427.8355378495, 705879.756352247
), `2017-05-01` = c(588841.848447216, 85962.4107041087, 205370.047642541,
24528.452, 50265.0630719988, 1172193.61025203), `2017-06-01` = c(592141.397064366,
186725.961678379, 239468.615831021, 31486.9, 48270.5745992247,
966639.315719368), `2017-07-01` = c(401320.540832368, 100871.630859688,
262648.979237381, 25590.72, 56131.8936476386, 531673.227676397
), `2017-08-01` = c(641511.11756313, 136289.088301959, 270709.662423172,
32773.69, 69511.1772105425, 1202000.7904115), `2017-09-01` = c(526890.631147415,
139847.451118091, 222951.630747679, 27480.974, 60645.622041419,
775667.840682767), `2017-10-01` = c(643121.78985178, 180025.223477481,
284514.628237396, 39847.288, 74247.6270395629, 1116881.49653723
), `2017-11-01` = c(641964.06004378, 170480.722936376, 236539.486734265,
33211.102, 65337.5560467566, 928064.977296857), `2017-12-01` = c(701220.126734004,
147016.439282975, 262372.515457345, 19004.936, 63303.3621122444,
886702.202033658), `2018-01-01` = c(522948.466272953, 94834.4834012872,
185624.076471366, 27964.772, 54902.1470296058, 394395.594443823
), `2018-02-01` = c(655763.722465697, 149547.490883877, 161965.604862219,
25338.488, 50546.0076345216, 734257.598467733), `2018-03-01` = c(614499.617793727,
179165.564404258, 380965.227731128, 32299.332, 52258.5402998062,
1048116.95924033), `2018-04-01` = c(587274.401556706, 158130.345063145,
130088.683628109, 32235.478, 51718.8379910518, 503405.505894426
), `2018-05-01` = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), `2018-06-01` = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("Brand",
"2016-05-01", "2016-06-01", "2016-07-01", "2016-08-01", "2016-09-01",
"2016-10-01", "2016-11-01", "2016-12-01", "2017-01-01", "2017-02-01",
"2017-03-01", "2017-04-01", "2017-05-01", "2017-06-01", "2017-07-01",
"2017-08-01", "2017-09-01", "2017-10-01", "2017-11-01", "2017-12-01",
"2018-01-01", "2018-02-01", "2018-03-01", "2018-04-01", "2018-05-01",
"2018-06-01"), row.names = c(NA, 6L), class = "data.frame")
>
答案 0 :(得分:2)
library(dplyr)
df %>% group_by(Brand) %>%
mutate(row=case_when(row_number()==1 ~ "Upload1", row_number()==2 ~ "Upload2",
row_number()==3 ~ "Upload3", TRUE ~ as.character(x))) %>%
gather(Month, value, -Brand, -row) %>% spread(row, value) %>% ungroup() %>%
mutate(SD=apply(.[(3:5)],1,sd), Month = gsub('X','',gsub('\\.','/',Month)))
# A tibble: 8 x 6
Brand Month Upload1 Upload2 Upload3 SD
<chr> <chr> <int> <int> <int> <dbl>
1 x 04/2017 10 11 17 3.79
2 x 05/2017 11 11 13 1.15
3 x 06/2017 9 8 8 0.577
4 x 07/2017 7 9 9 1.15
5 y 04/2017 10 14 15 2.65
6 y 05/2017 12 12 18 3.46
7 y 06/2017 4 6 8 2
8 y 07/2017 9 9 9 0
df <- read.table(text="
Brand 04/2017 05/2017 06/2017 07/2017
x 10 11 9 7
x 11 11 8 9
x 17 13 8 9
y 10 12 4 9
y 14 12 6 9
y 15 18 8 9
",header=T, stringsAsFactors = F)
答案 1 :(得分:2)
使用data.table
语法并使用melt()
和dcast()
,它将变成:
library(data.table)
melt(setDT(current_table), id.vars = "Brand", variable.name = "Month")[
, SD := round(sd(value), 2L), by = .(Brand, Month)][
, dcast(.SD, Brand + Month + SD ~ rowid(Brand, Month, prefix = "Upload"))]
请注意,SD
以长格式计算。因此,它与上传的数量无关。
Brand Month SD Upload1 Upload2 Upload3 1: x 04/2017 3.79 10 11 17 2: x 05/2017 1.15 11 11 13 3: x 06/2017 0.58 9 8 8 4: x 07/2017 1.15 7 9 9 5: y 04/2017 2.65 10 14 15 6: y 05/2017 3.46 12 12 18 7: y 06/2017 2.00 4 6 8 8: y 07/2017 0.00 9 9 9
library(data.table)
current_table <- fread(
"Brand 04/2017 05/2017 06/2017 07/2017
x 10 11 9 7
x 11 11 8 9
x 17 13 8 9
y 10 12 4 9
y 14 12 6 9
y 15 18 8 9 "
)