使用动态列名称按组计算rowSums

时间:2017-03-24 10:23:36

标签: r data.table reshape dcast

我想通过生产中使用的钻井类型来计算每种化石燃料的生产份额。 起点是以下data.table

library(data.table)
dt <- structure(list(Global.Company.Key = c(1380L, 1380L, 1380L, 1380L, 1380L)
                     , Calendar.Data.Year.and.Quarter = structure(c(2000, 2000, 2000, 2000, 2000), class = "yearqtr")
                     , Current.Assets.Total = c(2218, 2218, 2218, 2218, 2218)
                     , DRILL_TYPE = c("U", "D", "V", "H", "U")
                     , DI.Oil.Prod.Quarter = c(18395.6792379842, 1301949.24041659, 235.311086392291, 27261.8049684835, 4719.27956989249)
                     , DI.Gas.Prod.Quarter = c(1600471.27107983, 4882347.22928982, 2611.60215053765, 9634.76418242493, 27648.276603634)), .Names = c("Global.Company.Key", "Calendar.Data.Year.and.Quarter", "Current.Assets.Total", "DRILL_TYPE", "DI.Oil.Prod.Quarter",  "DI.Gas.Prod.Quarter"), row.names = c(NA, -5L), class = c("data.table",  "data.frame"), sorted = c("Global.Company.Key",  "Calendar.Data.Year.and.Quarter"))
然后,我可以根据钻井类型计算两种化石燃料类型中每种燃料的总产量。

# Oil Production per Drilling Type and Total Sum
dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE , value.var =  c("DI.Oil.Prod.Quarter"), fun = list(sum))[, Total.Sum :=rowSums(.SD, na.rm = TRUE), by=.(Global.Company.Key, Calendar.Data.Year.and.Quarter), .SDcols=c("U","D", "V", "H")][]

# Gas Production per Drilling Type and Total Sum
dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE , value.var =  c("DI.Gas.Prod.Quarter"), fun = list(sum))[, Total.Sum :=rowSums(.SD, na.rm = TRUE), by=.(Global.Company.Key, Calendar.Data.Year.and.Quarter), .SDcols=c("U","D", "V", "H")][]
# Combined calculation of the production for both fossil fuels with dynamic naming.
dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE , value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"), fun = list(sum))[, Total.Sum :=rowSums(.SD, na.rm = TRUE), by=.(Global.Company.Key, Calendar.Data.Year.and.Quarter)][]

有没有人知道如何计算不同化石燃料类型的总和?正如您在dcast命令的最后一种情况中所看到的,它会连接新列的名称,因此无法通过直接选择列来对列进行分组。

基本上,我想得到最后一个例子的输出,虽然增加了额外的列,总和石油和天然气的总产量。 然后,我想用这些数据来计算四种井中一种的石油和天然气产量份额。

3 个答案:

答案 0 :(得分:1)

使用data.tabledcast()的替代方法大约是OP合并的两倍approach

从宽到长完成重塑

molten <- melt(dt, measure.vars = patterns("^DI"))
molten
#    Global.Company.Key Calendar.Data.Year.and.Quarter Current.Assets.Total DRILL_TYPE            variable        value
# 1:               1380                           2000                 2218          U DI.Oil.Prod.Quarter   18395.6792
# 2:               1380                           2000                 2218          D DI.Oil.Prod.Quarter 1301949.2404
# 3:               1380                           2000                 2218          V DI.Oil.Prod.Quarter     235.3111
# 4:               1380                           2000                 2218          H DI.Oil.Prod.Quarter   27261.8050
# 5:               1380                           2000                 2218          U DI.Oil.Prod.Quarter    4719.2796
# 6:               1380                           2000                 2218          U DI.Gas.Prod.Quarter 1600471.2711
# 7:               1380                           2000                 2218          D DI.Gas.Prod.Quarter 4882347.2293
# 8:               1380                           2000                 2218          V DI.Gas.Prod.Quarter    2611.6022
# 9:               1380                           2000                 2218          H DI.Gas.Prod.Quarter    9634.7642
#10:               1380                           2000                 2218          U DI.Gas.Prod.Quarter   27648.2766

计算总计

totals <- molten[, .(DRILL_TYPE = "Total.Sum", value = sum(value)), 
                 by = .(Global.Company.Key, Calendar.Data.Year.and.Quarter, 
                        Current.Assets.Total, variable)]
totals
#   Global.Company.Key Calendar.Data.Year.and.Quarter Current.Assets.Total            variable DRILL_TYPE   value
#1:               1380                           2000                 2218 DI.Oil.Prod.Quarter  Total.Sum 1352561
#2:               1380                           2000                 2218 DI.Gas.Prod.Quarter  Total.Sum 6522713

追加详细信息

molten <- rbind(molten, totals)
molten
#    Global.Company.Key Calendar.Data.Year.and.Quarter Current.Assets.Total DRILL_TYPE            variable        value
# 1:               1380                           2000                 2218          U DI.Oil.Prod.Quarter   18395.6792
# 2:               1380                           2000                 2218          D DI.Oil.Prod.Quarter 1301949.2404
# 3:               1380                           2000                 2218          V DI.Oil.Prod.Quarter     235.3111
# 4:               1380                           2000                 2218          H DI.Oil.Prod.Quarter   27261.8050
# 5:               1380                           2000                 2218          U DI.Oil.Prod.Quarter    4719.2796
# 6:               1380                           2000                 2218          U DI.Gas.Prod.Quarter 1600471.2711
# 7:               1380                           2000                 2218          D DI.Gas.Prod.Quarter 4882347.2293
# 8:               1380                           2000                 2218          V DI.Gas.Prod.Quarter    2611.6022
# 9:               1380                           2000                 2218          H DI.Gas.Prod.Quarter    9634.7642
#10:               1380                           2000                 2218          U DI.Gas.Prod.Quarter   27648.2766
#11:               1380                           2000                 2218  Total.Sum DI.Oil.Prod.Quarter 1352561.3153
#12:               1380                           2000                 2218  Total.Sum DI.Gas.Prod.Quarter 6522713.1433

从长到宽重塑

# reorder factor levels of DRILL_TYPE to ensure 
# that columns are in the same order as rows (with totals last)
molten[, DRILL_TYPE := forcats::fct_inorder(DRILL_TYPE)]
# reshape
    dcast(molten, ... ~ variable + DRILL_TYPE, sum, value.var = "value")
#   Global.Company.Key Calendar.Data.Year.and.Quarter Current.Assets.Total DI.Oil.Prod.Quarter_U DI.Oil.Prod.Quarter_D
#1:               1380                           2000                 2218              23114.96               1301949
#   DI.Oil.Prod.Quarter_V DI.Oil.Prod.Quarter_H DI.Oil.Prod.Quarter_Total.Sum DI.Gas.Prod.Quarter_U DI.Gas.Prod.Quarter_D
#1:              235.3111               27261.8                       1352561               1628120               4882347
#   DI.Gas.Prod.Quarter_V DI.Gas.Prod.Quarter_H DI.Gas.Prod.Quarter_Total.Sum
#1:              2611.602              9634.764                       6522713

结果类似于使用OP merge()方法创建的结果(列顺序除外)。

基准

mb <- microbenchmark::microbenchmark(
  merge = merge(
    x = dcast(
      dt,
      Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE ,
      value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"),
      fun = list(sum)
    )[, -grepl(glob2rx("DI.Gas.Prod.Quarter_*"), colnames(
      dcast(
        dt,
        Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE ,
        value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"),
        fun = list(sum)
      )
    )), with = FALSE][, DI.Oil.Prod.Total.Sum := rowSums(.SD, na.rm = TRUE), by =
                        .(Global.Company.Key, Calendar.Data.Year.and.Quarter)][]
    ,
    y = dcast(
      dt,
      Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE ,
      value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"),
      fun = list(sum)
    )[, -grepl(glob2rx("DI.Oil.Prod.Quarter_*"), colnames(
      dcast(
        dt,
        Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE ,
        value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"),
        fun = list(sum)
      )
    )), with = FALSE][, DI.Gas.Prod.Total.Sum := rowSums(.SD, na.rm = TRUE), by =
                        .(Global.Company.Key, Calendar.Data.Year.and.Quarter)][]
    ,
    all.x = TRUE
    ,
    by = c(
      "Global.Company.Key",
      "Calendar.Data.Year.and.Quarter",
      "Current.Assets.Total"
    )
  ),
  aggr = {
    molten <- melt(dt, measure.vars = patterns("^DI"))
    molten[, Total.Sum := sum(value), by = .(Global.Company.Key, Calendar.Data.Year.and.Quarter, Current.Assets.Total, variable)]
    dcast(molten, ... ~ variable + DRILL_TYPE, sum, value.var = "value")
    molten <- melt(dt, measure.vars = patterns("^DI"))
    molten <- rbind(molten, molten[, .(DRILL_TYPE = "Total.Sum", value = sum(value)), 
                                   by = .(Global.Company.Key, Calendar.Data.Year.and.Quarter, 
                                          Current.Assets.Total, variable)])
    molten[, DRILL_TYPE := forcats::fct_inorder(DRILL_TYPE)]
    dcast(molten, ... ~ variable + DRILL_TYPE, sum, value.var = "value")
  },
  times = 100L
)

请注意,合并方法需要大约三倍的代码行。此外,性能是聚合和rbind 方法的两倍慢。

Unit: milliseconds
  expr       min        lq     mean   median       uq      max neval
 merge 20.298773 21.181559 22.13640 21.77682 22.59126 26.22265   100
  aggr  9.393847  9.806165 10.33053 10.07595 10.35460 20.11112   100

答案 1 :(得分:0)

不确定你想要的但是这样吗?:

dt %>% group_by(DRILL_TYPE) %>% summarise(so=sum(DI.Oil.Prod.Quarter),sg=sum(DI.Gas.Prod.Quarter),tot=so+sg)

修改

现在总结重复的条目并使用dcast创建单行

dt %>% 
gather(variable, value, -(Global.Company.Key:DRILL_TYPE)) %>%
unite(temp, DRILL_TYPE, variable) %>% dcast(... ~ temp, fun=sum,drop=FALSE) %>%
mutate(so=sum(select(dt,contains("Oil"))),sg=sum(select(dt,contains("Gas"))),tot=so+sg)

答案 2 :(得分:0)

我想出了一个答案,虽然它可能是无穷无尽的,但它会提供所需的输出。

merge(x = dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE , value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"), fun = list(sum) )[, -grepl(glob2rx("DI.Gas.Prod.Quarter_*"), colnames(dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE , value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"), fun = list(sum) ))), with = FALSE][, DI.Oil.Prod.Total.Sum :=rowSums(.SD, na.rm = TRUE), by=.(Global.Company.Key, Calendar.Data.Year.and.Quarter)][]
      , y = dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE , value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"), fun = list(sum) )[, -grepl(glob2rx("DI.Oil.Prod.Quarter_*"), colnames(dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total  ~ DRILL_TYPE , value.var =  c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"), fun = list(sum) ))), with = FALSE][, DI.Gas.Prod.Total.Sum :=rowSums(.SD, na.rm = TRUE), by=.(Global.Company.Key, Calendar.Data.Year.and.Quarter)][]
      , all.x = TRUE
      , by = c( "Global.Company.Key", "Calendar.Data.Year.and.Quarter", "Current.Assets.Total")
)