使用R

时间:2018-06-01 17:02:43

标签: r function loops

使用df,我正在创建一个新数据框(final.df),其中startdateenddate之间的每个日期都有一行df } datadframe。

df <- data.frame(claimid = c("123A", 
                             "125B", 
                             "151C", 
                             "124A", 
                             "325C"),
                 startdate = as.Date(c("2018-01-01", 
                                       "2017-05-20",
                                       "2017-12-15",
                                       "2017-11-05",
                                       "2018-02-06")),
                 enddate = as.Date(c("2018-01-06", 
                                     "2017-06-21",
                                     "2018-01-02",
                                     "2017-11-15",
                                     "2018-02-18")))

下面的嵌套函数是我目前用于创建final.df的函数,但是当循环数十万个声明时,这种创建final.df的方法需要数小时才能运行。我正在寻找可以更有效地创建final.df的替代方案。

claim_level <- function(a) {
  specific_row <- df[a, ]
  dates <- seq(specific_row$startdate, specific_row$enddate, by="days")
  day_level <- function(b) {
    day <- dates[b]
    data.frame(claimid = specific_row$claimid, date = day)
  }
  do.call("rbind", lapply(c(1:length(dates)), function(b) day_level(b))) 
}
final.df <- do.call("rbind", lapply(c(1:nrow(df)), function(a) claim_level(a))) 

print(subset(final.df, claimid == "123A"))

#claimid    date
#123A       2018-01-01
#123A       2018-01-02
#123A       2018-01-03
#123A       2018-01-04
#123A       2018-01-05
#123A       2018-01-06

3 个答案:

答案 0 :(得分:4)

您可以使用gather中的tidyr转换为长格式,然后使用pad中的padr在开始日期和结束日期之间创建新的日期行。 group = "claimid"参数允许您指定分组变量:

library(dplyr)
library(tidyr)
library(padr)

df %>%
  gather(var, date, -claimid) %>%
  pad(group = "claimid") %>%
  select(-var)

data.table提高效率:

library(data.table)
setDT(df)[,.(date = seq(startdate, enddate, "days")), claimid]

<强>结果:

   claimid       date
1     123A 2018-01-01
2     123A 2018-01-02
3     123A 2018-01-03
4     123A 2018-01-04
5     123A 2018-01-05
6     123A 2018-01-06
7     124A 2017-11-05
8     124A 2017-11-06
9     124A 2017-11-07
10    124A 2017-11-08
11    124A 2017-11-09
12    124A 2017-11-10
13    124A 2017-11-11
14    124A 2017-11-12
15    124A 2017-11-13
16    124A 2017-11-14
17    124A 2017-11-15
18    125B 2017-05-20
19    125B 2017-05-21
20    125B 2017-05-22
...

<强>基准:

初始化函数:

library(tidyverse)
library(padr)
library(data.table)

# OP's function
claim_level <- function(a) {
  specific_row <- df[a, ]
  dates <- seq(specific_row$startdate, specific_row$enddate, by="days")
  day_level <- function(b) {
    day <- dates[b]
    data.frame(claimid = specific_row$claimid, date = day)
  }
  do.call("rbind", lapply(c(1:length(dates)), function(b) day_level(b))) 
}

OP_f = function(){
  do.call("rbind", lapply(c(1:nrow(df)), function(a) claim_level(a))) 
}

# useR's tidyverse + padr
f1 = function(){
  df %>%
    gather(var, date, -claimid) %>%
    pad(interval = "day", group = "claimid") %>%
    select(-var)
}

# useR's data.table
DT = df
setDT(DT)

f2 = function(){
  DT[,.(date = seq(startdate, enddate, "days")), claimid]
}

# Moody_Mudskipper's Base R
f3 = function(){
  do.call(rbind,
          Map(function(claimid, startdate, enddate)
            data.frame(claimid, date=as.Date(startdate:enddate, origin = "1970-01-01")),
            df$claimid, df$startdate, df$enddate))
}

# Moody_Mudskipper's tidyverse
f4 = function(){
  df %>% 
    group_by(claimid) %>% 
    mutate(date = list(as.Date(startdate:enddate, origin = "1970-01-01"))) %>%
    select(1, 4) %>% 
    unnest %>%
    ungroup
}

# MKR's tidyr expand
f5 = function(){
  df %>% 
    group_by(claimid) %>%
    expand(date = seq(startdate, enddate, by="day"))
}

检查是否相同:

> identical(OP_f() %>% arrange(claimid), data.frame(f1()))
[1] TRUE
> identical(OP_f(), data.frame(f2()))
[1] TRUE
> identical(OP_f(), data.frame(f3()))
[1] TRUE
> identical(OP_f(), data.frame(f4()))
[1] TRUE
> identical(OP_f() %>% arrange(claimid), data.frame(f5()))
[1] TRUE

基准测试结果:

library(microbenchmark)
microbenchmark(OP_f(), f1(), f2(), f3(), f4(), f5())

Unit: milliseconds
   expr       min        lq      mean    median        uq        max neval
 OP_f() 26.421534 27.697194 30.342682 28.981143 31.537396  58.071238   100
   f1() 36.133364 38.179196 40.749812 39.870931 41.367655  58.428888   100
   f2()  1.005843  1.261449  1.450633  1.383232  1.559689   4.058900   100
   f3()  2.373679  2.534148  2.786888  2.633035  2.797452   6.941421   100
   f4() 22.659097 23.341435 25.275457 24.111411 26.499893  40.840061   100
   f5() 46.445622 48.148606 52.565480 51.185478 52.845829 176.912276   100

data.table是速度方面的赢家,@ Moody_Mudskipper的Base R解决方案是第二好的。虽然padr::padtidyr::expand似乎是最方便的,但它们也是最慢的(甚至比OP的原始程序还慢)。

答案 1 :(得分:2)

在基地people = int(input("Will you be travelling by yourself (1), or as a group of two (2)?: ")) while people: if people == 1: print("\nAh, a holiday for one! How adventurous.") break elif people == 2: print("\nOoh, bringing a friend! Sounds like fun!") break else: print("\nPlease enter either 1 or 2 to determine the number of travellers.") people = int(input("Will you be travelling by yourself (1), or as a group of two (2)?: "))

R

仅使用do.call(rbind, Map(function(claimid, startdate, enddate) data.frame(claimid, date=as.Date(startdate:enddate, origin = "1970-01-01")), df$claimid, df$startdate, df$enddate)) # claimid date # 1 123A 2018-01-01 # 2 123A 2018-01-02 # 3 123A 2018-01-03 # 4 123A 2018-01-04 # 5 123A 2018-01-05 # 6 123A 2018-01-06 #...

tidyverse

答案 2 :(得分:2)

一种选择是使用tidyr::expand功能在startdateenddate之间展开行。

library(tidyverse)
df %>% group_by(claimid) %>%
  expand(date = seq(startdate, enddate, by="day")) %>%
  as.data.frame()

#    claimid       date
# 1     123A 2018-01-01
# 2     123A 2018-01-02
# 3     123A 2018-01-03
# 4     123A 2018-01-04
# 5     123A 2018-01-05
# 6     123A 2018-01-06
# 7     124A 2017-11-05
# 8     124A 2017-11-06
# 9     124A 2017-11-07
# 10    124A 2017-11-08
# 11    124A 2017-11-09
# 12    124A 2017-11-10
#
#  70 more rows