从dplyr

时间:2018-04-05 11:13:35

标签: r dplyr plyr

以下是我的功能:

correct_occupancy <- function(testdf){

  occupied_dates_df <- data.frame(OccupiedDate = as.POSIXct(character()),
                                  Source = integer(),
                                  created = as.POSIXct(character()),
                                  Overwritten_cnt = integer(),
                                  testdf_row = integer()
                                 )


  ovrd_cnt= 0
  cond_ind = 0

  for (i in 1:nrow(testdf)){
    testspan <- seq(as.Date(testdf$check_in[i]), as.Date(testdf$check_out[i])-1, by="days")
    testspan <- as.data.frame(testspan)

    for(j in 1:nrow(testspan)){

      if(testspan$testspan[j] %in% occupied_dates_df[,1] == TRUE){

        lc <- which(occupied_dates_df[,1] == testspan$testspan[j], arr.ind=TRUE)
        if(occupied_dates_df$Source[lc] == testdf$connection_source[i]){
          testspan[j,1] <- NA
        }else{
          if(occupied_dates_df$Created[lc] < testdf$created[i]){
            testspan[j,1] <- NA
          }else{
            if(cond_ind == 0){
              cond_ind <- 1
            new_check_out <- data.frame(testspan = testdf$check_out[occupied_dates_df$testdf_row[lc]]-1)
            #print(new_check_out)
            testspan <- rbind(testspan,new_check_out)
            }                
            testdf$check_out[occupied_dates_df$testdf_row[lc]] <- testspan$testspan[1]
            occupied_dates_df$OccupiedDate[lc] <- testspan$testspan[j]
            occupied_dates_df$Source[lc] <- testdf$connection_source[i]
            occupied_dates_df$Created[lc] <- testdf$created[i]
            occupied_dates_df$Overwritten_cnt <- ovrd_cnt + 1
          }

        }

      }else{

        temp_dates_df = data.frame(OccupiedDate = testspan$testspan[j],
                                   Source = testdf$connection_source[i],
                                   Created = testdf$created[i],
                                   Overwritten_cnt = ovrd_cnt,
                                   testdf_row = i
        )
        occupied_dates_df <- rbind(occupied_dates_df,temp_dates_df)

      }

    }
    cond_ind <- 0
    testspan <- na.omit(testspan)
    if(nrow(testspan) != 0){
      testdf$check_in[i] <- min(testspan$testspan)
      testdf$check_out[i] <- max(testspan$testspan)+1

    }else{
      testdf$check_in[i] <- NA
      testdf$check_out[i] <- NA
    }
  }
  mylist = list(testdf=as.data.frame(testdf),occupied_dates_df=as.data.frame(occupied_dates_df))
  return(mylist)
}

我正在调用以下函数:

df_list1 <- testdf %>% split(list(.$FSID,.$PID),drop = TRUE) %>% map(~correct_occupancy(.))

输入数据:

structure(list(FSID = c(10729L, 10729L, 10729L, 10729L, 10729L, 
10729L, 10729L, 10729L, 10729L, 10729L), RID = 246:255, PID = c(52L, 
52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L), connection_source = c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), check_in = structure(c(1496477160, 
1518264360, 1500390360, 1500426360, 1500538020, 1521814020, 1500624360, 
1500710760, 1500797160, 1524139560), class = c("POSIXct", "POSIXt"
), tzone = ""), check_out = structure(c(1500091200, 1519362000, 
1500523200, 1500436800, 1500609600, 1522555200, 1500696000, 1500782400, 
1501473600, 1524369600), class = c("POSIXct", "POSIXt"), tzone = ""), 
    cancelled = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), created = structure(c(1500019607, 
    1500289617, 1500304017, 1500426413, 1500538021, 1500559624, 
    1500624418, 1500710819, 1500797219, 1501243611), class = c("POSIXct", 
    "POSIXt"), tzone = "")), .Names = c("FSID", "RID", "PID", 
"connection_source", "check_in", "check_out", "cancelled", "created"
), row.names = c(NA, -10L), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"), vars = c("FSID", "PID"), drop = TRUE, indices = list(
    0:9), group_sizes = 10L, biggest_group_size = 10L, labels = structure(list(
    FSID = 10729L, PID = 52L), row.names = c(NA, -1L), class = "data.frame", vars = c("FSID", 
"PID"), drop = TRUE, .Names = c("FSID", "PID")))

以下是我的问题:

1)如果我只从函数返回一个数据帧,则代码正在运行,但该函数执行的次数是组在输入数据集中的次数。即如果输入数据包含28行,包含5组,则结果为140行(结果应为28行)。

2)如何从dplyr中调用的函数返回列表(即我想从函数返回2个数据帧)。

请建议我出错的地方。

0 个答案:

没有答案