以下是我的功能:
correct_occupancy <- function(testdf){
occupied_dates_df <- data.frame(OccupiedDate = as.POSIXct(character()),
Source = integer(),
created = as.POSIXct(character()),
Overwritten_cnt = integer(),
testdf_row = integer()
)
ovrd_cnt= 0
cond_ind = 0
for (i in 1:nrow(testdf)){
testspan <- seq(as.Date(testdf$check_in[i]), as.Date(testdf$check_out[i])-1, by="days")
testspan <- as.data.frame(testspan)
for(j in 1:nrow(testspan)){
if(testspan$testspan[j] %in% occupied_dates_df[,1] == TRUE){
lc <- which(occupied_dates_df[,1] == testspan$testspan[j], arr.ind=TRUE)
if(occupied_dates_df$Source[lc] == testdf$connection_source[i]){
testspan[j,1] <- NA
}else{
if(occupied_dates_df$Created[lc] < testdf$created[i]){
testspan[j,1] <- NA
}else{
if(cond_ind == 0){
cond_ind <- 1
new_check_out <- data.frame(testspan = testdf$check_out[occupied_dates_df$testdf_row[lc]]-1)
#print(new_check_out)
testspan <- rbind(testspan,new_check_out)
}
testdf$check_out[occupied_dates_df$testdf_row[lc]] <- testspan$testspan[1]
occupied_dates_df$OccupiedDate[lc] <- testspan$testspan[j]
occupied_dates_df$Source[lc] <- testdf$connection_source[i]
occupied_dates_df$Created[lc] <- testdf$created[i]
occupied_dates_df$Overwritten_cnt <- ovrd_cnt + 1
}
}
}else{
temp_dates_df = data.frame(OccupiedDate = testspan$testspan[j],
Source = testdf$connection_source[i],
Created = testdf$created[i],
Overwritten_cnt = ovrd_cnt,
testdf_row = i
)
occupied_dates_df <- rbind(occupied_dates_df,temp_dates_df)
}
}
cond_ind <- 0
testspan <- na.omit(testspan)
if(nrow(testspan) != 0){
testdf$check_in[i] <- min(testspan$testspan)
testdf$check_out[i] <- max(testspan$testspan)+1
}else{
testdf$check_in[i] <- NA
testdf$check_out[i] <- NA
}
}
mylist = list(testdf=as.data.frame(testdf),occupied_dates_df=as.data.frame(occupied_dates_df))
return(mylist)
}
我正在调用以下函数:
df_list1 <- testdf %>% split(list(.$FSID,.$PID),drop = TRUE) %>% map(~correct_occupancy(.))
输入数据:
structure(list(FSID = c(10729L, 10729L, 10729L, 10729L, 10729L,
10729L, 10729L, 10729L, 10729L, 10729L), RID = 246:255, PID = c(52L,
52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L), connection_source = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), check_in = structure(c(1496477160,
1518264360, 1500390360, 1500426360, 1500538020, 1521814020, 1500624360,
1500710760, 1500797160, 1524139560), class = c("POSIXct", "POSIXt"
), tzone = ""), check_out = structure(c(1500091200, 1519362000,
1500523200, 1500436800, 1500609600, 1522555200, 1500696000, 1500782400,
1501473600, 1524369600), class = c("POSIXct", "POSIXt"), tzone = ""),
cancelled = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), created = structure(c(1500019607,
1500289617, 1500304017, 1500426413, 1500538021, 1500559624,
1500624418, 1500710819, 1500797219, 1501243611), class = c("POSIXct",
"POSIXt"), tzone = "")), .Names = c("FSID", "RID", "PID",
"connection_source", "check_in", "check_out", "cancelled", "created"
), row.names = c(NA, -10L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("FSID", "PID"), drop = TRUE, indices = list(
0:9), group_sizes = 10L, biggest_group_size = 10L, labels = structure(list(
FSID = 10729L, PID = 52L), row.names = c(NA, -1L), class = "data.frame", vars = c("FSID",
"PID"), drop = TRUE, .Names = c("FSID", "PID")))
以下是我的问题:
1)如果我只从函数返回一个数据帧,则代码正在运行,但该函数执行的次数是组在输入数据集中的次数。即如果输入数据包含28行,包含5组,则结果为140行(结果应为28行)。
2)如何从dplyr中调用的函数返回列表(即我想从函数返回2个数据帧)。
请建议我出错的地方。