我的数据框如下:
## Please copy following text in your clipboard (do not copy this line)
hid ,mid ,aprps,astart ,aend ,ax ,ay ,exph
10001,1000101,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 00:00:00,2012-01-01 08:00:00,475465.6,1272272,41.55607
10001,1000101,4 ,2012-01-01 08:00:00,2012-01-01 08:15:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 08:15:00,2012-01-01 09:15:00,475465.6,1272272,41.55607
10001,1000101,4 ,2012-01-01 09:15:00,2012-01-01 09:30:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 09:30:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 00:00:00,2012-01-01 07:30:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 07:30:00,2012-01-01 07:50:00,475465.6,1272272,41.55607
10001,1000102,1 ,2012-01-01 07:50:00,2012-01-01 11:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 11:00:00,2012-01-01 11:20:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 11:20:00,2012-01-01 14:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 14:00:00,2012-01-01 14:20:00,475465.6,1272272,41.55607
10001,1000102,1 ,2012-01-01 14:20:00,2012-01-01 17:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 17:00:00,2012-01-01 17:20:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 17:20:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 00:00:00,2012-01-01 08:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 08:00:00,2012-01-01 12:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 12:00:00,2012-01-01 13:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 13:00:00,2012-01-01 19:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 19:00:00,2012-01-01 20:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 20:00:00,2012-01-01 23:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 23:00:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 00:00:00,2012-01-01 07:00:00,475465.6,1272272,41.55607
10001,1000104,4 ,2012-01-01 07:00:00,2012-01-01 07:30:00,473548.0,1279171,41.55607
10001,1000104,2 ,2012-01-01 07:30:00,2012-01-01 10:00:00,473548.0,1279171,41.55607
10001,1000104,4 ,2012-01-01 10:00:00,2012-01-01 10:30:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 10:30:00,2012-01-01 17:30:00,475465.6,1272272,41.55607
10001,1000104,4 ,2012-01-01 17:30:00,2012-01-01 17:45:00,484869.7,1270558,41.55607
10001,1000104,2 ,2012-01-01 17:45:00,2012-01-01 21:30:00,484869.7,1270558,41.55607
10001,1000104,4 ,2012-01-01 21:30:00,2012-01-01 21:45:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 21:45:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
## Do not copy this line
您可以复制以上文本并使用df
包将其导入为{psych}
:
install.packages("psych")
library(psych)
# Please copy above text and run following
df <- read.clipboard(header=TRUE, sep=",")
我需要从df
获得的是:
exph
的总和,这些行在aprps==4
和上一行中提取aprps==4
有多行,请按mid
组重复此行exph
和对应的hid
的总和为了说明这一点,我目前正在基于两个循环使用以下脚本:
library(tidyverse)
calc <- function(i) {
## Extract records by "mid" excluding the first records
temp <<- df %>% filter(mid==i) %>% filter(row_number()>1)
## Extract row number of "aprps==4"
r.aprps <- which(temp$aprps==4)
## Repeat operation by two pairs of rows based on "r.aprps"
for (j in 1:length(r.aprps)) {
## Extract movement
temp2 <<- temp[c((r.aprps[j]-1):r.aprps[j]),]
## Other operations in actual data set (jsut put example)
exp <- data.frame(mid=unique(temp2$mid),expsum=sum(temp2$exph))
## Store PPA in list
if (lp==1 & j==1) {
df.exp <<- exp
} else {
df.exp <<- rbind(df.exp,exp)
}
}
}
## Set loop conditions
list.mid <- unique(df$mid)
nloop <- length(list.mid)
## Initialize df.exp
df.exp <- data.frame(matrix(vector(),0,2,
dimnames=list(c(),c("mid","expsum"))),
stringsAsFactors=F)
## Loop to store PPA in list
for (lp in 1:nloop) {
calc(list.mid[lp])
}
但是,由于实际数据帧df
包含大约40,000条记录,而实际操作包含更复杂的计算,因此需要30多个小时。我试图找到缩短操作的方法,现在尝试使用map
中的purrr
函数将每个操作存储在嵌套的数据帧中,而不是每次都在循环操作中替换变量。>
以下是我要构建的脚本,但是无法达到所需的输出。
## Store df by mid into list
nest <- df %>% group_by(mid) %>% nest()
## Extract row number with "aprps==4"
nest2 <- nest %>% mutate(row.aprps4=map(data,~which(.$aprps==4)))
## Obtain row numbers to extract by movement
nest3 <- nest2 %>% mutate(row.aprps4_1=map(data,~data.frame(rm1=which(.$aprps==4)-1)),
row.aprps4_2=map(data,~data.frame(rm1=which(.$aprps==4))))
## How to extract two pairs of records based on row.aprps4_1 and row.aprps4_1 and store sum of exph?
Some trials:
# It works but cannot extract records using two variables (row.aprps4_1 and .._2)
nest3 %>% mutate(move=map2(data,row.aprps4_1,~filter(.x,seq_len(nrow(.x))%in%.y)))
# Using pmap to specify range of filtering by two variables but does not work
nest4 %>% pmap(data,row.move1,row.move2,~filter(..1,seq_len(nrow(..1))%in%..2))
# Using double map function instead of double loop but does not work
pmap(nest4$data,nest4$row.move1,nest4$row.move2,~filter(..1,seq_len(nrow(..1))%in%c(..2:..3)))
您是否有建议使手术紧固?
我更喜欢使用map
函数来了解它,但是也欢迎其他选择。
我还发现this post与该问题类似,但无法解决如何基于动态变量r.aprpr4_1
和_2
提取两行的问题。
=====更新:已解决问题=====
我可以通过以下脚本解决问题:
## Convert df into nested data frame by `mid`
nest <- df %>% group_by(mid) %>% nest()
## Obtain row numbers to extract aprps==4
nest2 <- nest %>% mutate(r=map(data,~which(.$aprps==4)))
## Split r and expand record
nest3 <- nest2 %>% unnest(r,.drop=FALSE)
## Extract pairs of movement
nest4 <- nest3 %>% mutate(pair=map2(data,r,~filter(.x,seq_len(nrow(.x))%in%c((.y-1):.y)))) %>% dplyr::select(mid,pair)
要点是:
unnest()
通过从aprps==4
提取的向量扩展每个记录(不能应用.x%in%.y
,其中.y
具有两个以上的长度)mutate
是应用map2
所必需的(不接受诸如nest3 %>% map2(a,b,~f(.x,.y...))
之类的代码)非常感谢以下帖子以获取此解决方案:
答案 0 :(得分:1)
由于您提到也欢迎使用其他替代方法,请考虑以R为基数。初始设置(非购买要求)衍生出几个问题:
原始代码的最大问题之一是在循环中使用rbind
,这导致内存中的过度复制,如本SO线程Replace rbind in for-loop with lapply? (2nd circle of hell)和Patrick Burn的{{3}所解释的那样}。要解决此问题,请构建一个附加在循环外部的数据帧列表。
重复使用作用域分配<<-
从本地函数内部影响全局环境似乎是不必要的,尤其是因为 temp 对象已被每个循环替换因此只有最后一次迭代会保持。通常不鼓励使用此运算符,因为调整了全局变量后很难进行调试。返回一个对象时,最好处理函数。
您可以在调用df.exp
之前初始化一个空的数据帧calc()
,但是用<<-
在循环中覆盖它。通常,在分配一个空矩阵或数据帧之后,可以按循环内的行进行分配,但这没有完成。
通过unique()
或by()
替换循环split()
的值,这也避免了在函数内部使用dplyr::filter()
。顺便说一句,R Internal - Circle 2: Growing Objects使用管道,%>%
循环内。
而不是for
循环,而是使用 apply 族在迭代后构建对象列表,例如lapply
,这避免了记账{{1} }循环需要初始化一个空列表并为其分配元素(尽管使用此方法没有错)。另外,通过这种方式,可以避免在函数内使用for
。
基本R (使用<<-
,by
和lapply
)
do.call
由于calc <- function(sub) {
## Extract records by "mid" excluding the first records
temp <- sub[2:nrow(temp),]
## Extract row number of "aprps==4"
r.aprps <- which(temp$aprps==4)
## Store exp dataframes in list
subdf_list <- lapply(1:length(r.aprps), function(j) {
## Extract movement by two pairs of rows based on "r.aprps"
temp2 <- temp[c((r.aprps[j]-1):r.aprps[j]),]
## Other operations in actual data set (just put example)
exp <- data.frame(mid=unique(temp2$mid), expsum=sum(temp2$exph))
return(exp)
})
df.exp <- do.call(rbind, subdf_list)
return(df.exp)
}
## subset by mid and pass subsets to calc()
df_list <- by(df, df$mid, calc)
## append all in final object
final_df <- do.call(rbind, df_list)
有一些performance challenges,因此请考虑将第三方软件包替换为base::rbind.data.frame
,例如do.call(rbind, ...)
和dplyr::bind_rows()
。
data.table::rbindlist()