Question

我有以下数据集，其中包含大约64000行：

    Trial.time Recording.time X.center Y.center  Area    Areachange Elongation   Distance.moved Movement.Moving...Center.point.
2      300.030          0.000 -49.1651  31.9676 0.917085    0.65113   0.851349              -                               -
22     300.696          0.666 -48.4404  31.9945 0.816206   0.715326   0.831207       0.725139                               1
24     300.763          0.733  -47.996  32.0696 0.834547   0.412688   0.856234       0.450784                               1
33     301.063          1.033 -47.6583  32.0598  0.75201   0.137563   0.716028       0.337775                               1
41     301.330          1.299 -47.3385  32.0139 0.843718   0.302638   0.838526       0.323117                               1
98     303.230          3.199 -47.3914  31.6981 0.944598    1.26558   0.847969        0.32022                               1
113    303.730          3.699 -47.3807  31.0614  0.86206    1.24724   0.761099       0.636771                               1
114    303.763          3.733 -47.1308  30.3858  1.00879     1.1005   0.809162        0.72036                               1
116    303.830          3.799 -47.1914  30.0551  1.01796   0.440201   0.831924       0.336155                               1

通常，它描述了特定Recording.time中对象的移动（Distance.Moved）。如果两个连续行的Recording.time小于0.035，则两行都属于一个单一的移动。相反，如果它更大，则时间点代表两个单独的运动。我的工作是确定每个动作的长度，因此有多少连续的行给出一个动作，并且总距离在动作中移动。我写了下面的代码，它有效但很慢，我想问你是否知道如何提高速度。

    time <- c()
j.final <- c()

#Go through all rows of the data.frame
for(i in 1:length(data2[,1])){
  i <- 1
  j <- 1
  if (!is.na(data2$Recording.time[i+1])){

    # As long as the distance between two consecutive time points is smaller than 0.035, increase the counter by one
    while (data2$Recording.time[i+1]-data2$Recording.time[i] <= 0.035){
      j <- j+1
      i <- i+1
    }
    # Save the number of consecutive time points
    j.final <- rbind(j.final,j)
    # Save the time of the last movement frame 
    time <- rbind(time,data2$Recording.time[j])
    # Delete the amount of rows that gave one single movement 
    data2 <- data2[-(1:j),]
  }
}   
final <- cbind(j.final,time)

#Same as above... Continouslz rows out of the data.frame
data2 <- data1
for (i in 1:length(j.final)){
  Dtotal <- sum(data2$Distance.moved[1:j.final[i]])
  distance <- rbind(distance, Dtotal)
  data2 <- data2[-(1:j.final[i]),]
}
final <- cbind(final,distance)
dimnames(final) <- list(NULL,c("Frames","Time","Distance"))
epicfinal <- as.data.frame(final)

最终结果如下（请不要介意速度）

  Frames  Time  Distance    velocity
1      1 0.033 0.0407652 0.001386017
2     18 0.666 1.4887506 0.911115367
3      3 0.799 0.0912680 0.009309336
4      7 1.066 0.3703880 0.088152344
5      2 1.166 0.0371303 0.002524860
6      3 1.299 0.1013617 0.010338893

Answer 1

正如zx8754指出的那样，使用lag（或者更好的是，他在data.table：shift）和cumsum函数中的快速实现很容易实现这一点。
我使用data.table包来提高速度（请注意，语法与经典data.frames有很大不同，与data.table一样，您可以在对表进行子集化时将表达式放在j参数中，而不是简单地选择data.frame）中的列。

library(data.table)

## VARIABLE CREATION:
# Create a column which indicates the lag between two observations
data$lag <- data$Recording.time-shift(data$Recording.time) 
data$lag[1] <- 0 # The first value is always NA: fix it
data$newmovement <- data$lag<0.035 # Binary variable: T if there's a new movement, F otherwise
data$movement_index <- cumsum(data$newmovement) # Index to identify the movement

## COMPUTATIONS:
# Use the data.table package for fast computations
data <- data.table(data)
data[,.(length_movement=.N, # Length (nrows) for each movement
        total_distance=sum(Distance.moved,na.rm = T)), # Total distance: sum of distances for each movement
         by=movement_index] # Subset by=movement_index

#    movement_index length_movement total_distance
# 1:              1               7       2.793806
# 2:              2               2       1.056515

请注意，##VARIABLE CREATION部分也可以通过data.table包来实现这可能会导致额外的速度提升，您可以通过使用以下代码替换代码的第一部分来实现：

## VARIABLE CREATION:
data[,lag:=Recording.time-shift(Recording.time)][1,lag:=0L]
data[,newmovement:=lag<0.035]
data[,movement_index:=cumsum(newmovement)]

虽然在for循环中循环，是否有更简单快捷的方法？

1 个答案: