我有几个人的职位数据,每个人都在几个时间步骤注册。我想计算每只动物与同时登记的所有其他动物之间的距离。
以下是一个简化示例,其中三个日期('date')的数据分别在三个日期('日期')上注册,位于不同的位置('x','y'):
library(data.table)
dt1 <- data.table(animal_id = 1, date = as.POSIXct(c("2014-01-01", "2014-01-02", "2014-01-03")),
x = runif(3, -10, 10), y = runif(3, -10, 10))
dt2 <- data.table(animal_id = 2, date = as.POSIXct(c("2014-01-01", "2014-01-02", "2014-01-03")),
x = runif(3, -10, 10), y = runif(3, -10, 10))
dt3 <- data.table(animal_id = 3, date = as.POSIXct(c("2014-01-01", "2014-01-02", "2014-01-03")),
x = runif(3, -10, 10), y = runif(3, -10, 10))
dt <- rbindlist(list(dt1, dt2, dt3))
# Create dist function between two animals at same time
dist.between.animals <- function(collar_id1, x1, y1, collar_id2, x2, y2) {
if (collar_id1 == collar_id2) return(NA)
sqrt((x1 - x2)^2 + (y1 - y2)^2)
}
# Get unique collar id of each animal, create column name for all animals per animal
animal_ids <- dt[ , unique(animal_id)]
animal_ids_str <- dt[,paste0("dist_to_", unique(animal_id))]
datetimes <- dt[ , unique(date)]
# Calculate distance of each animal to all animals, at same time
for (i in 1:length(animal_ids)) {
for (j in 1:length(datetimes)) {
x1 <- dt[.(animal_ids[i], datetimes[j]), x, on = .(animal_id, date)]
y1 <- dt[.(animal_ids[i], datetimes[j]), y, on = .(animal_id, date)]
dt[date == datetimes[j], animal_ids_str[i] := mapply(function(c, x2, y2) dist.between.animals(animal_ids[i], x1, y1, c, x2, y2), animal_id, x, y)]
}
}
以下是输出结果的示例:
animal_id date x y dist_to_1 dist_to_2 dist_to_3
1: 1 2014-01-01 -7.0276047 4.7660664 NA 7.1354265 13.7962800
2: 1 2014-01-02 -6.6383802 7.0087919 NA 3.7003879 16.4294999
3: 1 2014-01-03 -0.9722872 -4.8638019 NA 11.6447645 11.8313410
4: 2 2014-01-01 0.1076661 4.8131960 7.135426 NA 7.7052205
5: 2 2014-01-02 -8.9042124 4.0832364 3.700388 NA 13.3225921
6: 2 2014-01-03 8.2858839 2.1992575 11.644764 NA 0.4569632
7: 3 2014-01-01 5.7519522 -0.4320359 13.796280 7.7052205 NA
8: 3 2014-01-02 -9.0805265 -9.2381889 16.429500 13.3225921 NA
9: 3 2014-01-03 8.6832729 1.9736531 11.831341 0.4569632 NA
然而,我的真实数据有大约30只动物,每只动物有20,000次观察,因此我目前的代码需要很长时间才能运行。有没有更有效的方法来做到这一点?
答案 0 :(得分:3)
dist
函数,它计算欧几里德距离(或任何其他,你的选择)。如果使用diag=T, upper=T
,它会生成一个矩阵,然后您可以将其分配给指定的行 - 列。创建列可能会使多个动物变得乏味,但paste
函数无法修复。
dt[, c("dist_to_1", "dist_to_2", "dist_to_3") := NA]
dt<- arrange(dt, date, animal_id) # order by date. here it turns into a data.frame
for (i in 1:length(unique(dt$date))){
sub<- subset(dt, dt$date == unique(dt$date)[i])
dt[which(dt$date == unique(sub$date)), c("dist_to_1", "dist_to_2", "dist_to_3")]<- as.matrix(dist(sub[, c("x","y")], diag=T, upper=T))
}
dt[dt==0]<- NA #assign NAs for 0s. Not necessary if the it's ok for diag==0.
setDT(dt) # back to datatable. Again this part is not really necessary.
dt<- dt[order(animal_id, date)] # order as initially ordered
使用此代码:
> proc.time()-ptm
user system elapsed
0.051 0.007 0.068
使用早期代码:
> proc.time()-ptm
user system elapsed
0.083 0.004 0.092
如果你找到了同时使用dist
和data.table
的方法,那么你就是金色的,但我无法理解。它非常快,因为它调用C,并且你添加的观察越多,它就会越快。
答案 1 :(得分:2)
这是另一种应该更快的方法:
library(data.table)
### CREATE A BIG DATASET
set.seed(123)
nSamples <- 20000
nAnimals <- 30
allDates <- as.POSIXct(c("2014-01-01")) + (1:nSamples)*24*3600
dts <- lapply(1:nAnimals, function(id){
data.table(animal_id=id,date=allDates,
x=runif(nSamples,-10,10), y=runif(nSamples,-10,10))
})
dt <- rbindlist(dts)
### ALTERNATIVE APPROACH (NO LOOP)
animal_ids_str <- dt[,paste0("dist_to_",sort(unique(animal_id)))]
# set keys
setkey(dt,animal_id,date)
# add the distance columns
dt[,(animal_ids_str):=as.double(NA)]
# custom function that computes animal distances for a subset of dt at the same date
distancesInSameDate <- function(subsetDT,animal_ids_str){
m <- as.matrix(dist(subsetDT[,.(x,y)]))
diag(m) <- NA
cols <- paste0("dist_to_",subsetDT$animal_id)
missingCols <- animal_ids_str[is.na(match(animal_ids_str,cols))]
m <- cbind(m,matrix(NA,nrow=nrow(m),ncol=length(missingCols)))
colnames(m) <- c(cols,missingCols)
DF <- as.data.frame(m,stringsAsFactors=F)
DF <- DF[,match(animal_ids_str,colnames(DF))]
return(DF)
}
# let's compute the distances
system.time( dt[,(animal_ids_str):=distancesInSameDate(.SD,animal_ids_str),by=date] )
在我的机器上需要:
user system elapsed
13.76 0.00 13.82