我有2个不同的数据框
str(drivenum)
'data.frame': 95841 obs. of 7 variables:
$ team: chr "SF" "ATL" "SF" "ATL" ...
$ year: int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
$ opp : chr "ATL" "SF" "ATL" "SF" ...
$ drvn: int 1 2 3 4 5 6 7 8 9 10 ...
$ fpid: int 2 12 19 23 36 40 54 58 66 71 ...
$ lpid: num 9 17 22 34 39 52 57 64 70 75 ...
$ pts : num 6 3 0 3 0 3 0 3 0 6 ...
str(drivedata)
'data.frame': 669217 obs. of 7 variables:
$ team: chr "SF" "SF" "SF" "SF" ...
$ year: int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
$ opp : chr "ATL" "ATL" "ATL" "ATL" ...
$ pid : int 1 2 3 4 5 6 7 8 9 10 ...
$ dwn : int 0 1 2 1 2 1 1 1 2 0 ...
$ ytg : int 0 10 9 10 6 10 10 6 4 0 ...
$ yfog: int 0 26 27 37 41 60 70 94 96 0 ...
如果drivedata $ pid落在drivenum $ fpid和drivenum $ lpid的范围之间,我试图返回drivenum $ drvn,但由于数据帧大小不同,我遇到了问题。有人有什么想法吗?
答案 0 :(得分:0)
您可以使用which
查找drivenum
中适用于drivedata$pid
中给定值的行:
drivenum <- data.frame( fpid = c( 2, 12, 19, 23, 36 ),
lpid = c( 9, 17, 22, 34, 39 ),
drvn = c( 1, 2, 3, 4, 5 ) )
drivedata <- data.frame( pid = 1:20 )
drvn.list <- sapply( drivedata$pid,
function(x){ drivenum$drvn[which((drivenum$fpid <= x) & (x <= drivenum$lpid))]})
> drvn.list
[[1]]
numeric(0)
[[2]]
[1] 1
[[3]]
[1] 1
[[4]]
[1] 1
[[5]]
[1] 1
[[6]]
[1] 1
[[7]]
[1] 1
[[8]]
[1] 1
[[9]]
[1] 1
[[10]]
numeric(0)
[[11]]
numeric(0)
[[12]]
[1] 2
[[13]]
[1] 2
[[14]]
[1] 2
[[15]]
[1] 2
[[16]]
[1] 2
[[17]]
[1] 2
[[18]]
numeric(0)
[[19]]
[1] 3
[[20]]
[1] 3
>
这是一个替代解决方案,如果
drivedata$pid
中的每个值,drivenum$drvn
和drivenum$fpid
和drivenum$lpid
的排名越来越高,例如divenum$fpid[i]<drivenum$fpid[j]
i<j
,drivenum$lpid
类似。它更快,但它包含一个循环。所以循环并不总是那么糟糕。
drvn.list.2 <- lapply(as.list(as.integer(rep(0,nrow(drivedata)))),head,0)
pos <- rep(NA,max(drivenum$lpid))
pos[drivedata$pid] <- 1:nrow(drivedata)
for ( i in 1:nrow(drivenum) )
{
if (max(drivedata$pid)<drivenum$fpid[i]) { break() }
drvn.list.2[pos[drivenum$fpid[i]:drivenum$lpid[i]]] <-
drivenum$drvn[i]
}
使用大小为8000的drivenum
和大小为60000的drivedata
的示例进行速度比较:
#---------------------------------------------------------
# Generate example data:
set.seed(1)
n <- 8000
d1 <- sample(1:3,n,replace=TRUE)
d2 <- sample(1:10,n,replace=TRUE)
drivenum <- data.frame( fpid = cumsum(d1+(c(0,d2)[-n])),
lpid = cumsum(d1+d2),
drvn = sample(1:n) )
drivedata <- data.frame( pid = sample(1:60000) )
#----------------------------------------------------------
# Speed comparison:
system.time(
for ( k in 1:10 )
{
drvn.list.1 <- sapply( drivedata$pid,
function(x){ drivenum$drvn[which((drivenum$fpid <= x) & (x <= drivenum$lpid))] } )
}
)
system.time(
for ( k in 1:10 )
{
drvn.list.2 <- lapply(as.list(as.integer(rep(0,nrow(drivedata)))),head,0)
pos <- rep(NA,max(drivenum$lpid))
pos[drivedata$pid] <- 1:nrow(drivedata)
for ( i in 1:nrow(drivenum) )
{
if (max(drivedata$pid)<drivenum$fpid[i]) { break() }
drvn.list.2[pos[drivenum$fpid[i]:drivenum$lpid[i]]] <-
drivenum$drvn[i]
}
}
)
> system.time(
+ for ( k in 1:10 )
+ {
+ drvn.list.1 <- .... [TRUNCATED]
user system elapsed
432.12 0.46 436.73
> system.time(
+ for ( k in 1:10 )
+ {
+ drvn.list.2 <- lapply(as.list(as.integer(rep(0,nrow(drivedata)))),head,0)
+ pos <- rep(NA,max(dr .... [TRUNCATED]
user system elapsed
51.07 0.03 51.41
>
结果重合:
> identical(drvn.list.1,drvn.list.2)
[1] TRUE
>