Question

如何根据日期范围定义在一个单独的，更小的R数据帧（12行）中对大R数据帧（> 2百万行）中的每一行进行分类？

我的大型数据框（捕获）在通过head(captures)调用时看起来与此类似：

       id       date sex
1  160520 2016-11-22   1
2 1029735 2016-11-12   1
3 1885200 2016-11-05   1
4 2058366 2015-09-26   2
5 2058367 2015-09-26   1
6 2058368 2015-09-26   1

我的小数据框，季节，看起来与此类似：

Season Opening.Date Closing.Date
  2016   2016-09-24   2017-01-15
  2015   2015-09-26   2016-01-10
  2014   2014-09-27   2015-01-11
  2013   2013-09-28   2014-01-12
  2012   2012-09-22   2013-01-13
  2011   2011-09-24   2012-01-08
  2010   2010-09-25   2011-01-16
  2009   2009-09-26   2010-01-17
  2008   2008-09-27   2009-01-18
  2007   2007-09-22   2008-01-13
  2006   2006-09-23   2007-01-14
  2005   2005-09-24   2006-01-15

我需要在我的捕获数据框中添加一个“季节”列，其中的值将根据captures$date是否落在季节中定义的范围内来确定。

这是我提出的一个长期解决方案，因为我的数据框架太大而无法使用。

#add packages
library(dplyr)
library(lubridate)
#make blank column
captures$season=NA
for (i in 1:length(seasons$Season)){
  for (j in 1:length(captures$id{
    captures$season[j]=ifelse(between(captures$date[j],ymd(seasons$Opening.Date[i]),ymd(seasons$Closing.Date[i])),seasons$Season[i],captures$season[j])
  }
}

同样，这对我来说不起作用，因为R每次都会崩溃。我也意识到这没有利用R中的矢量化。任何帮助都值得赞赏！

Answer 1

在这里使用 non equi 加入data.table：

require(data.table) # v1.10.4+
setDT(captures) # convert data.frames to data.tables
setDT(seasons)

ans <- seasons[captures, Season,
                 on=.(Opening.Date<=date, Closing.Date>=date), 
                 mult="first"]
# [1] 2016 2016 2016 2015 2015 2015
seasons[, season := ans]

对于captures中的每一行，mult="first"中与第一个匹配的行（seasons）对应的索引是根据提供给on论点。然后返回Season对应索引的值并保存在ans下。然后通过引用将其作为新列添加到seasons。

为了便于理解，我已经分两步展示了它。

您可以使用which=TRUE来查看第一个匹配的索引：

seasons[captures, 
          on=.(Opening.Date<=date, Closing.Date>=date),
          mult="first", 
          which=TRUE]
# [1] 1 1 1 2 2 2

Answer 2

如果您能够根据范围值而非相等有效地执行join操作，那将会非常棒。不幸的是，我不知道是否存在一般解决方案。目前，我建议使用单个for循环。

矢量化的效率最好沿着最高的数据进行。也就是说，如果我们在一个data.frame上循环并对另一个进行向量化，那么对较长的向量进行向量化并在较短的向量上进行循环更有意义。考虑到这一点，我们将在季节框架上循环并对2M行数据进行矢量化。

您的数据：

txt <- "Season Opening.Date Closing.Date
  2016   2016-09-24   2017-01-15
  2015   2015-09-26   2016-01-10
  2014   2014-09-27   2015-01-11
  2013   2013-09-28   2014-01-12
  2012   2012-09-22   2013-01-13
  2011   2011-09-24   2012-01-08
  2010   2010-09-25   2011-01-16
  2009   2009-09-26   2010-01-17
  2008   2008-09-27   2009-01-18
  2007   2007-09-22   2008-01-13
  2006   2006-09-23   2007-01-14
  2005   2005-09-24   2006-01-15"
seasons <- read.table(text = txt, header = TRUE)
seasons[2:3] <- lapply(seasons[2:3], as.Date)

txt <- "       id       date sex
1  160520 2016-11-22   1
2 1029735 2016-11-12   1
3 1885200 2016-11-05   1
4 2058366 2015-09-26   2
5 2058367 2015-09-26   1
6 2058368 2015-09-26   1"
dat <- read.table(text = txt, header = TRUE)
dat$date <- as.Date(dat$date)

开始这个过程，我们假设所有数据season尚未定义：

dat$season <- NA

环绕每个季节＆＃39;行：

for (i in seq_len(nrow(seasons))) {
  dat$season <- ifelse(is.na(dat$season) &
                         dat$date >= seasons$Opening.Date[i] &
                         dat$date < seasons$Closing.Date[i],
                       seasons$Season[i], dat$season)                       
}
dat
#        id       date sex season
# 1  160520 2016-11-22   1   2016
# 2 1029735 2016-11-12   1   2016
# 3 1885200 2016-11-05   1   2016
# 4 2058366 2015-09-26   2   2015
# 5 2058367 2015-09-26   1   2015
# 6 2058368 2015-09-26   1   2015

Answer 3

您可以尝试使用sqldf。注意，我必须将Opening_Date和Closing_Date中的点更改为＆＃34; _＆＃34;。

library(sqldf)

captures$season <- sqldf("select Season from seasons s, captures c
where c.date >= s.Opening_Date and c.date <= s.Closing_Date")
captures    
       id       date sex Season
1  160520 2016-11-22   1   2016
2 1029735 2016-11-12   1   2016
3 1885200 2016-11-05   1   2016
4 2058366 2015-09-26   2   2015
5 2058367 2015-09-26   1   2015
6 2058368 2015-09-26   1   2015

数据

txt <- "Season Opening_Date Closing_Date 2016 2016-09-24 2017-01-15 2015 2015-09-26 2016-01-10 2014 2014-09-27 2015-01-11 2013 2013-09-28 2014-01-12 2012 2012-09-22 2013-01-13 2011 2011-09-24 2012-01-08 2010 2010-09-25 2011-01-16 2009 2009-09-26 2010-01-17 2008 2008-09-27 2009-01-18 2007 2007-09-22 2008-01-13 2006 2006-09-23 2007-01-14 2005 2005-09-24 2006-01-15" seasons <- read.table(text = txt, header = TRUE) seasons[2:3] <- lapply(seasons[2:3], as.Date) txt <- " id date sex 1 160520 2016-11-22 1 2 1029735 2016-11-12 1 3 1885200 2016-11-05 1 4 2058366 2015-09-26 2 5 2058367 2015-09-26 1 6 2058368 2015-09-26 1" captures <- read.table(text = txt, header = TRUE) captures$date <- as.Date(captures$date)

根据R

3 个答案: