Question

我有2个数据框（DF），每个数据框都包含标识符和日期范围。在两个DF中，每个ID可能有很多日期范围。

我想做的是从第二个DF（DF.B）中的第一个DF（DF.A）中选择有任意长度重叠间隔的行。

df.A <- data.frame("ID" =       c(1,1,1,2,3,3),
                   "Start.A" =  c("2019-01-01", "2019-03-15", "2019-06-10", "2017-01-01", "2015-05-10", "2015-05-15"),
                   "End.A" =    c("2019-01-31", "2019-04-15", "2019-07-09", "2017-01-31", "2015-06-10", "2015-06-02"))


df.B <- data.frame("ID" =       c(1,1,1,3,3),
                   "Start.B" =  c("2019-01-01", "2019-02-01", "2019-03-01", "2015-06-01", "2015-07-01"),
                   "End.B" =    c("2019-01-31", "2019-02-28", "2019-03-31", "2015-06-30", "2015-07-31"))

数据框A：

ID       Start.A       End.A
1        2019-01-01    2019-01-31 
1        2019-03-15    2019-04-15 
1        2019-06-10    2019-07-09
2        2017-01-01    2017-01-31
3        2015-05-10    2015-06-10
3        2015-05-15    2015-06-02

数据框B：

ID       Start.B       End.B
1        2019-01-01    2019-01-31
1        2019-02-01    2019-02-28
1        2019-03-01    2019-03-31
3        2015-06-01    2015-06-30
3        2015-07-01    2015-07-31

我想要我的输出为：

ID       Start.A       End.A
1        2019-01-01    2019-01-31 
1        2019-03-15    2019-04-15 
3        2015-05-10    2015-06-10
3        2015-05-15    2015-06-02

我认为，如果我进行一对一比赛，我将能够毫无问题地做到这一点，但是，正如我提到的，在两个DF中，每个ID都有很多观察结果。我已经尝试过尝试使用lubridate的间隔，但是我在如何寻找重叠的同时却苦苦挣扎，同时还要处理因查找潜在匹配而必须在DF.B中查找所有对应ID的复杂性。

这是一个非常大的数据集（DF.A中有500万个观测值，DF.B中有200万个观测值），因此速度至关重要。任何对数据进行转换以使其尽可能快地进行操作的建议也将不胜感激。

如果有帮助：对于给定的ID，DF.A的观察值可以与DF.A中的其他观察值重叠（例如，上面玩具示例中的ID 3）。相反，DF.B间隔之间没有重叠。

Answer 1

如何？

    static function get_video_attributes(&$video_attribute, $video, $ffprobe="ffprobe")
    {
        $command = "$ffprobe $video 2>&1";
        exec($command, $output, $return_var);
        if ($return_var) {
            Log::channel('queuelog')->error("ERR EXEC: " . $command);
            return ERR_FILE;
        }

        $output = implode("\n",$output);
        $regex_sizes = "/Video: ([^,]*), (.*), ([0-9]{1,4})x([0-9]{1,4}).*, ([0-9]{1,4}) kb\/s, (.*) fps, (.*) tbr, (.*) tbn/";
        if (preg_match($regex_sizes, $output, $regs)) {
            $v_codec = $regs[1] ? $regs[1] : null;
            $width = $regs[3] ? $regs[3] : null;
            $height = $regs[4] ? $regs[4] : null;
            $v_bps = $regs[5] ? $regs[5] : null;
            $tbn = $regs[8] ? $regs[8] : null;
        }

        $regex_sizes = "/Audio: ([^,]*), ([^,]*) Hz, (.*), ([0-9]{1,4}) kb\/s/";
        if (preg_match($regex_sizes, $output, $regs)) {
            $a_codec = $regs[1] ? $regs[1] : null;
            $hz = $regs[2] ? $regs[2] : null;
            $a_bps = $regs[4] ? $regs[4] : null;
        }

        $regex_duration = "/Duration: ([0-9]{1,2}):([0-9]{1,2}):([0-9]{1,2}).([0-9]{1,2}), (.*) ([0-9]{1,4}) kb\/s/";
        if (preg_match($regex_duration, $output, $regs)) {
            $hours = $regs[1] ? $regs[1] : null;
            $mins = $regs[2] ? $regs[2] : null;
            $secs = $regs[3] ? $regs[3] : null;
            $ms = $regs[4] ? $regs[4] : null;
            $bps = $regs[6] ? $regs[6] : null;
        }

        $video_attribute = array(
            'width' => $width ? $width : null,
            'height' => $height ? $height : null,
            'duration' => ($duration = $hours * 3600 + $mins * 60 + $secs) ? $duration : null,
            'hours' => $hours ? $hours : null,
            'mins' => $mins ? $mins : null,
            'secs' => $secs ? $secs : null,
            'ms' => $ms ? $ms : null,
            'hz' => $hz ? $hz : null,
            'v_codec' => $v_codec ? $v_codec : null,
            'a_codec' => $a_codec ? $a_codec : null,
            'bps' => $bps ? $bps : null,
            'v_bps' => $v_bps ? $v_bps : null,
            'a_bps' => $a_bps ? $a_bps : null,
            'tbn' => $tbn ? $tbn : null
        );

        return SC_SUCCESS;
    }

还有

library(data.table)
df.A <- data.table("ID" =       c(1,1,1,2,3,3),
                   "Start.A" =  c("2019-01-01", "2019-03-15", "2019-06-10", "2017-01-01", "2015-05-10", "2015-05-15"),
                   "End.A" =    c("2019-01-31", "2019-04-15", "2019-07-09", "2017-01-31", "2015-06-10", "2015-06-02"))


df.B <- data.table("ID" =       c(1,1,1,3,3),
                   "Start.B" =  c("2019-01-01", "2019-02-01", "2019-03-01", "2015-06-01", "2015-07-01"),
                   "End.B" =    c("2019-01-31", "2019-02-28", "2019-03-31", "2015-06-30", "2015-07-31"))

结果显示如下：

DF = merge(df.A, df.B , by ='ID',allow.cartesian = TRUE)

DF$SEQ_DATE.A = apply(DF[,c('Start.A','End.A'), with=F],1, function(x){paste(x,collapse = ',')})

DF$SEQ_DATE.A = unlist(lapply(strsplit(DF$SEQ_DATE.A,','),function(x){
  out = seq(as.Date(x[1]),as.Date(x[2]),by = 'day')
  out = paste(out, collapse = '|')
  return(out)
}
))

DF$SEQ_DATE.B = apply(DF[,c('Start.B','End.B'), with=F],1, function(x){paste(x,collapse = ',')})

DF$SEQ_DATE.B = unlist(lapply(strsplit(DF$SEQ_DATE.B,','),function(x){
  out = seq(as.Date(x[1]),as.Date(x[2]),by = 'day')
  out = paste(out, collapse = '|')
  return(out)
}
))

DF$Result= apply(DF[,c('SEQ_DATE.A','SEQ_DATE.B'), with = F], 1, function(x){grepl(x[1],x[2])})

如何检查数据框中的日期范围是否与另一个数据框中的任何（特定于ID的）范围重叠

1 个答案: