使用Foverlaps连接数据集

时间:2019-01-22 14:32:11

标签: r

我有两个要基于时间戳范围加入的数据集。时间戳并不总是在两个数据帧中都匹配。因此,我想基于范围加入。我被建议给我们做饭。

我在下面使用此代码,但不能确定如何满足所有参数的要求,但无法正常工作:

                  ~timestamp,                                ~hostname,   ~gpuSerial,                                   ~gpuUUID, ~powerDrawWatt, ~gpuTempC, ~gpuUtilPerc, ~gpuMemUtilPerc,
  "2018-11-08T07:41:27.242Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.94,       32L,           0L,              0L,
  "2018-11-08T07:41:29.259Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.84,       32L,           0L,              0L,
  "2018-11-08T07:41:31.285Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.84,       32L,           0L,              0L,
  "2018-11-08T07:41:33.301Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.94,       32L,           0L,              0L,
  "2018-11-08T07:41:35.322Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.84,       32L,           0L,              0L
  )

tibble::tribble(
                  ~timestamp,                                ~hostname,      ~eventName, ~eventType,                                            ~jobId,                                ~taskId,
  "2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000",   "TotalRender",    "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
  "2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config",    "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
  "2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000",        "Render",    "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
  "2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config",     "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
  "2018-11-08T07:42:09.344Z", "04dc4e9647154250beeee51b866b0715000000",        "Render",     "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67"
  )

     require(data.table)
     simple example:
     x = data.table(df2,start=c(timestamp),end=c(taskId))
     y = data.table(df1,start=c(timestamp),end=c(gpuMemUtilPerc))
     setkey(y,start,end)
     foverlaps(x, y, by.x = c("timestamp","hostname"), type="within", nomatch = 0L)

我希望能够获得时间戳范围内的时间戳?谢谢您的帮助

1 个答案:

答案 0 :(得分:0)

尝试2

看来fuzzyjoin给了您很多重复的匹配,所以让我们尝试使用df1和嵌套的小标题将我们想要的数据从df2提取到map中< / p>

## load library and set data options
library(tidyverse)
options(digits.secs = 3)

## nest a filtered df1 in df2$df
 dat <-
  df2 %>%
  group_by(timestamp, hostname, eventName, eventType) %>%
  mutate(df = map(timestamp, ~ df1 %>% filter(between(timestamp, .x - 2, .x + 2)))) # 4 second interval


  ## walkthrough column df pulling out matching data, averaging multiple matches
  dat %>%
    mutate(
      vals = map2(
        df,
        timestamp,
        ~ if (nrow(.x) == 0)
          NA
        else
          .x %>% mutate(diff = timestamp - .y) %>% filter(diff == min(diff)) %>% summarise(
            powerDrawWatt = mean(powerDrawWatt),
            gpuTempC = mean(gpuTempC)
          )
      ),
      powerDrawWatt = unlist(vals)['powerDrawWatt'],
      gpuTempC = unlist(vals)['gpuTempC']
    ) %>%
    select(-df,-vals)

输出

# A tibble: 5 x 8
# Groups:   timestamp, eventName, eventType [5]
  timestamp               hostname                               eventName     eventType jobId                                           taskId                               powerDrawWatt gpuTempC
  <dttm>                  <chr>                                  <chr>         <chr>     <chr>                                           <chr>                                        <dbl>    <dbl>
1 2018-11-08 07:41:45.459 04dc4e9647154250beeee51b866b0715000000 TotalRender   START     1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67          29.9       34
2 2018-11-08 07:41:45.459 04dc4e9647154250beeee51b866b0715000000 Saving Config START     1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67          29.9       34
3 2018-11-08 07:41:32.460 04dc4e9647154250beeee51b866b0715000000 Render        START     1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67          25.8       32
4 2018-11-08 07:41:32.460 04dc4e9647154250beeee51b866b0715000000 Saving Config STOP      1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67          25.8       32
5 2018-11-08 07:42:09.344 04dc4e9647154250beeee51b866b0715000000 Render        STOP      1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67          NA         NA

数据

## create tibbles
df1 <- 
  tibble::tribble(
    ~timestamp,                                ~hostname,   ~gpuSerial,                                   ~gpuUUID, ~powerDrawWatt, ~gpuTempC, ~gpuUtilPerc, ~gpuMemUtilPerc,
    "2018-11-08T07:41:27.242Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.94,       32L,           0L,              0L,
    "2018-11-08T07:41:29.259Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.84,       32L,           0L,              0L,
    "2018-11-08T07:41:31.285Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.84,       32L,           0L,              0L,
    "2018-11-08T07:41:33.301Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.94,       32L,           0L,              0L,
    "2018-11-08T07:41:44.600Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          29.94,       34L,           0L,              0L,
    "2018-11-08T07:42:11.500Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          21.94,       36L,           0L,              0L,
    "2018-11-08T07:41:35.322Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d",          25.84,       32L,           0L,              0L
  ) %>% 
  mutate( timestamp = as.POSIXct(timestamp, format = '%Y-%m-%dT%H:%M:%OSZ', tz = 'UTC')  )

df2 <-
  tibble::tribble(
    ~timestamp,                                ~hostname,      ~eventName, ~eventType,                                            ~jobId,                                ~taskId,
    "2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000",   "TotalRender",    "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
    "2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config",    "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
    "2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000",        "Render",    "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
    "2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config",     "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
    "2018-11-08T07:42:09.344Z", "04dc4e9647154250beeee51b866b0715000000",        "Render",     "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67"
  ) %>% 
  mutate( timestamp = as.POSIXct(timestamp, format = '%Y-%m-%dT%H:%M:%OSZ', tz = 'UTC')  )

要舍入数据,

library(lubridate)
df1 %>% 
  mutate(
    timestamp = lubridate::round_date(timestamp, '10 seconds')
    ) %>% 
    group_by(timestamp, hostname, gpuSerial, gpuUUID) %>% 
    summarise_all(mean)

汇总数据可能会产生多大影响的示例:

sec_interval <- seq(as.POSIXct('2018-11-08 00:00:0.01', tz = 'UTC'), as.POSIXct('2018-11-08 12:00:01', tz = 'UTC'), by = 0.01)
sec_interval_rounded <- lubridate::round_date(test, '10 seconds') %>% unique()
> length(sec_interval)
[1] 4320100
> length(sec_interval_rounded)
[1] 4321