我有两个要基于时间戳范围加入的数据集。时间戳并不总是在两个数据帧中都匹配。因此,我想基于范围加入。我被建议给我们做饭。
我在下面使用此代码,但不能确定如何满足所有参数的要求,但无法正常工作:
~timestamp, ~hostname, ~gpuSerial, ~gpuUUID, ~powerDrawWatt, ~gpuTempC, ~gpuUtilPerc, ~gpuMemUtilPerc,
"2018-11-08T07:41:27.242Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.94, 32L, 0L, 0L,
"2018-11-08T07:41:29.259Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L,
"2018-11-08T07:41:31.285Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L,
"2018-11-08T07:41:33.301Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.94, 32L, 0L, 0L,
"2018-11-08T07:41:35.322Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L
)
tibble::tribble(
~timestamp, ~hostname, ~eventName, ~eventType, ~jobId, ~taskId,
"2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000", "TotalRender", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000", "Render", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config", "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:42:09.344Z", "04dc4e9647154250beeee51b866b0715000000", "Render", "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67"
)
require(data.table)
simple example:
x = data.table(df2,start=c(timestamp),end=c(taskId))
y = data.table(df1,start=c(timestamp),end=c(gpuMemUtilPerc))
setkey(y,start,end)
foverlaps(x, y, by.x = c("timestamp","hostname"), type="within", nomatch = 0L)
我希望能够获得时间戳范围内的时间戳?谢谢您的帮助
答案 0 :(得分:0)
尝试2
看来fuzzyjoin
给了您很多重复的匹配,所以让我们尝试使用df1
和嵌套的小标题将我们想要的数据从df2
提取到map
中< / p>
## load library and set data options
library(tidyverse)
options(digits.secs = 3)
## nest a filtered df1 in df2$df
dat <-
df2 %>%
group_by(timestamp, hostname, eventName, eventType) %>%
mutate(df = map(timestamp, ~ df1 %>% filter(between(timestamp, .x - 2, .x + 2)))) # 4 second interval
## walkthrough column df pulling out matching data, averaging multiple matches
dat %>%
mutate(
vals = map2(
df,
timestamp,
~ if (nrow(.x) == 0)
NA
else
.x %>% mutate(diff = timestamp - .y) %>% filter(diff == min(diff)) %>% summarise(
powerDrawWatt = mean(powerDrawWatt),
gpuTempC = mean(gpuTempC)
)
),
powerDrawWatt = unlist(vals)['powerDrawWatt'],
gpuTempC = unlist(vals)['gpuTempC']
) %>%
select(-df,-vals)
输出
# A tibble: 5 x 8
# Groups: timestamp, eventName, eventType [5]
timestamp hostname eventName eventType jobId taskId powerDrawWatt gpuTempC
<dttm> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 2018-11-08 07:41:45.459 04dc4e9647154250beeee51b866b0715000000 TotalRender START 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 29.9 34
2 2018-11-08 07:41:45.459 04dc4e9647154250beeee51b866b0715000000 Saving Config START 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 29.9 34
3 2018-11-08 07:41:32.460 04dc4e9647154250beeee51b866b0715000000 Render START 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 25.8 32
4 2018-11-08 07:41:32.460 04dc4e9647154250beeee51b866b0715000000 Saving Config STOP 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 25.8 32
5 2018-11-08 07:42:09.344 04dc4e9647154250beeee51b866b0715000000 Render STOP 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 NA NA
数据
## create tibbles
df1 <-
tibble::tribble(
~timestamp, ~hostname, ~gpuSerial, ~gpuUUID, ~powerDrawWatt, ~gpuTempC, ~gpuUtilPerc, ~gpuMemUtilPerc,
"2018-11-08T07:41:27.242Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.94, 32L, 0L, 0L,
"2018-11-08T07:41:29.259Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L,
"2018-11-08T07:41:31.285Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L,
"2018-11-08T07:41:33.301Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.94, 32L, 0L, 0L,
"2018-11-08T07:41:44.600Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 29.94, 34L, 0L, 0L,
"2018-11-08T07:42:11.500Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 21.94, 36L, 0L, 0L,
"2018-11-08T07:41:35.322Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L
) %>%
mutate( timestamp = as.POSIXct(timestamp, format = '%Y-%m-%dT%H:%M:%OSZ', tz = 'UTC') )
df2 <-
tibble::tribble(
~timestamp, ~hostname, ~eventName, ~eventType, ~jobId, ~taskId,
"2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000", "TotalRender", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000", "Render", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config", "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:42:09.344Z", "04dc4e9647154250beeee51b866b0715000000", "Render", "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67"
) %>%
mutate( timestamp = as.POSIXct(timestamp, format = '%Y-%m-%dT%H:%M:%OSZ', tz = 'UTC') )
要舍入数据,
library(lubridate)
df1 %>%
mutate(
timestamp = lubridate::round_date(timestamp, '10 seconds')
) %>%
group_by(timestamp, hostname, gpuSerial, gpuUUID) %>%
summarise_all(mean)
汇总数据可能会产生多大影响的示例:
sec_interval <- seq(as.POSIXct('2018-11-08 00:00:0.01', tz = 'UTC'), as.POSIXct('2018-11-08 12:00:01', tz = 'UTC'), by = 0.01)
sec_interval_rounded <- lubridate::round_date(test, '10 seconds') %>% unique()
> length(sec_interval)
[1] 4320100
> length(sec_interval_rounded)
[1] 4321