我有如下数据:
ex <- structure(list(timestamp = structure(c(1502480763.554, 1502480763.554,
1502480764.968, 1502480765.554, 1502480768.554, 1502480770.554,
1502480773.519, 1502480775.72, 1502480777.43, 1502480778.278,
1502480778.288, 1502480778.759, 1502480780.472, 1502480782.815,
1502480785.521, 1502480785.531, 1502480785.707, 1502480787.639,
1502480789.1, 1502480790.682, 1502480791.554, 1502480793.322,
1502480794.363, 1502480795.923, 1502480799.239, 1502480800.27,
1502480800.554, 1502480802.554, 1502480805.63, 1502480805.959,
1502480807.327, 1502480809.554, 1502480809.564, 1502480810.554,
1502480812.8, 1502480813.838, 1502480813.848, 1502480816.24,
1502480816.24, 1502480835.56, 1502480838.576, 1502480848.384,
1502480851.859, 1502480853.554, 1502480856.375, 1502480857.688,
1502480905.554, 1502480910.554, 1502480910.945, 1502480911.816
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), order = c(NA,
NA, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, 2L, 2L, 2L, 2L, NA, NA, NA,
3L, NA, 4L, 4L, 4L, 4L, 4L, NA, 5L, 5L, 5L, 6L, 6L, 6L, NA, NA,
NA, NA, NA, 7L, NA, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 10L,
10L), cat = c(0, 0, 1, 1, 1, 1, 1, 1, 1, 99, 99, 1, 1, 1, 99,
99, 21, 1, 1, 1, 94, 1, 1, 1, 1, 1, 1, 1, 94, 1, 1, 99, 99, 1,
61, 10, 3, 4, 4, 1, 1, 1, 1, 1, 1, 16, 1, 1, 13, 94), var1 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L), var2 = c(NA, NA, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, NA, NA, 0.9,
0.9, 0.9, 0.9, NA, NA, NA, NA, NA, 5.3, 5.3, 5.3, 5.3, 5.3, NA,
8.6, 8.6, 8.6, 14.5, 14.5, 14.5, NA, NA, NA, NA, NA, 7.4, NA,
7.4, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 4.6, 4.6, -4.1, -4.1),
var3 = c(NA, NA, 35.8, 59.3, 51.3, 57.3, 77.5, 82.4, 41.6,
NA, NA, 66.8, 53, 77.1, NA, NA, 55.8, 81.4, 45.8, 37.9, NA,
38.5, 32, 72, 46.9, 76.4, 76.9, 88, NA, 11.7, 49.4, NA, NA,
64.1, NA, NA, NA, NA, NA, 72.5, 77.7, 83.3, 96.4, 83.3, 95.3,
NA, 69.8, 78.9, NA, NA), var4 = c(NA, NA, 26.6, 24, 9.7,
12.7, 21, 12.7, 9.7, NA, NA, 14, 20.3, 25.6, NA, NA, 18.6,
25.3, 15.7, 10.7, NA, 12.8, 8, 41.9, 12.8, 8.5, 10.2, 14.3,
NA, 19.3, 40, NA, NA, 1.2, NA, NA, NA, NA, NA, 10, 21.9,
19, 42, 11.8, 18.4, NA, 33.5, 3.7, NA, NA), var5 = c(NA,
NA, 2.8, 5.2, 2.3, 4.4, -0.9, 0.3, -0.8, NA, NA, 1.3, 1.5,
5.2, NA, NA, -0.7, -0.9, -0.3, 2.8, NA, 0.3, 1.8, 5.3, -0.9,
4.9, 0.9, 4.8, NA, 1.6, -0.8, NA, NA, -0.7, NA, NA, NA, NA,
NA, 0.4, 0.4, 2.2, 4.2, 1.5, -0.1, NA, 0.3, 1.8, NA, NA),
var6 = c(NA, NA, NA, NA, NA, TRUE, NA, NA, TRUE, NA, NA,
TRUE, TRUE, NA, NA, NA, NA, NA, TRUE, TRUE, NA, NA, NA, NA,
TRUE, TRUE, NA, NA, NA, NA, NA, NA, NA, TRUE, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -50L))
在列order
的相同值中,我需要为每行创建两个嵌套的子数据帧-一个在数据之前和之后在此行和之后。因此,让我们以order == 1
的数据块为例:
ex %>% filter(order == 1) %>% print()
# A tibble: 6 x 9
timestamp order cat var1 var2 var3 var4 var5 var6
<dttm> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <lgl>
1 2017-08-11 19:46:04 1 1 1 2.5 35.8 26.6 2.8 NA
2 2017-08-11 19:46:05 1 1 1 2.5 59.3 24 5.20 NA
3 2017-08-11 19:46:08 1 1 1 2.5 51.3 9.7 2.3 NA
4 2017-08-11 19:46:10 1 1 1 2.5 57.3 12.7 4.40 TRUE
5 2017-08-11 19:46:13 1 1 1 2.5 77.5 21 -0.9 NA
6 2017-08-11 19:46:15 1 1 0 2.5 82.4 12.7 0.300 NA
我需要另外两个带有嵌套数据帧的列:data_before
和data_after
。对于第一行,data_before
将为空,而data_after
将包含所有行。对于第二行,data_before
仅包含第一行,data_after
将包含2至6的行。对于第三行,data_before
将包含前两行,而data_after
将包含前两行从3到6的行,依此类推...此类操作需要针对原始数据帧中的每个order
值执行。如何实现?
一个数据块(带有order == 1
)的预期输出为:
structure(list(order = c(1, 1, 1, 1, 1, 1), data_before = list(
structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"),
structure(list(timestamp = structure(1502480764.968, class = c("POSIXct",
"POSIXt"), tzone = "UTC"), cat = 1, var1 = 1L, var2 = 2.5,
var3 = 35.8, var4 = 26.6, var5 = 2.8, var6 = NA), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -1L)), structure(list(
timestamp = structure(c(1502480764.968, 1502480765.554
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), cat = c(1,
1), var1 = c(1L, 1L), var2 = c(2.5, 2.5), var3 = c(35.8,
59.3), var4 = c(26.6, 24), var5 = c(2.8, 5.2), var6 = c(NA,
NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L)), structure(list(timestamp = structure(c(1502480764.968,
1502480765.554, 1502480768.554), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), cat = c(1, 1, 1), var1 = c(1L, 1L, 1L),
var2 = c(2.5, 2.5, 2.5), var3 = c(35.8, 59.3, 51.3),
var4 = c(26.6, 24, 9.7), var5 = c(2.8, 5.2, 2.3), var6 = c(NA,
NA, NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-3L)), structure(list(timestamp = structure(c(1502480764.968,
1502480765.554, 1502480768.554, 1502480770.554), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), cat = c(1, 1, 1, 1), var1 = c(1L,
1L, 1L, 1L), var2 = c(2.5, 2.5, 2.5, 2.5), var3 = c(35.8,
59.3, 51.3, 57.3), var4 = c(26.6, 24, 9.7, 12.7), var5 = c(2.8,
5.2, 2.3, 4.4), var6 = c(NA, NA, NA, TRUE)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L)), structure(list(
timestamp = structure(c(1502480764.968, 1502480765.554,
1502480768.554, 1502480770.554, 1502480773.519), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), cat = c(1, 1, 1, 1, 1), var1 = c(1L,
1L, 1L, 1L, 1L), var2 = c(2.5, 2.5, 2.5, 2.5, 2.5), var3 = c(35.8,
59.3, 51.3, 57.3, 77.5), var4 = c(26.6, 24, 9.7, 12.7,
21), var5 = c(2.8, 5.2, 2.3, 4.4, -0.9), var6 = c(NA,
NA, NA, TRUE, NA)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L))), data_after = list(structure(list(
timestamp = structure(c(1502480764.968, 1502480765.554, 1502480768.554,
1502480770.554, 1502480773.519, 1502480775.72), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), cat = c(1, 1, 1, 1, 1, 1), var1 = c(1L,
1L, 1L, 1L, 1L, 0L), var2 = c(2.5, 2.5, 2.5, 2.5, 2.5, 2.5
), var3 = c(35.8, 59.3, 51.3, 57.3, 77.5, 82.4), var4 = c(26.6,
24, 9.7, 12.7, 21, 12.7), var5 = c(2.8, 5.2, 2.3, 4.4, -0.9,
0.3), var6 = c(NA, NA, NA, TRUE, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L)), structure(list(
timestamp = structure(c(1502480765.554, 1502480768.554, 1502480770.554,
1502480773.519, 1502480775.72), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), cat = c(1, 1, 1, 1, 1), var1 = c(1L, 1L,
1L, 1L, 0L), var2 = c(2.5, 2.5, 2.5, 2.5, 2.5), var3 = c(59.3,
51.3, 57.3, 77.5, 82.4), var4 = c(24, 9.7, 12.7, 21, 12.7
), var5 = c(5.2, 2.3, 4.4, -0.9, 0.3), var6 = c(NA, NA, TRUE,
NA, NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L)), structure(list(timestamp = structure(c(1502480768.554,
1502480770.554, 1502480773.519, 1502480775.72), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), cat = c(1, 1, 1, 1), var1 = c(1L,
1L, 1L, 0L), var2 = c(2.5, 2.5, 2.5, 2.5), var3 = c(51.3, 57.3,
77.5, 82.4), var4 = c(9.7, 12.7, 21, 12.7), var5 = c(2.3, 4.4,
-0.9, 0.3), var6 = c(NA, TRUE, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L)), structure(list(
timestamp = structure(c(1502480770.554, 1502480773.519, 1502480775.72
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), cat = c(1,
1, 1), var1 = c(1L, 1L, 0L), var2 = c(2.5, 2.5, 2.5), var3 = c(57.3,
77.5, 82.4), var4 = c(12.7, 21, 12.7), var5 = c(4.4, -0.9,
0.3), var6 = c(TRUE, NA, NA)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -3L)), structure(list(timestamp = structure(c(1502480773.519,
1502480775.72), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
cat = c(1, 1), var1 = 1:0, var2 = c(2.5, 2.5), var3 = c(77.5,
82.4), var4 = c(21, 12.7), var5 = c(-0.9, 0.3), var6 = c(NA,
NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L)), structure(list(timestamp = structure(1502480775.72, class = c("POSIXct",
"POSIXt"), tzone = "UTC"), cat = 1, var1 = 0L, var2 = 2.5, var3 = 82.4,
var4 = 12.7, var5 = 0.3, var6 = NA), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -1L)))), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L))
答案 0 :(得分:1)
检查:
SELECT
*
FROM
(
SELECT
*
FROM
`example_table`
ORDER BY
ORDER_COLUMN
DESC
) AS TEMP_TABLE
GROUP BY GROUP_NAME
答案 1 :(得分:1)
或者这个:
ex.list <- lapply(split(ex, ex$order), function(x){
ex.x <- as.data.frame(do.call(rbind,
lapply(1:nrow(x), function(i){
c(x$order[i], ifelse(i==1, list(data.frame()), list(x[1:(i-1), ])), list(x[i:nrow(x), ]))
})
))
names(ex.x) <- c('order', 'data_before', 'data_after')
ex.x
})
编辑: 尝试对之前发布的代码进行更多说明:
# lapply() applies a function (input arg 2) to each element of a list (input arg 1)
# and returns a list of return values of the function applied on each input element
ex.list <- lapply(
# the split() function returns a list of data.frames, subsets of ex
# splitted by ex$order; these will be the input for the 1. lapply() call
split(ex, ex$order),
# the following function will be applied to each of these data.farmes
# to create the return values
function(x){ # 'x' will be a data.frame, subset ox 'ex' with one single value of ex$order
list.of.rows <- lapply(# we now loop over each row in the data.frame
# containing data with one single value of ex$order,
# 'i' is the row number
1:nrow(x),
# the functions will create 1 row for the resulting data.frame
function(i){
c(# the row is 1 vector containing the following 3 values
# the first column of the putput data.frame is the value of ex$order
x$order[i],
# the value for row i of data_before
ifelse(i==1, list(data.frame()), list(x[1:(i-1), ])), # for the first row we return an empty list, else the data.frame with previous (1:(i-1)) rows
# the values for row i of data_after
list(x[i:nrow(x), ]) # subset of rows as off row i
)
})
# now that we have a list (list.of.rows) that contains one row for the output data.frame
# we rbind these into one data.frame
ex.x <- as.data.frame(do.call(rbind, # do.call(rbind, ...) cobines elements of ... using rbind()
list.of.rows
))
names(ex.x) <- c('order', 'data_before', 'data_after') # give column names to the output data.frame
ex.x # define the return value of the function of the 1. lapply() call
})
答案 2 :(得分:0)
我们可以使用tidyverse
在order
上拆分,并为每个数据框创建两个新列data_before
和data_after
,其中将包含基于条件的数据框列表。
library(tidyverse)
ex %>%
group_split(order) %>%
map_dfr(. %>%
mutate(data_before = map(seq_len(nrow(.)), function(y) .[seq_len(y - 1), ]),
data_after = map(seq_len(nrow(.)), function(y)
if (y == nrow(.)) .[0,] else .[(y + 1):nrow(.), ]))) %>%
select(order, data_before, data_after)
# A tibble: 50 x 3
# order data_before data_after
# <int> <list> <list>
# 1 1 <tibble [0 × 9]> <tibble [5 × 9]>
# 2 1 <tibble [1 × 9]> <tibble [4 × 9]>
# 3 1 <tibble [2 × 9]> <tibble [3 × 9]>
# 4 1 <tibble [3 × 9]> <tibble [2 × 9]>
# 5 1 <tibble [4 × 9]> <tibble [1 × 9]>
# 6 1 <tibble [5 × 9]> <tibble [0 × 9]>
# 7 2 <tibble [0 × 9]> <tibble [3 × 9]>
# 8 2 <tibble [1 × 9]> <tibble [2 × 9]>
# 9 2 <tibble [2 × 9]> <tibble [1 × 9]>
#10 2 <tibble [3 × 9]> <tibble [0 × 9]>
# … with 40 more rows
这也可以通过以下方式在base R中翻译
do.call(rbind, lapply(split(ex, ex$order), function(x) {
x$data_before <- lapply(seq_len(nrow(x)), function(y) x[seq_len(y - 1), ])
x$data_after <- lapply(seq_len(nrow(x)), function(y)
if (y == nrow(x)) x[0,] else x[(y + 1):nrow(x), ])
x
}))