Question

我向question询问了基于TimeStamp的单个文件的数据子集，并得到了很好的答案。现在我已将29个文件导入一个data.frame（ l2 ）并从1到29给它们ID。我希望能够在 l2中对数据进行子集化基于data.frame m 中的间隔。

我的问题是每个我需要根据 m 中的间隔并基于名为 l2的列拆分 l2 $ id （因为实验未同步）。

E.g。对于l2 $ SkinTemp中的所有值，l2 $ RespirationRate和l2 $ HeartRate与l2 $ id == 1我需要将它们拆分为 m $ P1 。对于P2来说，依赖于l2 $ id == 2

dput(head(l2))
structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", 
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "2", 
"20", "21", "22", "23", "24", "25", "26", "27", "3", "4", "5", 
"6", "7", "8", "9"), class = "factor"), Time = c(0, 0, 0, 0, 
0, 0), SkinTemp = c(27.781, 27.78, 27.779, 27.779, 27.778, 27.777
), HeartRate = c(70, 70, 70, 70, 70, 70), RespirationRate = c(10, 
10, 10, 10, 10, 10)), .Names = c("id", "Time", "SkinTemp", "HeartRate", 
"RespirationRate"), row.names = c(NA, 6L), class = "data.frame")

我有一个data.frame（TimeStamp），其中包括以秒为单位的时间间隔：

    dput(head(m))
structure(list(MARKER = c(NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), P1 = c(18, 138, 438, 678, 798, 1278), P2 = c(1, 
121, 421, 541, 661, 1141), P3 = c(2, 122, 422, 542, 662, 1142
), P4 = c(70, 190, 490, 600, 730, 1170), P5 = c(76, 196, 496, 
616, 752, 1232), P6 = c(33, 153, 453, 595, 715, 1195), P7 = c(20, 
149, 449, 569, 777, 1257), P8 = c(100, 241, 541, 661, 819, 1319
), P9 = c(25, 145, 445, 583, 763, 1246), P10 = c(18, 141, 441, 
621, 801, 1281), P11 = c(70, 190, 490, 710, 830, 1310), P12 = c(35, 
155, 455, 635, 755, 1235), P13 = c(35, 155, 455, 575, 695, 1175
), P14 = c(37, 157, 517, 637, 774, 1254), P15 = c(18, 138, 378, 
498, 678, 1158), P16 = c(49, 169, 469, 589, 769, 1266), P17 = c(75, 
195, 520, 625, 805, 1295), P18 = c(20, 140, 440, 560, 740, 1227
), P19 = c(8, 144, 444, 564, 780, 1260), P20 = c(25, 147, 447, 
648, 768, 1248), P21 = c(47, 173, 467, 587, 707, 1187), P22 = c(28, 
148, 448, 568, 688, 1168), P23 = c(22, 142, 442, 562, 682, 1172
), P24 = c(52, 145, 452, 684, 804, 1284), P25 = c(11, 131, 431, 
618, 738, 1218), P26 = c(19, 139, 439, 619, 762, 1250), P27 = c(41, 
161, 465, 672, 792, 1272), P28 = c(63, 183, 487, 667, 787, 1267
), P29 = c(71, 195, 495, 675, 795, 1275), P30 = c(135, 255, 555, 
675, 795, 1275), P31 = c(561, 681, 981, 1101, 1303, 1701), P32 = c(15, 
135, 435, 555, 675, 1155), P33 = c(31, 151, 451, 571, 691, 1171
), P34 = c(10, 130, 430, 550, 670, 1150), P35 = c(35, 155, 455, 
695, 815, 1295)), .Names = c("MARKER", "P1", "P2", "P3", "P4", 
"P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12", "P13", "P14", 
"P15", "P16", "P17", "P18", "P19", "P20", "P21", "P22", "P23", 
"P24", "P25", "P26", "P27", "P28", "P29", "P30", "P31", "P32", 
"P33", "P34", "P35"), row.names = c(NA, 6L), class = "data.frame")

如果我在一个文件上手动执行此操作

P1$Segment <- cut(l2$Time,c(-Inf,m$P1))
split(l2,P1$Segment)

Answer 1

基础溶液

预先：您的数据似乎不完整，是一个完全可重复的问题（例如，只有一个唯一id，m$MARKER的所有值都是NA，全部但l2中的一个值是不变的。我会创建一个结构相似的数据集，希望你可以调整你的代码自己的数据。

set.seed(42)
n <- 10
l2 <- data.frame(
  id = rep(1:2, each = 5),
  Time = rep(c(11, 33, 55, 77, 99), times = 2),
  SkinTemp = runif(n, min = 27.7, max = 27.9),
  HeartRate = 60 + sample(30, size = n, replace = TRUE),
  RespirationRate = 5 + sample(10, size = n, replace = TRUE)
)
str(l2)
# 'data.frame': 10 obs. of  5 variables:
#  $ id             : int  1 1 1 1 1 2 2 2 2 2
#  $ Time           : num  11 33 55 77 99 11 33 55 77 99
#  $ SkinTemp       : num  27.9 27.9 27.8 27.9 27.8 ...
#  $ HeartRate      : num  74 82 89 68 74 89 90 64 75 77
#  $ RespirationRate: num  15 7 15 15 6 11 9 15 10 14
(m <- data.frame(
  MARKER = 1:3,
  P1 = c(18, 44, 135),
  P2 = c(1, 66, 105)
))
#   MARKER  P1  P2
# 1      1  18   1
# 2      2  44  66
# 3      3 135 105

分配Segment变量，按id分组：

l2a <- do.call(rbind, by(l2, l2$id, function(x) {
  x$Segment <- cut(x$Time, c(-Inf, m[,paste0("P", x$id[1])]))
  x
}))

（do.call(rbind, ...)是为了确保我们最终得到一个data.frame;虽然不是严格要求的，但似乎可以让它更容易看到下一步。

现在拆分。如果你想要一个＆＃34; 2 +＆＃34;变量分裂，第二个参数需要是一个列表。请注意drop参数：if 未声明（默认FALSE），那么您将获得所有可能的值每个id组中的整个列;有时这很好，有时不。在此示例中，它会在列表中生成几个空（0行）data.frames。（我修剪了这个页面的输出。）

str( split(l2a, list(l2a$id, l2a$Segment), drop = TRUE) )
# List of 5
#  $ 1.(-Inf,18]:'data.frame':  1 obs. of  6 variables:
#   ..$ id             : int 1
#   ..$ Time           : num 11
#   ..$ SkinTemp       : num 27.9
#   ..$ HeartRate      : num 74
#   ..$ RespirationRate: num 15
#   ..$ Segment        : Factor w/ 6 levels "(-Inf,18]","(18,44]",..: 1
#  $ 1.(18,44]  :'data.frame':  1 obs. of  6 variables:
#  $ 1.(44,135] :'data.frame':  3 obs. of  6 variables:
#  $ 2.(1,66]   :'data.frame':  3 obs. of  6 variables:
#  $ 2.(66,105] :'data.frame':  2 obs. of  6 variables:

从factor 返回的

`cut`
完全属于个人偏好，但我更喜欢`cut`而不是`factor`或`character`的索引。您可以`cut(..., labels = FALSE)`获取`integer`。回想一下，切割范围之外的值将是`NA`（不是新的）。

＆＃34;长＆＃34;与＆＃34;宽＆＃34;标记

如果你的data.frame m肯定是固定的，那么你就可以侥幸成功，但它当然会变得非常宽广＆＃34;如果你有更多的受访者。许多数据 - bubbas更喜欢在＆＃34; long＆＃34;格式。在这个人为的例子中，它不会给你带来太大的收获，但是当你正式使用你工作的数据结构（例如，数据库，可变长度分组等）时，你可能会受益于使用＆＃34; long＆＃34 ;格式。

library(tidyr)
m2 <- gather(m, id, TimeCut, -MARKER)
m2$id <- gsub("^P", "", m2$id)
m2
#   MARKER id TimeCut
# 1      1  1      18
# 2      2  1      44
# 3      3  1     135
# 4      1  2       1
# 5      2  2      66
# 6      3  2     105

你的第一次拆分就像是：

l2b <- do.call(rbind, by(l2, l2$id, function(x) {
  x$Segment <- cut(x$Time, c(-Inf, subset(m2, id == x$id[1])$TimeCut))
  x
}))

（这里没有真正改进，记得我说过像＃34;如果你的数据变得更大/变异/..."）。

为什么我会这样工作的一个例子：我经常使用其他人在数据库中提供的数据。这可能是正式的，如SQL Server或PostgreSQL，或者更简单，如SQLite。在任何情况下，具有可变宽度列（例如您的m）对数据库来说都是一种痛苦：虽然您可以添加列，但这肯定不是推荐的方法。知道一些SQL以及来回传递它的方法是灵活的数据科学工具包的一个标题。（对不起，我在这里下了肥皂盒。）

`dplyr`

有些人更喜欢tidyverse中更多工具的可读性。这是同样的事情，但在dplyr - 说：

library(dplyr)
l2 %>%
  group_by(id) %>%
  # mutate( Segment = cut(Time, c(-Inf, subset(m2, id == id[1])$TimeCut))) %>%
  mutate( Segment = cut(Time, c(-Inf, m2$TimeCut[m2[["id"]] == id[1]])) ) %>%
  group_by(id, Segment) %>%
  do({
    dat <- .
    # do something with dat
    dat
  })
# Source: local data frame [10 x 6]
# Groups: id, Segment [5]
#       id  Time SkinTemp HeartRate RespirationRate   Segment
#    <int> <dbl>    <dbl>     <dbl>           <dbl>    <fctr>
# 1      1    11 27.88296        74              15 (-Inf,18]
# 2      1    33 27.88742        82               7   (18,44]
# 3      1    55 27.75723        89              15  (44,135]
# 4      1    77 27.86609        68              15  (44,135]
# 5      1    99 27.82835        74               6  (44,135]
# 6      2    11 27.80382        89              11    (1,66]
# 7      2    33 27.84732        90               9    (1,66]
# 8      2    55 27.72693        64              15    (1,66]
# 9      2    77 27.83140        75              10  (66,105]
# 10     2    99 27.84101        77              14  (66,105]

虽然在显示的结果中没有看起来拆分，但请注意Groups: id, Segment [5]，它表示该数据上的大多数dplyr函数都会执行一次分组。如果您将# do something with dat替换为browser()并运行它，则可以一次使用一个组来查看do()块的工作原理。

请注意，您必须返回data.frame（使用do({...})）或将其分配给变量（使用do(newvar = {...})）。后一种选择可能会建议使用unnest()，具体取决于您的工作。

（顺便说一下：dplyr也可以与数据库配合使用。如果您阅读过任何Hadley的书籍，教程，小插曲或其他文档，您可能会遇到他推荐的＆＃34;长＆＃34; over＆＃34;宽＆＃34;，所以它几乎＆＃34;强制执行＆＃34;。）

修改

评论询问是否可以将每一行与进行匹配的行（MARKER）相关联。这是对dplyr解决方案的修改，它添加了行索引以及下限/上限。

l2 %>% group_by(id) %>% # mutate( Segment = cut(Time, c(-Inf, subset(m2, id == id[1])$TimeCut))) %>% mutate( Segment = cut(Time, c(-Inf, m2$TimeCut[m2[["id"]] == id[1]]), labels = FALSE), TimeLower = c(-Inf, m2$TimeCut)[Segment], TimeUpper = c(-Inf, m2$TimeCut)[1+Segment] ) %>% group_by(id, Segment) %>% do({ dat <- . # do something with dat dat }) # Source: local data frame [10 x 8] # Groups: id, Segment [5] # id Time SkinTemp HeartRate RespirationRate Segment TimeLower TimeUpper # <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> # 1 1 11 27.88296 74 15 1 -Inf 18 # 2 1 33 27.88742 82 7 2 18 44 # 3 1 55 27.75723 89 15 3 44 135 # 4 1 77 27.86609 68 15 3 44 135 # 5 1 99 27.82835 74 6 3 44 135 # 6 2 11 27.80382 89 11 2 18 44 # 7 2 33 27.84732 90 9 2 18 44 # 8 2 55 27.72693 64 15 2 18 44 # 9 2 77 27.83140 75 10 3 44 135 # 10 2 99 27.84101 77 14 3 44 135

通过R中的时间戳或间隔和id对数据进行子集

1 个答案:

基础溶液

`cut`
完全属于个人偏好，但我更喜欢`cut`而不是`factor`或`character`的索引。您可以`cut(..., labels = FALSE)`获取`integer`。回想一下，切割范围之外的值将是`NA`（不是新的）。

＆＃34;长＆＃34;与＆＃34;宽＆＃34;标记

`dplyr`

通过R中的时间戳或间隔和id对数据进行子集

1 个答案:

基础溶液

cut 完全属于个人偏好，但我更喜欢cut而不是factor或character的索引。您可以cut(..., labels = FALSE)获取integer。回想一下，切割范围之外的值将是NA（不是新的）。

＆＃34;长＆＃34;与＆＃34;宽＆＃34;标记

dplyr

`cut`
完全属于个人偏好，但我更喜欢`cut`而不是`factor`或`character`的索引。您可以`cut(..., labels = FALSE)`获取`integer`。回想一下，切割范围之外的值将是`NA`（不是新的）。

`dplyr`