我有一个包含3个变量的数据集:ID
,Date
和Years_service
。像这样:
library(data.table)
data <- structure(list(ID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), Date = structure(c(1230768000,
1233446400, 1235865600, 1238544000, 1241136000, 1243814400, 1246406400,
1249084800, 1251763200, 1254355200, 1257033600, 1259625600, 1262304000,
1264982400, 1267401600, 1270080000, 1272672000, 1275350400, 1277942400,
1280620800, 1283299200, 1285891200, 1288569600, 1291161600, 1293840000,
1296518400, 1298937600, 1301616000, 1304208000, 1306886400, 1309478400,
1312156800, 1314835200, 1317427200, 1320105600, 1322697600, 1325376000,
1328054400, 1330560000, 1333238400, 1335830400, 1338508800, 1341100800,
1343779200, 1346457600, 1349049600, 1351728000, 1354320000, 1356998400,
1359676800, 1362096000, 1364774400, 1367366400, 1370044800, 1372636800,
1375315200, 1377993600, 1380585600, 1383264000, 1385856000, 1388534400,
1391212800, 1393632000, 1396310400, 1398902400, 1401580800, 1404172800,
1406851200, 1409529600, 1412121600, 1414800000, 1417392000, 1420070400,
1422748800, 1425168000, 1427846400, 1430438400, 1433116800, 1435708800,
1438387200, 1441065600, 1443657600, 1446336000, 1448928000, 1451606400,
1454284800, 1456790400, 1459468800, 1462060800, 1464739200, 1467331200,
1470009600, 1472688000, 1475280000, 1330560000, 1333238400, 1335830400,
1338508800, 1341100800, 1343779200, 1346457600, 1349049600, 1351728000,
1354320000, 1356998400, 1359676800, 1362096000, 1364774400, 1367366400,
1370044800, 1372636800, 1375315200, 1377993600, 1380585600, 1383264000,
1385856000, 1388534400, 1391212800, 1393632000, 1396310400, 1398902400,
1401580800, 1404172800, 1406851200, 1409529600, 1412121600, 1414800000,
1417392000, 1420070400, 1422748800, 1425168000, 1427846400, 1430438400,
1433116800, 1435708800, 1438387200, 1441065600, 1443657600, 1446336000,
1448928000, 1451606400, 1454284800, 1456790400, 1459468800, 1462060800,
1464739200, 1467331200, 1470009600, 1472688000, 1475280000), class =
c("POSIXct",
"POSIXt"), tzone = "UTC"), Years_service = c(19, 19, 19, 19,
19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 8, 8, 8, 8, 8, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13),
month_1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)), .Names = c("ID", "Date",
"Years_service", "month_1"), row.names = c(NA, -150L), class =
c("data.table",
"data.frame"))
我想要一个新变量,该变量为每个ID包含服务年限最长的日期和日期月限的最小日期。像这样:
ID Date Years_service Date_1
1: 1 2009-01-01 19 2016-06-01
2: 1 2009-02-01 19 2016-06-01
3: 1 2009-03-01 19 2016-06-01
4: 1 2009-04-01 19 2016-06-01
5: 1 2009-05-01 19 2016-06-01
---
146: 2 2016-06-01 12 2016-08-01
147: 2 2016-07-01 12 2016-08-01
148: 2 2016-08-01 13 2016-08-01
149: 2 2016-09-01 13 2016-08-01
150: 2 2016-10-01 13 2016-08-01
我想要的输出是Date_1
我尝试过:
data[,Date_1 := Date[which.max(Years_service) & which.min(month_1)], by = ID]
但是没有用。 我该如何实现?
答案 0 :(得分:1)
一个选择是获取每个'ID'的'Years_service为.I
的行的行索引(max
),然后使用该行,获取min
imum索引'month_1'中的“日期”子集,以与“ ID”分组的值相对应的“日期”子集,并将on
与“ ID”列中的原始数据结合在一起,以创建“ Date_1”列
i1 <- data[, .I[Years_service == max(Years_service)], ID]$V1
data[data[i1, Date[which.min(month_1)], ID], Date_1 :=V1, on = .(ID)]
data
# ID Date Years_service month_1 Date_1
# 1: 1 2009-01-01 19 1 2016-06-01
# 2: 1 2009-02-01 19 2 2016-06-01
# 3: 1 2009-03-01 19 3 2016-06-01
# 4: 1 2009-04-01 19 4 2016-06-01
# 5: 1 2009-05-01 19 5 2016-06-01
# ---
#146: 2 2016-06-01 12 6 2016-08-01
#147: 2 2016-07-01 12 7 2016-08-01
#148: 2 2016-08-01 13 8 2016-08-01
#149: 2 2016-09-01 13 9 2016-08-01
#150: 2 2016-10-01 13 10 2016-08-01
或从Data.table的子集中提取与最小“ month_1”相对应的“日期”
data[, Date_1 := .SD[Years_service == max(Years_service),
Date[which.min(month_1)]], ID]
或者另一种选择是给order
并将'Date_1'分配为按'ID'分组的first
'Date'
data[order(-Years_service, month_1), Date_1 := Date[1], ID]
或使用tidyverse
library(tidyverse)
data %>%
group_by(ID) %>%
arrange(desc(Years_service), month_1) %>%
mutate(Date_1 = first(Date))