对于数据墙提前抱歉。只想显示数据,并包含dputs()。总而言之,这篇文章不会太长。
我正在努力寻找一种更快速的方法来计算数据帧中列的值。我目前有以下〜40000行的数据帧,我必须(我现在)逐行循环:
# ignore row numbers being multiples of 5
head(mydata, 50)
iduser node_id insert_timestamp last_10_timestamps
5 175094 171078 2018-02-11 18:33:38 0
10 175094 171078 2018-02-22 18:33:48 0
15 175094 171078 2018-02-26 18:33:47 0
20 175094 171078 2018-03-02 18:33:51 0
25 175094 171078 2018-03-07 18:33:56 0
30 175094 171080 2018-03-15 00:47:28 0
35 175094 171080 2018-04-07 00:46:23 0
40 175094 171080 2018-04-15 00:46:03 0
45 175094 171080 2018-04-21 00:46:00 0
50 175094 171080 2018-04-29 00:46:01 0
55 563240 171080 2017-12-08 04:44:57 0
60 563240 171078 2017-12-10 21:32:44 0
65 563240 171078 2017-12-13 21:32:58 0
70 563240 171080 2017-12-16 04:57:35 0
75 563240 171078 2017-12-18 21:33:04 0
80 563240 171080 2017-12-21 04:47:52 0
85 563240 171078 2017-12-23 21:32:49 0
90 563240 171080 2017-12-26 04:47:24 0
95 563240 171078 2017-12-28 21:33:11 0
100 563240 171080 2017-12-31 04:44:59 0
105 563240 171078 2018-01-02 21:32:58 0
110 563240 171080 2018-01-05 04:59:26 0
115 563240 171078 2018-01-07 21:33:12 0
120 563240 171080 2018-01-10 04:45:18 0
125 563240 171080 2018-01-13 04:45:21 0
130 563240 171078 2018-01-15 21:33:23 0
135 563240 171078 2018-01-18 21:33:03 0
140 563240 171080 2018-01-21 05:06:57 0
145 563240 171080 2018-01-24 04:45:26 0
150 563240 171078 2018-01-26 21:33:19 0
155 563240 171080 2018-01-29 05:20:39 0
160 563240 171078 2018-01-31 21:34:33 0
165 563240 171080 2018-02-03 05:06:58 0
170 563240 171078 2018-02-05 21:34:40 0
175 563240 171080 2018-02-08 05:38:11 0
180 563240 171078 2018-02-10 21:34:28 0
185 563240 171080 2018-02-13 04:45:54 0
190 563240 171080 2018-04-03 03:47:03 0
195 563240 171080 2018-04-08 03:46:41 0
200 563240 171080 2018-04-13 03:47:31 0
205 563240 171080 2018-04-18 03:47:28 0
210 563240 171080 2018-04-23 03:47:02 0
215 563240 171080 2018-04-29 03:46:55 0
220 1018597 171080 2017-10-11 03:45:35 0
225 1018597 171078 2017-10-13 20:33:16 0
230 1018597 171080 2017-10-16 03:45:30 0
235 1018597 171078 2017-10-18 20:33:15 0
240 1020046 171078 2017-10-12 18:32:56 0
245 1020046 171078 2017-10-15 18:32:55 0
250 1020046 171078 2017-10-18 18:32:53 0
# and the dput to reconstruct
dput(mydata)
structure(list(iduser = c(175094L, 175094L, 175094L, 175094L,
175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 563240L,
563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 563240L,
563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 563240L,
563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 563240L,
563240L, 563240L, 563240L, 563240L, 563240L, 563240L, 563240L,
563240L, 563240L, 563240L, 563240L, 1018597L, 1018597L, 1018597L,
1018597L, 1020046L, 1020046L, 1020046L), node_id = c(171078L,
171078L, 171078L, 171078L, 171078L, 171080L, 171080L, 171080L,
171080L, 171080L, 171080L, 171078L, 171078L, 171080L, 171078L,
171080L, 171078L, 171080L, 171078L, 171080L, 171078L, 171080L,
171078L, 171080L, 171080L, 171078L, 171078L, 171080L, 171080L,
171078L, 171080L, 171078L, 171080L, 171078L, 171080L, 171078L,
171080L, 171080L, 171080L, 171080L, 171080L, 171080L, 171080L,
171080L, 171078L, 171080L, 171078L, 171078L, 171078L, 171078L
), insert_timestamp = structure(c(1518374018, 1519324428, 1519670027,
1520015631, 1520447636, 1521074848, 1523061983, 1523753163, 1524271560,
1524962761, 1512708297, 1512941564, 1513200778, 1513400255, 1513632784,
1513831672, 1514064769, 1514263644, 1514496791, 1514695499, 1514928778,
1515128366, 1515360792, 1515559518, 1515818721, 1516052003, 1516311183,
1516511217, 1516769126, 1517002399, 1517203239, 1517434473, 1517634418,
1517866480, 1518068291, 1518298468, 1518497154, 1522727223, 1523159201,
1523591251, 1524023248, 1524455222, 1524973615, 1507693535, 1507926796,
1508125530, 1508358795, 1507833176, 1508092375, 1508351573), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), last_10_timestamps = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0)), .Names = c("iduser", "node_id", "insert_timestamp",
"last_10_timestamps"), row.names = c(5L, 10L, 15L, 20L, 25L,
30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L,
95L, 100L, 105L, 110L, 115L, 120L, 125L, 130L, 135L, 140L, 145L,
150L, 155L, 160L, 165L, 170L, 175L, 180L, 185L, 190L, 195L, 200L,
205L, 210L, 215L, 220L, 225L, 230L, 235L, 240L, 245L, 250L), class = "data.frame")
last_10_timestamps是根据第二个数据帧计算的指标。 要计算任何给定行的last_10_timestamps,我必须通过iduser,node_id和timestamps的所有3个过滤第二个数据帧,这是第二个数据帧具有的(将在下面显示)。首先,我将显示第二个数据帧,然后我的慢速循环也会清除我想要做的事情。
head(mydata2, 50)
iduser meal_type log_meal_GMT log_meal_hr
1 175094 "snack" 2018-02-06 00:12:33 19
2 175094 "snack" 2018-02-06 00:57:41 19
3 175094 "breakfast" 2018-02-06 12:19:19 7
4 175094 "lunch" 2018-02-06 20:25:18 15
5 175094 "dinner" 2018-02-06 23:48:35 18
6 175094 "breakfast" 2018-02-08 23:09:42 18
7 175094 "lunch" 2018-02-07 18:57:46 13
8 175094 "dinner" 2018-02-07 23:47:09 18
9 175094 "breakfast" 2018-02-03 16:41:20 11
10 175094 "lunch" 2018-02-03 18:56:16 13
11 175094 "dinner" 2018-02-03 23:33:15 18
12 175094 "breakfast" 2018-02-08 14:02:34 9
13 175094 "lunch" 2018-02-08 23:08:04 18
14 175094 "dinner" 2018-02-08 23:09:13 18
15 175094 "snack" 2018-02-09 03:16:02 22
16 175094 "breakfast" 2018-02-09 13:53:57 8
17 175094 "lunch" 2018-02-10 02:29:40 21
18 175094 "snack" 2018-02-05 01:37:46 20
19 175094 "breakfast" 2018-02-05 13:19:42 8
20 175094 "snack" 2018-02-05 13:55:06 8
21 175094 "lunch" 2018-02-05 16:24:44 11
22 175094 "dinner" 2018-02-05 20:58:21 15
23 175094 "dinner" 2018-02-05 20:58:41 15
24 175094 "snack" 2018-02-04 04:12:42 23
25 175094 "breakfast" 2018-02-04 13:17:59 8
dput(mydata2)
structure(list(iduser = c(175094L, 175094L, 175094L, 175094L,
175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 175094L,
175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 175094L,
175094L, 175094L, 175094L, 175094L, 175094L, 175094L, 175094L
), meal_type = c("\"snack\"", "\"snack\"", "\"breakfast\"", "\"lunch\"",
"\"dinner\"", "\"breakfast\"", "\"lunch\"", "\"dinner\"", "\"breakfast\"",
"\"lunch\"", "\"dinner\"", "\"breakfast\"", "\"lunch\"", "\"dinner\"",
"\"snack\"", "\"breakfast\"", "\"lunch\"", "\"snack\"", "\"breakfast\"",
"\"snack\"", "\"lunch\"", "\"dinner\"", "\"dinner\"", "\"snack\"",
"\"breakfast\""), log_meal_GMT = c("2018-02-06 00:12:33", "2018-02-06 00:57:41",
"2018-02-06 12:19:19", "2018-02-06 20:25:18", "2018-02-06 23:48:35",
"2018-02-08 23:09:42", "2018-02-07 18:57:46", "2018-02-07 23:47:09",
"2018-02-03 16:41:20", "2018-02-03 18:56:16", "2018-02-03 23:33:15",
"2018-02-08 14:02:34", "2018-02-08 23:08:04", "2018-02-08 23:09:13",
"2018-02-09 03:16:02", "2018-02-09 13:53:57", "2018-02-10 02:29:40",
"2018-02-05 01:37:46", "2018-02-05 13:19:42", "2018-02-05 13:55:06",
"2018-02-05 16:24:44", "2018-02-05 20:58:21", "2018-02-05 20:58:41",
"2018-02-04 04:12:42", "2018-02-04 13:17:59"), log_meal_hr = c(19,
19, 7, 15, 18, 18, 13, 18, 11, 13, 18, 9, 18, 18, 22, 8, 21,
20, 8, 8, 11, 15, 15, 23, 8)), .Names = c("iduser", "meal_type",
"log_meal_GMT", "log_meal_hr"), row.names = c(NA, 25L), class = "data.frame")
最后,我有以下for循环,这是永远的。我在(在我的代码中)循环(在我的代码中)一个具有40K行的数据帧,并且在每个 for循环中,过滤数据帧本身是100K行:
# nrow(mydata) is nearly 40,000
mydata$last_10_timestamps = 0
for(i in 1:nrow(mydata)) {
this_user = mydata$iduser[i]
this_time = mydata$insert_timestamp[i]
this_node = mydata$node_id[i]
if(this_node == 171078) {
# this filtering 2+ seconds each time, is a problem to do 40K times.
starttime = Sys.time()
lastTenTimes = tail(mydata2$log_meal_hr[mydata2$iduser == this_user & mydata2$log_meal_GMT < this_time & mydata2$meal_type == "\"lunch\"" ], 10)
end_time = Sys.time()
end_time - starttime
}
# fill value for timestamps at correct (ith) row
mydata$last_10_timestamps[i] = mean(lastTenTimes)
}
任何有关如何提高速度的想法都会受到赞赏,谢谢!
编辑:我想提前指出,仅共享25-50行的100K数据帧可能还不够,也不能完全突出数据。很高兴在需要时分享更多。答案 0 :(得分:2)
在运行for循环后查看for循环和mydata
我怀疑输出不是你想要的。通过查看mydata
的第11行可以很好地说明这一点:iduser
中该mydata2
行并不存在,但仍会计算一个值。
for循环中的另一个可能的故障是,为node_id
不等于171078
的行计算值。
可能的解决方案是使用 data.table -package的连接功能。在以下解决方案中,我仅计算满足以下四个条件的行的值:
mydata$iduser == mydata2$iduser
mydata$node_id == 171078
mydata2$log_meal_GMT < mydata$insert_timestamp
mydata2$meal_type == '"lunch"'
代码:
# convert 'log_meal_GMT' to a date-format
mydata2$log_meal_GMT <- as.POSIXct(mydata2$log_meal_GMT)
# load the 'data.table'-package and convert the data.frames to data.tables
library(data.table)
setDT(mydata)
setDT(mydata2)
# use several nested joins to get the result
mydata[mydata2[meal_type == '"lunch"'
][mydata[node_id == 171078]
, on = .(iduser, log_meal_GMT < insert_timestamp)
, nomatch = 0
, allow.cartesian = TRUE
][, .(last_10_ts = mean(tail(log_meal_hr, 10)))
, by = .(iduser, log_meal_GMT)]
, on = .(iduser, insert_timestamp = log_meal_GMT)
, last_10_ts := i.last_10_ts][]
给出(我还包括for循环的输出):
iduser node_id insert_timestamp last_10_timestamps last_10_ts 1: 175094 171078 2018-02-11 18:33:38 15.16667 15.16667 2: 175094 171078 2018-02-22 18:33:48 15.16667 15.16667 3: 175094 171078 2018-02-26 18:33:47 15.16667 15.16667 4: 175094 171078 2018-03-02 18:33:51 15.16667 15.16667 5: 175094 171078 2018-03-07 18:33:56 15.16667 15.16667 6: 175094 171080 2018-03-15 00:47:28 15.16667 NA 7: 175094 171080 2018-04-07 00:46:23 15.16667 NA 8: 175094 171080 2018-04-15 00:46:03 15.16667 NA 9: 175094 171080 2018-04-21 00:46:00 15.16667 NA 10: 175094 171080 2018-04-29 00:46:01 15.16667 NA 11: 563240 171080 2017-12-08 04:44:57 15.16667 NA 12: 563240 171078 2017-12-10 21:32:44 NaN NA 13: 563240 171078 2017-12-13 21:32:58 NaN NA 14: 563240 171080 2017-12-16 04:57:35 NaN NA 15: 563240 171078 2017-12-18 21:33:04 NaN NA 16: 563240 171080 2017-12-21 04:47:52 NaN NA 17: 563240 171078 2017-12-23 21:32:49 NaN NA 18: 563240 171080 2017-12-26 04:47:24 NaN NA 19: 563240 171078 2017-12-28 21:33:11 NaN NA 20: 563240 171080 2017-12-31 04:44:59 NaN NA 21: 563240 171078 2018-01-02 21:32:58 NaN NA 22: 563240 171080 2018-01-05 04:59:26 NaN NA 23: 563240 171078 2018-01-07 21:33:12 NaN NA 24: 563240 171080 2018-01-10 04:45:18 NaN NA 25: 563240 171080 2018-01-13 04:45:21 NaN NA 26: 563240 171078 2018-01-15 21:33:23 NaN NA 27: 563240 171078 2018-01-18 21:33:03 NaN NA 28: 563240 171080 2018-01-21 05:06:57 NaN NA 29: 563240 171080 2018-01-24 04:45:26 NaN NA 30: 563240 171078 2018-01-26 21:33:19 NaN NA 31: 563240 171080 2018-01-29 05:20:39 NaN NA 32: 563240 171078 2018-01-31 21:34:33 NaN NA 33: 563240 171080 2018-02-03 05:06:58 NaN NA 34: 563240 171078 2018-02-05 21:34:40 NaN NA 35: 563240 171080 2018-02-08 05:38:11 NaN NA 36: 563240 171078 2018-02-10 21:34:28 NaN NA 37: 563240 171080 2018-02-13 04:45:54 NaN NA 38: 563240 171080 2018-04-03 03:47:03 NaN NA 39: 563240 171080 2018-04-08 03:46:41 NaN NA 40: 563240 171080 2018-04-13 03:47:31 NaN NA 41: 563240 171080 2018-04-18 03:47:28 NaN NA 42: 563240 171080 2018-04-23 03:47:02 NaN NA 43: 563240 171080 2018-04-29 03:46:55 NaN NA 44: 1018597 171080 2017-10-11 03:45:35 NaN NA 45: 1018597 171078 2017-10-13 20:33:16 NaN NA 46: 1018597 171080 2017-10-16 03:45:30 NaN NA 47: 1018597 171078 2017-10-18 20:33:15 NaN NA 48: 1020046 171078 2017-10-12 18:32:56 NaN NA 49: 1020046 171078 2017-10-15 18:32:55 NaN NA 50: 1020046 171078 2017-10-18 18:32:53 NaN NA
如果您不想在node_id
上过滤,则可以使用:
mydata[mydata2[meal_type == '"lunch"'
][mydata
, on = .(iduser, log_meal_GMT < insert_timestamp)
, nomatch = 0
, allow.cartesian = TRUE
][, .(last_10_ts = mean(tail(log_meal_hr, 10)))
, by = .(iduser, log_meal_GMT)]
, on = .(iduser, insert_timestamp = log_meal_GMT)
, last_10_ts := i.last_10_ts][]
给出:
iduser node_id insert_timestamp last_10_timestamps last_10_ts 1: 175094 171078 2018-02-11 18:33:38 15.16667 15.16667 2: 175094 171078 2018-02-22 18:33:48 15.16667 15.16667 3: 175094 171078 2018-02-26 18:33:47 15.16667 15.16667 4: 175094 171078 2018-03-02 18:33:51 15.16667 15.16667 5: 175094 171078 2018-03-07 18:33:56 15.16667 15.16667 6: 175094 171080 2018-03-15 00:47:28 15.16667 15.16667 7: 175094 171080 2018-04-07 00:46:23 15.16667 15.16667 8: 175094 171080 2018-04-15 00:46:03 15.16667 15.16667 9: 175094 171080 2018-04-21 00:46:00 15.16667 15.16667 10: 175094 171080 2018-04-29 00:46:01 15.16667 15.16667 11: 563240 171080 2017-12-08 04:44:57 15.16667 NA 12: 563240 171078 2017-12-10 21:32:44 NaN NA 13: 563240 171078 2017-12-13 21:32:58 NaN NA 14: 563240 171080 2017-12-16 04:57:35 NaN NA 15: 563240 171078 2017-12-18 21:33:04 NaN NA .....