我有一个数据框,其中包含以下字段:userX,Time1,Time2,Time3。观察的数量是2000年。
我有一个函数,其中包含userX,Time1,Time2,Time3作为输入,并返回一个包含1个观察值和19个变量的数据框。
我想将该函数应用于第一个数据帧的所有观察,以创建具有2000个观测值和19个变量的新数据帧。
我想过使用lapply,但如果我理解正确,它只需要一个变量。
有人能指出我正确的方向吗?
这是我现在的代码:
# Make Data Frame for video actions between given times for user X
DataVideoActionT <- function (userX, Time1, Time2, Time3){
#Get data for user X
videoActionsX<-subset(videoLectureActions, username==userX)
#Time1 = before first attempt
videoActionsX_T1<-subset(videoActionsX, eventTimestamp<Time1)
#Time2 = before best attemp
videoActionsX_T2<-subset(videoActionsX, eventTimestamp<Time2 & eventTimestamp>Time1)
#Time3= before last attemp
videoActionsX_T3<-subset(videoActionsX, eventTimestamp<Time3 & eventTimestamp>Time1)
error1 = sum(videoActionsX_T1$type==" error ")
pause1 = sum(videoActionsX_T1$type==" pause ")
play1 = sum(videoActionsX_T1$type==" play ")
ratechange1 = sum(videoActionsX_T1$type==" ratechange ")
seeked1 = sum(videoActionsX_T1$type==" seeked ")
stalled1 = sum(videoActionsX_T1$type==" stalled ")
error2 = sum(videoActionsX_T2$type==" error ")
pause2 = sum(videoActionsX_T2$type==" pause ")
play2 = sum(videoActionsX_T2$type==" play ")
ratechange2 = sum(videoActionsX_T2$type==" ratechange ")
seeked2 = sum(videoActionsX_T2$type==" seeked ")
stalled2 = sum(videoActionsX_T2$type==" stalled ")
error3 = sum(videoActionsX_T3$type==" error ")
pause3 = sum(videoActionsX_T3$type==" pause ")
play3 = sum(videoActionsX_T3$type==" play ")
ratechange3 = sum(videoActionsX_T3$type==" ratechange ")
seeked3 = sum(videoActionsX_T3$type==" seeked ")
stalled3 = sum(videoActionsX_T3$type==" stalled ")
data<-data.frame(anon_ID=userX,
error1 = error1,
pause1 = pause1,
play1 = play1,
ratechange1 = ratechange1,
seeked1=seeked1,
stalled1=stalled1,
error2 = error2,
pause2 = pause2,
play2 = play2,
ratechange2 = ratechange2,
seeked2 =seeked2,
stalled2 = stalled2,
error3 = error3,
pause3 = pause3,
play3 = play3,
ratechange3 = ratechange3,
seeked3 = seeked3,
stalled3 = stalled3)
return(data)
}
videoLectureActions<-structure(list(username = c("exampleID1", "exampleID1", "exampleID1",
"exampleID2", "exampleID2", "exampleID2", "exampleID3", "exampleID3",
"exampleID3", "exampleID3"), currentTime = c("103.701247", "103.701247",
"107.543877", "107.543877", "116.456507", "116.456507", "119.987188",
"177.816693", "183.417124", "183.417124"), playbackRate = c("null",
"null", "null", "null", "null", "null", "null", "null", "null",
"null"), pause = c("true", "false", "true", "false", "true",
"false", "true", "false", "true", "false"), error = c("null",
"null", "null", "null", "null", "null", "null", "null", "null",
"null"), networkState = c("1", "1", "1", "1", "1", "1", "1",
"1", "1", "1"), readyState = c("4", "4", "4", "4", "4", "4",
"4", "4", "4", "4"), lectureID = c("exampleLectureID1", "exampleLectureID1",
"exampleLectureID1", "exampleLectureID1", "exampleLectureID1",
"exampleLectureID1", "exampleLectureID1", "exampleLectureID1",
"exampleLectureID1", "exampleLectureID1"), eventTimestamp = c("2013-03-04 18:51:49",
"2013-03-04 18:51:50", "2013-03-04 18:51:54", "2013-03-04 18:51:56",
"2013-03-04 18:52:05", "2013-03-04 18:52:07", "2013-03-04 18:52:11",
"2013-03-04 18:59:17", "2013-03-04 18:59:23", "2013-03-04 18:59:31"
), initTimestamp = c("2013-03-04 18:44:15", "2013-03-04 18:44:15",
"2013-03-04 18:44:15", "2013-03-04 18:44:15", "2013-03-04 18:44:15",
"2013-03-04 18:44:15", "2013-03-04 18:44:15", "2013-03-04 18:44:15",
"2013-03-04 18:44:15", "2013-03-04 18:44:15"), type = c(" pause ",
" play ", " pause ", " play ", " pause ", " play ", " pause ",
" play ", " pause ", " play "), prevTime = c("103.701247 ", "103.701247 ",
"107.543877 ", "107.543877 ", "116.456507 ", "116.456507 ", "119.987188 ",
"177.816693 ", "183.417124 ", "183.417124 ")), .Names = c("username",
"currentTime", "playbackRate", "pause", "error", "networkState",
"readyState", "lectureID", "eventTimestamp", "initTimestamp",
"type", "prevTime"), row.names = c(1L, 2L, 5L, 6L, 17L, 21L,
28L, 936L, 957L, 988L), class = "data.frame")
data<-structure(list(anon_ID = c("exampleID1", "exampleID2", "exampleID3" ), maxGrade = c(10, 5, 10), firstGrade = c(10, 5, 8), lastGrade = c(10, 5, 10), total_submissions = c(1L, 1L, 3L), Time1 = structure(c(1361993741, 1362356090, 1362357401), class = c("POSIXct", "POSIXt"), tzone = ""), TimeM = structure(c(1361993741, 1362356090, 1362492744), class = c("POSIXct", "POSIXt"), tzone = ""), TimeL = structure(c(1361993741, 1362356090, 1362492744), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("anon_ID", "maxGrade", "firstGrade", "lastGrade", "total_submissions", "Time1", "TimeM", "TimeL"), row.names = c(NA, 3L), class = "data.frame")
library(foreach)
library(doMC)
registerDoMC(2) #change the 2 to your number of CPU cores
n <- nrow(data)
res <- list("vector", n)
foreach(i=1:n, .verbose=FALSE, .combine=rbind) %do% {
res[[i]] <- with(data, DataVideoActionT(anon_ID[i], Time1[i], TimeM[i], TimeL[i]))
}
test<-do.call(rbind, res)
我有3个问题。
如何让foreach不打印到控制台?这就是我运行它时的样子
foreach(i=1:n, .verbose=FALSE, .combine=rbind) %do% {
+ res[[i]] <- with(data, DataVideoActionT(anon_ID[i], Time1[i], TimeM[i], TimeL[i]))
+ }
anon_ID error1 pause1 play1 ratechange1 seeked1 stalled1
1 exampleID1 0 0 0 0 0 0
2 exampleID2 0 0 0 0 0 0
3 exampleID3 0 0 0 0 0 0
error2 pause2 play2 ratechange2 seeked2 stalled2 error3 pause3
1 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0
3 0 2 2 0 0 0 0 2
play3 ratechange3 seeked3 stalled3
1 0 0 0 0
2 0 0 0 0
3 2 0 0 0
我不希望在控制台中有数以千计的观察结果。
我想并行运行,我更改%do%%%dopar%代码停止工作。我没有用3个观察值和19个变量进行测试,而是得到了一个2x1字符矩阵
有更好的方法吗?如果是这样,你能解释为什么更好吗?
谢谢!
答案 0 :(得分:3)
mapply
是专为您的需求而设计的,因为它允许您组合每个案例的值,进行计算并返回更大的矩阵。
请注意,我只是将参数“user”,“time1”和“time2”作为一个小例子。
# This is a matrix of 3 columns
data <- replicate(3, 1:5)
# Your function takes some args, and returns extra info
your_function <- function(user, time1, time2) {
c(user, time1, time2, time1*time2, time1+time2, time1/time2)
}
# Here it comes together:
t(mapply(your_function, data[,1], data[,2], data[,3]))
# Output:
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] 1 1 1 1 2 1
# [2,] 2 2 2 4 4 1
# [3,] 3 3 3 9 6 1
# [4,] 4 4 4 16 8 1
# [5,] 5 5 5 25 10 1
确认自己有效:)
作为奖励,我为您的输入和输出做了基准测试,基于100次评估所花费的中位时间 24毫秒。当然,这取决于your_function
实际做了什么。
使用的代码:
data <- cbind(1:2000, replicate(18, rnorm(2000)))
f <- function(user, time1, time2, time3) {
c(user, time1, time2, time3,
time1+time2, time2+time3, time1+time3, time1+time2+time3,
time1+time2, time2+time3, time1+time3, time1+time2+time3,
time1+time2, time2+time3, time1+time3, time1+time2+time3,
time1+time2, time2+time3, time1+time3)
}
result <- t(mapply(f, data[,1], data[,2], data[,3], data[,4]))
result # dim(result) 2000 by 19
更改
data<-data.frame(anon_ID=userX,
...
...
...)
来自您的函数:
data<-c(error1, pause1, play1, ratechange1, seeked1, stalled1, error2,
pause2, play2, ratechange2, seeked2, stalled2, error3, pause3,
play3, ratechange3, seeked3, stalled3)
然后执行以下操作:
test<-t(mapply(DataVideoActionT, userX=data$anon_ID,
Time1=data$Time1, Time2=data$TimeM, Time3=data$TimeL))
colnames(test) <- c("error1", "pause1", "play1", "ratechange1", "seeked1",
"stalled1", "error2", "pause2", "play2", "ratechange2",
"seeked2", "stalled2", "error3", "pause3", "play3",
"ratechange3", "seeked3", "stalled3")
test
答案 1 :(得分:0)
您可以使用APPLY 下面是代码示例,可以帮助您!
dane_evaluations<-data.frame(dane_evaluations,time_spent=apply(dane_evaluations[,c('documentevaluation_start','documentevaluation_end')],1,function(x) time_spent(x[1], x[2])))
正在应用的函数的名称:time_spent
该函数有两个参数:documentevaluation_start
,documentevaluation_end
是数据框dane_evaluations
的列
应用数据框的结果由列time_spent
扩展,并为每行的函数计算适当的值。
示例数据:
subseting data_frame:
head(dane_evaluations[,c('documentevaluation_start','documentevaluation_end')])
documentevaluation_start documentevaluation_end
1 2013-02-07 13:53:57.073760 2013-02-07 14:10:29.445997
2 2013-02-07 14:28:29.463068 2013-02-07 14:34:56.867517
宣传功能:
time_spent <- function(from,to) {
op <- options(digits.secs = 3)
as.numeric((strptime(to, "%Y-%m-%d %H:%M:%OS")-strptime(from, "%Y-%m-%d %H:%M:%OS")),units="secs")
}
应用函数后的外观:
head(dane_evaluations[,c('documentevaluation_start','documentevaluation_end','time_spent')])
documentevaluation_start documentevaluation_end time_spent
1 2013-02-07 13:53:57.073760 2013-02-07 14:10:29.445997 992.3722
2 2013-02-07 14:28:29.463068 2013-02-07 14:34:56.867517 387.4044
答案 2 :(得分:0)
您可以使用do.call
和Vectorize
:
t(do.call(Vectorize(f), DF[,c("userX", "Time1", "Time2", "Time3")]))
其中f
是您的功能,DF
是您的数据帧。您必须进行转置才能保留列数。