在R中,确定来自具有时间依赖性的两个数据帧的分类数据的双频率

时间:2014-09-10 12:50:56

标签: r probability

我有两个数据框,一个包含设备操作期间的故障信息,第二个包含维护操作。有没有办法确定给定先前故障的维护操作的概率。

# Sample problem
# Determine frequency table from two data sources to determine
# if events are sequential

# Generate N random dates in a range
randDates <- function(N, st="2014/01/01", et="2014/06/30") {
     st <- as.POSIXct(as.Date(st))
     et <- as.POSIXct(as.Date(et))
     dt <- as.numeric(difftime(et,st,unit="sec"))
     ev <- sort(runif(N, 0, dt))
     rt <- st + ev
}

# generate two data sets
# smOper contains operational data that has events
#   the event time, event name, and serial number
# smMain contains maintenance data that has actions
#   the maintenance action, the maintence time, and the serial number 

set.seed(1234)
nSerial = 10
nEventTypes = 10
nOpersPerSerial = 100
nOps = nOpersPerSerial*nSerial
oper_df <- data.frame(time = randDates(nOps),
             fault = paste("Event_",sample(1:nEventTypes,nOps,replace=T),sep=""),
             serialNo = sample(1:nSerial,nOps,replace=T))

nMaintenancePerSerial = 30
nMaintenanceActions = 20       
nMaint = nMaintenancePerSerial * nSerial
maint_df <- data.frame(time = randDates(nOps),
             action = paste("Action_",sample(1:nMaintenanceActions,nOps,replace=T),sep=""),
             serialNo = sample(1:nSerial,nOps,replace=T))

# Show the frequency of events and actions by serial number

oper_table <- with(oper_df,table(fault,serialNo))
maint_table <- with(maint_df,table(action,serialNo))

# How to determine the frequency of maintenance actions with previous operational faults 
# on the same serial no? 

temp1_df <- data.frame(serialNo = oper_df$serialNo, time = oper_df$time,
             fault = oper_df$fault,
             action = NA,
             source="OPS")
temp2_df <- data.frame(serialNo = maint_df$serialNo, time = maint_df$time,
             action = maint_df$action,
             fault=NA,
             source="MAINT")
opsMaint_df <- rbind(temp1_df, temp2_df)
opsMaint_df <- opsMaint_df[with(opsMaint_df,order(serialNo,time)),]
head(opsMaint_df)

temp1_df <- data.frame(serialNo = oper_df$serialNo, time = oper_df$time,
             info = oper_df$fault,
             source="OPS")
temp2_df <- data.frame(serialNo = maint_df$serialNo, time = maint_df$time,
             info = maint_df$action,
             source="MAINT")
opsMaint_df2 <- rbind(temp1_df, temp2_df)
opsMaint_df2 <- opsMaint_df2[with(opsMaint_df2,order(serialNo,time)),]
head(opsMaint2_df)

with(opsMaint_df,table(action,fault))

现在,我想要一个包含列标题中的操作事件和维护的表 行标题中的操作,行/列交集是序列号操作之前事件发生的时间。

查看前10个项目

     serialNo                time      info source
1001        1 2013-12-31 22:53:07  Action_4  Maint
5           1 2014-01-01 04:16:25   Event_8    Ops
8           1 2014-01-01 09:10:46   Event_2    Ops
1005        1 2014-01-01 23:06:27 Action_10  Maint
1009        1 2014-01-02 08:48:12  Action_1  Maint
11          1 2014-01-02 12:01:18   Event_5    Ops
1011        1 2014-01-02 15:10:40  Action_3  Maint
1031        1 2014-01-05 11:06:17 Action_17  Maint
24          1 2014-01-05 11:43:07  Event_10    Ops
1041        1 2014-01-06 15:57:40 Action_17  Maint

计算操作之前发生的事件

          event
action    Event_2 Event_5 Event_8 Event_10
Action_1        1       0       1        0
Action_3        1       1       1        0
Action_4        0       0       0        0
Action_10       1       0       1        0
Action_17       2       2       2        1

这是我尝试解决使用循环的问题。这似乎产生了正确的结果,但可能需要太长时间。

# For each group of rows that corresponds to a serial number
#   For each row 
#     if the source is Maint then count the number of prior Faults
#     total the number of faults in a (Action, Fault) array

freqCount <- function(df) {
  # Frequency counting using loops
  uSerial <- sort(unique(df$serialNo))
  uFault <- sort(unique(df$info[df$source=="OPS"]))
  uAction <- sort((unique(df$info[df$source=="MAINT"])))
  freq <- data.frame(matrix(0, nrow = length(uAction), ncol = length(uFault)), stringsAsFactors=FALSE)
  names(freq)<-uFault
  row.names(freq)<-uAction

  for (iSerial in seq(uSerial)) {
    serialRows = which(df$serialNo == uSerial[iSerial])
    for (jRow in seq(serialRows)) {
      if (df$source[serialRows[jRow]] == "MAINT") {
        actn = as.character(df$info[serialRows[jRow]])
        for (kRow in seq(1,(jRow-1))) {
          if (kRow >= 1) {
            kInfo = as.character(df$info[serialRows[kRow]])
            if (df$source[serialRows[kRow]] == "OPS") { 
              freq[actn, kInfo ] <- freq[actn, kInfo ] + 1
            } else if (actn == kInfo) {
              break
            }
          }
        }
      }
    }
  }
  freq<-rbind(freq,colSums(freq))
  freq<-cbind(freq,rowSums(freq))
  names(freq)[ncol(freq)]<-"rowSums"
  row.names(freq)[nrow(freq)]<-"colSums"
  freq
}

# Reduce the amount of data to match example
#
freqCount(opsMaint_df2[1:10,])
# Analyze all the data
#
freqCount(opsMaint_df2)

这是使用动物园合并/聚合的尝试。但合并会在前10行中创建一个NA为NA的对象吗?

# zoo merge / aggregate frequency counting
require(zoo)
maint_z <- zoo(maint_df[,c("serialNo","action")],order.by=maint_df$time)
oper_z <- zoo(oper_df[,c("serialNo","fault")],order.by=oper_df$time)
zdf <- merge(oper_z, maint_z)
head(zdf)

如何按序号no

分组的先前故障按行动聚合zdf

0 个答案:

没有答案