我有上表。我想填写交易ID下的缺失值。填写此算法的算法如下:
答案 0 :(得分:0)
可能有一个更好的解决方案,但我用data.table编写了这个解决方案:
library(data.table)
#Create Data Table, You can read.csv or read.xlsx etc
raw <- data.table(Event = paste0("e", 1:10),
TransactionID = c("t1",NA,NA,"t4",NA,"t5","t6",NA,NA,"t8"),
UserId = c(rep("kenn1",4), rep("kenn2",6)),
EventTime = as.POSIXct(
c("2017-05-20 9:00", "2017-05-20 9:30", "2017-05-20 9:45", "2017-05-20 9:50", "2017-05-20 10:01",
"2017-05-20 10:02", "2017-05-20 10:03","2017-05-20 10:04","2017-05-20 10:05","2017-05-20 10:06")
, format="%Y-%m-%d %H:%M")
)
transactionTimes <- raw[!is.na(TransactionID), .(TransactionID, EventTime)]
raw[, Above := na.locf(TransactionID, na.rm = F), UserId]
raw[, Below := na.locf(TransactionID, na.rm = F, fromLast = T), UserId]
raw <- merge(raw, transactionTimes[, .(Above = TransactionID, AboveTime = EventTime)], by="Above", all.x = T)
raw <- merge(raw, transactionTimes[, .(Below = TransactionID, BelowTime = EventTime)], by="Below", all.x = T)
raw[, AboveDiff := EventTime - AboveTime]
raw[, BelowDiff := BelowTime - EventTime]
raw[is.na(TransactionID) & is.na(AboveDiff), TransactionID := Below]
raw[is.na(TransactionID) & is.na(BelowDiff), TransactionID := Above]
raw[is.na(TransactionID), TransactionID := ifelse(AboveDiff <= BelowDiff, Above, Below)]
raw <- raw[, .(Event, TransactionID, UserId, EventTime)]
rm(transactionTimes)
答案 1 :(得分:0)
data.table
的另一种解决方案。
library(data.table)
#Create Data Table, You can read.csv or read.xlsx etc
raw <- data.table(Event = paste0("e", 1:10),
TransactionID = c("t1",NA,NA,"t4",NA,"t5","t6",NA,NA,"t8"),
UserId = c(rep("kenn1",4), rep("kenn2",6)),
EventTime = as.POSIXct(
c("2017-05-20 9:00", "2017-05-20 9:30", "2017-05-20 9:45", "2017-05-20 9:50", "2017-05-20 10:01",
"2017-05-20 10:02", "2017-05-20 10:03","2017-05-20 10:04","2017-05-20 10:05","2017-05-20 10:06")
, format="%Y-%m-%d %H:%M")
)
#subset a rows without duplicates
raw_notNA <- raw[!is.na(TransactionID)]
# merge the subset data with original (this will duplicate rows of originals with candiate rows)
merged <- merge(raw, raw_notNA, all.x = T, by = "UserId", allow.cartesian=TRUE)
# calcuate time difference between original and candiate rows
merged[, DiffTime := abs(EventTime.x - EventTime.y)]
# create new Transaction IDs from the closest event
merged[, NewTransactionID := TransactionID.y[DiffTime == min(DiffTime)], by = Event.x]
# remove the duplicaetd rows, and delete unnecesary columns
output <- merged[, .SD[1], by = Event.x][, list(Event.x, NewTransactionID, UserId, EventTime.x)]
names(output) <- names(raw)
print(output)
受到这个问题答案的启发(你的问题不重复,只是类似)