我正在分析双极化雷达数据,我希望将Marshall Palmer关系的结果添加为my data中的ID级变量。
我可以找到没有CRAN功能,但是另一个R用户有脚本,其中他将关系应用为数据中预期值的估计值:
# From Troy W. (thanks!)
# A few small changes by hack-r
## Someone better in R than me could probably clean up/refactor the code a bit.
library(dplyr)
library(data.table)
test <- fread('../input/test.csv')
mpalmer <- function(ref, minutes_past) {
# order reflectivity values and minutes_past
sort_min_index = order(minutes_past)
minutes_past <- minutes_past[sort_min_index]
ref <- ref[sort_min_index]
# calculate the length of time for which each reflectivity value is valid
valid_time <- rep(0, length(minutes_past))
valid_time[1] <- minutes_past[1]
if (length(valid_time) > 1) {
for (i in seq(2, length(minutes_past))) {
valid_time[i] <- minutes_past[i] - minutes_past[i-1]
}
valid_time[length(valid_time)] = valid_time[length(valid_time)] + 60 - sum(valid_time)
} else {
# if only 1 observation, make it valid for the entire hour
valid_time <- 60
}
valid_time = valid_time / 60
# calculate hourly rain rates using marshall-palmer weighted by valid times
sum <- 0
for (i in seq(length(ref))) {
if (!is.na(ref[i])) {
mmperhr <- ((10^(ref[i]/10))/200) ^ 0.625
sum <- sum + mmperhr * valid_time[i]
}
}
return(sum)
}
results <- test %>% group_by(Id) %>% summarize(Expected=sum)
write.csv(results, file='sample_solution.csv', row.names=FALSE)
除了大数据速度极慢之外,上面代码的问题在于它没有在原始数据中创建一列结果,这将允许它在创建的ETL管道中生成。 ID级别的这种关系作为数据集中的1个预测变量。
我尝试重写这个函数:
mpalmer <- function(ref, minutes_past) {
# Credit to Troy for this
# edits by Jason Miller, hack-r.com
# order reflectivity values and minutes_past
sort_min_index = order(minutes_past)
minutes_past <- minutes_past[sort_min_index]
ref <- ref[sort_min_index]
# calculate the length of time for which each reflectivity value is valid
valid_time <- rep(0, length(minutes_past))
valid_time[1] <- minutes_past[1]
if (length(valid_time) > 1) {
for (i in seq(2, length(minutes_past))) {
valid_time[i] <- minutes_past[i] - minutes_past[i-1]
}
valid_time[length(valid_time)] = valid_time[length(valid_time)] + 60 - sum(valid_time)
} else {
# if only 1 observation, make it valid for the entire hour
valid_time <- 60
}
valid_time = valid_time / 60
# calculate hourly rain rates using marshall-palmer weighted by valid times
sum <- 0
for (i in seq(length(ref))) {
if (!is.na(ref[i])) {
mmperhr <- ((10^(ref[i]/10))/200) ^ 0.625
sum <- sum + mmperhr * valid_time[i]
}
}
return(sum)
}
然后像这样应用它:
train.samp$mp <- aggregate(train.samp$Ref, by=list(train.samp$Id), FUN = mpalmer, minutes_past = train.samp$minutes_past)
我认为这种方法很有用,但是在运行了很长时间后,它返回了一个错误:
Error in `$<-.data.frame`(`*tmp*`, "mp", value = list(Group.1 = c(10L, :
replacement has 9765 rows, data has 10000
我已经在不同的数据样本上尝试了它,并且错误消息始终采用该格式,但具体数字可能会发生变化。数据集中没有丢失的数据。
知道如何修复此功能(和/或加快速度)?
更新:我已经使用了for循环,但它 SO 慢......
答案 0 :(得分:0)
这就是我现在要做的事情,但我仍然愿意接受其他解决方案。
基本上,我回到原来的功能,然后将过大的数据集拆分成可管理的块,并在每个块上运行循环:
train.samp <- train.samp[order(train.samp$Id),]
train.samp1 <- train.samp1[order(train.samp1$Id),]
train.samp.id <- unique(train.samp$Id)
train.samp.id.1 <- train.samp.id[1:25000]
train.samp.id.2 <- train.samp.id[25001:50000]
train.samp.id.3 <- train.samp.id[50001:75000]
train.samp.id.4 <- train.samp.id[75001:100000]
train.samp.id.6 <- train.samp.id[100001:125000]
train.samp.id.5 <- train.samp.id[125001:150000]
train.samp.id.7 <- train.samp.id[150001:175000]
train.samp.id.8 <- train.samp.id[175001:200000]
train.samp.id.9 <- train.samp.id[200001:length(train.samp.id)]
train.samp.1 <- train.samp[train.samp$Id %in% train.samp.id.1,]
train.samp.2 <- train.samp[train.samp$Id %in% train.samp.id.2,]
train.samp.3 <- train.samp[train.samp$Id %in% train.samp.id.3,]
train.samp.4 <- train.samp[train.samp$Id %in% train.samp.id.4,]
train.samp.5 <- train.samp[train.samp$Id %in% train.samp.id.5,]
train.samp.6 <- train.samp[train.samp$Id %in% train.samp.id.6,]
train.samp.7 <- train.samp[train.samp$Id %in% train.samp.id.7,]
train.samp.8 <- train.samp[train.samp$Id %in% train.samp.id.8,]
train.samp.9 <- train.samp[train.samp$Id %in% train.samp.id.9,]
system.time(
for(i in unique(train.samp.1$Id)){
train.samp.1$mp[train.samp.1$Id == i] <- mpalmer(train.samp.1$Ref[train.samp.1$Id == i], minutes_past = train.samp.1$minutes_past[train.samp.1$Id == i])
} )
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.2])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.3])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.4])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.5])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.6])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.7])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.8])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.9])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
system.time(
for(i in unique(train.samp1$Id)){
train.samp1$mp[train.samp1$Id == i] <- mpalmer(train.samp1$Ref[train.samp1$Id == i], minutes_past = train.samp1$minutes_past[train.samp1$Id == i])
}
此处未显示此功能,但我将在评论中利用@Gregor的建议。