我有一个复杂的问题。我缺少需要估算的值,但是估算必须按我的数据框中的分组进行(因为它们是在不同的时间收集的),而我不想通过均值来进行估算。如果未针对特定观察记录该值,则需要使用先前记录的值。
以下是一些示例数据:
sample_data <- data.frame(Class = rep(x= letters[1:10], each=100),
group= rep(x=c("inside", "outside"), each=50),
Sample_number = seq(1,50,by=1),
x1= rnorm(1000,mean=0, sd=.5),
x2= 0)
sample_data$Class_group <- paste0(sample_data$Class,"_", sample_data$group)
sample_data$Class_group <- as.factor(sample_data$Class_group)
sample_data$x1[sample_data$x1 < 0] <- NA
这是我徒劳的尝试:
library(data.table)
varieties=levels(sample_data$Class_group)
for (i in 1:length(levels(sample_data$Class_group))){
variety_subset <- subset(sample_data, sample_data$Class_group==varieties[i])
for (ii in 1:nrow(variety_subset)){
temp_df <- subset(variety_subset, variety_subset$Sample_number==ii)
if(is.number(temp_df$x1)){
variety_subset$x2 <- variety_subset$x1
} else {
variety_subset[ , x2 := shift(x2, n=1L, type="lag")]
}}}
我不确定如何继续。我认为rolling join是一个很好的选择,它可以选择最接近sample_number的x1值在x2中进行插值,但是我没有两个数据框,而我只有一个。
我确实确实需要像this question这样的dplyr mutate
逐行执行操作,但是由于我的小组,我需要估算的变量来自小组本身。这就是为什么我尝试for
循环的原因。
必须有一种更优雅的方式来做到这一点!?
注意:尽管可以进行很多双重检查,但我可以在MS Excel中做到,但这很容易。我在B2单元格中放了一个公式,例如=IF(ISNUMBER(A2), A2, B1)
,我的真实数据集中的第一个值始终具有记录的值,因此这可行,但是我确实需要根据我的组在R中可重复地进行操作, strong> Class_group 。
以下是我希望它工作的方式……
Class,group,Sample_number,x1,x2,Class_group
a inside 1 NA 0 a_inside
a inside 2 NA 0 a_inside
a inside 3 NA 0 a_inside
a inside 4 NA 0 a_inside
a inside 5 0.57 0 a_inside
a inside 6 NA 0 a_inside
a inside 7 NA 0 a_inside
a inside 8 NA 0 a_inside
a inside 9 0.43 0 a_inside
a inside 10 0.19 0 a_inside
a inside 11 0.09 0 a_inside
a inside 12 0.13 0 a_inside
a inside 13 0.68 0 a_inside
a inside 14 0.50 0 a_inside
a inside 15 0.57 0 a_inside
以及之后...
Class,group,Sample_number,x1,x2,Class_group
a inside 1 NA 0.57 a_inside
a inside 2 NA 0.57 a_inside
a inside 3 NA 0.57 a_inside
a inside 4 NA 0.57 a_inside
a inside 5 0.57 0.57 a_inside
a inside 6 NA 0.57 a_inside
a inside 7 NA 0.57 a_inside
a inside 8 NA 0.43 a_inside
a inside 9 0.43 0.43 a_inside
a inside 10 0.19 0.19 a_inside
a inside 11 0.09 0.09 a_inside
a inside 12 0.13 0.13 a_inside
a inside 13 0.68 0.68 a_inside
a inside 14 0.50 0.50 a_inside
a inside 15 0.57 0.57 a_inside
答案 0 :(得分:3)
您可能需要?zoo::na.locf
。
library(zoo)
library(data.table)
set.seed(1) # Use for reproducibility
sample_data <- data.frame(Class = rep(x= letters[1:10], each=100),
group= rep(x=c("inside", "outside"), each=50),
Sample_number = seq(1,50,by=1),
x1= rnorm(1000,mean=0, sd=.5),
x2= 0)
sample_data$Class_group <- paste0(sample_data$Class,"_", sample_data$group)
sample_data$Class_group <- as.factor(sample_data$Class_group)
sample_data$x1[sample_data$x1 < 0] <- NA
varieties=levels(sample_data$Class_group)
setDT(sample_data)
sample_data[
, x2:=ifelse(
is.na(na.locf(x1, na.rm = FALSE)),
na.locf(x1, na.rm = FALSE, fromLast = TRUE),
na.locf(x1, na.rm = FALSE)
),
by = 'Class_group'
]
> sample_data[Class_group == 'a_inside'][1:10]
Class group Sample_number x1 x2 Class_group
1: a inside 1 NA 0.09182166 a_inside
2: a inside 2 0.09182166 0.09182166 a_inside
3: a inside 3 NA 0.09182166 a_inside
4: a inside 4 0.79764040 0.79764040 a_inside
5: a inside 5 0.16475389 0.16475389 a_inside
6: a inside 6 NA 0.16475389 a_inside
7: a inside 7 0.24371453 0.24371453 a_inside
8: a inside 8 0.36916235 0.36916235 a_inside
9: a inside 9 0.28789068 0.28789068 a_inside
10: a inside 10 NA 0.28789068 a_inside
顺便说一句,不需要创建中间变量Class_group
。您可以使用by = c('Class', 'group')
获得相同的结果。
答案 1 :(得分:2)
使用roll="nearest"
确保具有连续NA的案件更接近OP的请求:
sample_data[, x2 := sample_data[!is.na(x1)][
sample_data, x1, on=.(Class_group, Sample_number), roll="nearest"]][]
输出:
Class group Sample_number x1 x2 Class_group
1: a inside 1 NA 0.57 a_inside
2: a inside 2 NA 0.57 a_inside
3: a inside 3 NA 0.57 a_inside
4: a inside 4 NA 0.57 a_inside
5: a inside 5 0.57 0.57 a_inside
6: a inside 6 NA 0.57 a_inside
7: a inside 7 NA 0.57 a_inside
8: a inside 8 NA 0.43 a_inside
9: a inside 9 0.43 0.43 a_inside
10: a inside 10 0.19 0.19 a_inside
11: a inside 11 0.09 0.09 a_inside
12: a inside 12 0.13 0.13 a_inside
13: a inside 13 0.68 0.68 a_inside
14: a inside 14 0.50 0.50 a_inside
15: a inside 15 0.57 0.57 a_inside
样本数据:
library(data.table)
sample_data <- fread("Class group Sample_number x1 x2 Class_group
a inside 1 NA 0 a_inside
a inside 2 NA 0 a_inside
a inside 3 NA 0 a_inside
a inside 4 NA 0 a_inside
a inside 5 0.57 0 a_inside
a inside 6 NA 0 a_inside
a inside 7 NA 0 a_inside
a inside 8 NA 0 a_inside
a inside 9 0.43 0 a_inside
a inside 10 0.19 0 a_inside
a inside 11 0.09 0 a_inside
a inside 12 0.13 0 a_inside
a inside 13 0.68 0 a_inside
a inside 14 0.50 0 a_inside
a inside 15 0.57 0 a_inside")
sample_data[, x2 := as.numeric(x2)]