如何计算R中列中不规则选择的数据的平均值

时间:2018-07-10 14:27:31

标签: r

我有一个非常大的数据文件(16个列,超过600,000行),我需要计算空白行之后/之前的前两个值的平均值和最后两个值的平均值(由于仪器内部而丢失值)校准)-数据框中基本上有多个单独的表,由空白行分隔。但是,数据有点不规则,即在绝大多数情况下,有21行数据,后接4空行(即,计算第1和第2值的平均值(跳过17行)以及第20和第21值)。 .4空白行...平均26和27(跳过17行)和45和46 ... 4空白...等等),但偶尔空白的数量可能会有所不同,因此我需要采用一种方法考虑空白行的位置,因为它们是关键,知道空白值是每个列节中的第一和最后两个值(在示例中仅针对Conc列的均值),但是我一直在失败调整脚本。我还无法发布图片,因此还有指向相关数据示例的链接。有什么想法吗?

enter image description here

2 个答案:

答案 0 :(得分:0)

这可以完成一列的工作,您可以使用lapply将其应用于所有列

首先创建结构相似的数据框:

set.seed(1)
#random numbers
df=data.frame(start_time=1:100, end_time=2:101, conc370=runif(100))
#index for block seperating rows
m=seq(7,by=24,95)
# at each m delete 4 rows
df$conc370[c(m, m+1, m+2, m+3)]=NA

df现在应该具有与您的数据类似的结构,因此我们可以开始:

# detect empty lines:
na_idx=is.na(df$conc370)
# the last line of the block of numbers:
end_idx=which(diff(na_idx)==1)
# the first line of the block:
start_idx=which(diff(na_idx)==-1)+1
# get the position where we want to compute the average
pos=sort(c(end_idx-1, start_idx))
# compute the average at the positions
mean_vals=sapply(pos, function(idx,x) mean(x[c(idx, idx+1)]), x=df$conc370 )
# add to df
df$mean=NA
df$mean[pos]=mean_vals

答案 1 :(得分:0)

我的解决方案不是特别优雅,但是我在解决方案方面取得了部分成功,即,除了已经考虑的数据外,没有数据丢失(空白单元格)。但是,需要一些额外的精力来制定详细信息:

data$index = c(1:dim(data)[1])
head(data)
data$missing = c(rep(1, dim(data)[1]))

Length = length(data$missing[is.na(data$Conc370)])
data$missing[is.na(data$Conc370)] = c(rep(0, Length))
data$Conc370.mean = c(rep(NA, dim(data)[1]))

Length = length(data$missing[is.na(data$Conc470)])
data$missing[is.na(data$Conc470)] = c(rep(0, Length))
data$Conc470.mean = c(rep(NA, dim(data)[1]))

Length = length(data$missing[is.na(data$Conc520)])
data$missing[is.na(data$Conc520)] = c(rep(0, Length))
data$Conc520.mean = c(rep(NA, dim(data)[1]))

Length = length(data$missing[is.na(data$Conc590)])
data$missing[is.na(data$Conc590)] = c(rep(0, Length))
data$Conc590.mean = c(rep(NA, dim(data)[1]))

Length = length(data$missing[is.na(data$Conc660)])
data$missing[is.na(data$Conc660)] = c(rep(0, Length))
data$Conc660.mean = c(rep(NA, dim(data)[1]))

Length = length(data$missing[is.na(data$Conc880)])
data$missing[is.na(data$Conc880)] = c(rep(0, Length))
data$Conc880.mean = c(rep(NA, dim(data)[1]))

Length = length(data$missing[is.na(data$Conc950)])
data$missing[is.na(data$Conc950)] = c(rep(0, Length))
data$Conc950.mean = c(rep(NA, dim(data)[1]))

i = 0
number = 0
repeat{
  i = i + 1
  if (data$missing[i] == 0){
    repeat{
      number = number + 1
      if ((number == 1) & (i > 3)){   # if you are on the first missing row
        data$Conc370.mean[i-1] = mean(c(data$Conc370[i-1], data$Conc370[i-2]))
        data$Conc470.mean[i-1] = mean(c(data$Conc470[i-1], data$Conc470[i-2]))
        data$Conc520.mean[i-1] = mean(c(data$Conc520[i-1], data$Conc520[i-2]))
        data$Conc590.mean[i-1] = mean(c(data$Conc590[i-1], data$Conc590[i-2]))
        data$Conc660.mean[i-1] = mean(c(data$Conc660[i-1], data$Conc660[i-2]))
        data$Conc880.mean[i-1] = mean(c(data$Conc880[i-1], data$Conc880[i-2]))
        data$Conc950.mean[i-1] = mean(c(data$Conc950[i-1], data$Conc950[i-2]))
      }

      if (data$missing[i + number] == 1){
        data$Conc370.mean[i + number] = mean(c(data$Conc370[i + number], data$Conc370[i + number + 1]))
        data$Conc470.mean[i + number] = mean(c(data$Conc470[i + number], data$Conc470[i + number + 1]))
        data$Conc520.mean[i + number] = mean(c(data$Conc520[i + number], data$Conc520[i + number + 1]))
        data$Conc590.mean[i + number] = mean(c(data$Conc590[i + number], data$Conc590[i + number + 1]))
        data$Conc660.mean[i + number] = mean(c(data$Conc660[i + number], data$Conc660[i + number + 1]))
        data$Conc880.mean[i + number] = mean(c(data$Conc880[i + number], data$Conc880[i + number + 1]))
        data$Conc950.mean[i + number] = mean(c(data$Conc950[i + number], data$Conc950[i + number + 1]))

        i = i + number - 1
        number = 0
        break
      }
    }
  }

  if (i >= dim(data)[1]){
    break
    }
}