如何从标准的回归线获得并应用R中测量值的梯度

时间:2018-03-08 13:46:34

标签: r atmosphere

我正在尝试将标准回归线应用于气体数据。数据看起来像下面的data.frame。阀门编号表示CO2是样品= out_air(1)还是标准(2)。在我的标准中,我只使用每个标准测量行(10 x)的最后5次测量的平均值。我想计算这两种方法之间的回归线。之后,应将其梯度应用于同时测量的out_air样本。有任何想法吗?我有更多的数据和标准行,为什么我需要一个可以应用于重复测量的脚本。

              Date    CO2     CH4 CO_LGR N2O_LGR Ventil      date_unconv
1     31.01.2018 00:02 410.87 1927.30 119.63  328.23      1 31.01.2018 00:02
2     31.01.2018 00:03 410.99 1925.54 119.47  328.23      1 31.01.2018 00:03
3     31.01.2018 00:04 410.91 1926.99 119.59  328.23      1 31.01.2018 00:04
4     31.01.2018 00:05 410.91 1926.90 119.64  328.22      1 31.01.2018 00:05
5     31.01.2018 00:06 410.88 1927.38 119.70  328.22      1 31.01.2018 00:06
6     31.01.2018 00:07 410.97 1925.69 119.51  328.25      1 31.01.2018 00:07
7     31.01.2018 00:08 410.95 1926.71 119.61  328.24      1 31.01.2018 00:08
8     31.01.2018 00:09 410.93 1926.80 119.65  328.21      1 31.01.2018 00:09
9     31.01.2018 00:10 410.90 1926.91 119.61  328.21      1 31.01.2018 00:10
10    31.01.2018 00:11 410.92 1926.72 119.61  328.24      1 31.01.2018 00:11
11    31.01.2018 00:12 410.96 1926.40 119.57  328.22      1 31.01.2018 00:12
12    31.01.2018 00:13 410.86 1927.34 119.70  328.21      1 31.01.2018 00:13
13    31.01.2018 00:14 410.92 1926.67 119.67  328.22      1 31.01.2018 00:14
14    31.01.2018 00:15 410.96 1926.23 119.65  328.23      1 31.01.2018 00:15
15    31.01.2018 00:16 410.91 1927.37 119.74  328.24      1 31.01.2018 00:16
16    31.01.2018 00:17 410.92 1927.19 119.68  328.21      1 31.01.2018 00:17
17    31.01.2018 00:18 410.95 1927.18 119.71  328.21      1 31.01.2018 00:18
18    31.01.2018 00:19 410.90 1927.76 119.77  328.23      1 31.01.2018 00:19
19    31.01.2018 00:20 410.99 1926.46 119.66  328.24      1 31.01.2018 00:20
20    31.01.2018 00:21 411.01 1925.92 119.59  328.23      1 31.01.2018 00:21
21    31.01.2018 00:22 410.89 1927.16 119.68  328.22      1 31.01.2018 00:22
22    31.01.2018 00:23 410.90 1927.05 119.72  328.22      1 31.01.2018 00:23
23    31.01.2018 00:24 410.89 1927.20 119.72  328.22      1 31.01.2018 00:24
                  Date    CO2     CH4 CO_LGR N2O_LGR Ventil      date_unconv
300   31.01.2018 05:01 404.88 1864.68 113.73  332.79      2 31.01.2018 05:01
301   31.01.2018 05:02 403.02 1849.81 118.14  335.29      2 31.01.2018 05:02
302   31.01.2018 05:03 403.01 1849.74 117.92  335.32      2 31.01.2018 05:03
303   31.01.2018 05:04 403.02 1849.80 117.87  335.33      2 31.01.2018 05:04
304   31.01.2018 05:05 403.00 1849.79 117.86  335.32      2 31.01.2018 05:05
305   31.01.2018 05:06 403.02 1849.88 117.85  335.34      2 31.01.2018 05:06
306   31.01.2018 05:07 403.02 1849.73 117.87  335.34      2 31.01.2018 05:07
307   31.01.2018 05:08 403.01 1849.77 117.84  335.35      2 31.01.2018 05:08
308   31.01.2018 05:09 403.01 1849.76 117.83  335.32      2 31.01.2018 05:09
309   31.01.2018 05:10 403.01 1849.75 117.84  335.31      2 31.01.2018 05:10

......

这是我到目前为止所写的:

    Dat <- data.frame
# declare time 
Dat$Date <- as.POSIXct(strptime(Dat$Date, format = "%d.%m.%Y %H:%M", tz = "GMT"))

#different valve datasets
out_air <- Dat[Dat$valve==1,]
work_std <- Dat[Dat$valve==2,]

# mean Std, out of last 5 values of the 10 measurements each
MeansCO2 <- apply(as.data.frame(matrix(work_std$CO2, nrow=10)[6:10,]),2,mean) 

# Date of last Std measurement time
Date4means <- work_std$Date[seq(10,nrow(work_std),10)]

# dataframe for Std means and date
StdMeans <- data.frame(Date4means, MeansCO2)


# using one minute later than last std cal in out_air dataset; This is not working for all std.’s since there were also other numbers in Ventile column which I have not mentioned because they are not important for my question 

stdcal <- which(is.element( out_air$Date, StdMeans$Date4means+60))
out_air$std_m <- ifelse(is.element( out_air$Date, StdMeans$Date4means+60), StdMeans$MeansCO2, NA) # some means are missing 
out_air$std <- out_air[stdcal,]

# dataset without outlier
out_air_small <- out_air[out_air$CO2<500,]

library(lattice)
library(latticeExtra)
plot_CO2 <- xyplot(CO2 ~ Date, data=out_air_small, ylim=400:430)
plot_CO2 + as.layer(xyplot(std_m ~ Date, data=out_air_small,col="red",pch = 4, cex = 1))

# plot is made out of all data I have with red X indicating the standards:

xyplot of all CO2 data (blue) and standards (red)

I also tried to calculate the gradient by using this function:

gradient <- function(data, stdcal,parameter, Date){
  x       <- seq(1,length(stdcal),1)
  y       <- data$parameter[stdcal]
  y1      <- y[which(data$parameter)]
  pos_y1  <- stdcal[x]
  x1      <- as.numeric(data$Date[pos_y1])
  y2      <- y[x+1]
  pos_y2  <- stdcal[x+1]
  x2      <- as.numeric(data$Date[pos_y2])
  m       <- (y2 - y1)/(x2 - x1)
  b       <- y1 - m*x1 
  grad    <- m*(as.numeric(data$Date[stdcal])) + b
  grad
  }
gradient(data=out_air, stdcal, parameter = CO2, Date)
# without success as I cannot use this date format and also no success when I use the unformated date version.

我非常感谢您的帮助,因为我现在正在研究这个问题。

谢谢!

2 个答案:

答案 0 :(得分:0)

您似乎想要的计算对您共享的数据不起作用。无论如何,这似乎有用:

数据

library(data.table) # makes it easier to query/subset datasets
dt <- as.data.table(read.delim("clipboard", header = T, sep = "|"))
setnames(dt, c("Date", "CO2"), c("date", "co2"))
dt$date <- as.POSIXct(dt$date, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")

代码

# Function to divide data.table into chunks of "n" observations and return mean value of 
# co2 for n/2 observations. 

retVals <- function(z, chunk = 10){
  n <- nrow(z)
  r <- rep(1:ceiling(n/chunk), each = chunk)[1:n]

  # Split data.table into chunks - set to 10 based on the question
  dat_list <- split(z, r)
  ret_list <- lapply(dat_list, function(q){

    # averaging second half of an observation set set midval <- 5  if you only want the
    # last 5 observations to be averaged, regardless of the chunk size 

    midval <- ceiling(nrow(q)/2)
    return(data.table(date = q$date[nrow(q)], 
                      co2 = mean(q$co2[midval:nrow(q)], na.rm = T) ))
  })
  return(rbindlist(ret_list, use.names = T, fill = T))
}

这是一些输出:

> retVals(dt[valve == 1])
                  date    co2
1: 2018-01-31 00:13:00 410.92
2: 2018-01-31 00:33:00 410.90
> retVals(dt[valve == 2])
                  date      co2
1: 2018-01-31 00:23:00 410.9400
2: 2018-01-31 00:43:00 410.9133

从给定数据集获取当前时间co2的值加上某个恒定时间间隔(time_interval)的函数:

out_air <- dt[valve == 1, ]
work_std <- dt[valve == 2, ]

nextObv <- function(tme, dat = out_air, time_interval = 60){
  dat_sub <- dat[date == tme + time_interval, ]
  retval <- NA
  if(nrow(dat_sub) > 0){
    retval <- mean(dat_sub$co2)
  }
  return(retval)
}

一些输出:

> q <- retVals(dt[valve == 1])
> q
                  date    co2
1: 2018-01-31 00:13:00 410.92
2: 2018-01-31 00:33:00 410.90
> q[, next_min_co2 := nextObv(date, dat = work_std), by = date]
> q
                  date    co2 next_min_co2
1: 2018-01-31 00:13:00 410.92       410.92
2: 2018-01-31 00:33:00 410.90       410.90

此处q是外部空气的数据集(valve == 1),其中,date是每个10个观察块的最后日期,co2是平均值这10个观察的后半部分co2。您发布的数据(dt)有valve == 1的20个观察值,这意味着两个大小为10的块,这就是我们在q中有2个观察值的原因。

在第二步中,对于每次观察,我从co2获得work_std的值,该值是一分钟后记录的。我使用了work_std,但您只需使用相同的结果调用dt[valve == 2]

> q[, next_min_co2 := nextObv(date, dat = dt[valve == 2]), by = date]
> q
                  date    co2 next_min_co2
1: 2018-01-31 00:13:00 410.92       410.92
2: 2018-01-31 00:33:00 410.90       410.90

根据问题中提供的逻辑,上述值是准确的。以上所有代码都适用于整个数据集。

根据观察co2 < 500的异常值的逻辑,整个数据集是一个异常值。我不确定你想要计算什么“渐变”。如果您对查找co2 vs date的线性方程感兴趣,可以使用:

lm(co2 ~ as.numeric(date), data = q)

最好使用经过时间而不是绝对时间,但我不知道你的最终目标。如果您想要不同的内容,请在您的问题中明确指出您要查找的内容,例如:对于给定的瞬间,您想要找出(next_min_co2 - co2)/as.numeric(date)或其他内容的值。在示例中使用数值来显示您想要的计算,以便其他人理解相同的内容。

希望这会有所帮助。

修改

使用approxfun获取所有时间的插值:

outModel <- approxfun(x = out_air$date, y = out_air$co2, method = "linear")
workModel <- approxfun(x = work_std$date, y = work_std$co2, method = "linear")

nextObv <- function(tme, model = outModel, time_interval = 60){
  return(model(tme + time_interval))
}

> q <- retVals(dt[valve == 2])
> q
                  date      co2
1: 2018-01-31 00:23:00 410.9400
2: 2018-01-31 00:43:00 410.9133

approxfun在已知数据点之间进行插值以输出预期值。在您的情况下,由于数据可能是不连续的,您可以使用插值来代替。在下面的示例中,我们尝试两个time_intervals - 60和30.使用60应该返回一个精确值但是30返回q中第一行的插值。第二行将在两种情况下显示NA,因为它超出了数据范围。我们正在使用outModel,因此使用下面的out_air数据集(与dt[valve == 1,]相同)对值进行插值:

> dt[valve == 1]
                   date    co2 valve
 1: 2018-01-31 00:04:00 410.91     1
 2: 2018-01-31 00:05:00 410.91     1
 3: 2018-01-31 00:06:00 410.88     1
 4: 2018-01-31 00:07:00 410.97     1
 5: 2018-01-31 00:08:00 410.95     1
 6: 2018-01-31 00:09:00 410.93     1
 7: 2018-01-31 00:10:00 410.90     1
 8: 2018-01-31 00:11:00 410.92     1
 9: 2018-01-31 00:12:00 410.96     1
10: 2018-01-31 00:13:00 410.86     1
11: 2018-01-31 00:24:00 410.89     1
12: 2018-01-31 00:25:00 410.88     1
13: 2018-01-31 00:26:00 410.90     1
14: 2018-01-31 00:27:00 410.91     1
15: 2018-01-31 00:28:00 410.93     1
16: 2018-01-31 00:29:00 410.94     1
17: 2018-01-31 00:30:00 410.94     1
18: 2018-01-31 00:31:00 410.88     1
19: 2018-01-31 00:32:00 410.87     1
20: 2018-01-31 00:33:00 410.84     1

> q$newvals <- nextObv(q$date)
> q
                  date      co2 newvals
1: 2018-01-31 00:23:00 410.9400  410.89
2: 2018-01-31 00:43:00 410.9133      NA

> q$newvals2 <- nextObv(q$date, time_interval = 30)
> q
                  date      co2 newvals newvals2
1: 2018-01-31 00:23:00 410.9400  410.89 410.8886
2: 2018-01-31 00:43:00 410.9133      NA       NA

如果您不想要插值,请先创建一个函数来获取精确值,如果返回NA,则使用approxfun获取插值。正如我之前提到的,使用经过时间比使用确切的时间瞬间(查找difftime以获得日期的时间间隔b / w)更好。

答案 1 :(得分:0)

我上面改变了所有功能和数据显示。好像它终于有效了。我甚至不需要插值。谢谢你,@ Gautam!我从你的帮助中学到了很多东西!

    # declare time 
Dat$Date <- as.POSIXct(strptime(Dat$Date, format = "%d.%m.%Y %H:%M", tz = "GMT"))

#different valve datasets
out_air <- Dat[Dat$Ventil==1,]
work_std <- Dat[Dat$Ventil==2,]
std <- Dat[Dat$Ventil==3 | Dat$Ventil==4 ,] # not visible in the uploaded data
tgt <- Dat[Dat$Ventil==5,] # not visible in the uploaded data

# mean Std's, using last 5 values of the 10 measurements each
MeansCO2 <- apply(as.data.frame(matrix(work_std$CO2, nrow=10)[6:10,]),2,mean) 
MeansCH4 <- apply(as.data.frame(matrix(work_std$CH4, nrow=10)[6:10,]),2,mean) 
MeansCO <- apply(as.data.frame(matrix(work_std$CO_LGR, nrow=10)[6:10,]),2,mean) 
MeansN2O <- apply(as.data.frame(matrix(work_std$N2O_LGR, nrow=10)[6:10,]),2,mean) 

# Date of last Std measurement time
Date4means <- work_std$Date[seq(10,nrow(work_std),10)]

# dataframe for Std means and date
StdMeans <- data.frame(Date4means, MeansCO2, MeansCH4,MeansCO,MeansN2O)

# location of std mean in dataframe
which(is.element( Dat$Date, StdMeans$Date4means))

#### CO2 ######
Dat$CO2_Std_MEAN<-ifelse(is.element( Dat$Date, StdMeans$Date4means), StdMeans$MeansCO2, NA) 
pos_Std_Dat <- which(!is.na(Dat$CO2_Std_MEAN))

# function to get gradients between the CO2 calibrations

xyplot(MeansCO2~Date4means,data=StdMeans, typ=c("l","p","r"))

CO2_gradients <- function(x,data=StdMeans){
  LM <- lm(MeansCO2[x:(x+1)]~Date4means[x:(x+1)])
  grad <- coef(LM)[2] # gives you the slope
  names(grad) <- names(Date4means)
  grad
}

CO2_slopes <- sapply(1:nrow(StdMeans), CO2_gradients) # apply on all cal std pairs
CO2_Dat_slope <- data.frame(StdMeans$Date4means, CO2_slopes) # make data.frame
CO2_Dat_slope

which(is.element( Dat$Date, CO2_Dat_slope$StdMeans.Date4means))

#add CO2_slope to big data.frame
Dat$CO2_slope<-ifelse(is.element( Dat$Date, CO2_Dat_slope$StdMeans.Date4means), CO2_Dat_slope$CO2_slopes, NA)

# make new data.frame including only out_air and work_std data
Dat_calCO2 <- Dat[Dat$Ventil==1| Dat$Ventil==2,]
pos_Std_mean <- which(!is.na(Dat_calCO2$CO2_Std_MEAN)) # location where mean std is


# apply the gradient on the measured CO2 data
CO2_cal <- function(x,data=Dat_calCO2){
  pos <- pos_Std_mean[x:(x+1)]                # use a position and the following 
  CO2_pos <- data$CO2[pos[1]:pos[2]]          # use a position and the following --> std and next std 
  CO2_corr <- CO2_pos * data$CO2_slope[pos_Std_mean[x]] + CO2_pos # apply gradient on CO2 data
  dat_new <- data.frame(data$Date[pos[1]:pos[2]],CO2_corr)# make data.frame
  colnames(dat_new) <- c("Date", "CO2_corr") # give names
  dat_new
}

# apply function on all CO2 sections, result is a list
CO2_corr_all <- sapply(1:(nrow(StdMeans)-1), CO2_cal) # including the last value would add NA that's why -1

# to get Date and Value pairs: in CO2_corr_all list the odd nr's are Dates, the equal nr's are the corrected CO2 values 
Data_Frame <-function(x, data=CO2_corr_all){
  trans <-t(data.frame(data[[x]], data[[x+1]] ))
  trans
}

# apply only on the odd nr's in the list
CO2_new <- sapply(seq(1,length(cbind(CO2_corr_all)),2),Data_Frame)
CO2_corr_data_t<-as.data.frame(CO2_new) # make data.frame
CO2_corr_data<-data.frame(t(CO2_corr_data_t))
colnames(CO2_corr_data)<- c("Date", "CO2_corr")

# convert into correct formats
CO2_corr_data$Date <- as.POSIXct(CO2_corr_data$Date, tz="GMT")
CO2_corr_data$CO2_corr <- as.numeric(levels(CO2_corr_data$CO2_corr))[CO2_corr_data$CO2_corr]

# add corr CO2 data to data.frame
Dat_cal$CO2_corr <-ifelse(is.element(Dat_cal$Date, CO2_corr_data$Date),CO2_corr_data$CO2_corr , NA)