我正在尝试将标准回归线应用于气体数据。数据看起来像下面的data.frame。阀门编号表示CO2是样品= out_air(1)还是标准(2)。在我的标准中,我只使用每个标准测量行(10 x)的最后5次测量的平均值。我想计算这两种方法之间的回归线。之后,应将其梯度应用于同时测量的out_air样本。有任何想法吗?我有更多的数据和标准行,为什么我需要一个可以应用于重复测量的脚本。
Date CO2 CH4 CO_LGR N2O_LGR Ventil date_unconv
1 31.01.2018 00:02 410.87 1927.30 119.63 328.23 1 31.01.2018 00:02
2 31.01.2018 00:03 410.99 1925.54 119.47 328.23 1 31.01.2018 00:03
3 31.01.2018 00:04 410.91 1926.99 119.59 328.23 1 31.01.2018 00:04
4 31.01.2018 00:05 410.91 1926.90 119.64 328.22 1 31.01.2018 00:05
5 31.01.2018 00:06 410.88 1927.38 119.70 328.22 1 31.01.2018 00:06
6 31.01.2018 00:07 410.97 1925.69 119.51 328.25 1 31.01.2018 00:07
7 31.01.2018 00:08 410.95 1926.71 119.61 328.24 1 31.01.2018 00:08
8 31.01.2018 00:09 410.93 1926.80 119.65 328.21 1 31.01.2018 00:09
9 31.01.2018 00:10 410.90 1926.91 119.61 328.21 1 31.01.2018 00:10
10 31.01.2018 00:11 410.92 1926.72 119.61 328.24 1 31.01.2018 00:11
11 31.01.2018 00:12 410.96 1926.40 119.57 328.22 1 31.01.2018 00:12
12 31.01.2018 00:13 410.86 1927.34 119.70 328.21 1 31.01.2018 00:13
13 31.01.2018 00:14 410.92 1926.67 119.67 328.22 1 31.01.2018 00:14
14 31.01.2018 00:15 410.96 1926.23 119.65 328.23 1 31.01.2018 00:15
15 31.01.2018 00:16 410.91 1927.37 119.74 328.24 1 31.01.2018 00:16
16 31.01.2018 00:17 410.92 1927.19 119.68 328.21 1 31.01.2018 00:17
17 31.01.2018 00:18 410.95 1927.18 119.71 328.21 1 31.01.2018 00:18
18 31.01.2018 00:19 410.90 1927.76 119.77 328.23 1 31.01.2018 00:19
19 31.01.2018 00:20 410.99 1926.46 119.66 328.24 1 31.01.2018 00:20
20 31.01.2018 00:21 411.01 1925.92 119.59 328.23 1 31.01.2018 00:21
21 31.01.2018 00:22 410.89 1927.16 119.68 328.22 1 31.01.2018 00:22
22 31.01.2018 00:23 410.90 1927.05 119.72 328.22 1 31.01.2018 00:23
23 31.01.2018 00:24 410.89 1927.20 119.72 328.22 1 31.01.2018 00:24
Date CO2 CH4 CO_LGR N2O_LGR Ventil date_unconv
300 31.01.2018 05:01 404.88 1864.68 113.73 332.79 2 31.01.2018 05:01
301 31.01.2018 05:02 403.02 1849.81 118.14 335.29 2 31.01.2018 05:02
302 31.01.2018 05:03 403.01 1849.74 117.92 335.32 2 31.01.2018 05:03
303 31.01.2018 05:04 403.02 1849.80 117.87 335.33 2 31.01.2018 05:04
304 31.01.2018 05:05 403.00 1849.79 117.86 335.32 2 31.01.2018 05:05
305 31.01.2018 05:06 403.02 1849.88 117.85 335.34 2 31.01.2018 05:06
306 31.01.2018 05:07 403.02 1849.73 117.87 335.34 2 31.01.2018 05:07
307 31.01.2018 05:08 403.01 1849.77 117.84 335.35 2 31.01.2018 05:08
308 31.01.2018 05:09 403.01 1849.76 117.83 335.32 2 31.01.2018 05:09
309 31.01.2018 05:10 403.01 1849.75 117.84 335.31 2 31.01.2018 05:10
......
这是我到目前为止所写的:
Dat <- data.frame
# declare time
Dat$Date <- as.POSIXct(strptime(Dat$Date, format = "%d.%m.%Y %H:%M", tz = "GMT"))
#different valve datasets
out_air <- Dat[Dat$valve==1,]
work_std <- Dat[Dat$valve==2,]
# mean Std, out of last 5 values of the 10 measurements each
MeansCO2 <- apply(as.data.frame(matrix(work_std$CO2, nrow=10)[6:10,]),2,mean)
# Date of last Std measurement time
Date4means <- work_std$Date[seq(10,nrow(work_std),10)]
# dataframe for Std means and date
StdMeans <- data.frame(Date4means, MeansCO2)
# using one minute later than last std cal in out_air dataset; This is not working for all std.’s since there were also other numbers in Ventile column which I have not mentioned because they are not important for my question
stdcal <- which(is.element( out_air$Date, StdMeans$Date4means+60))
out_air$std_m <- ifelse(is.element( out_air$Date, StdMeans$Date4means+60), StdMeans$MeansCO2, NA) # some means are missing
out_air$std <- out_air[stdcal,]
# dataset without outlier
out_air_small <- out_air[out_air$CO2<500,]
library(lattice)
library(latticeExtra)
plot_CO2 <- xyplot(CO2 ~ Date, data=out_air_small, ylim=400:430)
plot_CO2 + as.layer(xyplot(std_m ~ Date, data=out_air_small,col="red",pch = 4, cex = 1))
# plot is made out of all data I have with red X indicating the standards:
xyplot of all CO2 data (blue) and standards (red)
I also tried to calculate the gradient by using this function:
gradient <- function(data, stdcal,parameter, Date){
x <- seq(1,length(stdcal),1)
y <- data$parameter[stdcal]
y1 <- y[which(data$parameter)]
pos_y1 <- stdcal[x]
x1 <- as.numeric(data$Date[pos_y1])
y2 <- y[x+1]
pos_y2 <- stdcal[x+1]
x2 <- as.numeric(data$Date[pos_y2])
m <- (y2 - y1)/(x2 - x1)
b <- y1 - m*x1
grad <- m*(as.numeric(data$Date[stdcal])) + b
grad
}
gradient(data=out_air, stdcal, parameter = CO2, Date)
# without success as I cannot use this date format and also no success when I use the unformated date version.
我非常感谢您的帮助,因为我现在正在研究这个问题。
谢谢!
答案 0 :(得分:0)
您似乎想要的计算对您共享的数据不起作用。无论如何,这似乎有用:
数据
library(data.table) # makes it easier to query/subset datasets
dt <- as.data.table(read.delim("clipboard", header = T, sep = "|"))
setnames(dt, c("Date", "CO2"), c("date", "co2"))
dt$date <- as.POSIXct(dt$date, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
代码
# Function to divide data.table into chunks of "n" observations and return mean value of
# co2 for n/2 observations.
retVals <- function(z, chunk = 10){
n <- nrow(z)
r <- rep(1:ceiling(n/chunk), each = chunk)[1:n]
# Split data.table into chunks - set to 10 based on the question
dat_list <- split(z, r)
ret_list <- lapply(dat_list, function(q){
# averaging second half of an observation set set midval <- 5 if you only want the
# last 5 observations to be averaged, regardless of the chunk size
midval <- ceiling(nrow(q)/2)
return(data.table(date = q$date[nrow(q)],
co2 = mean(q$co2[midval:nrow(q)], na.rm = T) ))
})
return(rbindlist(ret_list, use.names = T, fill = T))
}
这是一些输出:
> retVals(dt[valve == 1])
date co2
1: 2018-01-31 00:13:00 410.92
2: 2018-01-31 00:33:00 410.90
> retVals(dt[valve == 2])
date co2
1: 2018-01-31 00:23:00 410.9400
2: 2018-01-31 00:43:00 410.9133
从给定数据集获取当前时间co2
的值加上某个恒定时间间隔(time_interval
)的函数:
out_air <- dt[valve == 1, ]
work_std <- dt[valve == 2, ]
nextObv <- function(tme, dat = out_air, time_interval = 60){
dat_sub <- dat[date == tme + time_interval, ]
retval <- NA
if(nrow(dat_sub) > 0){
retval <- mean(dat_sub$co2)
}
return(retval)
}
一些输出:
> q <- retVals(dt[valve == 1])
> q
date co2
1: 2018-01-31 00:13:00 410.92
2: 2018-01-31 00:33:00 410.90
> q[, next_min_co2 := nextObv(date, dat = work_std), by = date]
> q
date co2 next_min_co2
1: 2018-01-31 00:13:00 410.92 410.92
2: 2018-01-31 00:33:00 410.90 410.90
此处q
是外部空气的数据集(valve == 1
),其中,date
是每个10个观察块的最后日期,co2
是平均值这10个观察的后半部分co2
。您发布的数据(dt
)有valve == 1
的20个观察值,这意味着两个大小为10的块,这就是我们在q
中有2个观察值的原因。
在第二步中,对于每次观察,我从co2
获得work_std
的值,该值是一分钟后记录的。我使用了work_std
,但您只需使用相同的结果调用dt[valve == 2]
:
> q[, next_min_co2 := nextObv(date, dat = dt[valve == 2]), by = date]
> q
date co2 next_min_co2
1: 2018-01-31 00:13:00 410.92 410.92
2: 2018-01-31 00:33:00 410.90 410.90
根据问题中提供的逻辑,上述值是准确的。以上所有代码都适用于整个数据集。
根据观察co2 < 500
的异常值的逻辑,整个数据集是一个异常值。我不确定你想要计算什么“渐变”。如果您对查找co2
vs date
的线性方程感兴趣,可以使用:
lm(co2 ~ as.numeric(date), data = q)
最好使用经过时间而不是绝对时间,但我不知道你的最终目标。如果您想要不同的内容,请在您的问题中明确指出您要查找的内容,例如:对于给定的瞬间,您想要找出(next_min_co2 - co2)/as.numeric(date)
或其他内容的值。在示例中使用数值来显示您想要的计算,以便其他人理解相同的内容。
希望这会有所帮助。
修改强>
使用approxfun
获取所有时间的插值:
outModel <- approxfun(x = out_air$date, y = out_air$co2, method = "linear")
workModel <- approxfun(x = work_std$date, y = work_std$co2, method = "linear")
nextObv <- function(tme, model = outModel, time_interval = 60){
return(model(tme + time_interval))
}
> q <- retVals(dt[valve == 2])
> q
date co2
1: 2018-01-31 00:23:00 410.9400
2: 2018-01-31 00:43:00 410.9133
approxfun
在已知数据点之间进行插值以输出预期值。在您的情况下,由于数据可能是不连续的,您可以使用插值来代替。在下面的示例中,我们尝试两个time_intervals
- 60和30.使用60应该返回一个精确值但是30返回q
中第一行的插值。第二行将在两种情况下显示NA
,因为它超出了数据范围。我们正在使用outModel
,因此使用下面的out_air
数据集(与dt[valve == 1,]
相同)对值进行插值:
> dt[valve == 1]
date co2 valve
1: 2018-01-31 00:04:00 410.91 1
2: 2018-01-31 00:05:00 410.91 1
3: 2018-01-31 00:06:00 410.88 1
4: 2018-01-31 00:07:00 410.97 1
5: 2018-01-31 00:08:00 410.95 1
6: 2018-01-31 00:09:00 410.93 1
7: 2018-01-31 00:10:00 410.90 1
8: 2018-01-31 00:11:00 410.92 1
9: 2018-01-31 00:12:00 410.96 1
10: 2018-01-31 00:13:00 410.86 1
11: 2018-01-31 00:24:00 410.89 1
12: 2018-01-31 00:25:00 410.88 1
13: 2018-01-31 00:26:00 410.90 1
14: 2018-01-31 00:27:00 410.91 1
15: 2018-01-31 00:28:00 410.93 1
16: 2018-01-31 00:29:00 410.94 1
17: 2018-01-31 00:30:00 410.94 1
18: 2018-01-31 00:31:00 410.88 1
19: 2018-01-31 00:32:00 410.87 1
20: 2018-01-31 00:33:00 410.84 1
> q$newvals <- nextObv(q$date)
> q
date co2 newvals
1: 2018-01-31 00:23:00 410.9400 410.89
2: 2018-01-31 00:43:00 410.9133 NA
> q$newvals2 <- nextObv(q$date, time_interval = 30)
> q
date co2 newvals newvals2
1: 2018-01-31 00:23:00 410.9400 410.89 410.8886
2: 2018-01-31 00:43:00 410.9133 NA NA
如果您不想要插值,请先创建一个函数来获取精确值,如果返回NA
,则使用approxfun
获取插值。正如我之前提到的,使用经过时间比使用确切的时间瞬间(查找difftime
以获得日期的时间间隔b / w)更好。
答案 1 :(得分:0)
我上面改变了所有功能和数据显示。好像它终于有效了。我甚至不需要插值。谢谢你,@ Gautam!我从你的帮助中学到了很多东西!
# declare time
Dat$Date <- as.POSIXct(strptime(Dat$Date, format = "%d.%m.%Y %H:%M", tz = "GMT"))
#different valve datasets
out_air <- Dat[Dat$Ventil==1,]
work_std <- Dat[Dat$Ventil==2,]
std <- Dat[Dat$Ventil==3 | Dat$Ventil==4 ,] # not visible in the uploaded data
tgt <- Dat[Dat$Ventil==5,] # not visible in the uploaded data
# mean Std's, using last 5 values of the 10 measurements each
MeansCO2 <- apply(as.data.frame(matrix(work_std$CO2, nrow=10)[6:10,]),2,mean)
MeansCH4 <- apply(as.data.frame(matrix(work_std$CH4, nrow=10)[6:10,]),2,mean)
MeansCO <- apply(as.data.frame(matrix(work_std$CO_LGR, nrow=10)[6:10,]),2,mean)
MeansN2O <- apply(as.data.frame(matrix(work_std$N2O_LGR, nrow=10)[6:10,]),2,mean)
# Date of last Std measurement time
Date4means <- work_std$Date[seq(10,nrow(work_std),10)]
# dataframe for Std means and date
StdMeans <- data.frame(Date4means, MeansCO2, MeansCH4,MeansCO,MeansN2O)
# location of std mean in dataframe
which(is.element( Dat$Date, StdMeans$Date4means))
#### CO2 ######
Dat$CO2_Std_MEAN<-ifelse(is.element( Dat$Date, StdMeans$Date4means), StdMeans$MeansCO2, NA)
pos_Std_Dat <- which(!is.na(Dat$CO2_Std_MEAN))
# function to get gradients between the CO2 calibrations
xyplot(MeansCO2~Date4means,data=StdMeans, typ=c("l","p","r"))
CO2_gradients <- function(x,data=StdMeans){
LM <- lm(MeansCO2[x:(x+1)]~Date4means[x:(x+1)])
grad <- coef(LM)[2] # gives you the slope
names(grad) <- names(Date4means)
grad
}
CO2_slopes <- sapply(1:nrow(StdMeans), CO2_gradients) # apply on all cal std pairs
CO2_Dat_slope <- data.frame(StdMeans$Date4means, CO2_slopes) # make data.frame
CO2_Dat_slope
which(is.element( Dat$Date, CO2_Dat_slope$StdMeans.Date4means))
#add CO2_slope to big data.frame
Dat$CO2_slope<-ifelse(is.element( Dat$Date, CO2_Dat_slope$StdMeans.Date4means), CO2_Dat_slope$CO2_slopes, NA)
# make new data.frame including only out_air and work_std data
Dat_calCO2 <- Dat[Dat$Ventil==1| Dat$Ventil==2,]
pos_Std_mean <- which(!is.na(Dat_calCO2$CO2_Std_MEAN)) # location where mean std is
# apply the gradient on the measured CO2 data
CO2_cal <- function(x,data=Dat_calCO2){
pos <- pos_Std_mean[x:(x+1)] # use a position and the following
CO2_pos <- data$CO2[pos[1]:pos[2]] # use a position and the following --> std and next std
CO2_corr <- CO2_pos * data$CO2_slope[pos_Std_mean[x]] + CO2_pos # apply gradient on CO2 data
dat_new <- data.frame(data$Date[pos[1]:pos[2]],CO2_corr)# make data.frame
colnames(dat_new) <- c("Date", "CO2_corr") # give names
dat_new
}
# apply function on all CO2 sections, result is a list
CO2_corr_all <- sapply(1:(nrow(StdMeans)-1), CO2_cal) # including the last value would add NA that's why -1
# to get Date and Value pairs: in CO2_corr_all list the odd nr's are Dates, the equal nr's are the corrected CO2 values
Data_Frame <-function(x, data=CO2_corr_all){
trans <-t(data.frame(data[[x]], data[[x+1]] ))
trans
}
# apply only on the odd nr's in the list
CO2_new <- sapply(seq(1,length(cbind(CO2_corr_all)),2),Data_Frame)
CO2_corr_data_t<-as.data.frame(CO2_new) # make data.frame
CO2_corr_data<-data.frame(t(CO2_corr_data_t))
colnames(CO2_corr_data)<- c("Date", "CO2_corr")
# convert into correct formats
CO2_corr_data$Date <- as.POSIXct(CO2_corr_data$Date, tz="GMT")
CO2_corr_data$CO2_corr <- as.numeric(levels(CO2_corr_data$CO2_corr))[CO2_corr_data$CO2_corr]
# add corr CO2 data to data.frame
Dat_cal$CO2_corr <-ifelse(is.element(Dat_cal$Date, CO2_corr_data$Date),CO2_corr_data$CO2_corr , NA)