插值数据:如何在不同组的一个数据集中插值数据,并将其应用于具有相同组的另一数据集?

时间:2019-01-23 19:27:45

标签: r dplyr interpolation linear-regression

我有两个数据集: 首先是水分数据:按日期/地点/处理/树木进行组织,并包含百分比(水分百分比)

我还有另一个数据集(SCD),其中包含日期/地点/处理/树,然后有采集量。 首先,日期不排队(按天关闭)。每个处理方法有8棵树,每个站点有2种处理方法

我想使用水分数据在给定的采样日内对每棵树的数据进行插值(我已经使用平均值进行了此操作)。

我希望根据每个给定地点/处理/树木的水分来回归日期,然后从样本收集数据中获取日期以获取水分百分比。

水分数据片段:

structure(list(date = structure(c(17681, 17681, 
17681, 17681, 
17681, 17681, 17681, 17681, 17681, 17681, 17681, 
17681, 17681, 
17681, 17681, 17681, 17681, 17681, 17681, 17681, 
17681, 17681, 
17681, 17681, 17681, 17681, 17681, 17681, 17681, 
17681, 17681, 
17681, 17681, 17685, 17685, 17685, 17685, 17685, 
17685, 17685, 
17685, 17685, 17685, 17685, 17685, 17685, 17685, 
17685, 17685, 
17685), class = "Date"), site = structure(c(2L, 4L, 
2L, 4L, 6L, 
2L, 4L, 4L, 2L, 4L, 4L, 6L, 4L, 6L, 4L, 6L, 4L, 6L, 
4L, 2L, 4L, 
6L, 4L, 6L, 4L, 6L, 4L, 6L, 4L, 6L, 2L, 4L, 6L, 2L, 
4L, 2L, 4L, 
6L, 2L, 4L, 2L, 4L, 2L, 4L, 4L, 6L, 2L, 4L, 6L, 4L), 
.Label = c("hydric", 
"Hydric", "mesic", "Mesic", "xeric", "Xeric"), class 
= "factor"), 
trt = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 
1L, 
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
2L, 1L, 
1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 
1L, 1L, 
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L), .Label = 
c("c", 
"s"), class = "factor"), tree = c(1L, 1L, 1L, 1L, 1L, 
2L, 
2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 
6L, 6L, 
6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 1L, 
1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L), 
percent = c(55.19, 54.22, 56.8, 42.05, 45, 67.53, 50, 
41.85, 
58.39, 51.76, 38.2, 43.6, 60.27, 32.25, 42.1, 49.25, 
47.85, 
25.25, 53.1, 44.25, 46.15, 42.65, 52.18, 40.4, 37.8, 
47, 
44.25, 43.75, 31.8, 37.05, 47, 62.34, 48.55, 64.86, 
44.45, 
61.9, 47.1, 47.25, 68.05, 46.77, 59.93, 33.5, 68.11, 
52.61, 
41.05, 44.2, 63.76, 58.42, 34.8, 62.92), year = 
c(2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 
2018L, 2018L, 2018L, 2018L)), .Names = c("date", 
"site", 
"trt", "tree", "percent", "year"), row.names = c(NA, 
50L), class = "data.frame")

样品采集数据:

structure(list(Date = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 
3L), .Label = c("43333", "5/31/2018", "6/1/2018", 
"6/10/2018", 
"6/11/2018", "6/14/2018", "6/15/2018", "6/16/2018", 
"6/2/2018", 
"6/20/2018", "6/21/2018", "6/24/2018", "6/25/2018", 
"6/26/2018", 
"6/27/2018", "6/28/2018", "6/29/2018", "6/9/2018", 
"7/14/2018", 
"7/15/2018", "7/16/2018", "7/20/2018", "7/21/2018", 
"7/23/2018", 
"7/24/2018", "7/25/2018", "7/28/2018", "7/29/2018", 
"7/30/2018", 
"7/6/2018", "7/7/2018", "7/9/2018", "8/11/2018", 
"8/12/2018", 
"8/16/2018", "8/17/2018", "8/18/2018", "8/2/2018", 
"8/20/2018", 
"8/21/2018", "8/22/2018", "8/24/2018", "8/26/2018", 
"8/27/2018", 
"8/3/2018", "8/4/2018", "8/9/2018"), class = 
"factor"), Site = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Hydric", 
"Mesic", "Xeric"
), class = "factor"), treatment = structure(c(5L, 5L, 
5L, 5L, 
5L, 5L, 5L, 5L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 5L, 
5L, 5L, 5L, 
5L, 5L, 5L, 5L), .Label = c("Ancillary", "Control", 
"Fertalized", 
"Lysim", "Snowfence"), class = "factor"), Plot = 
c(1L, 2L, 3L, 
4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
1L, 2L, 3L, 
4L, 5L, 6L, 7L, 8L), Fill = structure(c(3L, 7L, 7L, 
3L, 5L, 5L, 
5L, 4L, 5L, 5L, 3L, 5L, 5L, 5L, 4L, 7L, 3L, 7L, 7L, 
4L, 5L, 4L, 
7L, 5L), .Label = c("", "Broken", "D", "F", "M", 
"reinstall", 
"S", "Sp", "SP", "VF"), class = "factor"), Mass = 
c(12.2, 7.2, 
12, 8.1, 11.4, 8.3, 12.5, 12.8, 12.6, 12.9, 7.4, 
12.6, 12.8, 
9.8, 12.8, 9.3, 8, 9.8, 11.6, 12.8, 10.4, 13.1, 10.6, 
12.9)), .Names = c("Date", 
"Site", "treatment", "Plot", "Fill", "Mass"), 
row.names = c(14L, 
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 
25L, 26L, 27L, 
28L, 29L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L), 
class = "data.frame")

Moisture$date<-as.Date(Moisture$date, 
format="%m/%d/%Y")
SCD$Date<-as.Date(SCD$Date, format="%m/%d/%Y")

因此,目标是获取与SCD数据集一致的水分百分比数据,以进行进一步分析。我想对每棵树进行回归,然后输入SCD日期/树/站点并返回百分比,并将其添加为SCD中的列。

我知道这远非正确,但这是我目前的尝试。

library(dplyr)
MoistForSCD <- Moisture %>% 
group_by(site,trt,tree) %>% 
arrange(date,tree,site,trt) %>% 
mutate(Loess = predict(loess(percent ~ date, span = 
.5, data=.),SCD))
SCD$M<-MoistForSCD

1 个答案:

答案 0 :(得分:1)

您可以这样做:

library(tidyverse)
library(lubridate)

Moisture %>%
  mutate(set = 'train') %>%
  select(-year) %>%
  bind_rows(
    SCD %>%
      select(Date, Site, treatment, Plot, Mass) %>%
      rename_at(1:4, ~colnames(Moisture)[1:4]) %>%
      mutate(
        trt = trt %>% str_sub(1, 1) %>% str_to_lower(),
        set = 'test'
      )
  ) %>%
  mutate(month = date %>% month()) %>%
  group_by(tree) %>%
  nest() %>%
  mutate(
    train = map(data, ~filter(.x, set == 'train')),
    test = map(data, ~filter(.x, set == 'test') %>% select(-percent)),
    model = map(train, ~lm(percent ~ date + trt, data = .x)),
    pred = map2(model, test, ~predict(.x, newdata = .y))
  ) %>%
  select(test, pred, tree) %>%
  unnest() %>%
  select(-set)

如果您有更多数据,还可以在公式中包含site变量:

percent ~ date + site + trt