Question

我想知道dplyr提供任何有用的工具来对地表温度时间序列进行快速数据汇总。但是，我已经从E-OBS数据集（E-OBS grid data）中提取了德国的网格化数据，并使用excel格式以表格数据渲染了此提取的栅格网格。现在，在新导出的数据中，数据显示了具有15年温度观测值的相应地理坐标对（1012行，15x365 / 366列）。 Plase快速查看数据：time series data。

以下是我想要做的事情，动态数据time series data，我希望按年进行数据汇总，因为原始观察是通过每日水平观察完成的。特别是，每个地理坐标对，我打算计算每年的平均年温度，所有操作都会达到15年。更具体地说，在完成聚合之后，我想将结果放在原始地理坐标对出现的新data.frame中，但添加新列，例如1980_avg_temp，1981_avg_temp, 1982_avg_temp`等等。因此，我希望按列减少数据维度，引入新的聚合列，其中将添加年平均温度。

如何对dplyr数据使用data.table或excel来完成此操作？有没有更简单的方法可以动态地对附加数据进行数据聚合操作time series data？有什么想法？

Answer 1

我试过了：

library(tidyverse)
library(readxl)
df <- read_excel("YOUR_XLSX_FILE")

df %>% 
  gather(date, temp, -x, -y) %>% 
  separate(date, c("year", "month", "day")) %>% 
  separate(year, c("trash", "year"), sep = "X") %>% 
  select(-trash) %>% 
  group_by(year, x, y) %>% 
  summarise(avg_temp=mean(temp)) %>% 
  spread(year, avg_temp)

输出是：

# A tibble: 19 x 17
# Groups: x [11]
       x     y `1980` `1981` `1982` `1983` `1984` `1985` `1986` `1987` `1988` `1989` `1990` `1991`
 * <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1  8.88  54.4   7.79   8.02   8.76   9.20   8.32   7.51   7.88   7.43   9.20   9.63   9.76   8.55
 2  8.88  54.9   7.54   7.61   8.41   8.84   8.15   7.15   7.53   7.15   8.97   9.51   9.55   8.42
 3  9.12  54.4   7.65   7.86   8.62   9.05   8.17   7.34   7.70   7.28   9.01   9.46   9.60   8.37
 4  9.12  54.6   7.44   7.59   8.38   8.81   8.02   7.11   7.50   7.13   8.88   9.36   9.47   8.31
 5  9.12  54.9   7.33   7.36   8.25   8.67   8.02   7.05   7.49   7.10   8.91   9.48   9.55   8.41
 6  9.38  54.4   7.69   7.91   8.61   9.02   8.15   7.31   7.69   7.24   8.98   9.49   9.64   8.35
 7  9.38  54.6   7.45   7.62   8.46   8.85   8.05   7.16   7.59   7.18   8.92   9.48   9.61   8.41
 8  9.38  54.9   7.24   7.29   8.21   8.62   7.95   7.04   7.56   7.15   8.94   9.57   9.66   8.53
 9  9.62  54.4   7.65   7.90   8.60   9.01   8.14   7.24   7.64   7.16   8.93   9.52   9.65   8.33
10  9.62  54.6   7.39   7.60   8.45   8.82   8.01   7.10   7.56   7.12   8.86   9.46   9.55   8.34
11  9.62  54.9   7.28   7.38   8.28   8.69   7.98   7.07   7.61   7.18   8.96   9.60   9.68   8.54
12  9.88  54.4   7.70   8.00   8.69   9.14   8.23   7.36   7.76   7.23   9.03   9.63   9.73   8.41
13  9.88  54.6   7.40   7.65   8.46   8.87   8.05   7.11   7.58   7.12   8.87   9.47   9.50   8.30
14 10.1   54.4   7.76   8.12   8.78   9.21   8.30   7.49   7.90   7.34   9.08   9.69   9.79   8.52
15 10.4   54.4   7.66   8.09   8.70   9.17   8.23   7.41   7.87   7.29   9.03   9.70   9.82   8.60
16 11.1   54.9   7.61   8.14   8.74   9.14   8.33   7.32   7.92   7.22   9.17   9.93  10.1    8.86
17 11.4   54.9   7.59   8.17   8.74   9.14   8.32   7.29   7.92   7.20   9.17   9.95  10.1    8.87
18 11.9   54.9   7.54   8.15   8.71   9.10   8.28   7.19   7.85   7.15   9.10   9.92  10.1    8.84
19 12.1   54.9   7.52   8.12   8.69   9.08   8.27   7.12   7.80   7.11   9.05   9.91  10.0    8.82
# ... with 3 more variables: `1992` <dbl>, `1993` <dbl>, `1994` <dbl>

向您显示地理坐标未在tibble中更改（它只是四舍五入），在管道的末尾添加as.data.frame()并查看您的数据：示例：

df %>% 
  gather(date, temp, -x, -y) %>% 
  separate(date, c("year", "month", "day")) %>% 
  separate(year, c("trash", "year"), sep = "X") %>% 
  select(-trash) %>% 
  group_by(year, x, y) %>% 
  summarise(avg_temp=mean(temp)) %>% 
  spread(year, avg_temp) %>% 
  as.data.frame() %>% # add this
  head()

输出是：

#       x      y     1980     1981     1982     1983     1984     1985     1986     1987     1988
# 1 8.875 54.375 7.792978 8.021342 8.762274 9.203424 8.317131 7.505370 7.879068 7.427260 9.197431
# 2 8.875 54.875 7.536229 7.607507 8.414877 8.841260 8.154945 7.151890 7.532164 7.147945 8.969781
# 3 9.125 54.375 7.651393 7.862466 8.620904 9.052630 8.169262 7.337589 7.701205 7.282657 9.014590
# 4 9.125 54.625 7.435983 7.590548 8.381753 8.808904 8.019399 7.109096 7.499589 7.127370 8.875656
# 5 9.125 54.875 7.332978 7.363370 8.247205 8.669370 8.024645 7.045425 7.487424 7.098849 8.911776
# 6 9.375 54.375 7.693907 7.914630 8.612438 9.022055 8.150164 7.305068 7.688164 7.242274 8.984207
#       1989     1990     1991     1992     1993     1994
# 1 9.625781 9.760931 8.550356 9.678907 8.208109 9.390904
# 2 9.513863 9.552767 8.420109 9.425328 8.010082 9.134466
# 3 9.462959 9.602876 8.374575 9.465164 8.052794 9.207041
# 4 9.358986 9.473178 8.305863 9.353743 7.935507 9.050109
# 5 9.478192 9.545781 8.412329 9.403005 7.998877 9.074740
# 6 9.493205 9.635561 8.352740 9.385819 8.017260 9.184959

Answer 2

这适用于您提供的数据。

library(tidyverse)
library(lubridate)
demo_data %>%
  gather(date, temp, -x, -y) %>%
  mutate(date = ymd(str_remove(date, "X"))) %>%
  mutate(year = year(date)) %>%
  group_by(x, y, year) %>%
  summarise_at(vars(temp), mean, na.rm = TRUE) %>%
  spread(year, temp)

# # A tibble: 19 x 17
# # Groups:   x, y [19]
#        x     y `1980` `1981` `1982` `1983` `1984` `1985` `1986` `1987` `1988`
#    <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#  1  8.88  54.4   7.79   8.02   8.76   9.20   8.32   7.51   7.88   7.43   9.20
#  2  8.88  54.9   7.54   7.61   8.41   8.84   8.15   7.15   7.53   7.15   8.97
#  3  9.12  54.4   7.65   7.86   8.62   9.05   8.17   7.34   7.70   7.28   9.01
#  4  9.12  54.6   7.44   7.59   8.38   8.81   8.02   7.11   7.50   7.13   8.88
#  5  9.12  54.9   7.33   7.36   8.25   8.67   8.02   7.05   7.49   7.10   8.91
#  6  9.38  54.4   7.69   7.91   8.61   9.02   8.15   7.31   7.69   7.24   8.98
#  7  9.38  54.6   7.45   7.62   8.46   8.85   8.05   7.16   7.59   7.18   8.92
#  8  9.38  54.9   7.24   7.29   8.21   8.62   7.95   7.04   7.56   7.15   8.94
#  9  9.62  54.4   7.65   7.90   8.60   9.01   8.14   7.24   7.64   7.16   8.93
# 10  9.62  54.6   7.39   7.60   8.45   8.82   8.01   7.10   7.56   7.12   8.86
# 11  9.62  54.9   7.28   7.38   8.28   8.69   7.98   7.07   7.61   7.18   8.96
# 12  9.88  54.4   7.70   8.00   8.69   9.14   8.23   7.36   7.76   7.23   9.03
# 13  9.88  54.6   7.40   7.65   8.46   8.87   8.05   7.11   7.58   7.12   8.87
# 14 10.1   54.4   7.76   8.12   8.78   9.21   8.30   7.49   7.90   7.34   9.08
# 15 10.4   54.4   7.66   8.09   8.70   9.17   8.23   7.41   7.87   7.29   9.03
# 16 11.1   54.9   7.61   8.14   8.74   9.14   8.33   7.32   7.92   7.22   9.17
# 17 11.4   54.9   7.59   8.17   8.74   9.14   8.32   7.29   7.92   7.20   9.17
# 18 11.9   54.9   7.54   8.15   8.71   9.10   8.28   7.19   7.85   7.15   9.10
# 19 12.1   54.9   7.52   8.12   8.69   9.08   8.27   7.12   7.80   7.11   9.05
# # ... with 6 more variables: `1989` <dbl>, `1990` <dbl>, `1991` <dbl>,
# #   `1992` <dbl>, `1993` <dbl>, `1994` <dbl>

如何用dplyr总结所有给定的每日观测时间序列的年平均温度？

2 个答案: