Question

我在R中有一个大型数据框，其中包含许多位置的每日降雨时间序列数据（每个位置均在其自己的列中）。我想知道每个月以及每年每个位置的降雨量小于或大于阈值的次数。

我的数据框很大，因此我在此处提供了示例数据：

Date_ex <- seq.Date(as.Date('2000-01-01'),as.Date('2005-01-31'),by = 1)
A <- sample(x = c(1, 3, 5), size = 1858, replace = TRUE)
B <- sample(x = c(1, 2, 10), size = 1858, replace = TRUE)
C <- sample(x = c(1, 3, 5), size = 1858, replace = TRUE)
D <- sample(x = c(1, 3, 4), size = 1858, replace = TRUE)

df <- data.frame(Date_ex, A, B, C, D)

我如何找出A，B，C和D中的值分别每月大于每年4的次数。

我认为我应该能够将其总结为两个新表。

这样的人（例如，忽略数字）：

           A     B     C     D
 2000-01   1     0     5     0
 2000-02   2     16    25    0
 2000-03   1     5     26    0

这样的一个例子（例如，忽略数字）：

       A   B     C    D
2000   44  221   67  0
2001   67  231   4   132
2002   99  111   66  4
2003   33  45    45  4

我认为我应该为此使用dplyr？但是我不确定如何使日期生效。

Answer 1

使用dplyr和lubridate软件包的解决方案。关键是创建Year和Month列，按这些列分组，然后使用summarise_all汇总数据。

# Create the example data frame, set the seed for reproducibility 
set.seed(199)

Date_ex <- seq.Date(as.Date('2000-01-01'),as.Date('2005-01-31'),by = 1)
A <- sample(x = c(1, 3, 5), size = 1858, replace = TRUE)
B <- sample(x = c(1, 2, 10), size = 1858, replace = TRUE)
C <- sample(x = c(1, 3, 5), size = 1858, replace = TRUE)
D <- sample(x = c(1, 3, 4), size = 1858, replace = TRUE)

df <- data.frame(Date_ex, A, B, C, D)

library(dplyr)
library(lubridate)

# Summarise for each month
df2 <- df %>%
  mutate(Year = year(Date_ex), Month = month(Date_ex)) %>%
  select(-Date_ex) %>%
  group_by(Year, Month) %>%
  summarise_all(funs(sum(. > 4))) %>%
  ungroup()
df2
# # A tibble: 61 x 6
#     Year Month     A     B     C     D
#    <dbl> <dbl> <int> <int> <int> <int>
#  1  2000     1    13     8    13     0
#  2  2000     2    12     7     8     0
#  3  2000     3     7     9     9     0
#  4  2000     4     9    12    10     0
#  5  2000     5    11    12     8     0
#  6  2000     6    12     9    16     0
#  7  2000     7    10    11    10     0
#  8  2000     8     8    12    14     0
#  9  2000     9    12    12    12     0
# 10  2000    10     9     9     7     0
# # ... with 51 more rows

# Summarise for each year and month
df3 <- df %>%
  mutate(Year = year(Date_ex)) %>%
  select(-Date_ex) %>%
  group_by(Year) %>%
  summarise_all(funs(sum(. > 4)))
df3
# # A tibble: 6 x 5
#    Year     A     B     C     D
#   <dbl> <int> <int> <int> <int>
# 1  2000   120   119   125     0
# 2  2001   119   123   113     0
# 3  2002   135   122   105     0
# 4  2003   114   112   104     0
# 5  2004   115   125   124     0
# 6  2005     9    14    11     0

Answer 2

以下是一些解决方案。

1）汇总。此解决方案仅使用基数R。新的Date列是月初或年初的日期。

aggregate(df[-1] > 4, list(Date = as.Date(cut(df[[1]], "month"))), sum)
aggregate(df[-1] > 4, list(Date = as.Date(cut(df[[1]], "year"))), sum)

1a）使用Zoo中的yearmon类和（3）中的toyear，我们可以这样写：

library(zoo)

aggregate(df[-1] > 4, list(Date = as.yearmon(df[[1]])), sum)
aggregate(df[-1] > 4, list(Date = toyear(df[[1]])), sum)

2）rowum 这是另一个基本的R解决方案。年/月或年由行名称给出。

rowsum((df[-1] > 4) + 0, format(df[[1]], "%Y-%m"))
rowsum((df[-1] > 4) + 0, format(df[[1]], "%Y"))

2a）使用Zoo中的yearmon类和（3）中的toyear，我们可以这样写：

library(zoo)

rowsum((df[-1] > 4) + 0, as.yearmon(df[[1]]))
rowsum((df[-1] > 4) + 0, toyear(df[[1]]))

3）aggregate.zoo 转换为动物园对象并使用aggregate.zoo。请注意，yearmon类在内部表示年和月为年份，再加上0表示1月，1/12表示2月，2/12表示3月，依此类推，因此取整数部分即表示年份。

library(zoo)
z <- read.zoo(df)

aggregate(z > 4, as.yearmon, sum)

toyear <- function(x) as.integer(as.yearmon(x))
aggregate(z > 4, toyear, sum)

结果是一个动物园时间序列，在第一种情况下带有yearmon索引，在第二种情况下带有整数索引。如果要使用数据帧，请使用fortify.zoo(ag)，其中ag是aggregate的结果。

4）dplyr toyear来自（3）。

library(dplyr)
library(zoo)

df %>% 
   group_by(YearMonth = as.yearmon(Date_ex)) %>% 
   summarize_all(funs(sum)) %>%
   ungroup

df %>% 
   group_by(Year = toyear(Date_ex)) %>% 
   summarize_all(funs(sum)) %>%
   ungroup

Answer 3

Data.table丢失了，所以我要添加它。注释在代码中。我用set.seed（1）生成了样本。

library(data.table)

setDT(df)

# add year and month to df
df[, `:=`(month = month(Date_ex), 
          year = year(Date_ex))]

# monthly returns, remove date_ex
monthly_dt <- df[,lapply(.SD, function(x) sum(x > 4)), by = .(year, month), .SDcols = -("Date_ex")]
year month  A  B  C D
1: 2000     1 10 10 11 0
2: 2000     2 10 11  8 0
3: 2000     3 11 11 11 0
4: 2000     4 10 11  8 0
5: 2000     5  7 10  8 0
6: 2000     6  9  6  7 0
.....

# yearly returns, remove Date_ex and month
yearly_dt <- df[,lapply(.SD, function(x) sum(x > 4)), by = .(year), .SDcols = -c("Date_ex", "month")]
year   A   B   C D
1: 2000 114 118 113 0
2: 2001 127 129 120 0
3: 2002 122 108 126 0
4: 2003 123 128 125 0
5: 2004 123 132 131 0
6: 2005  14  15  15 0

计算一个月和一年中时间序列数据超过阈值的次数

3 个答案: