Question

我有4个数据帧，每个数据帧对应一年。每个数据框包含五个地点的每日降雨量。

生成样本数据

    location <- c("A","B","C","D","E")
    mat <- round(as.data.frame(matrix(runif(1825),nrow=5,ncol=365)), digits=2)
    dat.1981 <-as.data.frame(cbind(location,mat)) # rainfall for 1981
    dat.1981$year <- 1981

    mat <- round(as.data.frame(matrix(runif(1825),nrow=5,ncol=365)), digits = 2)
    dat.1982 <- as.data.frame(cbind(location,mat)) # rainfall for 1982
    dat.1982$year <- 1982

    mat <- round(as.data.frame(matrix(runif(1825),nrow=5,ncol=365)), digits = 2)
    dat.1983 <-as.data.frame(cbind(location,mat)) # rainfall for 1983
    dat.1983$year <- 1983

    mat <- round(as.data.frame(matrix(runif(1825),nrow=5,ncol=365)), digits = 2)
    dat.1984 <-as.data.frame(cbind(location,mat)) # rainfall for 1984
    dat.1984$year <- 1984

    dat <- as.data.frame(rbind(dat.1981,dat.1982,dat.1983,dat.1984))

对于每一年，我想分类一天是否是一个极端潮湿的日子

以下是我的计算方法：

1）对于每个地点，生成1981至1984年期间每周的降雨量的平均值和降雨量。例如，在位置A，第一周的平均降雨量为：

（1981年第一周雨A + 1982年第一周雨A + 1983年第一周雨A + 1984年第一周雨A）/ 4

可以用R写成

    mean.week1.loc1 <- mean(rowSums(dat[dat$location=="A",2:8])) # 2:8 selects the first 7 days in each year
    sd.week1.loc1 <- sd(rowSums(dat[dat$location=="A",2:8])) 

    wet.cr <- mean.week1 + sd.week1 # this is my threshold for defining a wet day

如果位置A中1981年至1984年的第1周的每日降雨量大于wet.cr，那天是潮湿的一天，因此价值为1

作为一个例子，为了检查1981年到1984年A区的第1周降雨量是否是湿天，我可以做以下事情：

   lapply(dat[, 2:8], function(x) ifelse(x > wet.cr, 1, 0))

我想为每周和每个地点重复这一点。

但是，我无法将这些个别功能拼接在一起我的最终结果应该是与dat相同的数据框，但不是降雨量值，我将有1或0来定义它是否是湿天。

修改

以下解决方案为我提供了以下内容：

mean(c(rainfall 1981 day 1 week 1, ...., rainfall 1981 day 7 week 1, rainfall 1982 day 1 week 1,....,rainfall 1982 day 7 week 1,....,rainfall 1984 day 1 week 1,....,rainfall 1984 day 7 week 1))

我想要的是：

mean(c(mean(total rainfall week 1 1981), mean(total rainfall week 1 1982), mean(total rainfall week 1 1983), mean(total rainfall week 1 1984)))

我希望现在很清楚。

Answer 1

tidyverse解决方案

    library(magrittr)
    library(tidyverse)

    dat_m <- gather(dat, day, rainfall, -location, -year)
    str(dat_m)

    dat_m %<>%
      mutate(day = gsub("V", "", day)) %>%
      mutate(day = as.numeric(day)) %>% 
      mutate(week = as.integer(ceiling(day/7))) %>% 
      group_by(location, week) %>% 
      mutate(wet.cr = mean(rainfall, na.rm = TRUE) + sd(rainfall, na.rm = TRUE) ) %>% 
      mutate(indication = ifelse(rainfall > wet.cr, 1, 0)) %>% 
      ungroup()
    dat_m 

    # A tibble: 7,300 x 7
       location  year   day rainfall  week wet.cr indication
       <fctr>   <dbl> <dbl>    <dbl> <int>  <dbl>      <dbl>
     1 A         1981  1.00    0.880     1  0.845       1.00
     2 B         1981  1.00    0.850     1  0.829       1.00
     3 C         1981  1.00    1.00      1  0.877       1.00
     4 D         1981  1.00    0.100     1  0.755       0   
     5 E         1981  1.00    0.190     1  0.750       0   
     6 A         1982  1.00    0.380     1  0.845       0   
     7 B         1982  1.00    0.760     1  0.829       0   
     8 C         1982  1.00    0.940     1  0.877       1.00
     9 D         1982  1.00    0.900     1  0.755       1.00
    10 E         1982  1.00    0.600     1  0.750       0   
    # ... with 7,290 more rows

修改：对于降雨量，我认为使用sum（总计）比mean

更好

因此，我们首先计算每年的每周总降雨量。然后我们计算长期平均值＆amp;每周总降雨量的stdev。

    dat_m %<>%
      mutate(day = as.numeric(gsub("V", "", day)),
             week = as.integer(ceiling(day/7))) %>%
      group_by(location, week, year) %>% 
      mutate(total_weekly_rainfall = sum(rainfall, na.rm = TRUE)) %>% 
      ungroup() %>% 
      group_by(location, week) %>% 
      mutate(mean_weekly_rainfall = sum(rainfall, na.rm = TRUE)/length(unique(year)),
             stddev_weekly_rainfall = sd(rainfall, na.rm = TRUE),
             wet.cr =  mean_weekly_rainfall + stddev_weekly_rainfall,
             indication = ifelse(total_weekly_rainfall > wet.cr, 1, 0)) %>% 
      arrange(location, year, day) %>% 
      ungroup() %>% 
      distinct(location, year, week, .keep_all = TRUE)
    dat_m 

    # A tibble: 1,060 x 10
       location  year   day rainfall  week total_wee~ mean_wee~ stddev_w~ wet.~ indic~
       <fctr>   <dbl> <dbl>    <dbl> <int>      <dbl>     <dbl>     <dbl> <dbl>  <dbl>
     1 A         1981  1.00   0.880      1     0.880      0.630     0.277 0.907      0
     2 A         1981  8.00   0.190      2     0.190      0.330     0.431 0.761      0
     3 A         1981 15.0    0.630      3     0.630      0.548     0.331 0.878      0
     4 A         1981 22.0    0.0300     4     0.0300     0.290     0.259 0.549      0
     5 A         1981 29.0    0.360      5     0.360      0.308     0.196 0.504      0
     6 A         1981 36.0    0.540      6     0.540      0.500     0.225 0.725      0
     7 A         1981 43.0    0.0300     7     0.0300     0.375     0.289 0.664      0
     8 A         1981 50.0    0.170      8     0.170      0.332     0.375 0.708      0
     9 A         1981 57.0    0.260      9     0.260      0.652     0.272 0.924      0
    10 A         1981 64.0    0.590     10     0.590      0.512     0.202 0.715      0
    # ... with 1,050 more rows

Answer 2

使用data.table：

library(data.table)
dat <- setDT(dat)
newdat <- melt(dat, measure.vars = patterns("^V"),variable.name = "day",value.name = "rain")
newdat[,day := as.character(day)]
newdat[,day := as.numeric(unlist(lapply(newdat$day,function(x){strsplit(x,"V")[[1]][2]})))]
newdat[,Week := day %/% 7]
newdat[,threshold := mean(rain) + sd(rain),  by = .(location,Week)]
newdat[,wet := ifelse(rain > threshold,1,0)]
print(newdat,topn = 100)


      location year day rain Week threshold wet
   1:        A 1981   1 0.73    0 0.7630065   0
   2:        B 1981   1 0.69    0 0.8599243   0
   3:        C 1981   1 0.45    0 0.8145956   0
   4:        D 1981   1 0.51    0 0.8935058   0
   5:        E 1981   1 0.77    0 0.6992752   1
   6:        A 1982   1 0.47    0 0.7630065   0
   7:        B 1982   1 0.70    0 0.8599243   0
   8:        C 1982   1 0.48    0 0.8145956   0
   9:        D 1982   1 0.92    0 0.8935058   1

逐步说明：首先，您需要更改数据格式以简化计算。长格式更合适，因为每列V ##实际上是一个变量，即数字日。这是使用熔化

完成的

melt(dat, measure.vars = patterns("^V"),variable.name = "day",value.name = "rain")

     location year  day rain
   1:        A 1981   V1 0.73
   2:        B 1981   V1 0.69
   3:        C 1981   V1 0.45
   4:        D 1981   V1 0.51
   5:        E 1981   V1 0.77
  ---                        
7296:        A 1984 V365 0.31
7297:        B 1984 V365 0.99
7298:        C 1984 V365 0.25
7299:        D 1984 V365 0.24
7300:        E 1984 V365 0.87

然后您将一天转换为实数，以便能够计算一周

newdat[,day := as.character(day)]
newdat[,day := as.numeric(unlist(lapply(newdat$day,function(x){strsplit(x,"V")[[1]][2]})))]
> newdat[,.(day,year)]
      day year
   1:   1 1981
   2:   1 1981
   3:   1 1981
   4:   1 1981
   5:   1 1981

然后计算与你一样的周数

newdat[,Week := day %/% 7]

sthreshold演算的统计数据是通过对周数和地点进行分组来完成的（因此每个地方的统计数据都是如此）

newdat[,threshold := mean(rain) + sd(rain), by = .(location,Week)]

并将雨天定义为降雨高于阈值的日子

newdat[,wet := ifelse(rain > threshold,1,0)]

但我同意评论说初始数据肯定比你提出的格式更好。

Answer 3

对于data.table和tidyverse解决方案，您可以很好地将其视为缩放练习（许多学科中的z分数），因为平均值+ n标准差是众所周知的基准。

对于data.table解决方案，您可以：

newdat[,zrain := scale(rain),  by = .(location,Week)]
newdat[,zwet := ifelse(zrain > 1.0,1,0)]

你依赖于基础的scale并与1.0比较

对于变得：tidyverse：

mutate(zrain = scale(rainfall)) %>% 
mutate(indication = ifelse(zrain > 1.0, 1, 0)) %>%

将来，如果您对“湿”的标准发生变化，您只需在代码中更改一个数字

R：计算一年中每周的平均值和标准差

3 个答案: