我的数据框包含大约200列,代表1999年至2015年的每月干旱测量值。每列中的值可以是正数也可以是负数。数据框中的每一行代表一个焦点年,我有兴趣计算参考的指标。焦点年份在另一栏中有所体现。如果它们代表来自不同站点的测量值(FIPS列),则可以有多个具有相同焦点年份的行。这是数据框的玩具版本(更新版本!):
structure(list(FIPS = c(19045, 48157, 20045, 20027), Year = c(2003,
2004, 2005, 2005), pdsi_2002.01.15 = c(1.46, 4.38, 0.38, -1.41
), pdsi_2002.02.15 = c(1.6, 3.63, -0.05, -1.66), pdsi_2002.03.15 = c(1.32,
3, -0.62, -1.93), pdsi_2002.04.15 = c(1.81, 2.68, 0.66, -1.88
), pdsi_2002.05.15 = c(2.03, 1.86, 1.26, -1.7), pdsi_2002.06.15 = c(2.51,
1.74, -0.5, -2.94), pdsi_2002.07.15 = c(2.79, 1.94, -1.47, -3.82
), pdsi_2002.08.15 = c(3.06, 2.64, -1.99, -4.09), pdsi_2002.09.15 = c(2.08,
3.02, -2.82, -4.87), pdsi_2002.10.15 = c(2.68, 4.73, -2.02, -3.01
), pdsi_2002.11.15 = c(2, 5.28, -2.55, -3.22), pdsi_2002.12.15 = c(1.55,
5.94, -3.23, -3.52), pdsi_2003.01.15 = c(0.96, 5.39, -3.58, -3.51
), pdsi_2003.02.15 = c(0.29, 5.24, -3.54, -3.29), pdsi_2003.03.15 = c(-0.15,
4.41, -3.77, -3.15), pdsi_2003.04.15 = c(-1.13, 3.39, -3.33,
-2.46), pdsi_2003.05.15 = c(-1.05, 1.91, -3.47, -2.63), pdsi_2003.06.15 = c(-1.5,
1.45, -2.94, -2.34), pdsi_2003.07.15 = c(-0.85, 1.69, -3.42,
-3.02), pdsi_2003.08.15 = c(-1.78, 1.48, -2.75, -3.13), pdsi_2003.09.15 = c(-1.55,
2.31, -2.66, -2.85), pdsi_2003.10.15 = c(-1.87, 2.5, -2.99, -3.16
), pdsi_2003.11.15 = c(-1.19, 2.72, -3.39, -2.73), pdsi_2003.12.15 = c(0.09,
2.67, -2.96, -2.63), pdsi_2004.01.15 = c(-0.2, 3.2, -2.83, -2.42
), pdsi_2004.02.15 = c(0.07, 3.73, -2.78, -2.21), pdsi_2004.03.15 = c(1.58,
3.04, -1.66, -0.77), pdsi_2004.04.15 = c(0.37, 3.19, -2, -1.25
), pdsi_2004.05.15 = c(1.7, 3.71, -1.35, -1.41), pdsi_2004.06.15 = c(1.53,
5.21, -0.84, -1.04), pdsi_2004.07.15 = c(1.14, 4.84, 2.08, 0.93
), pdsi_2004.08.15 = c(1.4, 4.41, 3.22, 0.24), pdsi_2004.09.15 = c(-0.43,
3.27, 2.39, -0.44), pdsi_2004.10.15 = c(0.77, 2.77, 2.49, -1.11
), pdsi_2004.11.15 = c(0.94, 4.95, 2.94, -1.03), pdsi_2004.12.15 = c(0.62,
4.41, 2.67, -1.43), pdsi_2005.01.15 = c(1.51, 3.93, 3.55, -1.05
), pdsi_2005.02.15 = c(1.45, 4.54, 3.83, 0.71), pdsi_2005.03.15 = c(0.58,
4.31, 3.01, 0.24), pdsi_2005.04.15 = c(-0.97, 3.36, 1.97, 0.94
), pdsi_2005.05.15 = c(-1.57, 3.12, 1.54, -0.33), pdsi_2005.06.15 = c(-2.65,
2.02, 2.33, 1.16), pdsi_2005.07.15 = c(-3.58, 2.07, 2.31, 1.08
), pdsi_2005.08.15 = c(-3.51, 1.56, 3.7, 1.72), pdsi_2005.09.15 = c(-3.96,
-0.71, 3.62, 0.74), pdsi_2005.10.15 = c(-4.77, -2.13, 3.79, 0.96
), pdsi_2005.11.15 = c(-5.08, -2.32, 3.4, 0.53), pdsi_2005.12.15 = c(-5.63,
-2.57, 3.27, -0.22)), .Names = c("FIPS", "Year", "pdsi_2002.01.15",
"pdsi_2002.02.15", "pdsi_2002.03.15", "pdsi_2002.04.15", "pdsi_2002.05.15",
"pdsi_2002.06.15", "pdsi_2002.07.15", "pdsi_2002.08.15", "pdsi_2002.09.15",
"pdsi_2002.10.15", "pdsi_2002.11.15", "pdsi_2002.12.15", "pdsi_2003.01.15",
"pdsi_2003.02.15", "pdsi_2003.03.15", "pdsi_2003.04.15", "pdsi_2003.05.15",
"pdsi_2003.06.15", "pdsi_2003.07.15", "pdsi_2003.08.15", "pdsi_2003.09.15",
"pdsi_2003.10.15", "pdsi_2003.11.15", "pdsi_2003.12.15", "pdsi_2004.01.15",
"pdsi_2004.02.15", "pdsi_2004.03.15", "pdsi_2004.04.15", "pdsi_2004.05.15",
"pdsi_2004.06.15", "pdsi_2004.07.15", "pdsi_2004.08.15", "pdsi_2004.09.15",
"pdsi_2004.10.15", "pdsi_2004.11.15", "pdsi_2004.12.15", "pdsi_2005.01.15",
"pdsi_2005.02.15", "pdsi_2005.03.15", "pdsi_2005.04.15", "pdsi_2005.05.15",
"pdsi_2005.06.15", "pdsi_2005.07.15", "pdsi_2005.08.15", "pdsi_2005.09.15",
"pdsi_2005.10.15", "pdsi_2005.11.15", "pdsi_2005.12.15"), row.names = c(13222L,
18125L, 19543L, 19534L), class = "data.frame")
我想要做的是计算焦点年份中每次负值运行的长度和总和(因此寻找同一行中的运行,跨列),然后计算平均运行长度,平均运行总和,每个运行总和的平均值除以每行的每个运行长度。增加另一层难度,如果焦点年份的1月份测量值为负值,那么我想回顾一下焦点年份之前的年份,以解释前一年开始出现负数的情况。可以想象,这次运行一直延续到1999年1月。
我已经能够使用rle()计算游程长度指标,但无法弄清楚如何获得游程总和。
答案 0 :(得分:1)
我认为这可能适用于您要查找的内容,这将生成指定年份的3个必需值,如果1月份有负值,则会继续向下运行,直到达到正值为止去年。
library(tidyr)
library(dplyr)
select.order <- colnames(drought_data)[3:length(colnames(drought_data))]
drought_data <- drought_data %>%
# Gather data by date
gather(key = date, value = value, -Year, -FIPS) %>%
# Separate date into separate columns
separate(date, into = c("yr","month", "day"), sep = "\\.") %>%
# Extract year
mutate(yr = substr(yr, 6, 9)) %>%
# Sort data by FIPS number, year, month
arrange(FIPS, yr, month) %>%
# Group data by FIPS number, focal year, and data year
group_by(FIPS, Year, yr) %>%
# Generate a run number for each run of negative numbers for the focal year
mutate(run.num = ifelse(Year == yr,
{run.num = rle(ifelse(value < 0, 1, 0))
rep(ifelse(run.num$values == 1, cumsum(run.num$values), 0), run.num$lengths)}, NA),
# Set run.num to -1 for positive values
run.num = ifelse(value >= 0, -1, run.num)) %>%
# Sort data by FIPS number, descending year, and descending month
arrange(FIPS, desc(yr), desc(month)) %>%
# Group data by FIPS number and focal year
group_by(FIPS, Year) %>%
# Fill out the run numbers for each run to cross data years
fill(run.num, .direction = "down") %>%
# Convert all -1 run numbers (Which indicate positive values) to zero
mutate(run.num = ifelse(run.num == -1, 0, run.num),
# Set run.num for negative values that did not qualify as a run for the specified year to 0
run.num = ifelse(is.na(run.num), 0, run.num)) %>%
ungroup %>%
# mutate(run.num = ifelse(is.na(run.num, 0, run.num))) %>%
# Group data by FIPS number, focal year, and run number
group_by(FIPS, Year, run.num) %>%
# Calculate the length, sum, and rate of each run
mutate(run.length = ifelse(run.num == 0, 0, n()),
run.sum = ifelse(run.num == 0, 0, sum(value)),
run.rate = ifelse(run.num == 0, 0, run.sum/run.length)) %>%
# Group by FIPS number and focal year
group_by(FIPS, Year) %>%
# Calculate the mean run length, and mean run sum for the focal year of each FIPS number
mutate(mean.run.length = sum(ifelse(run.num == 0, 0, 1)) / max(run.num),
mean.run.length = ifelse(is.nan(mean.run.length), 0, mean.run.length),
mean.run.sum = sum(ifelse(run.num == 0, 0, value) / max(run.num)),
mean.run.sum = ifelse(is.nan(mean.run.sum), 0, mean.run.sum)) %>%
# Combine date parts back to single column
unite(dt, yr:day, sep = ".") %>%
# Recreate the pdsi_ label format on the date column
mutate(dt = paste0("pdsi_", dt)) %>%
# Drop the run.sum column
select(-run.sum) %>%
# Spread the data back to a wide view to eliminate duplicate run.rate values
spread(dt, value) %>%
# Group data by FIPS number and focal year
group_by(FIPS, Year) %>%
# Calculate the mean of the sum of run rates over the number of runs
mutate(mean.run.sum.length = sum(run.rate) / max(run.num),
mean.run.sum.length = ifelse(is.nan(mean.run.sum.length), 0, mean.run.sum.length)) %>%
# Remove grouping
ungroup %>%
# Drop the run.num, run.length, and run.rate columns
select(-run.num, -run.length, -run.rate) %>%
# Gather the data into tall view to remove duplicates and NA values
gather_("dt", "value", select.order, na.rm = TRUE) %>%
# Spread data back to wide view
spread(dt, value)
# Change the column order
drought_data <- drought_data[,c("FIPS","Year","mean.run.length","mean.run.sum","mean.run.sum.length", select.order)]
最终输出将是包含三个附加计算列的原始数据帧。下面是提供的测试数据集的计算列的输出。
> drought_data[,c("FIPS","Year","mean.run.length","mean.run.sum","mean.run.sum.length")]
# A tibble: 4 x 5
FIPS Year mean.run.length mean.run.sum mean.run.sum.length
<dbl> <dbl> <dbl> <dbl> <dbl>
1 19045 2003 9.000000 -11.07 -1.2300000
2 20027 2005 2.333333 -1.87 -0.5206667
3 20045 2005 0.000000 0.00 0.0000000
4 48157 2004 0.000000 0.00 0.0000000
答案 1 :(得分:0)
这是问题的tidyverse
方法,至少是第一部分。但我相信它也会解决你问题的第二部分。
在我看来,以不同的,整洁的格式提供数据是有帮助的,其中每一行是在单独的焦点年度中的每月观察。 (顺便说一下,我不确定为什么focal_ years
列都是2001
。根据您的数据描述,它不应该是单独的年份吗?)
library(tidyverse)
drought_data_tidy <- drought_data %>%
gather(key, value, -FIPS, -Year) %>%
arrange(FIPS)
这为我们提供了以下数据结构(请注意,我添加了一个行ID,因为焦点年份始终是相同的。如果这是您的错误,代码会稍微简化一下):
> head(drought_data_tidy)
FIPS Year key value
1 8019 2005 pdsi_2002.01.15 -1.73
2 8019 2005 pdsi_2002.02.15 -2.04
3 8019 2005 pdsi_2002.03.15 -2.44
4 8019 2005 pdsi_2002.04.15 -3.55
5 8019 2005 pdsi_2002.05.15 -3.84
6 8019 2005 pdsi_2002.06.15 -4.42
唯一的“难点”是为每次负面运行创建一个唯一的run_id
,我们就是这样:
drought_data_tidy <- drought_data_tidy %>%
group_by(FIPS) %>%
mutate(run_id = cumsum(c(TRUE, diff(value < 0) != 0)),
run_id = ifelse(value < 0, run_id, NA))
现在剩下的就是适当的group_by
和mean
命令。请注意,我通过[!is.na(run_id)]
选择了我们进行计算的值的相关部分;我们还可以为此创建另一个分组变量,这可能更优雅。
鉴于您更新的数据和unique()
的问题,我决定通过summarize()
命令进行计算,并将结果存储在我最后加入的单独数据框中。它也可以在一个巨大的dplyr链中完成,但我认为这种方法更易于阅读,更适合修复错误。
# run length
drought_data_run_length <- drought_data_tidy %>%
group_by(FIPS, run_id) %>%
summarize(run_length = n()) %>%
mutate(mean_run_length = mean(run_length[!is.na(run_id)]))
# mean run length for join
drought_data_mean_run_length <- drought_data_run_length %>%
group_by(FIPS) %>%
summarise(mean_run_length = unique(mean_run_length))
# run sum
drought_data_tidy <- drought_data_tidy %>%
group_by(FIPS, run_id) %>%
mutate(run_sum = sum(value))
# mean run sum
drought_data_mean_run_sum <- drought_data_tidy %>%
group_by(FIPS) %>%
summarise(mean_run_sum = mean(run_sum[!is.na(run_id)]))
# mean run sum by mean run length
drought_data_mrs_by_mrl <- left_join(drought_data_mean_run_sum,
drought_data_mean_run_length,
by = "FIPS") %>%
mutate(mrs_by_mrl = mean(mean_run_sum / mean_run_length))
# join run length, mean run length, mean run sum, mrs_by_mrl
drought_data_tidy <- left_join(drought_data_tidy,
drought_data_run_length %>% select(-mean_run_length),
by = c("FIPS", "run_id"))
drought_data_tidy <- left_join(drought_data_tidy,
drought_data_mean_run_length %>% select(FIPS, mean_run_length),
by = "FIPS")
drought_data_tidy <- left_join(drought_data_tidy,
drought_data_mean_run_sum %>% select(FIPS, mean_run_sum),
by = "FIPS")
drought_data_tidy <- left_join(drought_data_tidy,
drought_data_mrs_by_mrl %>% select(FIPS, mrs_by_mrl),
by = "FIPS")
这为您提供以下输出:
> head(drought_data_tidy)
Source: local data frame [6 x 10]
Groups: FIPS, run_id [1]
FIPS Year key value run_id run_sum run_length mean_run_length mean_run_sum mrs_by_mrl
<dbl> <dbl> <chr> <dbl> <int> <dbl> <int> <dbl> <dbl> <dbl>
1 8019 2005 pdsi_2002.01.15 -1.73 1 -49.33 14 9 -33.54481 -3.542602
2 8019 2005 pdsi_2002.02.15 -2.04 1 -49.33 14 9 -33.54481 -3.542602
3 8019 2005 pdsi_2002.03.15 -2.44 1 -49.33 14 9 -33.54481 -3.542602
4 8019 2005 pdsi_2002.04.15 -3.55 1 -49.33 14 9 -33.54481 -3.542602
5 8019 2005 pdsi_2002.05.15 -3.84 1 -49.33 14 9 -33.54481 -3.542602
6 8019 2005 pdsi_2002.06.15 -4.42 1 -49.33 14 9 -33.54481 -3.542602
现在,您可以轻松地将其传播回原始数据格式。但是,我认为以长格式进行的月度观察的tidy
数据框对于通过巧妙地使用first()
来获得问题的第二部分更有用。但要解决这个问题,我们需要更多的数据(或玩具数据)扩展到不同的focal_years
。
希望有所帮助。