我有一个数据框,其中包含1964年至2013年期间76个车站的每日降雨量值。对于特定站,每行是不同的月份。以下是dataframe-
的片段 Station Year Month Days 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
USC00020750 1964 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0 23 51 36 0 0 0 0 0 0 0 0
USC00020750 1964 2 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 48 0 0 0 3 0 0 0 0 0 0 Inf Inf
USC00020750 1964 3 31 0 46 51 0 0 36 41 46 0 0 0 0 43 0 0 0 0 0 0 0 0 53 99 140 36 0 0 0 0 0 0
USC00020750 1964 4 30 5 69 23 30 0 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 13 0 0 0 15 0 Inf
USC00020750 1964 5 31 0 0 0 0 0 0 43 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 51 8 0 0 0 0
USC00020750 1964 6 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 0 0 0 Inf
USC00020750 1964 7 31 0 0 0 0 0 0 0 0 0 0 0 0 41 0 13 13 0 0 0 0 8 51 0 71 0 10 0 0 20 165 25
USC00020750 1964 8 31 8 30 137 0 0 5 89 0 0 0 18 64 5 0 0 0 0 0 0 0 0 0 0 0 0 76 0 0 0 0 0
USC00020750 1964 9 30 0 0 0 0 0 119 0 0 0 0 0 0 0 41 25 0 0 0 0 0 25 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 10 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
USC00020750 1964 11 30 0 5 0 0 0 0 0 0 0 0 91 0 0 0 36 94 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 12 31 0 107 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 79 152 0 0 0 0 0 0 0 0 0 0 0 0
...
Station Year Month Days 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
USW00093129 2013 10 31 0 0 0 0 0 0 0 0 43 15 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 41 3 8 0
USW00093129 2013 11 30 0 0 0 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 79 18 20 0 0 0 0 0 0 0 Inf
USW00093129 2013 12 31 0 0 175 33 0 0 3 0 0 0 0 0 0 0 0 0 0 0 5 15 0 0 0 0 0 0 0 0 0 0 0
我试图找出每一行中最长的非零降雨值的长度以及该段的总降雨量。找到最长拉伸长度的最简单方法是将数据帧转换为0和1,使用rle并沿每行应用max(y$lengths[y$values!=0])
。但是如何找到值的总和?
感谢您提前帮忙!
答案 0 :(得分:3)
不完全是单行,但这有效:
df <- read.table(header=TRUE,stringsAsFactors=FALSE,check.names=FALSE,text=
"Station Year Month Days 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
USC00020750 1964 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0 23 51 36 0 0 0 0 0 0 0 0
USC00020750 1964 2 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 48 0 0 0 3 0 0 0 0 0 0 Inf Inf
USC00020750 1964 3 31 0 46 51 0 0 36 41 46 0 0 0 0 43 0 0 0 0 0 0 0 0 53 99 140 36 0 0 0 0 0 0
USC00020750 1964 4 30 5 69 23 30 0 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 13 0 0 0 15 0 Inf
USC00020750 1964 5 31 0 0 0 0 0 0 43 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 51 8 0 0 0 0
USC00020750 1964 6 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 0 0 0 Inf
USC00020750 1964 7 31 0 0 0 0 0 0 0 0 0 0 0 0 41 0 13 13 0 0 0 0 8 51 0 71 0 10 0 0 20 165 25
USC00020750 1964 8 31 8 30 137 0 0 5 89 0 0 0 18 64 5 0 0 0 0 0 0 0 0 0 0 0 0 76 0 0 0 0 0
USC00020750 1964 9 30 0 0 0 0 0 119 0 0 0 0 0 0 0 41 25 0 0 0 0 0 25 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 10 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
USC00020750 1964 11 30 0 5 0 0 0 0 0 0 0 0 91 0 0 0 36 94 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 12 31 0 107 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 79 152 0 0 0 0 0 0 0 0 0 0 0 0")
res <- lapply(1:nrow(df), function(r){
monthDays <- df[r,'Days']
rain <- as.numeric(df[r,(1:monthDays) + 4])
enc <- rle(rain > 0)
if(all(!enc$values))
return(c(0,0))
len <- enc$lengths
len[!enc$values] <- 0
max.idx <- which.max(len)
lastIdx <- cumsum(enc$lengths)[max.idx]
firstIdx <- lastIdx - enc$lengths[max.idx] + 1
tot <- sum(rain[firstIdx:lastIdx])
stretch <- lastIdx - firstIdx + 1
return(c(stretch,tot))
})
columnsToAdd <- do.call(rbind,res)
colnames(columnsToAdd) <- c('StretchLen','StretchRain')
df2 <- cbind(df,columnsToAdd)
结果:
# We print the result without months values for better readability
> df2[,-(5:35)]
Station Year Month Days StretchLen StretchRain
1 USC00020750 1964 1 31 3 110
2 USC00020750 1964 2 29 1 48
3 USC00020750 1964 3 31 4 328
4 USC00020750 1964 4 30 4 127
5 USC00020750 1964 5 31 2 59
6 USC00020750 1964 6 30 1 38
7 USC00020750 1964 7 31 3 210
8 USC00020750 1964 8 31 3 175
9 USC00020750 1964 9 30 2 66
10 USC00020750 1964 10 31 0 0
11 USC00020750 1964 11 30 2 130
12 USC00020750 1964 12 31 2 127
顺便说一下,如果你想坚持申请,就会这样:
columnsToAdd <-
t(apply(df[,-(1:3)],MARGIN=1,function(r){
monthDays <- r[1]
rain <- as.numeric(r[-1])
enc <- rle(rain > 0)
if(all(!enc$values))
return(c(0,0))
len <- enc$lengths
len[!enc$values] <- 0
max.idx <- which.max(len)
lastIdx <- cumsum(enc$lengths)[max.idx]
firstIdx <- lastIdx - enc$lengths[max.idx] + 1
tot <- sum(rain[firstIdx:lastIdx])
stretch <- lastIdx - firstIdx + 1
return(c(stretch,tot))
}))
colnames(columnsToAdd) <- c('StretchLen','StretchRain')
df2 <- cbind(df,columnsToAdd)
我不喜欢在data.frame上使用apply
,因为它是为矩阵创建的,因此它在调用函数之前将列强制转换为相同类型(因此,如果您处理不同类型的列,需要小心)。
答案 1 :(得分:2)
这是dplyr / tidyr
的另一个解决方案data %>%
gather(day, rain, -Station, -Year, -Month, -Days) %>%
arrange(Station, Year, Month, day) %>%
group_by(Station, Year, Month) %>%
mutate(previous_rain = lag(rain)) %>%
filter(!(rain %in% c(0, Inf))) %>%
mutate(storm = cumsum(previous_rain %in% c(0, NA))) %>%
group_by(Station, Year, Month, storm) %>%
summarize(total_rain = sum(rain),
number_of_days = n(),
start_day = first(day),
end_day = last(day)) %>%
arrange(desc(number_of_days)) %>%
slice(1)
答案 2 :(得分:0)
这是我的另一个观点,我已经使用rle()
函数来查找游程长度。它旷日持久,但主要是为了清楚地说明发生了什么 - 你可以轻松地缩短它。
raindf <-
tmp <- read.table(textConnection(" Station Year Month Days 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
USC00020750 1964 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0 23 51 36 0 0 0 0 0 0 0 0
USC00020750 1964 2 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 48 0 0 0 3 0 0 0 0 0 0 Inf Inf
USC00020750 1964 3 31 0 46 51 0 0 36 41 46 0 0 0 0 43 0 0 0 0 0 0 0 0 53 99 140 36 0 0 0 0 0 0
USC00020750 1964 4 30 5 69 23 30 0 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 13 0 0 0 15 0 Inf
USC00020750 1964 5 31 0 0 0 0 0 0 43 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 51 8 0 0 0 0
USC00020750 1964 6 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 0 0 0 Inf
USC00020750 1964 7 31 0 0 0 0 0 0 0 0 0 0 0 0 41 0 13 13 0 0 0 0 8 51 0 71 0 10 0 0 20 165 25
USC00020750 1964 8 31 8 30 137 0 0 5 89 0 0 0 18 64 5 0 0 0 0 0 0 0 0 0 0 0 0 76 0 0 0 0 0
USC00020750 1964 9 30 0 0 0 0 0 119 0 0 0 0 0 0 0 41 25 0 0 0 0 0 25 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 10 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
USC00020750 1964 11 30 0 5 0 0 0 0 0 0 0 0 91 0 0 0 36 94 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 12 31 0 107 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 79 152 0 0 0 0 0 0 0 0 0 0 0 0"), header = TRUE)
rainfall <- unlist(as.data.frame(t(raindf[1:3, -c(1:4)])), use.names = FALSE)
rainfall <- rainfall[!is.infinite(rainfall)]
rainfall[rainfall > 0] <- 1
rainyruns <- rle(rainfall)
rainyrunsDf <- data.frame(lengths = rainyruns$lengths, values = rainyruns$values)
rainyrunsDf <- subset(rainyrunsDf, values != 0)
rainyrunsDf <- rainyrunsDf[order(rainyrunsDf$lengths, decreasing = TRUE), ]
rainyrunsDf[1,1]
## [1] 4