我有一个CSV文件,以80天为单位存储每天毫米的降水数据,如下所示:
19980720;0
19980721;5.1
19980722;2
19980723;21.1
19980724;3.4
19980725;0
19980726;0
19980727;1.2
19980728;0.2
19980729;0
19980730;0
19980731;0
19980801;2
19980802;5.6
19980803;10
19980804;15
19980805;14.6
19980806;13
19980807;0
19980808;0
我已经在R中加载了数据。现在我想找到连续下雨的时期。为此,我想到了总结至少两天没有降雨的天数降雨量。
如何查找这些期间?
编辑:对于示例数据,输出可能是这样的:
60.2
31.6
1.4
这是我的例子中连续下雨的三个时期的汇总和排序值。也许可以在特定事件发生时添加开始和/或结束日期:
19980801 19980806 60.2
19980721 19980724 31.6
19980727 19980728 1.4
答案 0 :(得分:7)
为了在持续降雨期间只获得累积雨量,你需要创建一个向量,指示一天是否属于这样一个时期。
注意:为了说明这一点,我创建了一个略有不同的数据集,在最后一段雨中,有一天没有下雨(见下文)。
使用 data.table ,您可以:
setDT(mydf)[, sus.rf := !(rf==0 & (shift(rf,1L,type="lag",fill=0)==0 | shift(rf,1L,type="lead",fill=0)==0))
][, id:=rleid(sus.rf)
][sus.rf==TRUE, .(start=dates[1], end=dates[.N], sum.rf=sum(rf)), id]
给出:
id start end sum.rf
1: 2 19980721 19980724 31.6
2: 4 19980727 19980728 1.4
3: 6 19980801 19980806 45.6
假设您想要总结至少3天没有下雨的天数,您可以这样做:
setDT(mydf)[, sus.rf := !(rf==0 & (shift(rf,1L,type="lag",fill=0)==0 | shift(rf,1L,type="lead",fill=0)==0))
][, days := .N, rleid(sus.rf)
][sus.rf==FALSE & days < 3, sus.rf := TRUE
][, .(start=dates[1], end=dates[.N], sum.rf=sum(rf)), rleid(sus.rf)][sum.rf!=0][]
给出:
rleid start end sum.rf
1: 1 19980720 19980728 33.0
2: 3 19980801 19980808 45.6
使用过的数据:
mydf <- structure(list(dates = c(19980720L, 19980721L, 19980722L, 19980723L,
19980724L, 19980725L, 19980726L, 19980727L, 19980728L, 19980729L,
19980730L, 19980731L, 19980801L, 19980802L, 19980803L, 19980804L,
19980805L, 19980806L, 19980807L, 19980808L),
rf = c(0, 5.1, 2, 21.1, 3.4, 0, 0, 1.2, 0.2, 0, 0, 0, 2, 5.6, 10, 15, 0, 13, 0, 0)),
.Names = c("dates", "rf"), row.names = c(NA, -20L), class = "data.frame")
答案 1 :(得分:4)
我们可以使用data.table
。将'data.frame'转换为'data.table'(setDT(d1)
),根据'v2'(rleid
)中的逻辑索引创建一个!v2
的分组列。删除“v2”中对应于“0”值的行,按“i1”分组,我们得到“{2}”的“v2”以及sum
和first
({{1} },last
)'v1'中的元素。将'i1'分配为NULL(如果需要)。
v1[1]
v1[.N]
答案 2 :(得分:3)
这可能是另一个选项(使用sep = ';'
中的header = FALSE
和read.table
加载您的文件
library(data.table)
dat$V3 = cumsum(c(TRUE, diff(dat$V2)==0))
out = setDT(dat)[, .(a = V1[2L], b = V1[.N-1], c = sum(V2)), by = V3][, -V3, with=FALSE]
这将给出
# a b c
#1: 19980721 19980724 31.6
#2: 19980727 19980728 1.4
#3: NA NA 0.0
#4: 19980801 19980806 60.2
#5: NA NA 0.0
稍后您可以删除NA的行 像这样
out[is.finite(rowSums(out))]
使用基础R lapply
dat$V3 = cumsum(c(TRUE, diff(dat$V2)==0))
d1 = split(dat, f=dat$V3)
do.call(rbind,
lapply(d1[sapply(d1, function(x) dim(x)[1]) > 1],
function(x){ out = subset(x, x$V2 != 0);
data.frame(a = out$V1[1L], b = out$V1[length(out$V1)], c = sum(out$V2))
}))
# a b c
#1 19980721 19980724 31.6
#2 19980727 19980728 1.4
#4 19980801 19980806 60.2
答案 3 :(得分:2)
基础R解决方案:
# add index and binary "rain-code" column
d$index <- 1:nrow(d)
d$rain <- ifelse(d$V2 == 0, 0, 1)
# As one day without rain is similar to a rainy day ("summing up the rainfall of days which
# are interrupted by at least two days without any rainfall.")
# we change the 0 of the first sunny day in a row to 1
d$rain[which(d$rain==0)[c(2,diff(which(d$rain==0),1))!=1] ] <- 1
# remove the sunny days
b <- d[ d$rain != 0,]
# and include a grouping factor to get rainy intervals
b$group <- .bincode(b$index,c(1,which(diff(b$index)!=1)+1,nrow(d)),include.lowest = T)
# remove the remaining sunny days in each group which were set to 1 at the beginning
b <- b[ b$V2 != 0,]
# and the final output:
a1 <- aggregate(b[,1],list(b$group),function(x) cbind(min(x),max(x)))
a2 <- aggregate(b[,2],list(b$group),sum)
cbind(a1,volume=a2$x)
Group.1 x.1 x.2 volume
1 1 19980721 19980724 31.6
2 2 19980727 19980728 1.4
3 3 19980801 19980806 60.2