最长字符串非零值的总和

时间:2015-11-01 17:13:34

标签: r

我有一个数据框,其中包含1964年至2013年期间76个车站的每日降雨量值。对于特定站,每行是不同的月份。以下是dataframe-

的片段
 Station     Year Month Days 1   2   3  4 5   6  7  8 9 10 11 12 13 14 15 16 17 18  19 20 21 22 23  24 25 26 27 28 29  30  31
USC00020750 1964     1   31 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0  25  0 23 51 36   0  0  0  0  0  0   0   0
USC00020750 1964     2   29 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0  48  0  0  0  3   0  0  0  0  0  0 Inf Inf
USC00020750 1964     3   31 0  46  51  0 0  36 41 46 0  0  0  0 43  0  0  0  0  0   0  0  0 53 99 140 36  0  0  0  0   0   0
USC00020750 1964     4   30 5  69  23 30 0  18  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0  33 13  0  0  0 15   0 Inf
USC00020750 1964     5   31 0   0   0  0 0   0 43  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0 51  8  0  0   0   0
USC00020750 1964     6   30 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0  0 38  0  0   0 Inf
USC00020750 1964     7   31 0   0   0  0 0   0  0  0 0  0  0  0 41  0 13 13  0  0   0  0  8 51  0  71  0 10  0  0 20 165  25
USC00020750 1964     8   31 8  30 137  0 0   5 89  0 0  0 18 64  5  0  0  0  0  0   0  0  0  0  0   0  0 76  0  0  0   0   0
USC00020750 1964     9   30 0   0   0  0 0 119  0  0 0  0  0  0  0 41 25  0  0  0   0  0 25  0  0   0  0  0  0  0  0   0 Inf
USC00020750 1964    10   31 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0  0  0  0  0   0   0
USC00020750 1964    11   30 0   5   0  0 0   0  0  0 0  0 91  0  0  0 36 94  0  0   0  0  0  0  0   0  0  0  0  0  0   0 Inf
USC00020750 1964    12   31 0 107  20  0 0   0  0  0 0  0  0  0  0  0  0  0  0 79 152  0  0  0  0   0  0  0  0  0  0   0   0

...

Station Year Month Days  1  2   3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28  29  30  31
USW00093129 2013    10   31  0  0   0  0  0  0  0  0 43 15  0  0 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0 41   3   8   0
USW00093129 2013    11   30  0  0   0 23  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3 79 18 20  0  0  0  0  0   0   0 Inf
USW00093129 2013    12   31  0  0 175 33  0  0  3  0  0  0  0  0  0  0  0  0  0  0  5 15  0  0  0  0  0  0  0  0   0   0   0

我试图找出每一行中最长的非零降雨值的长度以及该段的总降雨量。找到最长拉伸长度的最简单方法是将数据帧转换为0和1,使用rle并沿每行应用max(y$lengths[y$values!=0])。但是如何找到值的总和? 感谢您提前帮忙!

3 个答案:

答案 0 :(得分:3)

不完全是单行,但这有效:

df <- read.table(header=TRUE,stringsAsFactors=FALSE,check.names=FALSE,text=
"Station     Year Month Days 1   2   3  4 5   6  7  8 9 10 11 12 13 14 15 16 17 18  19 20 21 22 23  24 25 26 27 28 29  30  31
USC00020750 1964     1   31 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0  25  0 23 51 36   0  0  0  0  0  0   0   0
USC00020750 1964     2   29 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0  48  0  0  0  3   0  0  0  0  0  0 Inf Inf
USC00020750 1964     3   31 0  46  51  0 0  36 41 46 0  0  0  0 43  0  0  0  0  0   0  0  0 53 99 140 36  0  0  0  0   0   0
USC00020750 1964     4   30 5  69  23 30 0  18  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0  33 13  0  0  0 15   0 Inf
USC00020750 1964     5   31 0   0   0  0 0   0 43  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0 51  8  0  0   0   0
USC00020750 1964     6   30 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0  0 38  0  0   0 Inf
USC00020750 1964     7   31 0   0   0  0 0   0  0  0 0  0  0  0 41  0 13 13  0  0   0  0  8 51  0  71  0 10  0  0 20 165  25
USC00020750 1964     8   31 8  30 137  0 0   5 89  0 0  0 18 64  5  0  0  0  0  0   0  0  0  0  0   0  0 76  0  0  0   0   0
USC00020750 1964     9   30 0   0   0  0 0 119  0  0 0  0  0  0  0 41 25  0  0  0   0  0 25  0  0   0  0  0  0  0  0   0 Inf
USC00020750 1964    10   31 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0  0  0  0  0   0   0
USC00020750 1964    11   30 0   5   0  0 0   0  0  0 0  0 91  0  0  0 36 94  0  0   0  0  0  0  0   0  0  0  0  0  0   0 Inf
USC00020750 1964    12   31 0 107  20  0 0   0  0  0 0  0  0  0  0  0  0  0  0 79 152  0  0  0  0   0  0  0  0  0  0   0   0")

res <- lapply(1:nrow(df), function(r){
  monthDays <- df[r,'Days']
  rain <- as.numeric(df[r,(1:monthDays) + 4])
  enc <- rle(rain > 0)
  if(all(!enc$values))
    return(c(0,0))
  len <- enc$lengths
  len[!enc$values] <- 0
  max.idx <- which.max(len)
  lastIdx <- cumsum(enc$lengths)[max.idx]
  firstIdx <- lastIdx - enc$lengths[max.idx] + 1
  tot <- sum(rain[firstIdx:lastIdx])
  stretch <- lastIdx - firstIdx + 1
  return(c(stretch,tot))  
})
columnsToAdd <- do.call(rbind,res)
colnames(columnsToAdd) <- c('StretchLen','StretchRain')

df2 <- cbind(df,columnsToAdd)

结果:

# We print the result without months values for better readability
> df2[,-(5:35)]
       Station Year Month Days StretchLen StretchRain
1  USC00020750 1964     1   31          3         110
2  USC00020750 1964     2   29          1          48
3  USC00020750 1964     3   31          4         328
4  USC00020750 1964     4   30          4         127
5  USC00020750 1964     5   31          2          59
6  USC00020750 1964     6   30          1          38
7  USC00020750 1964     7   31          3         210
8  USC00020750 1964     8   31          3         175
9  USC00020750 1964     9   30          2          66
10 USC00020750 1964    10   31          0           0
11 USC00020750 1964    11   30          2         130
12 USC00020750 1964    12   31          2         127
顺便说一下,如果你想坚持申请,就会这样:

columnsToAdd <- 
t(apply(df[,-(1:3)],MARGIN=1,function(r){
  monthDays <- r[1]
  rain <- as.numeric(r[-1])
  enc <- rle(rain > 0)
  if(all(!enc$values))
    return(c(0,0))
  len <- enc$lengths
  len[!enc$values] <- 0
  max.idx <- which.max(len)
  lastIdx <- cumsum(enc$lengths)[max.idx]
  firstIdx <- lastIdx - enc$lengths[max.idx] + 1
  tot <- sum(rain[firstIdx:lastIdx])
  stretch <- lastIdx - firstIdx + 1
  return(c(stretch,tot))  
}))

colnames(columnsToAdd) <- c('StretchLen','StretchRain')

df2 <- cbind(df,columnsToAdd)

我不喜欢在data.frame上使用apply,因为它是为矩阵创建的,因此它在调用函数之前将列强制转换为相同类型(因此,如果您处理不同类型的列,需要小心)。

答案 1 :(得分:2)

这是dplyr / tidyr

的另一个解决方案
data %>%
  gather(day, rain, -Station, -Year, -Month, -Days) %>%
  arrange(Station, Year, Month, day) %>%
  group_by(Station, Year, Month) %>%
  mutate(previous_rain = lag(rain)) %>%
  filter(!(rain %in% c(0, Inf))) %>%
  mutate(storm = cumsum(previous_rain %in% c(0, NA))) %>%
  group_by(Station, Year, Month, storm) %>%
  summarize(total_rain = sum(rain),
            number_of_days = n(),
            start_day = first(day),
            end_day = last(day)) %>%
  arrange(desc(number_of_days)) %>%
  slice(1)

答案 2 :(得分:0)

这是我的另一个观点,我已经使用rle()函数来查找游程长度。它旷日持久,但主要是为了清楚地说明发生了什么 - 你可以轻松地缩短它。

raindf <- 
    tmp <- read.table(textConnection(" Station     Year Month Days 1   2   3  4 5   6  7  8 9 10 11 12 13 14 15 16 17 18  19 20 21 22 23  24 25 26 27 28 29  30  31
USC00020750 1964     1   31 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0  25  0 23 51 36   0  0  0  0  0  0   0   0
           USC00020750 1964     2   29 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0  48  0  0  0  3   0  0  0  0  0  0 Inf Inf
           USC00020750 1964     3   31 0  46  51  0 0  36 41 46 0  0  0  0 43  0  0  0  0  0   0  0  0 53 99 140 36  0  0  0  0   0   0
           USC00020750 1964     4   30 5  69  23 30 0  18  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0  33 13  0  0  0 15   0 Inf
           USC00020750 1964     5   31 0   0   0  0 0   0 43  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0 51  8  0  0   0   0
           USC00020750 1964     6   30 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0  0 38  0  0   0 Inf
           USC00020750 1964     7   31 0   0   0  0 0   0  0  0 0  0  0  0 41  0 13 13  0  0   0  0  8 51  0  71  0 10  0  0 20 165  25
           USC00020750 1964     8   31 8  30 137  0 0   5 89  0 0  0 18 64  5  0  0  0  0  0   0  0  0  0  0   0  0 76  0  0  0   0   0
           USC00020750 1964     9   30 0   0   0  0 0 119  0  0 0  0  0  0  0 41 25  0  0  0   0  0 25  0  0   0  0  0  0  0  0   0 Inf
           USC00020750 1964    10   31 0   0   0  0 0   0  0  0 0  0  0  0  0  0  0  0  0  0   0  0  0  0  0   0  0  0  0  0  0   0   0
           USC00020750 1964    11   30 0   5   0  0 0   0  0  0 0  0 91  0  0  0 36 94  0  0   0  0  0  0  0   0  0  0  0  0  0   0 Inf
           USC00020750 1964    12   31 0 107  20  0 0   0  0  0 0  0  0  0  0  0  0  0  0 79 152  0  0  0  0   0  0  0  0  0  0   0   0"), header = TRUE)

rainfall <- unlist(as.data.frame(t(raindf[1:3, -c(1:4)])), use.names = FALSE)
rainfall <- rainfall[!is.infinite(rainfall)]
rainfall[rainfall > 0] <- 1
rainyruns <- rle(rainfall)
rainyrunsDf <- data.frame(lengths = rainyruns$lengths, values = rainyruns$values)
rainyrunsDf <- subset(rainyrunsDf, values != 0)
rainyrunsDf <- rainyrunsDf[order(rainyrunsDf$lengths, decreasing = TRUE), ]
rainyrunsDf[1,1]
## [1] 4