如何获取具有两个连续负值的数据帧的行号?
df<-data.frame(val=c(.3,.1,-.1,-.2,.01,.2,-.9,-.7))
> df
val
1 0.30
2 0.10
3 -0.10
4 -0.20
5 0.01
6 0.20
7 -0.90
8 -0.70
目标:返回第4行和第8行
注意:如果一行中有多于2个负值,我希望返回第二个负值的行号。例如,如果
val
1 -0.30 #added to show just one negative value should be ignored
2 0.10
3 -0.10
4 -0.20
5 0.01
6 0.20
7 -0.90
8 -0.70
9 -0.09 #added to show the case of more than 2 neg values
目标:仍然返回第4行和第8行
答案 0 :(得分:5)
您可以尝试使用rle
:
# get the sequences of negative / not negative values
rle_val <- rle(df$val < 0)
# get the ones that fulfill the condition, using cumsum to get the row numbers and not just the sequence order
cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths==2)]
#[1] 4 8
修改强>
如果你想要发现第二个负值的索引,即使连续的值超过2个,你也可以
cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)-1]+2
<强> EDIT2 强>
如果您在开始时有2个或更多负值,则上述操作无效。为了解决这个问题,你可以这样做:
if(rle_val$value[1] & rle_val$lengths[1]>=2) {
c(2, cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)[-1]-1]+2)
} else {
cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)-1]+2
}
为例
df$val2 <- df$val
df$val2[5] <- -0.05
rle_val2 <- rle(df$val2 < 0)
if(rle_val2$value[1] & rle_val2$lengths[1]>=2) {
c(2, cumsum(rle_val2$lengths)[which(rle_val2$value & rle_val2$lengths>=2)[-1]-1]+2)
} else {
cumsum(rle_val2$lengths)[which(rle_val2$value & rle_val2$lengths>=2)-1]+2
}
#[1] 4 8
以bgoldst为例:
df <- data.frame(val=c(-0.4,-0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7,-0.09))
rle_val <- rle(df$val < 0)
if(rle_val$value[1] & rle_val$lengths[1]>=2) {
c(2, cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)[-1]-1]+2)
} else {
cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)-1]+2
}
#[1] 2 5 9
答案 1 :(得分:3)
另一个选项是rleid
library(data.table)
setDT(df)[,tail(.I[val<0 & .N ==2],1) , rleid(val< 0)]$V1
#[1] 4 8
答案 2 :(得分:2)
df <- data.frame(val=c(0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7)); ## OP's first test case
nr <- nrow(df);
which(c(FALSE,TRUE,df$val[-c(nr,nr-1L)]>=0) & c(FALSE,df$val[-nr]<0) & df$val<0);
## [1] 4 8
df <- data.frame(val=c(-0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7,-0.09)); ## OP's second case
nr <- nrow(df);
which(c(FALSE,TRUE,df$val[-c(nr,nr-1L)]>=0) & c(FALSE,df$val[-nr]<0) & df$val<0);
## [1] 4 8
df <- data.frame(val=c(-0.4,-0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7,-0.09)); ## leading case
nr <- nrow(df);
which(c(FALSE,TRUE,df$val[-c(nr,nr-1L)]>=0) & c(FALSE,df$val[-nr]<0) & df$val<0);
## [1] 2 5 9
library(data.table);
library(microbenchmark);
bgoldst <- function(df) { nr <- nrow(df); which(c(FALSE,TRUE,df$val[-c(nr,nr-1L)]>=0) & c(FALSE,df$val[-nr]<0) & df$val<0); };
akrun <- function(dt) dt[,tail(.I[val<0 & .N ==2],1) , rleid(val< 0)]$V1;
cath <- function(df) { rle_val <- rle(df$val < 0); if(rle_val$value[1] & rle_val$lengths[1]>=2) { c(2, cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)[-1]-1]+2); } else { cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)-1]+2; }; };
df <- data.frame(val=c(0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7)); ## OP's first test case
dt <- as.data.table(df);
ex <- bgoldst(df);
identical(ex,akrun(dt));
## [1] TRUE
identical(ex,as.integer(cath(df))); ## cath returns double
## [1] TRUE
microbenchmark(bgoldst(df),akrun(dt),cath(df));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(df) 26.515 32.5025 40.05455 35.4955 45.7595 86.814 100
## akrun(dt) 940.409 979.9665 1136.94108 1001.7770 1074.4780 2340.116 100
## cath(df) 37.634 44.9040 56.64326 53.6715 62.4385 144.547 100
set.seed(1L);
N <- 1e5L; df <- data.frame(val=runif(N,-1,1));
dt <- as.data.table(df);
ex <- bgoldst(df);
identical(ex,akrun(dt)); ## akrun currently doesn't handle 3 or more consecutive
## [1] FALSE
identical(ex,as.integer(cath(df))); ## cath returns double
## [1] TRUE
microbenchmark(bgoldst(df),akrun(dt),cath(df));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(df) 7.247004 8.363818 12.27691 8.884486 10.49567 57.03775 100
## akrun(dt) 618.399502 640.290830 664.46935 649.069256 685.36564 769.65257 100
## cath(df) 7.895753 8.412999 13.85755 9.595672 11.34092 64.75346 100