获取条件存在的行

时间:2016-05-23 06:41:35

标签: r dataframe

如何获取具有两个连续负值的数据帧的行号?

df<-data.frame(val=c(.3,.1,-.1,-.2,.01,.2,-.9,-.7))
> df
    val
1  0.30
2  0.10
3 -0.10
4 -0.20
5  0.01
6  0.20
7 -0.90
8 -0.70

目标:返回第4行和第8行

注意:如果一行中有多于2个负值,我希望返回第二个负值的行号。例如,如果

    val
1 -0.30 #added to show just one negative value should be ignored
2  0.10
3 -0.10
4 -0.20
5  0.01
6  0.20
7 -0.90
8 -0.70
9 -0.09  #added to show the case of more than 2 neg values

目标:仍然返回第4行和第8行

3 个答案:

答案 0 :(得分:5)

您可以尝试使用rle

# get the sequences of negative / not negative values
rle_val <- rle(df$val < 0)
# get the ones that fulfill the condition, using cumsum to get the row numbers and not just the sequence order
cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths==2)]
#[1] 4 8

修改

如果你想要发现第二个负值的索引,即使连续的值超过2个,你也可以

cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)-1]+2

<强> EDIT2

如果您在开始时有2个或更多负值,则上述操作无效。为了解决这个问题,你可以这样做:

if(rle_val$value[1] & rle_val$lengths[1]>=2) {
    c(2, cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)[-1]-1]+2)
} else {
    cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)-1]+2
}

为例

df$val2 <- df$val
df$val2[5] <- -0.05
rle_val2 <- rle(df$val2 < 0)
if(rle_val2$value[1] & rle_val2$lengths[1]>=2) {
   c(2, cumsum(rle_val2$lengths)[which(rle_val2$value & rle_val2$lengths>=2)[-1]-1]+2)
} else {
   cumsum(rle_val2$lengths)[which(rle_val2$value & rle_val2$lengths>=2)-1]+2
}
#[1] 4 8

以bgoldst为例:

df <- data.frame(val=c(-0.4,-0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7,-0.09))
rle_val <- rle(df$val < 0)
if(rle_val$value[1] & rle_val$lengths[1]>=2) {
   c(2, cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)[-1]-1]+2)
} else {
   cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)-1]+2
}
#[1] 2 5 9

答案 1 :(得分:3)

另一个选项是rleid

library(data.table) 
setDT(df)[,tail(.I[val<0 & .N ==2],1) , rleid(val< 0)]$V1
#[1] 4 8

答案 2 :(得分:2)

df <- data.frame(val=c(0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7)); ## OP's first test case
nr <- nrow(df);
which(c(FALSE,TRUE,df$val[-c(nr,nr-1L)]>=0) & c(FALSE,df$val[-nr]<0) & df$val<0);
## [1] 4 8
df <- data.frame(val=c(-0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7,-0.09)); ## OP's second case
nr <- nrow(df);
which(c(FALSE,TRUE,df$val[-c(nr,nr-1L)]>=0) & c(FALSE,df$val[-nr]<0) & df$val<0);
## [1] 4 8
df <- data.frame(val=c(-0.4,-0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7,-0.09)); ## leading case
nr <- nrow(df);
which(c(FALSE,TRUE,df$val[-c(nr,nr-1L)]>=0) & c(FALSE,df$val[-nr]<0) & df$val<0);
## [1] 2 5 9

基准

library(data.table);
library(microbenchmark);

bgoldst <- function(df) { nr <- nrow(df); which(c(FALSE,TRUE,df$val[-c(nr,nr-1L)]>=0) & c(FALSE,df$val[-nr]<0) & df$val<0); };
akrun <- function(dt) dt[,tail(.I[val<0 & .N ==2],1) , rleid(val< 0)]$V1;
cath <- function(df) { rle_val <- rle(df$val < 0); if(rle_val$value[1] & rle_val$lengths[1]>=2) { c(2, cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)[-1]-1]+2); } else { cumsum(rle_val$lengths)[which(rle_val$value & rle_val$lengths>=2)-1]+2; }; };
df <- data.frame(val=c(0.3,0.1,-0.1,-0.2,0.01,0.2,-0.9,-0.7)); ## OP's first test case
dt <- as.data.table(df);

ex <- bgoldst(df);
identical(ex,akrun(dt));
## [1] TRUE
identical(ex,as.integer(cath(df))); ## cath returns double
## [1] TRUE

microbenchmark(bgoldst(df),akrun(dt),cath(df));
## Unit: microseconds
##         expr     min       lq       mean    median        uq      max neval
##  bgoldst(df)  26.515  32.5025   40.05455   35.4955   45.7595   86.814   100
##    akrun(dt) 940.409 979.9665 1136.94108 1001.7770 1074.4780 2340.116   100
##     cath(df)  37.634  44.9040   56.64326   53.6715   62.4385  144.547   100
set.seed(1L);
N <- 1e5L; df <- data.frame(val=runif(N,-1,1));
dt <- as.data.table(df);

ex <- bgoldst(df);
identical(ex,akrun(dt)); ## akrun currently doesn't handle 3 or more consecutive
## [1] FALSE
identical(ex,as.integer(cath(df))); ## cath returns double
## [1] TRUE

microbenchmark(bgoldst(df),akrun(dt),cath(df));
## Unit: milliseconds
##         expr        min         lq      mean     median        uq       max neval
##  bgoldst(df)   7.247004   8.363818  12.27691   8.884486  10.49567  57.03775   100
##    akrun(dt) 618.399502 640.290830 664.46935 649.069256 685.36564 769.65257   100
##     cath(df)   7.895753   8.412999  13.85755   9.595672  11.34092  64.75346   100