Question

我有一个巨大的文本文件。我想提取行的块，哪些行索引在另一个数据帧中定义，例如sub。我有一个循环脚本，但我想为这个任务找到更有效的方法（更好的没有循环）。这是我的玩具示例：

df <- data.frame(value=runif(10000, 0, 10^3))
df$idx <- 1:nrow(df)
sub <- data.frame(start=c(20,50,130,2000),end=c(25,60,150,2030))
sub_data <- data.frame()
for (j in 1:nrow(sub)){
  dt <- df[df$idx >= sub$start[j] & df$idx <= sub$end[j],]
  sub_data <- rbind(sub_data,dt)
}

sub_data

Answer 1

以下是data.table使用非equi join 的解决方案自v1.9.8

library(data.table)
dt <- data.table(value=runif(10000, 0, 10^3))
# add index column
dt[, idx := seq_len(.N)]
# create subset table
sub <- data.table(start=c(20,50,130,2000),end=c(25,60,150,2030))
# use data.table non equijoin
dt1 <- dt[sub, on = .(idx >= start, idx <= end)]
head(dt1)
#>        value idx idx.1
#> 1: 820.38637  20    25
#> 2: 262.51398  20    25
#> 3: 900.37408  20    25
#> 4:  74.91815  20    25
#> 5: 507.87825  20    25
#> 6: 547.45235  20    25
# use data.table non equi join but just keep column from dt
dt2 <- dt[sub, .(value, idx = x.idx), on = .(idx >= start, idx <= end)]
head(dt2)
#>        value idx
#> 1: 820.38637  20
#> 2: 262.51398  21
#> 3: 900.37408  22
#> 4:  74.91815  23
#> 5: 507.87825  24
#> 6: 547.45235  25

Answer 2

这是一个创建所有id序列的解决方案，然后根据id的序列对constant进行子集化。 df是最终输出。

df2

或者我们可以使用IDs <- unlist(lapply(1:nrow(sub), function(i) {sub$start[i]:sub$end[i]})) df2 <- df[df$idx %in% IDs, ]中的函数。

tidyverse

Answer 3

为df的每一行设置sub的相关部分，使子组位于列表中，然后将rbind个子组放在一起

output = do.call(rbind, lapply(1:NROW(sub), function(i) with(sub, df[start[i]:end[i],])))
identical(sub_data, output)
#[1] TRUE

Answer 4

如你所说，你有一个巨大的文本文件，

我建议使用data.table的fread和rbindlist函数来使用

dt_div_conquer <- function(loc, id_name, subset_id){
# id_name : ID column in file - to be used for filtering
# subset_id : list of IDs to be filtered
# loc : file location



  ## Read ID Column from the txt file
  v <- fread(sprintf('%s', loc), select = id_name)

  ## filter row numbers to read
  v <- v[[id_name]] %in% subset_id
  seq  <- rle(v)
  idx  <- c(0, cumsum(seq$lengths))[which(seq$values)] + 1

  ## create starting row-number and length as a data-frame
  indx <- data.frame(start=idx, length=seq$length[which(seq$values)])

  ## Apply fread with row-number and length details
  result <- do.call(rbindlist, 
                    apply(indx, 1, function(x) return(fread(sprintf('%s', loc),nrows= x[2],skip=x[1]))))
  return(result)
}

如何在没有循环的情况下提取行块

4 个答案: