Question

我使用read.table()将文本文件读入R，从而生成包含一列未解析数据的数据框。

我还有一个数据字典，其中包含文本文件每一行的列名及其开头和结尾字符位置。

这是我用一个小例子解析文本文件的方法：

library(data.table)

df <- data.frame(
  parse=c("123qweASD","234werSDF","345ertDFG"),
  stringsAsFactors = FALSE
)

guide <- data.frame(
 name=c("c1","c2","c3"),
 begin=c(1,3,6),
 end=c(2,5,9)
)

emptyDF <- data.frame(matrix(ncol = nrow(guide), 
                             nrow = nrow(df)))
colnames(emptyDF) <- as.character(unlist(guide[1]))
emptyDF[is.na(emptyDF)] <- ""

setDT(emptyDF)

for(y in 1:nrow(df)){
  split <- character()
  for(z in 1:nrow(guide)){
    extr <- substr(df[y,], guide[z, 2], guide[z, 3])
    split <- c(split, extr)
  }
  emptyDF <- emptyDF[y, names(emptyDF) := as.list(split)]
}

导致：

> emptyDF
   c1  c2   c3
1: 12 3qw eASD
2: 23 4we rSDF
3: 34 5er tDFG

为了避免追加并加快速度，我创建了一个空数据。与未解析数据的行长度相同，并用解析后的行替换其行

该方法有效，但运行包含200,000行和90列的文件需要一段时间。

我还能做些什么来加快速度吗？

Answer 1

如果您知道文本文件的每一行的开始和结束字符位置，您肯定应该使用基础R中的?read.fwf（fread没有任何此类功能，但您可以转换之后再到data.table ......）

writeLines(c("123qweASD","234werSDF","345ertDFG"),
            con="tmpfwf.txt")
guide <- data.frame(
    name=c("c1","c2","c3"),
    begin=c(1,3,6),
    end=c(2,5,9)
)

df <- read.fwf("tmpfwf.txt",guide$end+1-guide$begin)
data.table::data.table(df)
##    V1  V2   V3
## 1: 12 3qw eASD
## 2: 23 4we rSDF
## 3: 34 5er tDFG

Answer 2

readr::read_fwf怎么样？它对错误更快，更健壮。

采取华纳的大数据集：

writeLines(as.character(unlist(df1[1])), con="df1.txt")
system.time({
  a4 <- read_fwf("df1.txt", fwf_widths(guide1$end+1-guide1$begin))
})

  user  system elapsed 
4.398    0.390   4.975

system.time({
  a2 <- read.fwf("df1.txt", guide1$end+1-guide1$begin)
})

   user  system elapsed 
101.583  83.111 190.462

system.time({
a3 <- setDT(setNames(do.call(rbind.data.frame, 
                     Map(substring, df1$v1, list(guide1$begin), list(guide1$end))), guide1$names))[]
})

   user  system elapsed 
140.660   2.790 147.322

这最后一个时间被搞砸了，因为它没有从文件中读取数据。它从内存中读取它。要真正比较时间，您还需要将readLines之类的时间添加到其中。

Answer 3

编辑以包含第3种方法

比较我的方法时间，@ Ben Bolker＆和@ akrun。我通常不会比较速度，所以可能有一种更有说服力的方式来设置它。接受建议。

我创建了一个包含500行的示例，其中一行包含250,000行。我看一下分成10列和100列的时间。

方法1：使用for循环填充空data.table()。

方法2： read.fwf()

方法3： setDT() substring

以下是这些方法在几分钟内的时间：

> results
                   DataSize   Approach1   Approach2   Approach3
1      500 Rows, 10 Columns  0.01934705 0.002605216 0.001200851
2     500 rows, 100 Columns  0.07892265 0.028603617 0.014927268
3  250,000 Rows, 10 Columns  6.84735728 1.527935565 1.585325948
4 250,000 rows, 100 Columns 37.34443290 8.075678647 4.172232886

read.fwf()显然比我在我的方法中使用的for循环更快。 substring方法是最快的。有趣的是，根据列数与行数的不同，这三种方法的比例不同：

> # Time factor increase with column and row increases
> scaling
                                     Increase  Approach1  Approach2   Approach3
1   500 Rows: Increase from 10 to 100 Columns   4.079311  10.979366   12.430577
2  250k Rows: Increase from 10 to 100 Columns   5.453846   5.285353    2.631782
3  10 Columns: Increase from 500 to 250k Rows 353.922518 586.490999 1320.168952
4 100 Columns: Increase from 500 to 250k Rows 473.177640 282.330677  279.504118

当存在少量列或少量行时，空data.table方法似乎比read.fwf()和substring方法更好。对此为何的任何想法？

另一个想法：我的数据集比这里最大的例子少了一些列和行。但解析需要将近一个小时。我的数据集中的每一行有700-800个字符，结果列的大小各不相同。这是性能和速度的另一个方面，值得考虑。

以下是我如何设置它。

使用随机字符串和指南表设置表格

library(stringi)

df1 <- data.frame(
  v1=stri_rand_strings(n=250000, length=200, pattern="[A-Za-z0-9]"),
  stringsAsFactors=FALSE
)
df2 <- as.data.frame(df1[1:500,])

guide1 <- data.frame(
  names=paste0(rep("c",100), 1:100),
  begin=(1:100)*2-1,
  end=(1:100)*2,
  stringsAsFactors = FALSE
)

guide2 <- data.frame(
  names=paste0(rep("c",10), 1:10),
  begin=(0:9)*20+1,
  end=(1:10)*20,
  stringsAsFactors = FALSE
)

设置两种方法的功能

approach1 <- function(emptydf, df, guide){
  for(y in 1:nrow(df)){
    split <- character()
    for(z in 1:nrow(guide)){
      extr <- substr(df[y,], guide[z, 2], guide[z, 3])
      split <- c(split, extr)
    }
    emptydf <- emptydf[y, names(emptydf) := as.list(split)]
  }
  return(emptydf)
}


approach2 <- function(path, guide){
  import <- read.fwf(path, guide$end+1-guide$begin)
}


approach3 <- function(df, guide){
      setDT(setNames(do.call(rbind.data.frame, Map(substring, df$v1, 
                  list(guide$begin), list(guide$end))), guide$names))[] 

}

方法1：使用For循环清空data.table：

emptydf1 <- data.frame(matrix(ncol = nrow(guide1), 
                              nrow = nrow(df1)))
colnames(emptydf1) <- as.character(unlist(guide1[1]))
emptydf1[is.na(emptydf1)] <- ""

emptydf2 <- as.data.frame(emptydf1[, 1:10])
emptydf3 <- as.data.frame(emptydf1[1:500,])
emptydf4 <- as.data.frame(emptydf1[1:500,1:10])

setDT(emptydf1)
setDT(emptydf2)
setDT(emptydf3)
setDT(emptydf4)

## 500 rows and 10 columns
a0 <- Sys.time()
app1Out1 <- approach1(emptydf4, df2, guide2)
a1 <- Sys.time()
## 500 rows and 100 columns
b0 <- Sys.time()
app1Out2 <- approach1(emptydf3, df2, guide1)
b1 <- Sys.time()
## 250,000 rows and 10 columns
c0 <- Sys.time()
app1Out3 <- approach1(emptydf2, df1, guide2)
c1 <- Sys.time()
## 250,000 rows and 100 columns
d0 <- Sys.time()
app1Out4 <- approach1(emptydf1, df1, guide1)
d1 <- Sys.time()

方法2： read.fwf()

writeLines(as.character(unlist(df1[1])), con="df1.txt")
writeLines(as.character(unlist(df2[1])), con="df2.txt")

## 500 rows and 10 columns
e0 <- Sys.time()
app2Out1 <- approach2("df2.txt", guide2)
e1 <- Sys.time()
## 500  rows and 100 columns
f0 <- Sys.time()
app2Out2 <- approach2("df2.txt", guide1)
f1 <- Sys.time()
## 500 rows and 10 columns
g0 <- Sys.time()
app2Out3 <- approach2("df1.txt", guide2)
g1 <- Sys.time()
## 250,00 rows and 100 columns
h0 <- Sys.time()
app2Out4 <- approach2("df1.txt", guide1)
h1 <- Sys.time()

方法3： setDF() substring

names(df2) <- "v1"

## 500 rows and 10 columns
i0 <- Sys.time()
app3Out1 <- approach3(df2, guide2)
i1 <- Sys.time()
## 500 rows and 100 columns
j0 <- Sys.time()
app3Out2 <- approach3(df2, guide1)
j1 <- Sys.time()
## 250,000 rows and 10 columns
k0 <- Sys.time()
app3Out3 <- approach3(df1, guide2)
k1 <- Sys.time()
## 250,000 rows and 100 columns
l0 <- Sys.time()
app3Out4 <- approach3(df1, guide1)
l1 <- Sys.time()

设置结果表

tests <- c("500 Rows, 10 Columns","500 rows, 100 Columns","250,000 Rows, 10 Columns",
           "250,000 rows, 100 Columns")
app1 <- c(as.numeric(a1-a0)/60,as.numeric(b1-b0)/60,as.numeric(c1-c0),as.numeric(d1-d0))
app2 <- c(as.numeric(e1-e0)/60,as.numeric(f1-f0)/60,as.numeric(g1-g0),as.numeric(h1-h0))
app3 <- c(as.numeric(i1-i0)/60,as.numeric(j1-j0)/60,as.numeric(k1-k0),as.numeric(l1-l0))

results <- data.frame(
  "DataSize"=tests,
  "Approach1"=app1,
  "Approach2"=app2,
  "Approach3"=app3
)

# Time factor with increase with column and row increases
scaling <- data.frame(
  "Increase"=c("500 Rows: Increase from 10 to 100 Columns","250k Rows: Increase from 10 to 100 Columns",
               "10 Columns: Increase from 500 to 250k Rows","100 Columns: Increase from 500 to 250k Rows"),
  "Approach1"=c((results[2,2]/results[1,2]),(results[4,2]/results[3,2]),
                (results[3,2]/results[1,2]),(results[4,2]/results[2,2])),
  "Approach2"=c((results[2,3]/results[1,3]),(results[4,3]/results[3,3]),
                (results[3,3]/results[1,3]),(results[4,3]/results[2,3])),
  "Approach3"=c((results[2,4]/results[1,4]),(results[4,4]/results[3,4]),
                (results[3,4]/results[1,4]),(results[4,4]/results[2,4]))
  )

Answer 4

以下是substring

的另一个选项

library(data.table)
setDT(setNames(do.call(rbind.data.frame, Map(substring, df$parse, 
         list(guide$begin), list(guide$end))), guide$name))[] 
#   c1  c2   c3
#1: 12 3qw eASD
#2: 23 4we rSDF
#3: 34 5er tDFG

加速解析文本到R中的data.table

4 个答案: