Question

我正在用R读取一些大型文本文件到数据库中，但它们包含数据库软件的非法字段名称。大文本文件的列名只在第一行 - 是否可以只编辑第一行而不循环遍历文件中的每一行（这似乎浪费资源）？

以下是我尝试使用一些示例数据的两个示例。第一个将所有内容读入ram - 因此对我的大型数据表不起作用。第二个会工作，但它很慢，因为它处理文件中的每一行。

我认为解决方案跨平台和的工作并不需要安装外部软件（R软件包除外），这很重要，因为我将与其他人共享此脚本，而不是要求他们执行超出必要的步骤。我正在寻找最简单的方法：）

# create two temporary files
tf <- tempfile() ; tf2 <- tempfile()

# write the mtcars data table to a file on the disk
write.csv( mtcars , tf )

# look at the first three lines
readLines( tf , n = 3 )

# read in the entire table
z <- readLines( tf )

# make the only substitution i care about
z[1] <- gsub( 'disp' , 'newvar' , z[1] )

# write the entire table back out to the table
writeLines( z , tf2 )

# confirm the replacement
readLines( tf2 , 2 )
# done!

# # # # # # # OR

# blank out the output file
file.remove( tf2 )

# create a file connection to the text file
incon <- file( tf , "r" )

# create a second file connection to the secondary temporary file
outcon <- file( tf2 , "w" )

# read in one line at a time
while( length( one.line <- readLines( incon , 1 ) ) > 0 ){

    # make the substitution on every line
    one.line <- gsub( 'disp' , 'newvar' , one.line )

    # write each line to the second temporary file
    writeLines( one.line , outcon )
}

# close the connections
close( incon ) ; close( outcon )

# confirm the replacement
readLines( tf2 , 2 )
# done!

Answer 1

你使用了错误的工具。请改用命令行工具。例如。使用sed，像sed -i '1 s/disp/newvar/' file这样的smth应该这样做。如果您必须在R中执行此操作，请使用

filename = 'myfile'
scan(pipe(paste("sed -i '1 s/disp/newvar/' ", filename, sep = "")))

这是一个特定于Windows的版本：

filename = 'myfile'
tf1 = tempfile()
tf2 = tempfile()

# read header, modify and write to file
header = readLines(filename, n = 1)
header = gsub('disp', 'newvar', header)
writeLines(header, tf1)

# cut the rest of the file to a separate file
scan(pipe(paste("more ", filename, " +1 > ", tf2)))

# append the two bits together
file.append(tf1, tf2)

# tf1 now has what you want

Answer 2

为什么不编辑标题，然后以块的形式阅读其余部分？我不知道这个文件有多大，但也许是在一行中（我已经猜到了10000）。根据您拥有的内存量，您可以将其调整为更大或更小。

##setup
tf <- tempfile(); tf2 <- tempfile()
write.csv(mtcars,tf)

fr <- file(tf, open="rt") #open file connection to read
fw <- file(tf2, open="wt") #open file connection to write 
header <- readLines(f,n=1) #read in header
header <- gsub( 'disp' , 'newvar' , header) #modify header    
writeLines(header,con=fw) #write header to file
while(length(body <- readLines(fr,n=10000)) > 0) {
  writeLines(body,fw) #pass rest of file in chunks of 10000
}
close(fr);close(fw) #close connections
#unlink(tf);unlink(tf2) #delete temporary files

它应该更快，因为R将每10000行而不是每一行运行while循环。此外，R会在您想要的行上调用gsub而不是每行，从而节省您的R时间。 R无法“就地”编辑文件，可以这么说，因此无法读取和复制文件。如果你必须在R中执行它，那么使你的块大小与内存允许一样大，然后传递你的文件。

我发现两种方式之间的性能差异是3倍：

#test file creation ~3M lines
tf <- tempfile(); tf2 <- tempfile()
fw <- file(tf,open="wt")
sapply(1:1e6,function(x) write.csv(mtcars,fw))
close(fw)

#my way
system.time({
fr <- file(tf, open="rt") #open file connection to read
fw <- file(tf2, open="wt") #open file connection to write 
header <- readLines(f,n=1) #read in header
header <- gsub( 'disp' , 'newvar' , header) #modify header    
writeLines(header,con=fw) #write header to file
while(length(body <- readLines(fr,n=10000)) > 0) {
  writeLines(body,fw) #pass rest of file in chunks of 10000
}
close(fr);close(fw) #close connections
})    
#   user  system elapsed 
#  32.96    1.69   34.85 

#OP's way
system.time({
incon <- file( tf , "r" )
outcon <- file( tf2 , "w" )
while( length( one.line <- readLines( incon , 1 ) ) > 0 ){
    one.line <- gsub( 'disp' , 'newvar' , one.line )
    writeLines( one.line , outcon )
}
close( incon ) ; close( outcon )
})
#   user  system elapsed 
# 104.36    1.92  107.03

Answer 3

你试过了吗？

dplyr

这只会覆盖第一行，并且取决于它如何管理系统资源可能非常有效。一定要备份。

如何使用R编辑或修改或更改大文本文件中的单行

3 个答案: