Question

所有R开发人员，

我维护着GENEAread软件包，并且最近在软件包中发现了一个错误，该错误来自于header.info函数。此功能旨在从Actigraphy手表GENEActiv读取存储在GENEActiv二进制文件中的标头信息。此信息存储在二进制文件的前100行中。

此函数中错误读取值的部分使用函数scan（）。直到最近它一直有效，但是由于现在出现的scan（）输出变化，函数header.info读取的频率现在采用了不同的形式。

下面是一些演示此问题的示例代码：

install.packages(“GENEAread”)
library(GENEAread)
binfile = system.file("binfile/TESTfile.bin", package = "GENEAread")[1]

nobs = 300

info <- vector("list", 15)
#    index <- c(2, 20:22, 26:29)
tmpd = readLines(binfile, 300)

 #try to find index positions - so will accomodate multiple lines in the 
 notes sections
 #change when new version of binfile is produced.
 ind.subinfo = min(which((tmpd == "Subject Info" )& (1:length(tmpd) >= 37)))
 ind.memstatus = max(which(tmpd == "Memory Status"))
 ind.recdata = (which(tmpd == "Recorded Data"))
 ind.recdata = ind.recdata[ind.recdata > ind.memstatus][1:2]
 ind.calibdata = max(which(tmpd == "Calibration Data"))
 ind.devid = min(which(tmpd == "Device Identity"))
 ind.config = min(which(tmpd == "Configuration Info"))
 ind.trial = min(which(tmpd == "Trial Info"))

 index = c(ind.devid + 1, ind.recdata[1] + 8, ind.config + 2:3, ind.trial + 
 1:4, ind.subinfo + 1:7, ind.memstatus + 1)

 if (max(index) == Inf){
    stop("Corrupt headers or not Geneactiv file!", call = FALSE)
 }

 # Read in header info
 nm <- NULL

 for (i in 1:length(index)) {
    line = strsplit(tmpd[index[i]], split = ":")[[1]]
    el = ""
    if (length(line) > 1){
       el <- paste(line[2:length(line)],collapse=":")
    }
    info[[i]] <- el
    nm[i] <- paste(strsplit(line[1], split = " ")[[1]], collapse = "_")
 }

 info <- as.data.frame(matrix(info), row.names = nm)
 colnames(info) <- "Value"

 Decimal_Separator = "."

 if (length( grep(",", paste(tmpd[ind.memstatus + 8:9], collapse = "")) ) > 0){
   Decimal_Separator = ","
 }

 info = rbind(info,
         Decimal_Separator = Decimal_Separator)
 # more here
 # if (more){
 # grab calibration data etc as well
 calibration = list()
 fc = file(binfile, "rt")

 index = sort(c(ind.config + 4,
                ind.calibdata + 1:8,
                ind.memstatus + 1,
                ind.recdata + 3,
                ind.recdata[1] + c(2,8))
              )


 #### First appearance in the function header.info of the function scan. ####
 # tmp <- substring(scan(fc,
 #                       skip = index[1] - 1,
 #                       what = "",
 #                       n = 3,
 #                       sep = " ",
 #                       quiet = TRUE)[3],
 #                  c(1,2,5),
 #                  c(1, 3, 6))

 # Isolating scan and running multiple times #

scan(fc,
     skip = index[1] - 1,
     what = "",
     n = 3,
     sep = " ",
     quiet = TRUE)[3]

scan(fc,
     skip = index[1] - 1,
     what = "",
     n = 3,
     sep = " ",
     quiet = TRUE)[3]

scan(fc,
     skip = (index[1] - 1),
     what = "",
     n = 3,
     sep = " ",
     quiet = TRUE)[3]

#### Checking the same thing happens with the substring ####
# Checking by using 3.4.3 possibly
substring(scan(fc,
               skip = index[1] - 1,
               what = "",
               n = 3,
               sep = " ",
               quiet = TRUE)[3],
          c(1,2,5),
          c(1, 3, 6))

substring(scan(fc,
               skip = index[1] - 1,
               what = "",
               n = 3,
               sep = " ",
               quiet = TRUE)[3],
          c(1,2,5),
          c(1, 3, 6))

substring(scan(fc,
               skip = index[1] - 1,
               what = "",
               n = 3,
               sep = " ",
               quiet = TRUE)[3],
          c(1,2,5),
          c(1, 3, 6))

为什么扫描功能的输出会变化？我已经运行了扫描帮助页面上给出的示例，并且如果代码多次运行，输出将是相同的。运行该功能的构建过程中有哪些会导致输出变化？

任何帮助将不胜感激。

Answer 1

您使用以下方式打开了fc连接

fc = file(binfile, "rt")

这意味着scan()将从中读取并保持打开状态，文件指针前进到读取结束。每次调用scan()时，您正在读取文件的稍后部分。这就是为什么结果会有所不同的原因。

如果您希望始终读取文件的同一部分，则可以执行以下操作：

seek(fc, 0)
scan(fc, ...)

seek(fc, 0)
scan(fc, ...)

或者，创建时不要打开fc，并且scan()每次都会打开和关闭。您可以通过编写

fc <- file(binfile)  # No open specified

或更简单（但效率略有降低）

fc <- binfile

每次都会创建一个新的连接。

扫描功能输出有所不同

1 个答案: