如何正确读取固定宽度格式的文件

时间:2019-03-26 19:18:47

标签: r fixed-width read.fwf

R和R Studio相对较新,我正在尝试重新格式化文本文件以对其中的数据进行一些分析。我目前正在尝试使用read.fwf整理数据,但似乎在做错事,导致各种错误。我试图通过参考资源/指南来解决这些问题,但仍然很困难。有什么建议么? (下面是代码,文本文件中信息的示例以及所需的格式)。

当前代码:

library(readr)
library(tidyr)
read.fwf("AK_JAN_2017_TMAS_", widths =c(1,2,6,1,1,2,2,2,2,2,3,4,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3), header = false, sep = "", row.names = c("Record Type", "FIPS", "Station ID", "Direction of Travel code", "Lane of Travel", "Year of Data", "Month of Data", "Day of Data", "Hour of Data", "Vehicle Class", "Open", "Total Weight of Vehicle", "Number of Axles", "A-axle Weight", "A-B Axle Spacing", "B-axle Weight", "B-C Axle Spacing", "C-axle Weight", "C-D Axle Spacing", "D-axle Spacing", "D-E Axle Spacing", "E-axle Weight", "E-F Axle Spacing", "F-axle Weight", "F-G Axle Spacing", "G-axle Weight", "G-H Axle Spacing", "H-axle Weight", "H-I Axle Spacing", "I-axle Weight", "I-J Axle Spacing", "J-axle Weight", "J-K Axle Spacing", "K-axle Weight", "K-L Axle Spacing", "L-axle Weight", "L-M Axle Spacing", "M-axle Weight"), col.names = NULL, n = -1, buffersize = 2000, fileencoding = "" )

文本文件示例:

  

W02000103311701021610061031206057054056013054096054054053053015038   W02000103311701021606055024403039038084005121   W02000103311701021609067028505040038054013065104062012064   W02000103311701021705073004302024043019   W02000103311701021710066045606055070075015088094085018086018067   W02000103311701021710080044706052069075015087096083018085018065   W02000103311701021805076007402034056040   W02000103311701021805076004802025043023   W02000103311701021905077002402010051014   W02000103311701021905072004702026042021   W020001033117010219060440203003068053067015068   W02000103311701022006066014803057045049014042   W02000103311701022006053012903058041038014033   W02000103311701022005060003702020043017   W02000103311701022006063009503046047023014026   W020001033117010221050720062006602036060030   W02000103311701022206068017703045050059015073   W0200010331170102230506500500202033037036   W02000103311701030005066008802032038056   W02000103311701030305066008202037063045

所需的数据格式:

enter image description here

问题:

 library(readr)
 library(tidyr)
 AK_JAN_2017_TMAS_ <- read.table("~/R Studio Sessions/AK_JAN_2017_TMAS_.txt", quote="\"", comment.char="")
   View(AK_JAN_2017_TMAS_)
 left<-c(2,4,10,11,12,14,16,18,20,22,25,29,31,34,37,40,43,46,49,52,55,58,61,64,67,70,73,76,79,82,85,88,91,94,97,100,103)
 right<-c(3,9,10,11,13,15,17,19,21,24,28,30,33,36,39,42,45,48,51,54,57,60,63,66,69,72,75,78,81,84,87,90,93,96,99,102,105)
 df <- data.frame(matrix(numeric(length(x)*length(left)),ncol=length(left)))
Error in numeric(length(x) * length(left)) : object 'x' not found
 for (i in 1:length(input.set)) {
     stop <- nchar(x[i])
     for (j in 1:length(left)) {
         df[i,j] <- as.numeric(substr(input.set[i], left[j], right[j]))
         if (right[j] ==  stop) break
     }
 }
Error in length(input.set) : object 'input.set' not found
 df <- data.frame(AK_JAN_2017_TMAS_(numeric(length(x)*length(left)),ncol=length(left)))
Error in AK_JAN_2017_TMAS_(numeric(length(x) * length(left)), ncol = length(left)) : 
  could not find function "AK_JAN_2017_TMAS_"
 for (i in 1:length(input.set)) {
     stop <- nchar(x[i])
     for (j in 1:length(left)) {
         df[i,j] <- as.numeric(substr(input.set[i], left[j], right[j]))
         if (right[j] ==  stop) break
     }
 }

 df <- data.frame(matrix(numeric(length(x)*length(left)),ncol=length(left)))
Error in numeric(length(x) * length(left)) : object 'x' not found
 for (i in 1:length('AK_JAN_2017_TMAS_'.set)) {
Error: unexpected symbol in "for (i in 1:length('AK_JAN_2017_TMAS_'.set"
     stop <- nchar(x[i])
Error in nchar(x[i]) : object 'x' not found
     for (j in 1:length(left)) {
         df[i,j] <- as.numeric(substr(input.set[i], left[j], right[j]))
         if (right[j] ==  stop) break
     }
Error in substr(input.set[i], left[j], right[j]) : 
  object 'input.set' not found
 }
Error: unexpected '}' in "}"


And a few changes I attempted to make:

     df <- data.frame(matrix(numeric(length(x)*length(left)),ncol=length(left)))
    for (i in 1:length('AK_JAN_2017_TMAS_'.set)) {
        stop <- nchar(x[i])
        for (j in 1:length(left)) {
            df[i,j] <- as.numeric(substr(input.set[i], left[j], right[j]))
            if (right[j] ==  stop) break
        }
    }

1 个答案:

答案 0 :(得分:1)

变化的线长可能是一个问题。您可以按以下方式逐行执行此操作:

构造两个显示值边界的列表(我们跳过“ W”)。这些将用于提取变量的子字符串。

left<-c(2,4,10,11,12,14,16,18,20,22,25,29,31,34,37,40,
        43,46,49,52,55,58,61,64,67,70,73,76,79,82,85,88,91,94,97,100,103)

right<-c(3,9,10,11,13,15,17,19,21,24,28,30,33,36,39,42,
         45,48,51,54,57,60,63,66,69,72,75,78,81,84,87,90,93,96,99,102,105)

输入行上的循环从子字符串生成数字。您只希望处理每个变量的最后一个可能值以防止NA,因此将根据每个数据字符串中的字符数创建一个前哨变量(stop)。

df <- data.frame(matrix(numeric(length(input.set)*length(left)),ncol=length(left)))
for (i in 1:length(input.set)) {
    stop <- nchar(input.set[i])
    for (j in 1:length(left)) {
        df[i,j] <- as.numeric(substr(input.set[i], left[j], right[j]))
        if (right[j] ==  stop) break
    }
}

然后可以为列添加名称。

nvals <- c("FIPS","StaID","Dir","Lane","Year","Month","Day","Hour","Class",
           "Open", "TotW", "Axles",
           "AW","ASp","BW","BSp","CW","CSp","DW","DSp",
           "EW","ESp","FW","FSp","GW","GSp","HW","HSp",
           "IW","ISp","JW","JSp","KW","KSp","LW","LSp","MW")
names(df) <- nvals

以下是结果数据框中的几行:

  FIPS StaID Dir Lane Year Month Day Hour Class Open TotW Axles AW ASp BW BSp  CW
1    2   103   3    1   17     1   2   16    10   61  312     6 57  54 56  13  54
2    2   103   3    1   17     1   2   16     6   55  244     3 39  38 84   5 121
3    2   103   3    1   17     1   2   16     9   67  285     5 40  38 54  13  65
  CSp DW DSp EW ESp FW FSp GW GSp HW HSp IW ISp JW JSp KW KSp LW LSp MW
1  96 54  15 53  15 38   0  0   0  0   0  0   0  0   0  0   0  0   0  0
2   0  0   0  0   0  0   0  0   0  0   0  0   0  0   0  0   0  0   0  0
3 104 62  12 64   0  0   0  0   0  0   0  0   0  0   0  0   0  0   0  0