提取pdf数据并在R中创建多个数据框

时间:2019-09-18 14:29:43

标签: r

我有几个pdf文件,例如波纹管...

enter image description here

我每个pdf文件平均有24页。 我正在使用子字符串函数提取数据,因为您可以看到一个模式。 提取并转换每个变量后,我想为每个页面创建1个数据框。 乍一看,这似乎不是一个整洁的代码,但是由于我没有编码技能,所以我认为它很好。 我正在使用代码...

library(pdftools)
library(chron)

pdf_file<-pdf_text()
all_pages<-strsplit(pdf_file,"\n")

#Cleaning the first page(the first is standardized)

all_pages[[1]] <- all_pages[[1]][-c(1,2,3,4,5,6,7)] 

for( i in 2:length(all_pages)){
  all_pages[[i]] <- all_pages[[i]][-c(1,2,3,4,5,6,7,8)]
}

all_pages

tentativa <- as.data.frame()
while(j != length(all_pages)){

Remessa <- vector()
for ( i in 1:length(all_pages[[j]])){
  Remessa[i] <- substring(all_pages[[j]][i],3,8)
}
Remessa <- stripWhitespace(Remessa)
Remessa <- as.numeric(Remessa)

Conta <- vector()
for ( i in 1:length(all_pages[[j]])){
 Conta[i] <- substring(all_pages[[j]][i],13,19)
}
Conta <- stripWhitespace(Conta)
Conta <- as.numeric(Conta)

Atendimento_Paciente <- vector()
for ( i in 1:length(all_pages[[j]])){
  Atendimento_Paciente[i] <- substring(all_pages[[j]][i],27,33)
  Atendimento_Paciente <- stripWhitespace(Atendimento_Paciente)
}
Atendimento_Paciente <- as.numeric(Atendimento_Paciente)

Paciente <- vector()
for ( i in 1:length(all_pages[[j]])){
  Paciente[i] <- substring(all_pages[[j]][i],35,55)
  Paciente <- stripWhitespace(Paciente)
}

Data <- vector()
for ( i in 1:length(all_pages[[j]])){
  Data[i] <- substring(all_pages[[j]][i],64,73)
}
Data <- as.Date(Data, format = "%d/%m/%Y")

Hora <- vector()
for ( i in 1:length(all_pages[[j]])){
  Hora[i] <- substring(all_pages[[j]][i],75,79)
}
Hora <- times(format(as.POSIXct(Hora, format = "%H:%M"),"%H:%M:%S"))

HE <- vector()
for ( i in 1:length(all_pages[[j]])){
  HE[i] <- substring(all_pages[[j]][i],82,84)
  HE <- stripWhitespace(HE)
}

Procedimento <- vector()
for ( i in 1:length(all_pages[[j]])){
  Procedimento[i] <- substring(all_pages[[j]][i],99,106)
  Procedimento <- stripWhitespace(Procedimento)
} 
  Procedimento <- as.numeric(Procedimento)


Cod_AMB <- vector()
for ( i in 1:length(all_pages[[j]])){
  Cod_AMB[i] <- substring(all_pages[[j]][i],128,137)
  Cod_AMB <- stripWhitespace(Cod_AMB)
}  
  Cod_AMB <- as.numeric(Cod_AMB)


Vl_repasse <- vector()
for ( i in 1:length(all_pages[[j]])){
  Vl_repasse[i] <- substring(all_pages[[j]][i],163,168)
  Vl_repasse <- stripWhitespace(Vl_repasse)
}  
  Vl_repasse <- str_replace(Vl_repasse,",",".")
  Vl_repasse <- as.numeric(Vl_repasse)

tentativa[j] <- data.frame(Remessa = Remessa, Conta = Conta, Atendimento_Paciente = Atendimento_Paciente, Paciente = Paciente, Data = Data, Hora = Hora, HE = HE, Procedimento = Procedimento, Cod_AMB = Cod_AMB, Vl_repasse = Vl_repasse)

j <- j +1
}

但是我失败了!有什么建议么? 预先,我真的很感谢您的时间。

0 个答案:

没有答案