我有几个pdf文件,例如波纹管...
我每个pdf文件平均有24页。 我正在使用子字符串函数提取数据,因为您可以看到一个模式。 提取并转换每个变量后,我想为每个页面创建1个数据框。 乍一看,这似乎不是一个整洁的代码,但是由于我没有编码技能,所以我认为它很好。 我正在使用代码...
library(pdftools)
library(chron)
pdf_file<-pdf_text()
all_pages<-strsplit(pdf_file,"\n")
#Cleaning the first page(the first is standardized)
all_pages[[1]] <- all_pages[[1]][-c(1,2,3,4,5,6,7)]
for( i in 2:length(all_pages)){
all_pages[[i]] <- all_pages[[i]][-c(1,2,3,4,5,6,7,8)]
}
all_pages
tentativa <- as.data.frame()
while(j != length(all_pages)){
Remessa <- vector()
for ( i in 1:length(all_pages[[j]])){
Remessa[i] <- substring(all_pages[[j]][i],3,8)
}
Remessa <- stripWhitespace(Remessa)
Remessa <- as.numeric(Remessa)
Conta <- vector()
for ( i in 1:length(all_pages[[j]])){
Conta[i] <- substring(all_pages[[j]][i],13,19)
}
Conta <- stripWhitespace(Conta)
Conta <- as.numeric(Conta)
Atendimento_Paciente <- vector()
for ( i in 1:length(all_pages[[j]])){
Atendimento_Paciente[i] <- substring(all_pages[[j]][i],27,33)
Atendimento_Paciente <- stripWhitespace(Atendimento_Paciente)
}
Atendimento_Paciente <- as.numeric(Atendimento_Paciente)
Paciente <- vector()
for ( i in 1:length(all_pages[[j]])){
Paciente[i] <- substring(all_pages[[j]][i],35,55)
Paciente <- stripWhitespace(Paciente)
}
Data <- vector()
for ( i in 1:length(all_pages[[j]])){
Data[i] <- substring(all_pages[[j]][i],64,73)
}
Data <- as.Date(Data, format = "%d/%m/%Y")
Hora <- vector()
for ( i in 1:length(all_pages[[j]])){
Hora[i] <- substring(all_pages[[j]][i],75,79)
}
Hora <- times(format(as.POSIXct(Hora, format = "%H:%M"),"%H:%M:%S"))
HE <- vector()
for ( i in 1:length(all_pages[[j]])){
HE[i] <- substring(all_pages[[j]][i],82,84)
HE <- stripWhitespace(HE)
}
Procedimento <- vector()
for ( i in 1:length(all_pages[[j]])){
Procedimento[i] <- substring(all_pages[[j]][i],99,106)
Procedimento <- stripWhitespace(Procedimento)
}
Procedimento <- as.numeric(Procedimento)
Cod_AMB <- vector()
for ( i in 1:length(all_pages[[j]])){
Cod_AMB[i] <- substring(all_pages[[j]][i],128,137)
Cod_AMB <- stripWhitespace(Cod_AMB)
}
Cod_AMB <- as.numeric(Cod_AMB)
Vl_repasse <- vector()
for ( i in 1:length(all_pages[[j]])){
Vl_repasse[i] <- substring(all_pages[[j]][i],163,168)
Vl_repasse <- stripWhitespace(Vl_repasse)
}
Vl_repasse <- str_replace(Vl_repasse,",",".")
Vl_repasse <- as.numeric(Vl_repasse)
tentativa[j] <- data.frame(Remessa = Remessa, Conta = Conta, Atendimento_Paciente = Atendimento_Paciente, Paciente = Paciente, Data = Data, Hora = Hora, HE = HE, Procedimento = Procedimento, Cod_AMB = Cod_AMB, Vl_repasse = Vl_repasse)
j <- j +1
}
但是我失败了!有什么建议么? 预先,我真的很感谢您的时间。