我已经使用pdfbox开源库完成了pdf文件的ocr,目前将其数据写入 Text 文件中,但我想将所有数据放入数据库表中
public class Pdfocr {
private static final String DB_CONNECTION = "jdbc:oracle:thin:@host:port:db";
private static final String DB_USER = "user";
private static final String DB_PASSWORD = "pwd";
public static void main(String[] args) {
PDDocument pd;
BufferedWriter wr;
try {
Connection dbConnection = DriverManager.getConnection(DB_CONNECTION, DB_USER, DB_PASSWORD);
System.out.println("Connection");
Statement statement = dbConnection.createStatement();
//statement.execute("INSERT INTO xx_lockbox_receipts(BATCH, TRX, CHECK_DATE, LOCKBOX_NO, ACCT_NO, CHECK_NO, AMOUNT) VALUES (1,1,SYSDATE,123,123,321,10)");
File input = new File("C:/ocr/MTBC lockbox deposits Feb 01, 2019 ($5,185.85).pdf");
File output = new File("C:/ocr/SampleText.txt");
pd = PDDocument.load(input);
int i = pd.getNumberOfPages();
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(1);
stripper.setEndPage(i);
wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
stripper.writeText(pd, wr);
if (pd != null) {
pd.close();
}
wr.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
上面的代码的注释中提到了SQL INSERT语句;提取的文本文件的输出示例如下:
Batch: 1, Tran: 1, Date: 20190201, Amount: 73.88, Lockbox#: 83495, Acc#: 1200653200, Check#: 1242
Batch: 1, Tran: 1, Date: 20190201, Amount: 0.00, Lockbox#: 83495, Acc#: -
Batch: 1, Tran: 2, Date: 20190201, Amount: 3440.91, Lockbox#: 83495, Acc#: 5020607192, Check#: 23322
Batch: 1, Tran: 2, Date: 20190201, Amount: 0.00, Lockbox#: 83495, Acc#: -
Batch: 1, Tran: 2, Date: 20190201, Amount: 0.00, Lockbox#: 83495, Acc#: -
Batch: 1, Tran: 2, Date: 20190201, Amount: 0.00, Lockbox#: 83495, Acc#: -
Batch: 1, Tran: 3, Date: 20190201, Amount: 1671.06, Lockbox#: 83495, Acc#: 3601085602, Check#: 8723
Batch: 1, Tran: 3, Date: 20190201, Amount: 0.00, Lockbox#: 83495, Acc#: -
Batch: 1, Tran: 3, Date: 20190201, Amount: 0.00, Lockbox#: 83495, Acc#: -
Batch: 1, Tran: 3, Date: 20190201, Amount: 0.00, Lockbox#: 83495, Acc#: -