我创建了一个程序来读取和提取PDF文件中的文本......但它在执行期间产生了这个异常..
java.io.IOException: Error: Expected a long type, actual='930[299'
at org.apache.pdfbox.pdfparser.BaseParser.readLong(BaseParser.java:1669)
at org.apache.pdfbox.pdfparser.PDFObjectStreamParser.parse(PDFObjectStreamParser.java:100)
at org.apache.pdfbox.cos.COSDocument.dereferenceObjectStreams(COSDocument.java:632)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:244)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1205)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1172)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1097)
at PatentAdder.main(PatentAdder.java:60)
这是我的代码:
import java.awt.Rectangle;
import java.io.File;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class PatentAdder {
/**
* @param args
*/
public static String patno,patit,patdate,patfilled,appno;
private static int File;
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
int cnt=0;
if( args.length == 1 )
{
// usage();
}
else
{
PDDocument document = null;
try
{
File dataDir = new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs");
File[] files = dataDir.listFiles();
// String[] files = dataDir.list();
int count=0;
// System.out.println ("Satrt1");
for (File file : files) {
// System.out.println ("Satrt2");
File f = file;
if (!f.isDirectory()) {
document = PDDocument.load(f.getAbsolutePath());
if( document.isEncrypted() )
{
try
{
document.decrypt( "" );
}
catch( InvalidPasswordException e )
{
System.err.println( "Error: Document is encrypted with a password." );
System.exit( 1 );
}
} }
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition( true );
// Rectangle rectt = new Rectangle( 590, 108, 600, 100 ); // enlarge title
Rectangle rectt = new Rectangle( 288, 60, 222, 40 );
Rectangle rect = new Rectangle( 55, 108, 230, 600 ); // US-Patent title h40
// Rectangle rect = new Rectangle( 108, 210, 480, 499 ); //full enlarge
stripper.addRegion( "class1", rect );
stripper.addRegion("class2", rectt);
List allPages = document.getDocumentCatalog().getAllPages();
PDPage firstPage = (PDPage)allPages.get( 0 );
stripper.extractRegions( firstPage );
String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";
// System.out.println(rg);
String region = stripper.getTextForRegion( "class1" );
// System.out.println(region);
String regiont = stripper.getTextForRegion( "class2" );
Pattern p = Pattern.compile(in);
Matcher m = p.matcher(region);
Pattern p2 = Pattern.compile(as);
Matcher m2 = p2.matcher(region);
Pattern p3 = Pattern.compile(title);
Matcher m3 = p3.matcher(region);
Pattern p4 = Pattern.compile(pat_no);
Matcher m4 = p4.matcher(regiont);
Pattern p5 = Pattern.compile(app_no);
Matcher m5 = p5.matcher(region);
Pattern p6 = Pattern.compile(filed);
Matcher m6 = p6.matcher(region);
Pattern p7 = Pattern.compile(pat_dt);
Matcher m7 = p7.matcher(regiont);
while(m.find())
{
// System.out.println(m.group());
}
while(m2.find())
{
// System.out.println(m2.group());
}
while(m3.find())
{
// System.out.println(m3.group());
patit = m3.group().replace("(54)", " ");
patit = patit.trim();
}
while(m4.find())
{
// System.out.println(m4.group());
patno = m4.group().replace("Patent No.: ", " ");
patno = patno.replace("Patent No: ", " ");
patno = patno.replace("Patent", " ");
patno = patno.replace("No.:", " ");
patno = patno.replace("No:", " ");
patno = patno.replace("Number: ", " ");
patno = patno.replace("Number.: ", " ");
patno = patno.trim();
}
while(m5.find())
{
// System.out.println(m5.group());
appno = m5.group().replace("(21)", " ");
appno = appno.replace("Appl. No.: ", " ");
appno = appno.replace("Appl.", " ");
appno = appno.replace("No.", " ");
appno = appno.replace(":"," ");
appno = appno.trim();
}
while(m6.find())
{
// System.out.println(m6.group());
patfilled = m6.group().replace("(22)", " ");
patfilled = patfilled.replace("Filed", " ");
patfilled= patfilled.replace("PCT", " ");
patfilled = patfilled.replace(":", " ");
patfilled = patfilled.replace("\n", "");
patfilled= patfilled.trim();
}
while (m7.find())
{
patdate = m7.group().replace("(45) Date of Patent: ", " ");
patdate = patdate.replace("(45) Date of Patent.: ", " ");
patdate = patdate.replace("(45)", " ");
patdate = patdate.replace("Date", " ");
patdate = patdate.replace("of", " ");
patdate = patdate.replace("Patent.: ", " ");
patdate = patdate.replace("Patent: ", " ");
patdate = patdate.replace("Reissued", " ");
patdate = patdate.replace(":", " ");
patdate = patdate.replace("Patent", " ");
patdate = patdate.replace("*", " ");
patdate = patdate.trim();
}
System.out.println("File name:"+f.getName());
System.out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");
// boolean st = addPatent (patno,patit,patdate,patfilled,appno);
// if ( st == true ) System.out.println(patno+" added");
// else System.out.println(patno+" not added");
count++;
}
System.out.print("-----Finised "+count+" Files------ \n");
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
catch (Exception e)
{
System.out.println(e.getStackTrace());
//System.out.println(e.getLocalizedMessage());
System.out.println(e.getMessage());
System.out.println(e.getCause());
//System.out.println(e.getClass());
e.printStackTrace();
}
}
static boolean addPatent(String pno,String ptitle,String pat_date ,String filed_date , String appl_no )
{
int i=0;
boolean status =false;
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
Connection con = DriverManager.getConnection("jdbc:mysql://localhost:3306/patent", "root","ragesh");
PreparedStatement st = con.prepareStatement("insert into patents_info values (?,?,?,?,?,?)");
st.setString(1, pno);
st.setString(2, ptitle);
st.setString(3,pat_date);
st.setString(4,filed_date);
st.setString(5,appl_no);
st.setInt(6,0);
i=st.executeUpdate();
if (i > 0) status= true;
}
catch (Exception e)
{
e.printStackTrace();
}
return status;
}
public static List<File> getAllChildFiles(File[] dir)
{
List<File> result = new ArrayList<File>();
for (File file : dir)
{
if (file.isDirectory())
{
File[] children = file.listFiles();
List<File> grandChildren = getAllChildFiles(children);
result.addAll(grandChildren);
}
else
{
result.add(file);
}
}
return result;
}
}
这个程序提供了一些迭代的输出,但是像上面指定的那样停止和Thorw异常..
带有异常的示例输出:
File name:06019327.pdf
Number: 6,019,327
[54] INSTALLATION STRUCTURE OF OUTDOOR
COMMUNICATION DRIVE
[45] Feb. 1, 2000
[22] Aug. 30, 1996
Related U.S. Application Data
[21] 08/704,920
-------
File name:06019328.pdf
Number: 6,019,328
[54] STAY-PUT PEGBOARD ACCESSORY
[45] Feb. 1, 2000
[22] Jan. 27, 1999
[21] 09/238,242
-------
File name:06019329.pdf
Number: 6,019,329
[54] CLAMPS
[45] Feb. 1, 2000
[22] Oct. 30, 1997
[21] 08/961,310
-------
File name:06019330.pdf
Number: 6,019,330
[54] ROOF GUARD DEVICE FOR LIFTING
OBJECTS ON TO A ROOF
[45] Feb. 1, 2000
[22] Nov. 20, 1997
[21] 08/974,866
-------
File name:06019331.pdf
Number: 6,019,331
[54] CANTILEVER BRACKET ASSEMBLY
[45] Feb. 1, 2000
[22] May 28, 1997
Related U.S. Application Data
[21] 08/865,587
-------
[Ljava.lang.StackTraceElement;@43a6684f
Error: Expected a long type, actual='930[299'
java.io.IOException: Error: Expected a long type, actual='930[299'
at org.apache.pdfbox.pdfparser.BaseParser.readLong(BaseParser.java:1669)
at org.apache.pdfbox.pdfparser.PDFObjectStreamParser.parse(PDFObjectStreamParser.java:100)
at org.apache.pdfbox.cos.COSDocument.dereferenceObjectStreams(COSDocument.java:632)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:244)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1205)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1172)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1097)
at PatentAdder.main(PatentAdder.java:60)
第二个问题
有时执行会冻结..这只是在经过一些迭代后显示闪烁的光标....为什么......?
File name:06019329.pdf
Number: 6,019,329
[54] CLAMPS
[45] Feb. 1, 2000
[22] Oct. 30, 1997
[21] 08/961,310
-------
File name:06019330.pdf
Number: 6,019,330
[54] ROOF GUARD DEVICE FOR LIFTING
OBJECTS ON TO A ROOF
[45] Feb. 1, 2000
[22] Nov. 20, 1997
[21] 08/974,866
-------
File name:06019331.pdf
Number: 6,019,331
[54] CANTILEVER BRACKET ASSEMBLY
[45] Feb. 1, 2000
[22] May 28, 1997
Related U.S. Application Data
[21] 08/865,587
-------
(__ cursor blinks on... and execution freezes )
请帮我解决这两个问题:
JDK版本:1.6 PDF格式1.8.3
答案 0 :(得分:4)
这是由PDFBox不遵循PDF参考信函引起的:)
PDF标记流中的标记可以由空格分隔(对于大多数编程语言通常是这样),但也是隐式的:因为下一个字符是它自己的分隔符,因为它引入了一个特殊的函数。因此,遇到诸如
之类的结构是完全有效的 - 当然也不罕见/A[123/B(C)]
完全等同于稍长的
/A [ 123 /B (C) ]
来自ISO“PDF 32000-1:2008”, 7.2.2字符集:
PDF字符集分为三个类,分别称为常规,分隔符和空白字符。此分类确定将字符分组为标记。本子条款中定义的规则适用于文件中的所有字符,但字符串,流和注释除外。
空白字符显示[...]
分隔符字符(,),&lt;,&gt;,[,],{,},/和%是特殊的[..]
原始代码显示当前的实现(取自http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java):
/**
1681 * This method is used to read a token by the {@linkplain #readInt()} method and the {@linkplain #readLong()} method.
1682 *
1683 * @return the token to parse as integer or long by the calling method.
1684 * @throws IOException throws by the {@link #pdfSource} methods.
1685 */
1686 protected final StringBuilder readStringNumber() throws IOException
1687 {
1688 int lastByte = 0;
1689 StringBuilder buffer = new StringBuilder();
1690 while( (lastByte = pdfSource.read() ) != 32 &&
1691 lastByte != 10 &&
1692 lastByte != 13 &&
1693 lastByte != 60 && //see sourceforge bug 1714707
1694 lastByte != 0 && //See sourceforge bug 853328
1695 lastByte != -1 )
1696 {
1697 buffer.append( (char)lastByte );
1698 }
1699 if( lastByte != -1 )
1700 {
1701 pdfSource.unread( lastByte );
1702 }
1703 return buffer;
1704 }
“下一个字符”是针对7.2.2中表1中的空白字符(从上到下,“空格”,“换行”,“回车”和Nul字符 - 进行测试的 - 尽管它们仍然是缺少“Form Feed”代码0x0C
,而且很奇怪,常见的“Tab”0x09
。但是,他们会测试文件结尾(-1)和{{1 (<
),后者可能是因为有人之前遇到过类似的错误。(我无法找到原始错误报告#1714707,但我可以推断它一定与您的问题类似。)
此列表必须通过添加以下字符来完成,从7.2.2中的表2中复制 verbatim :
Table 2 – Delimiter characters Glyph Decimal Hexadecimal Octal Name ( 40 28 50 LEFT PARENTHESIS ) 41 29 51 RIGHT PARENTHESIS [1] < 60 3C 60 LESS-THAN SIGN > 62 3E 62 GREATER-THAN SIGN [ 91 5B 133 LEFT SQUARE BRACKET ] 93 5D 135 RIGHT SQUARE BRACKET { 123 7B 173 LEFT CURLY BRACKET } 125 7D 175 RIGHT CURLY BRACKET / 47 2F 57 SOLIDUS % 37 25 45 PERCENT SIGN
奇怪的是60
和{
,因为目前它们只出现在PostScript片段中,而且它们不是基础对象,而是包含在}
中。但也许它们在历史上“保留用于未来扩展”(这应该不再是一个问题,现在PDF格式已被冻结为ISO规范)。
此外,字符stream
本身就是一个分隔符,但它需要一些特殊处理,并引入注释:
评论由PERCENT SIGN之后的所有字符组成,但最多但不包括行的结尾[...]( 7.2.3评论)
(注意那里有一点含糊之处:
符合标准的读者应忽略评论,并将其视为单个空白字符。也就是说,注释将它前面的标记与它后面的标记分开。
哪个应该不必要,因为前一行已经说明评论在行尾之前结束;所以行尾本身应该保留在输入流中,因此充当分隔符。也许只不过是一个腰带和吊带方法的例子。)
[1]在审核时:实际上,右括号是多余的。它只能在匹配的左括号后出现,并引入一个字符串。一次只查看一个令牌,您永远不会遇到迷路%
- 如果这样做,则表示格式错误的PDF流。
答案 1 :(得分:2)
readLong方法从底层流中读取一个long。正如PDFBox API所述,该方法抛出的IOException是由用作输入源的PushBackInputStream生成的(pdfSource)。
在你的情况下,日志是非常明显的,似乎你的流中有一个方括号'[',这使得长时间转换成为不可能。
您有两种选择:
PDDocument.load
之前执行完整性检查)关于冻结问题
你确定代码没有卡在你的一个:
while(mX.find())
{
...
}
块?我发现设计非常容易出错,特别是对于X = 1和2.我没有时间进入逻辑但你可能想要重构while条件如下:
long TIMEOUT = 15000l; // 15 seconds
long now = System.currentTimeMillis(); // init the long just above the while
while(mX.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
...
}