我有一个程序,使用PDF文档中的正则表达式模式提取文本。
我的问题是matcher块需要很长时间才能执行某些PDF文件......
这是代码:
String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";
String region = stripper.getTextForRegion( "class1" );
String regiont = stripper.getTextForRegion( "class2" );
Pattern p = Pattern.compile(in);
Matcher m = p.matcher(region);
Pattern p2 = Pattern.compile(as);
Matcher m2 = p2.matcher(region);
Pattern p3 = Pattern.compile(title);
Matcher m3 = p3.matcher(region);
Pattern p4 = Pattern.compile(pat_no);
Matcher m4 = p4.matcher(regiont);
Pattern p5 = Pattern.compile(app_no);
Matcher m5 = p5.matcher(region);
Pattern p6 = Pattern.compile(filed);
Matcher m6 = p6.matcher(region);
Pattern p7 = Pattern.compile(pat_dt);
Matcher m7 = p7.matcher(regiont);
long TIMEOUT = 15000l; // 15 seconds
long now = System.currentTimeMillis(); // init the long just above the while
System.out.println("find start");
while(m.find())
{
// System.out.println(m.group());
}
Long nowtime = System.currentTimeMillis() ;
while(m2.find())
{
// System.out.println(m2.group());
}
while(m3.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
// System.out.println(m3.group());
patit = m3.group().replace("(54)", " ");
patit = patit.trim();
// System.out.println("m3");
}
while(m4.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
// System.out.println(m4.group());
patno = m4.group().replace("Patent No.: ", " ");
patno = patno.replace("Patent No: ", " ");
patno = patno.replace("Patent", " ");
patno = patno.replace("No.:", " ");
patno = patno.replace("No:", " ");
patno = patno.replace("Number: ", " ");
patno = patno.replace("Number.: ", " ");
patno = patno.trim();
// System.out.println("m4");
}
while(m5.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
// System.out.println(m5.group());
appno = m5.group().replace("(21)", " ");
appno = appno.replace("Appl. No.: ", " ");
appno = appno.replace("Appl.", " ");
appno = appno.replace("No.", " ");
appno = appno.replace(":"," ");
appno = appno.trim();
// System.out.println("m5");
}
while(m6.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
// System.out.println(m6.group());
patfilled = m6.group().replace("(22)", " ");
patfilled = patfilled.replace("Filed", " ");
patfilled= patfilled.replace("PCT", " ");
patfilled = patfilled.replace(":", " ");
patfilled = patfilled.replace("\n", "");
patfilled= patfilled.trim();
// System.out.println("m6");
}
while (m7.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
patdate = m7.group().replace("(45) Date of Patent: ", " ");
patdate = patdate.replace("(45) Date of Patent.: ", " ");
patdate = patdate.replace("(45)", " ");
patdate = patdate.replace("Date", " ");
patdate = patdate.replace("of", " ");
patdate = patdate.replace("Patent.: ", " ");
patdate = patdate.replace("Patent: ", " ");
patdate = patdate.replace("Reissued", " ");
patdate = patdate.replace(":", " ");
patdate = patdate.replace("Patent", " ");
patdate = patdate.replace("*", " ");
patdate = patdate.trim();
// System.out.println("m7");
}
System.out.println("find end");
在上面的代码中,对于某些文件的某些迭代,mX.find()需要很长时间才能执行...这就是在某些迭代时System.out.println("find start");
处的执行冻结。
这是示例输出:(滚动并查看)
-------
find start
1ms Elasped
1841
File name:06377334.pdf
US 6,377,334 B2
METHOD FOR CONTROLLING IMAGE
SIZE OF INTEGRATED CIRCUITS ON
WAFERS SUPPORTED ON HOT PLATES
DURING POST EXPOSURE BAKING OF THE
WAFERS
Apr. 23, 2002
Jan. 24, 2001
Related U.S. Application Data
09/768,973
-------
find start
1ms Elasped
1842
File name:06377337.pdf
US 6,377,337 B1
PROJECTION EXPOSURE APPARATUS
Apr. 23, 2002
Apr. 27, 1999
09/299,558
-------
find start
1843
File name:06377338.pdf
US 6,377,338 B1
EXPOSURE APPARATUS AND METHOD
Apr. 23, 2002
Oct. 13, 2000
Related U.S. Application Data
09/299,558
-------
find start
1844
File name:06377339.pdf
US 6,377,339 B1
DOCUMENT IMAGING SYSTEM
INCORPORATING A SELECTIVELY
OPAQUE
Apr. 23, 2002
Mar. 29, 1999
09/280,186
-------
find start
1845
File name:06377340.pdf
US 6,377,340 B1
METHOD OF DETECTION OF NATURAL
DIAMONDS THAT HAVE BEEN PROCESSED
AT HIGH PRESSURE AND HIGH
TEMPERATURES
Apr. 23, 2002
Oct. 29, 1999
09/430,477
-------
find start
1846
File name:06377341.pdf
US 6,377,341 B1
REFRACTIVE INDEX BASED DETECTOR
SYSTEM FOR LIQUID CHROMATOGRAPHY
Apr. 23, 2002
Aug. 3, 1999
09/368,310
-------
find start
(execution freezes here )
为什么会这样?为什么正则表达式匹配需要很长时间?
整个计划:
import java.awt.Rectangle;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class PatentAdder {
/**
* @param args
*/
public static String patno,patit,patdate,patfilled,appno;
private static int File;
public static void main(String[] args) {
try {
int cnt=0;
if( args.length == 1 )
{
// usage();
}
else
{
PDDocument document = null;
try
{
File dataDir = new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs");
File[] files = dataDir.listFiles();
int count=0;
long TIMEOUT1 = 60000l; // 15 seconds
long now1 = System.currentTimeMillis();
for (File file : files) {
try {
// System.out.println ("Satrt2");
File f = file;
if (!f.isDirectory()) {
document = PDDocument.load(f.getAbsolutePath());
if( document.isEncrypted() )
{
try
{
document.decrypt( "" );
}
catch( InvalidPasswordException e )
{
System.err.println( "Error: Document is encrypted with a password." );
System.exit( 1 );
}
} }
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition( true );
Rectangle rectt = new Rectangle( 288, 60, 222, 40 );
Rectangle rect = new Rectangle( 55, 108, 230, 600 ); // US-Patent title h40
stripper.addRegion( "class1", rect );
stripper.addRegion("class2", rectt);
List allPages = document.getDocumentCatalog().getAllPages();
PDPage firstPage = (PDPage)allPages.get( 0 );
stripper.extractRegions( firstPage );
String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";
String region = stripper.getTextForRegion( "class1" );
String regiont = stripper.getTextForRegion( "class2" );
Pattern p = Pattern.compile(in);
Matcher m = p.matcher(region);
Pattern p2 = Pattern.compile(as);
Matcher m2 = p2.matcher(region);
Pattern p3 = Pattern.compile(title);
Matcher m3 = p3.matcher(region);
Pattern p4 = Pattern.compile(pat_no);
Matcher m4 = p4.matcher(regiont);
Pattern p5 = Pattern.compile(app_no);
Matcher m5 = p5.matcher(region);
Pattern p6 = Pattern.compile(filed);
Matcher m6 = p6.matcher(region);
Pattern p7 = Pattern.compile(pat_dt);
Matcher m7 = p7.matcher(regiont);
System.out.println("find start");
Long nowtime = System.currentTimeMillis() ;
while(m3.find())
{
patit = m3.group().replace("(54)", " ");
patit = patit.trim();
}
while(m4.find() )
{
patno = m4.group().replace("Patent No.: ", " ");
patno = patno.replace("Patent No: ", " ");
patno = patno.replace("Patent", " ");
patno = patno.replace("No.:", " ");
patno = patno.replace("No:", " ");
patno = patno.replace("Number: ", " ");
patno = patno.replace("Number.: ", " ");
patno = patno.trim();
}
while(m5.find() )
{
appno = m5.group().replace("(21)", " ");
appno = appno.replace("Appl. No.: ", " ");
appno = appno.replace("Appl.", " ");
appno = appno.replace("No.", " ");
appno = appno.replace(":"," ");
appno = appno.trim();
}
while(m6.find())
{
patfilled = m6.group().replace("(22)", " ");
patfilled = patfilled.replace("Filed", " ");
patfilled= patfilled.replace("PCT", " ");
patfilled = patfilled.replace(":", " ");
patfilled = patfilled.replace("\n", "");
patfilled= patfilled.trim();
}
while (m7.find() )
{
patdate = m7.group().replace("(45) Date of Patent: ", " ");
patdate = patdate.replace("(45) Date of Patent.: ", " ");
patdate = patdate.replace("(45)", " ");
patdate = patdate.replace("Date", " ");
patdate = patdate.replace("of", " ");
patdate = patdate.replace("Patent.: ", " ");
patdate = patdate.replace("Patent: ", " ");
patdate = patdate.replace("Reissued", " ");
patdate = patdate.replace(":", " ");
patdate = patdate.replace("Patent", " ");
patdate = patdate.replace("*", " ");
patdate = patdate.trim();
}
PrintWriter out = new PrintWriter (new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs/output.txt"));
System.out.println(count);
out.println(count);
System.out.println("File name:"+f.getName());
out.println("File name:"+f.getName());
System.out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");
out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");
Long endtime = System.currentTimeMillis()-nowtime;
System.out.println(endtime+"ms Elasped") ;
out.println(endtime+"ms Elasped") ;
count++;
}
catch (IOException e)
{
continue;
}
}
System.out.print("-----Finised "+count+" Files------ \n");
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
catch (Exception e)
{
System.out.println(e.getStackTrace());
//System.out.println(e.getLocalizedMessage());
System.out.println(e.getMessage());
System.out.println(e.getCause());
//System.out.println(e.getClass());
e.printStackTrace();
}
}
请告诉我如何优化正则表达式并解决此执行冻结问题......
答案 0 :(得分:0)
我喜欢自己使用正则表达式,但看起来这并不是你想要做的事情的理想方法。正则表达式适用于从文本中提取特定的信息。但是在文本上反复应用几个regexp表明解析方法会更好。
您的方法存在的一个问题是,每个while循环都会再次读取整个文本。这可以通过编写自己的解析器并让它在文档中处理一次来避免。
你的正则表达式的另一个问题是它们有很多可选部分(\s*
等)。可选部件使正则表达式的评估成本更高。使用非常简单的正则表达式并重新检查其匹配位置以获得误报可能是一种很好的方法。例如,而不是你的正则表达式
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
你可以寻找
String simple_term ="Term";
然后检查Term
的每次出现是否确实是您要找的部分。
顺便看一下我从你的代码中随机挑选的字符串,我注意到它比它必须更复杂。只需删除替代|
,因为第一个和第二个替代方案是相同的。
答案 1 :(得分:0)
你遇到了回溯地狱,因为你有一个类似于\s*\s*c
的结构 - 连续重复的字符类,它具有非空交集,后跟一个不相交的续集。
让我们看看字符串in
(由引擎看到):
((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:))
|
((?s)\[\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n))
|
(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:))
|
((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:))
|
((?s)\(\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n))
|
(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:))
你的正则表达式中有很多这样的模式:
((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:))
^^^^^^^^^^^^^^^^^^^^
((?s)\[\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:))
^^^^^^^^^^^^^^^^^^^^
这并未提及您在正则表达式中包含相同的3个子模式两次,这完全是多余的。