我在研究论文中对参考论文进行排名,我从谷歌学者那里下载了这些论文,并将这些pdf文件转换为(.txt)。现在我想提取标题为Abstract的段落。我编写了代码,但问题是它提取整个文档。请仔细看看代码,并指导我。!
PDFManager pdfManager = new PDFManager();
pdfManager.setFilePath("D:\\paper.pdf");
//System.out.println(pdfManager.ToText());
PrintWriter outputfile = new PrintWriter("D:\\test.txt");
outputfile.print(pdfManager.ToText());
outputfile.close();
try{
FileInputStream fstream = new FileInputStream("D:\\test.txt");
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine;
while ((strLine = br.readLine()) != null) {
System.out.println (strLine);
}
in.close();
}catch (Exception e){
System.err.println("Error: " + e.getMessage());
}
}
}
这是我的pdfManager.class
package pdftext;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFManager {
private PDFParser parser;
private PDFTextStripper pdfStripper;
private PDDocument pdDoc ;
private COSDocument cosDoc ;
private String Text ;
private String filePath;
private File file;
public PDFManager() {
}
public String ToText() throws IOException
{
this.pdfStripper = null;
this.pdDoc = null;
this.cosDoc = null;
file = new File("D:\\information.pdf");
parser = new PDFParser(new RandomAccessFile(file,"r")); // update for PDFBox V 2.0
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdDoc.getNumberOfPages();
pdfStripper.setStartPage(1);
pdfStripper.setEndPage(10);
// reading text from page 1 to 10
// if you want to get text from full pdf file use this code
// pdfStripper.setEndPage(pdDoc.getNumberOfPages());
Text = pdfStripper.getText(pdDoc);
return Text;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
}
答案 0 :(得分:0)
File fileName = new File("D:\\testa.txt");
try {
String startToken = "Abstract";
String endToken = "1";
boolean output = false;
Scanner scan = new Scanner(fileName);
while (scan.hasNextLine()) {
String line = scan.nextLine();
if (!output && line.indexOf(startToken) > -1) {
output = true;
line = line.substring(line.indexOf(startToken)+startToken.length());
} else if (output && line.indexOf(endToken) > -1) {
output = false;
System.out.println(line.substring(0, line.indexOf(endToken)));
}
if (output) {
System.out.println(line);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
我终于设法使用扫描,开始令牌和结束令牌......!