我需要从PDF文件中提取一些数据。
我使用 iTextSharp
来做到这一点。
我使用的是我在网上创建的代码:
using System;
using System.IO;
using iTextSharp.text.pdf;
namespace PdfToText
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
outFileName = String.Empty;
outFileName = Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory);
//string currentDirectory = Directory.GetCurrentDirectory();
//string filePath = System.IO.Path.Combine(currentDirectory, "Data", "myfile.txt");
// extract the text
//string test = "";
outFileName += @"\test.txt";
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, true, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch(Exception ex)
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}
我用这种方式:
PDFParser pdfParser = new PDFParser();
pdfParser.ExtractText(pdfFile,Path.GetFileNameWithoutExtension(pdfFile) + ".txt");
所以pdf内容是用txt文件写的。 它适用于某些pdf-s,但对于我真正需要使用的pdf文件,txt文件始终为空。我没有收到任何错误,但由于某种原因,它没有写任何内容,尽管你可以在这个截图中看到它识别pdf,它有2页......
这是我需要的pdf但是txt始终保持为空。(黑线是我添加的,所以当我想在txt中写入时不存在)
这是另一个pdf。为此,程序工作正常,它写的是一个txt文件。它比其他pdf大得多,仍然为此我可以提取文本而另一方面我不能。
你知道可能出现什么问题吗?
答案 0 :(得分:1)
评论太长,也许你不喜欢的答案:
在PDF中,&#34;发短信给你看&#34;又名字体外观和&#34;字形是什么意思&#34;又名什么字形映射到哪个utf8字母是单独的东西。
它们存储在pdf的不同部分 - 完全可能pdf看起来完全正常,但是如果你试图提取文本它将不会给你任何东西,因为它只包含你的textglyphs的形状而不是theire &#34;含义&#34 ;.
尝试打开pdf并选择+复制您所追求的文本,如果将其粘贴到编辑器中并注意到,那么您的pdf缺少信息&#34;此字形显示的是什么utf8字母&#34 ;
OR:
也可能是你的pdf只包含文字的图像 - 照片就这么说了。你可以阅读它,iTextSharp只看到一个&#34;图片&#34; - 没有文字。
这些是可能的,为什么会回答你的问题。至于如何解决它:
关于SO的损坏PDF有几个问题:
How to repair a PDF file and embed missing fonts
Embedded fonts in PDF: copy and paste problems(this answer)
复制和粘贴与文本解析有关,因此可能会帮助您解决问题。
您的编辑会显示有关解析的详细信息,为什么不利用iTextSharp?
using iTextSharp.text.pdf; using iTextSharp.text.pdf.parser; public static string ExtractTextFromPdf(string path) { using (PdfReader reader = new PdfReader(path)) { StringBuilder text = new StringBuilder(); for (int i = 1; i <= reader.NumberOfPages; i++) { text.Append(PdfTextExtractor.GetTextFromPage(reader, i)); } return text.ToString(); }
来自:http://www.squarepdf.net/parsing-pdf-files-using-itextsharp
或者像这里:parse-pdf-with-itextsharp-and-then-extract-specific-text-to-the-screen?