有没有办法将PDF文件转换为DataTable? PDF文件主要由表格组成,任何帮助将受到高度赞赏。
答案 0 :(得分:1)
如果PDF包含标记内容(您可以在我的博客文章http://www.jpedal.org/PDFblog/2010/09/the-easy-way-to-discover-if-a-pdf-file-contains-structured-content/中查看如何找到它),则可以从PDF文件中提取它。否则,您需要提取文本并尝试猜测结构。
答案 1 :(得分:1)
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
public DataTable ImportPDF(string Filename)
{
string strText = string.Empty;
List<string[]> list = new List<string[]>();
string[] PdfData = null;
try
{
PdfReader reader = new PdfReader((string)Filename);
for (int page = 1; page <= reader.NumberOfPages; page++)
{
ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();
String cipherText = PdfTextExtractor.GetTextFromPage(reader, page, its);
cipherText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(cipherText)));
strText = strText + "\n" + cipherText;
PdfData = strText.Split('\n');
}
reader.Close();
}
catch (Exception ex)
{
}
List<string> temp = PdfData.ToList();
temp.RemoveAt(0);
list = temp.ConvertAll<string[]>(x => x.Split(' ').ToArray());
List<string> columns = list.FirstOrDefault().ToList();
DataTable dtTemp = new DataTable();
columns.All(x => { dtTemp.Columns.Add(new DataColumn(x)); return true; });
list.All(x => { dtTemp.Rows.Add(dtTemp.NewRow().ItemArray = x); return true; });
return dtTemp;
}