如何将pdf文件转换为datatable

时间:2011-05-11 06:28:58

标签: asp.net pdf datatable

有没有办法将PDF文件转换为DataTable? PDF文件主要由表格组成,任何帮助将受到高度赞赏。

2 个答案:

答案 0 :(得分:1)

如果PDF包含标记内容(您可以在我的博客文章http://www.jpedal.org/PDFblog/2010/09/the-easy-way-to-discover-if-a-pdf-file-contains-structured-content/中查看如何找到它),则可以从PDF文件中提取它。否则,您需要提取文本并尝试猜测结构。

答案 1 :(得分:1)

using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;

 public DataTable ImportPDF(string Filename)
    {
        string strText = string.Empty;
        List<string[]> list = new List<string[]>();
        string[] PdfData = null;
        try
        {
            PdfReader reader = new PdfReader((string)Filename);
            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();
                String cipherText = PdfTextExtractor.GetTextFromPage(reader, page, its);
                cipherText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(cipherText)));
                strText = strText + "\n" + cipherText;
                PdfData = strText.Split('\n');

            }
            reader.Close();
        }
        catch (Exception ex)
        {
        }

        List<string> temp = PdfData.ToList();
        temp.RemoveAt(0);
        list = temp.ConvertAll<string[]>(x => x.Split(' ').ToArray());
        List<string> columns = list.FirstOrDefault().ToList();
        DataTable dtTemp = new DataTable();
        columns.All(x => { dtTemp.Columns.Add(new DataColumn(x)); return true; });
        list.All(x => { dtTemp.Rows.Add(dtTemp.NewRow().ItemArray = x); return true; });
        return dtTemp;
    }