如何将提取的文本从pdf映射到excel单元格?

时间:2017-12-17 08:01:56

标签: c# excel pdf

我试图从pdf中提取文本并从该文本创建excel, 我已经使用pdfbox从pdf中提取文本,但现在我正在尝试将它们粘贴到pdcel结构的excel单元格中,我有x和y坐标的文本,但我如何将它们映射到excel单元格? 首先我尝试了一个pdf页面到一个excel表格,然后获取pdf页面中的所有对象,从y坐标开始排序并获得如下排名

private DataTable RankDt(DataTable dt, string fld)
{
    var rankDt = (from row in dt.AsEnumerable()
                  orderby row.Field<float>(fld) 
                  select row).CopyToDataTable();


    rankDt.Columns.Add(fld+"Rank",typeof(int));
    int rank = 1;
    for (int i = 0; i < rankDt.Rows.Count - 1; i++)
    {
        rankDt.Rows[i][fld + "Rank"] = rank;
        if (rankDt.Rows[i][fld].ToString() != rankDt.Rows[i + 1][fld].ToString())
            rank++;
    }
    rankDt.Rows[rankDt.Rows.Count - 1][fld + "Rank"] = rank;
    return rankDt;
}

然后循环到行计数并在excel

中按如下方式过去
   public void CreateExcelOutputFile(System.Data.DataTable dt, string destinationFileName, string destinationFilePath)
        {
            try
            {


                Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application();

                Workbook wb = xlApp.Workbooks.Add(XlWBATemplate.xlWBATWorksheet);
                Worksheet ws = (Worksheet)wb.Worksheets[1];

                if (dt.Rows.Count > 0)
                {
                    int rowNo = 1;
                    for (int i = 0; i < dt.Rows.Count - 1; i++)
                    {
                        var dty = (from row in dt.AsEnumerable()
                                   orderby row.Field<float>("x")
                                   where row.Field<int>("yRank") == rowNo
                                   select row);
                        if (dty.Count() == 0)
                        {
                            break;
                        }
                        else
                        {
                            var tbl = dty.CopyToDataTable();
                            for (int y = 0; y < tbl.Rows.Count; y++)
                            {
                                ws.Cells[rowNo, y+1] = tbl.Rows[y]["text"].ToString();
                            }
                        }

                        rowNo++;
                    }
                }

                wb.SaveAs((destinationFilePath + '\\' + destinationFileName), Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing);
                wb.Close();
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }

70%的数据格式化为pdf,但有些文字格式不是pdf

这个pdftoexcel网站做得很好,他们如何进行此次转换? 是否有任何人可以给出提示或解决方案? 提前谢谢

1 个答案:

答案 0 :(得分:0)

Try using this.

   void ExportTOExcel(System.Data.DataTable dt)
                {
                    Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application();

                    Microsoft.Office.Interop.Excel.Workbook xlWorkBook;
                    Microsoft.Office.Interop.Excel.Worksheet xlWorkSheet;
                    object misValue = System.Reflection.Missing.Value;

                    xlApp = new Microsoft.Office.Interop.Excel.Application();
                    xlWorkBook = xlApp.Workbooks.Add(misValue);
                    xlWorkSheet = (Microsoft.Office.Interop.Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);

                    //add data 
                    int StartCol = 1;
                    int StartRow = 1;
                    int j = 0, i = 0;

                    //Write Headers
                    for (j = 0; j < dt.Columns.Count; j++)
                    {
                        Microsoft.Office.Interop.Excel.Range myRange = (Microsoft.Office.Interop.Excel.Range)xlWorkSheet.Cells[StartRow, StartCol + j];
                        myRange.Value2 = dt.Columns[j].HeaderText;
                    }

                    StartRow++;

                    //Write dt content
                    for (i = 0; i < dt.Rows.Count; i++)
                    {
                        for (j = 0; j < dt.Columns.Count; j++)
                        {
                            try
                            {
                                Microsoft.Office.Interop.Excel.Range myRange = (Microsoft.Office.Interop.Excel.Range)xlWorkSheet.Cells[StartRow + i, StartCol + j];
                                myRange.Value2 = dt[j, i].Value == null ? "" : dt[j, i].Value;
                            }
                            catch
                            {
                                ;
                            }
                        }
                    }

                    Microsoft.Office.Interop.Excel.Range chartRange;

                    Microsoft.Office.Interop.Excel.ChartObjects xlCharts = (Microsoft.Office.Interop.Excel.ChartObjects)xlWorkSheet.ChartObjects(Type.Missing);
                    Microsoft.Office.Interop.Excel.ChartObject myChart = (Microsoft.Office.Interop.Excel.ChartObject)xlCharts.Add(10, 80, 300, 250);
                    Microsoft.Office.Interop.Excel.Chart chartPage = myChart.Chart;

                    chartRange = xlWorkSheet.get_Range("A1", "B" + dt.Rows.Count);
                    chartPage.SetSourceData(chartRange, misValue);
                    chartPage.ChartType = Microsoft.Office.Interop.Excel.XlChartType.xlColumnClustered;

                    xlApp.Visible = true;

                }