我试图从pdf中提取文本并从该文本创建excel,
我已经使用pdfbox
从pdf中提取文本,但现在我正在尝试将它们粘贴到pdcel结构的excel单元格中,我有x和y坐标的文本,但我如何将它们映射到excel单元格?
首先我尝试了一个pdf页面到一个excel表格,然后获取pdf页面中的所有对象,从y坐标开始排序并获得如下排名
private DataTable RankDt(DataTable dt, string fld)
{
var rankDt = (from row in dt.AsEnumerable()
orderby row.Field<float>(fld)
select row).CopyToDataTable();
rankDt.Columns.Add(fld+"Rank",typeof(int));
int rank = 1;
for (int i = 0; i < rankDt.Rows.Count - 1; i++)
{
rankDt.Rows[i][fld + "Rank"] = rank;
if (rankDt.Rows[i][fld].ToString() != rankDt.Rows[i + 1][fld].ToString())
rank++;
}
rankDt.Rows[rankDt.Rows.Count - 1][fld + "Rank"] = rank;
return rankDt;
}
然后循环到行计数并在excel
中按如下方式过去 public void CreateExcelOutputFile(System.Data.DataTable dt, string destinationFileName, string destinationFilePath)
{
try
{
Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application();
Workbook wb = xlApp.Workbooks.Add(XlWBATemplate.xlWBATWorksheet);
Worksheet ws = (Worksheet)wb.Worksheets[1];
if (dt.Rows.Count > 0)
{
int rowNo = 1;
for (int i = 0; i < dt.Rows.Count - 1; i++)
{
var dty = (from row in dt.AsEnumerable()
orderby row.Field<float>("x")
where row.Field<int>("yRank") == rowNo
select row);
if (dty.Count() == 0)
{
break;
}
else
{
var tbl = dty.CopyToDataTable();
for (int y = 0; y < tbl.Rows.Count; y++)
{
ws.Cells[rowNo, y+1] = tbl.Rows[y]["text"].ToString();
}
}
rowNo++;
}
}
wb.SaveAs((destinationFilePath + '\\' + destinationFileName), Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing);
wb.Close();
}
catch (Exception ex)
{
throw ex;
}
}
70%的数据格式化为pdf,但有些文字格式不是pdf
这个pdftoexcel网站做得很好,他们如何进行此次转换? 是否有任何人可以给出提示或解决方案? 提前谢谢
答案 0 :(得分:0)
Try using this.
void ExportTOExcel(System.Data.DataTable dt)
{
Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application();
Microsoft.Office.Interop.Excel.Workbook xlWorkBook;
Microsoft.Office.Interop.Excel.Worksheet xlWorkSheet;
object misValue = System.Reflection.Missing.Value;
xlApp = new Microsoft.Office.Interop.Excel.Application();
xlWorkBook = xlApp.Workbooks.Add(misValue);
xlWorkSheet = (Microsoft.Office.Interop.Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);
//add data
int StartCol = 1;
int StartRow = 1;
int j = 0, i = 0;
//Write Headers
for (j = 0; j < dt.Columns.Count; j++)
{
Microsoft.Office.Interop.Excel.Range myRange = (Microsoft.Office.Interop.Excel.Range)xlWorkSheet.Cells[StartRow, StartCol + j];
myRange.Value2 = dt.Columns[j].HeaderText;
}
StartRow++;
//Write dt content
for (i = 0; i < dt.Rows.Count; i++)
{
for (j = 0; j < dt.Columns.Count; j++)
{
try
{
Microsoft.Office.Interop.Excel.Range myRange = (Microsoft.Office.Interop.Excel.Range)xlWorkSheet.Cells[StartRow + i, StartCol + j];
myRange.Value2 = dt[j, i].Value == null ? "" : dt[j, i].Value;
}
catch
{
;
}
}
}
Microsoft.Office.Interop.Excel.Range chartRange;
Microsoft.Office.Interop.Excel.ChartObjects xlCharts = (Microsoft.Office.Interop.Excel.ChartObjects)xlWorkSheet.ChartObjects(Type.Missing);
Microsoft.Office.Interop.Excel.ChartObject myChart = (Microsoft.Office.Interop.Excel.ChartObject)xlCharts.Add(10, 80, 300, 250);
Microsoft.Office.Interop.Excel.Chart chartPage = myChart.Chart;
chartRange = xlWorkSheet.get_Range("A1", "B" + dt.Rows.Count);
chartPage.SetSourceData(chartRange, misValue);
chartPage.ChartType = Microsoft.Office.Interop.Excel.XlChartType.xlColumnClustered;
xlApp.Visible = true;
}