我正在编写一个方法将文件和/或PdfReader转换为XMLDocument
当我尝试使用文件时,它完美无缺。 但是,我使用PdfReader作为参数调用该方法,当通过' reader.ConvertToXml(pdfReader,ms); '时,它会抛出错误“已经关闭”。任何想法
public static XmlDocument PDF_Tag_Extract(string filesource)
{
PdfReader pdfReader = new PdfReader(filesource);
return PDF_Tag_Extract(pdfReader);
}
public static XmlDocument PDF_Tag_Extract(PdfReader pdfReader)
{
TaggedPdfReaderTool reader = new TaggedPdfReaderTool();
XmlDocument XMLized_PDF = new XmlDocument();
using (MemoryStream ms = new MemoryStream())
{
reader.ConvertToXml(pdfReader, ms);
StringBuilder sb = new StringBuilder();
foreach (byte b in ms.ToArray())
{
sb.Append((char)b);
}
var doc = XDocument.Parse(sb.ToString());
var emptyElements = from descendant in doc.Descendants()
where descendant.IsEmpty || string.IsNullOrWhiteSpace(descendant.Value)
select descendant;
emptyElements.Remove();
XMLized_PDF.LoadXml(doc.ToString());
}
return XMLized_PDF;
}
实际上,这是我从以下方法调用方法的代码:
public static void Process_Trademarks(string filesource)
{
PdfReader pdfReader = new PdfReader(filesource);
string[] PDF_TXT_page = new string[pdfReader.NumberOfPages];
for (int i = 1; i <= pdfReader.NumberOfPages; i++)
{
// Separa la página en un nuevo PdfReader
MemoryStream output = new MemoryStream();
Document document = new Document();
PdfCopy copy = new PdfCopy(document, output);
copy.SetTagged();
copy.CloseStream = false;
document.Open();
copy.AddPage(copy.GetImportedPage(pdfReader, i, true));
document.Close();
PdfReader page = new PdfReader(output.ToArray());
output.Close();
copy.Close();
// Página a XML
try
{
// Identifies Section
PDF_TXT_page[i - 1] = Common.ExtractText(page);
// HERE IT ENTERS THE TROUBLESOME CODE.
var xmlnodes = Common.PDF_Tag_Extract(page).DocumentElement.SelectNodes("/Sect/Table");
}
catch (Exception ex)
{
}
}
}
但是,我发现如果放“PDF_TXT_page [i-1] = Common.ExtractText(page);”在“var xmlnodes = Common.PDF_Tag_Extract(page).DocumentElement.SelectNodes(”/ Sect / Table“);”之后,代码可以正常工作。