我试图编写一个解析器,使用NPOI从word文档中提取细节。我能够从文档中的每个表中检索详细信息,但我需要能够识别表格来自哪个文档部分,以便区分它们。虽然我可以识别出具有我需要的特定标题类型的所有行,但我无法确定如何判断哪个标题位于哪个表之前。
有人可以提供任何建议吗?如果NPOI不可能,那么有人可以推荐另一种方法吗?
答案 0 :(得分:0)
如果要解析word文档。我建议你使用Eric white的OpenXMlpowertool,从NuGet包管理器下载或直接从网上下载。
这是我用来解析文档的代码片段,代码片段非常小,干净而且稳定。您必须先调试它以了解它的工作情况,这将有助于您自己定制它。它将读取所有文本,段落,项目符号和内容等。阅读Eric White的文档以获取更多详细信息,但下面的代码片段是您需要解析的最多内容,并且您可以构建您的功能。
using DocumentFormat.OpenXml.Packaging;
using OpenXmlPowerTools;
private static WordprocessingDocument _wordDocument;
_wordDocument = WordprocessingDocument.Open(wordFileStream, false); // stream wordFileStream in constructor
// To get header and footer use this
var headerList = _wordDocument.MainDocumentPart.HeaderParts.ToList();
var footerList = _wordDocument.MainDocumentPart.FooterParts.ToList();
private void GetDocumentBodyContents()
{
List<string> allList = new List<string>();
List<string> allListText = new List<string>();
try
{
//RevisionAccepter.AcceptRevisions(_wordDocument);
XElement root = _wordDocument.MainDocumentPart.GetXDocument().Root;
XElement body = root.LogicalChildrenContent().First();
OutputBlockLevelContent(_wordDocument, body);
}
catch (Exception ex)
{ }
}
private void OutputBlockLevelContent(WordprocessingDocument wordDoc, XElement blockLevelContentContainer)
{
try
{
string currentItem = string.Empty, currentItemText = string.Empty, numberText = string.Empty;
foreach (XElement blockLevelContentElement in
blockLevelContentContainer.LogicalChildrenContent())
{
if (blockLevelContentElement.Name == W.p)
{
currentItem = ListItemRetriever.RetrieveListItem(wordDoc, blockLevelContentElement);
//currentItemText = blockLevelContentElement
// .LogicalChildrenContent(W.r)
// .LogicalChildrenContent(W.t)
// .Select(t => (string)t)
// .StringConcatenate();
currentItemText = blockLevelContentElement
.LogicalChildrenContent(W.r)
.Select(t =>
{
if (t.LogicalChildrenContent(W.br).Count() > 0)
{
//Adding line Break for Steps because it is truncated when typecaste with String
t.SetElementValue(W.br, "<br />");
}
return (string)t;
}
).StringConcatenate();
continue;
}
// If element is not a paragraph, it must be a table.
foreach (var row in blockLevelContentElement.LogicalChildrenContent())
{
foreach (var cell in row.LogicalChildrenContent())
{
// Cells are a block-level content container, so can call this method recursively.
OutputBlockLevelContent(wordDoc, cell);
}
}
}
}
catch (Exception ex)
{
}
}