我正在尝试使用aspose.net使用此代码获取我的world文件的内容:
Document doc = new Document(@"D:\a.docx");
// Create an object that inherits from the DocumentVisitor class.
MyDocToTxtWriter myConverter = new MyDocToTxtWriter();
doc.Accept(myConverter);
System.IO.File.WriteAllText(@"c:/a.txt", myConverter.GetText());
Console.ReadLine();
我的类在上面的代码中定义:
public class MyDocToTxtWriter : DocumentVisitor
{
public MyDocToTxtWriter()
{
mIsSkipText = false;
mBuilder = new StringBuilder();
}
/// <summary>
/// Gets the plain text of the document that was accumulated by the visitor.
/// </summary>
public string GetText()
{
return mBuilder.ToString();
}
/// <summary>
/// Called when a Run node is encountered in the document.
/// </summary>
public override VisitorAction VisitRun(Run run)
{
AppendText(run.Text);
// Let the visitor continue visiting other nodes.
return VisitorAction.Continue;
}
/// <summary>
/// Called when a FieldStart node is encountered in the document.
/// </summary>
public override VisitorAction VisitFieldStart(FieldStart fieldStart)
{
// In Microsoft Word, a field code (such as "MERGEFIELD FieldName") follows
// after a field start character. We want to skip field codes and output field
// result only, therefore we use a flag to suspend the output while inside a field code.
//
// Note this is a very simplistic implementation and will not work very well
// if you have nested fields in a document.
mIsSkipText = true;
return VisitorAction.Continue;
}
/// <summary>
/// Called when a FieldSeparator node is encountered in the document.
/// </summary>
public override VisitorAction VisitFieldSeparator(FieldSeparator fieldSeparator)
{
// Once reached a field separator node, we enable the output because we are
// now entering the field result nodes.
mIsSkipText = false;
return VisitorAction.Continue;
}
/// <summary>
/// Called when a FieldEnd node is encountered in the document.
/// </summary>
public override VisitorAction VisitFieldEnd(FieldEnd fieldEnd)
{
// Make sure we enable the output when reached a field end because some fields
// do not have field separator and do not have field result.
mIsSkipText = false;
return VisitorAction.Continue;
}
/// <summary>
/// Called when visiting of a Paragraph node is ended in the document.
/// </summary>
public override VisitorAction VisitParagraphEnd(Paragraph paragraph)
{
// When outputting to plain text we output Cr+Lf characters.
AppendText(ControlChar.CrLf);
return VisitorAction.Continue;
}
public override VisitorAction VisitBodyStart(Body body)
{
// We can detect beginning and end of all composite nodes such as Section, Body,
// Table, Paragraph etc and provide custom handling for them.
mBuilder.Append("*** Body Started ***\r\n");
return VisitorAction.Continue;
}
public override VisitorAction VisitBodyEnd(Body body)
{
mBuilder.Append("*** Body Ended ***\r\n");
return VisitorAction.Continue;
}
/// <summary>
/// Called when a HeaderFooter node is encountered in the document.
/// </summary>
public override VisitorAction VisitHeaderFooterStart(HeaderFooter headerFooter)
{
// Returning this value from a visitor method causes visiting of this
// node to stop and move on to visiting the next sibling node.
// The net effect in this example is that the text of headers and footers
// is not included in the resulting output.
return VisitorAction.SkipThisNode;
}
/// <summary>
/// Adds text to the current output. Honours the enabled/disabled output flag.
/// </summary>
private void AppendText(string text)
{
if (!mIsSkipText)
mBuilder.Append(text);
}
private readonly StringBuilder mBuilder;
private bool mIsSkipText;
}
当我运行此代码时,只提取一些内容而不是全部内容。 为什么?
答案 0 :(得分:0)
尝试迭代这样的每个段落:
Document doc = new Document(@"D:\a.docx");
var builder = new DocumentBuilder(doc);
var mBuilder = new StringBuilder();
var paragraphs = builder.Document.GetChildNodes(NodeType.Paragraph, true).ToArray().ToList();
paragraphs.ForEach(
x =>
{
((Paragraph)x).Runs.ToArray().ToList().ForEach(y => mBuilder.Append(y.Text));
mBuilder.Append(Environment.NewLine);
}
);
System.IO.File.WriteAllText(@"c:/a.txt", mBuilder.ToString());
Console.ReadLine();
答案 1 :(得分:0)
使用Aspose.Words for .NET API,您可以使用以下简单代码轻松将Word文档转换为TXT格式
Document doc = new Document(MyDir + @"a.docx");
TxtSaveOptions opts = new TxtSaveOptions();
doc.Save(MyDir + @"a.txt", opts);
另一种可用于获取整个Word文档的文本表示的方法如下:
Document doc = new Document(MyDir + @"a.docx");
System.IO.File.WriteAllText(MyDir + @"a.txt", doc.ToString(SaveFormat.Text));
我与Aspose一起担任开发者布道者。