Question

我正在尝试提取Word文档中的所有单词。我可以一次性完成所有操作......

Word.Application word = new Word.Application();
doc = word.Documents.Open(@"C:\SampleText.doc");
doc.Activate();

foreach (Word.Range docRange in doc.Words) // loads all words in document
{
    IEnumerable<string> sortedSubstrings = Enumerable.Range(0, docRange.Text.Trim().Length)
        .Select(i => docRange.Text.Substring(i))
        .OrderBy(s => s.Length < 3 ? s : s.Remove(2, Math.Min(s.Length - 2, 2)));

    wordPosition =
        (int)
        docRange.get_Information(
            Microsoft.Office.Interop.Word.WdInformation.wdFirstCharacterColumnNumber);

    foreach (var substring in sortedSubstrings)
    {
        index = docRange.Text.IndexOf(substring) + wordPosition;
        charLocation[index] = substring;
    }
}

但是我希望一次加载一行文件......是否可以这样做？

我可以按段加载它但是我无法遍历段落来提取所有单词。

foreach (Word.Paragraph para in doc.Paragraphs)
{
    foreach (Word.Range docRange in para) // Error: type Word.para is not enumeranle**
    {
        IEnumerable<string> sortedSubstrings = Enumerable.Range(0, docRange.Text.Trim().Length)
            .Select(i => docRange.Text.Substring(i))
            .OrderBy(s => s.Length < 3 ? s : s.Remove(2, Math.Min(s.Length - 2, 2)));

        wordPosition =
            (int)
            docRange.get_Information(
                Microsoft.Office.Interop.Word.WdInformation.wdFirstCharacterColumnNumber);

        foreach (var substring in sortedSubstrings)
        {
            index = docRange.Text.IndexOf(substring) + wordPosition;
            charLocation[index] = substring;
        }

    }
}

Answer 1

这有助于您逐行获取字符串。

    object file = Path.GetDirectoryName(Application.ExecutablePath) + @"\Answer.doc";

    Word.Application wordObject = new Word.ApplicationClass();
    wordObject.Visible = false;

    object nullobject = Missing.Value;
    Word.Document docs = wordObject.Documents.Open
        (ref file, ref nullobject, ref nullobject, ref nullobject,
        ref nullobject, ref nullobject, ref nullobject, ref nullobject,
        ref nullobject, ref nullobject, ref nullobject, ref nullobject,
        ref nullobject, ref nullobject, ref nullobject, ref nullobject);

    String strLine;
    bool bolEOF = false;

    docs.Characters[1].Select();

    int index = 0;
    do
    {
        object unit = Word.WdUnits.wdLine;
        object count = 1;
        wordObject.Selection.MoveEnd(ref unit, ref count);

        strLine = wordObject.Selection.Text;
        richTextBox1.Text += ++index + " - " + strLine + "\r\n"; //for our understanding

        object direction = Word.WdCollapseDirection.wdCollapseEnd;
        wordObject.Selection.Collapse(ref direction);

        if (wordObject.Selection.Bookmarks.Exists(@"\EndOfDoc"))
            bolEOF = true;
    } while (!bolEOF);

    docs.Close(ref nullobject, ref nullobject, ref nullobject);
    wordObject.Quit(ref nullobject, ref nullobject, ref nullobject);
    docs = null;
    wordObject = null;

Here是代码背后的天才。请点击链接获取有关其工作原理的更多说明。

Answer 2

我建议您按照此页面上的代码here

进行操作

它的关键在于您使用Word.ApplicationClass（Microsoft.Interop.Word）对象读取它，尽管他获得“Doc”对象超出了我的范围。我假设您使用ApplicationClass创建它。

编辑：通过调用此文件来检索文档：

Word.Document doc = wordApp.Documents.Open(ref file, ref nullobj, ref nullobj,
                                      ref nullobj, ref nullobj, ref nullobj,
                                      ref nullobj, ref nullobj, ref nullobj,
                                      ref nullobj, ref nullobj, ref nullobj);

可悲的是，我链接的页面上的代码格式化并不容易。

EDIT2：从那里你可以循环访问doc段落，但是据我所知，没有办法循环遍历行。我建议使用一些模式匹配来查找换行符。

要从段落中提取文本，请使用Word.Paragraph.Range .Text，这将返回段落中的所有文本。然后，您必须搜索换行符。我会使用string.IndexOf()。

或者，如果您想逐行提取一个句子，则可以简单地遍历Range.Sentences

Answer 3

        Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.Application();
        object miss = System.Reflection.Missing.Value;
        object path = @"D:\viewstate.docx";
        object readOnly = true;
        Microsoft.Office.Interop.Word.Document docs = word.Documents.Open(ref path, ref miss, ref readOnly, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss);
        string totaltext = "";

        object unit = Microsoft.Office.Interop.Word.WdUnits.wdLine;
        object count = 1;
        word.Selection.MoveEnd(ref unit, ref count);
        totaltext = word.Selection.Text;

        TextBox1.Text = totaltext;
        docs.Close(ref miss, ref miss, ref miss);
        word.Quit(ref miss, ref miss, ref miss);
        docs = null;
        word = null;

增加每行的计数

Answer 4

我建议使用DocX库。它很轻巧，不需要在机器上安装Word。以下是用于逐行获取文本的代码：

using(DocX doc = DocX.Load("sample.docx"))
{
     for (int i = 0; i < doc.Paragraphs.Count; i++ )
     {
          foreach (var item in doc.Paragraphs[i].Text.Split(new string[]{"\n"}
                    , StringSplitOptions.RemoveEmptyEntries))
          {
                Console.WriteLine(item);
          }
     }
}

有没有办法逐行阅读word文档

4 个答案: