word ifilter for docx parser error

时间:2009-12-21 09:51:04

标签: ifilter

.Docx文档似乎没有被编入索引。

我在.docx中使用了一个唯一的字符串,但是当我搜索“one”时,不会返回.docx。

例如,这里是以下文字:

“这是第一行的文字,这里是第二行的文字。”

将通过iFilter提取为:

“以下是一行行的文字,这里是第二行的文字。”

因此,当Ifilter解析.docx时,他会删除换行符分隔符并尝试解析“oneand here”......。

所以似乎.docx的Word ifilter将一行的最后一个单词与下一行的第一个单词连接起来。

有人能提出一些如何解决这个问题的想法吗?

提前致谢。

2 个答案:

答案 0 :(得分:2)

好的,我现在想出来了。基本上64位IFilter无法正常工作。它合并了由换行符分隔的单词并且不会通过换行符。我使用Ionic.zip访问docx zip存档,并使用稍微修改后的DocxToText版本解析了重要的xml文件。这现在完美无缺。

以下是最初由Jevgenij Pankov创建的修改后的代码

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Ionic.Zip;
using System.IO;
using System.Xml;

public class DocxToText
{
    private const string ContentTypeNamespace =
        @"http://schemas.openxmlformats.org/package/2006/content-types";

    private const string WordprocessingMlNamespace =
        @"http://schemas.openxmlformats.org/wordprocessingml/2006/main";

    private const string DocumentXmlXPath =
        "/t:Types/t:Override[@ContentType=\"" +
        "application/vnd.openxmlformats-officedocument." +
        "wordprocessingml.document.main+xml\"]";

    private const string BodyXPath = "/w:document/w:body";

    private string docxFile = "";
    private string docxFileLocation = "";

    public DocxToText(string fileName)
    {
        docxFile = fileName;
    }

    #region ExtractText()
    /// 

    /// Extracts text from the Docx file.

    /// 

    /// Extracted text.

    public string ExtractText()
    {
        if (string.IsNullOrEmpty(docxFile))
            throw new Exception("Input file not specified.");

        // Usually it is "/word/document.xml"


        docxFileLocation = FindDocumentXmlLocation();

        if (string.IsNullOrEmpty(docxFileLocation))
            throw new Exception("It is not a valid Docx file.");

        return ReadDocumentXml();
    }
    #endregion

    #region FindDocumentXmlLocation()
    /// 

    /// Gets location of the "document.xml" zip entry.

    /// 

    /// Location of the "document.xml".

    private string FindDocumentXmlLocation()
    {
        using (ZipFile zip = new ZipFile(docxFile))
        {
            foreach (ZipEntry entry in zip)
            {
                // Find "[Content_Types].xml" zip entry
                if (string.Compare(entry.FileName, "[Content_Types].xml", true) == 0)
                {
                    XmlDocument xmlDoc = new XmlDocument();
                    using (var stream = new MemoryStream())
                    {

                        entry.Extract(stream);
                        stream.Position = 0;

                        xmlDoc.PreserveWhitespace = true;
                        xmlDoc.Load(stream);
                    }

                    //Create an XmlNamespaceManager for resolving namespaces


                    XmlNamespaceManager nsmgr =
                        new XmlNamespaceManager(xmlDoc.NameTable);
                    nsmgr.AddNamespace("t", ContentTypeNamespace);

                    // Find location of "document.xml"


                    XmlNode node = xmlDoc.DocumentElement.SelectSingleNode(
                        DocumentXmlXPath, nsmgr);

                    if (node != null)
                    {
                        string location =
                            ((XmlElement)node).GetAttribute("PartName");
                        return location.TrimStart(new char[] { '/' });
                    }
                    break;
                }
            }
        }
        return null;
    }
    #endregion

    #region ReadDocumentXml()
    /// 

    /// Reads "document.xml" zip entry.

    /// 

    /// Text containing in the document.

    private string ReadDocumentXml()
    {
        StringBuilder sb = new StringBuilder();

        using (ZipFile zip = new ZipFile(docxFile))
        {
            foreach (ZipEntry entry in zip)
            {
                if (string.Compare(entry.FileName, docxFileLocation, true) == 0)
                {
                    XmlDocument xmlDoc = new XmlDocument();
                    using (var stream = new MemoryStream())
                    {

                        entry.Extract(stream);
                        stream.Position = 0;

                        xmlDoc.PreserveWhitespace = true;
                        xmlDoc.Load(stream);
                    }

                    XmlNamespaceManager nsmgr =
                        new XmlNamespaceManager(xmlDoc.NameTable);
                    nsmgr.AddNamespace("w", WordprocessingMlNamespace);

                    XmlNode node =
                        xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, nsmgr);

                    if (node == null)
                        return string.Empty;

                    sb.Append(ReadNode(node));

                    break;
                }
            }
        }
        return sb.ToString();
    }
    #endregion

    #region ReadNode()
    /// 

    /// Reads content of the node and its nested childs.

    /// 

    /// XmlNode.

    /// Text containing in the node.

    private string ReadNode(XmlNode node)
    {
        if (node == null || node.NodeType != XmlNodeType.Element)
            return string.Empty;

        StringBuilder sb = new StringBuilder();
        foreach (XmlNode child in node.ChildNodes)
        {
            if (child.NodeType != XmlNodeType.Element) continue;

            switch (child.LocalName)
            {
                case "t": // Text

                    sb.Append(child.InnerText.TrimEnd());

                    string space =
                        ((XmlElement)child).GetAttribute("xml:space");
                    if (!string.IsNullOrEmpty(space) &&
                        space == "preserve")
                        sb.Append(' ');

                    break;

                case "cr":                          // Carriage return

                case "br":                          // Page break

                    sb.Append(Environment.NewLine);
                    break;

                case "tab":                         // Tab

                    sb.Append("\t");
                    break;

                case "p":                           // Paragraph

                    sb.Append(ReadNode(child));
                    sb.Append(Environment.NewLine);
                    sb.Append(Environment.NewLine);
                    break;

                default:
                    sb.Append(ReadNode(child));
                    break;
            }
        }
        return sb.ToString();
    }
    #endregion
}

以下是此代码的用法...

DocxToText dtt = new DocxToText(filepath);
string docxText = dtt.ExtractText();

答案 1 :(得分:1)

将光标放在单词的中间并保存文档将导致单词在两个XML标记之间分割,其间带有“_GoBack”书签。结果是在使用此例程解析后,在这两个字符串片段之间放置一个空格,而不是将它们合并回一个字符串。处理“_GoBack”场景很容易,但也可能还有其他场景。也许是“跟踪变化”,谁知道还有什么。

DOCX是否存在更详细的解析算法?