.Docx文档似乎没有被编入索引。
我在.docx中使用了一个唯一的字符串,但是当我搜索“one”时,不会返回.docx。
例如,这里是以下文字:
“这是第一行的文字,这里是第二行的文字。”
将通过iFilter提取为:
“以下是一行行的文字,这里是第二行的文字。”
因此,当Ifilter解析.docx时,他会删除换行符分隔符并尝试解析“oneand here”......。
所以似乎.docx的Word ifilter将一行的最后一个单词与下一行的第一个单词连接起来。
有人能提出一些如何解决这个问题的想法吗?
提前致谢。
答案 0 :(得分:2)
好的,我现在想出来了。基本上64位IFilter无法正常工作。它合并了由换行符分隔的单词并且不会通过换行符。我使用Ionic.zip访问docx zip存档,并使用稍微修改后的DocxToText版本解析了重要的xml文件。这现在完美无缺。
以下是最初由Jevgenij Pankov创建的修改后的代码
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Ionic.Zip;
using System.IO;
using System.Xml;
public class DocxToText
{
private const string ContentTypeNamespace =
@"http://schemas.openxmlformats.org/package/2006/content-types";
private const string WordprocessingMlNamespace =
@"http://schemas.openxmlformats.org/wordprocessingml/2006/main";
private const string DocumentXmlXPath =
"/t:Types/t:Override[@ContentType=\"" +
"application/vnd.openxmlformats-officedocument." +
"wordprocessingml.document.main+xml\"]";
private const string BodyXPath = "/w:document/w:body";
private string docxFile = "";
private string docxFileLocation = "";
public DocxToText(string fileName)
{
docxFile = fileName;
}
#region ExtractText()
///
/// Extracts text from the Docx file.
///
/// Extracted text.
public string ExtractText()
{
if (string.IsNullOrEmpty(docxFile))
throw new Exception("Input file not specified.");
// Usually it is "/word/document.xml"
docxFileLocation = FindDocumentXmlLocation();
if (string.IsNullOrEmpty(docxFileLocation))
throw new Exception("It is not a valid Docx file.");
return ReadDocumentXml();
}
#endregion
#region FindDocumentXmlLocation()
///
/// Gets location of the "document.xml" zip entry.
///
/// Location of the "document.xml".
private string FindDocumentXmlLocation()
{
using (ZipFile zip = new ZipFile(docxFile))
{
foreach (ZipEntry entry in zip)
{
// Find "[Content_Types].xml" zip entry
if (string.Compare(entry.FileName, "[Content_Types].xml", true) == 0)
{
XmlDocument xmlDoc = new XmlDocument();
using (var stream = new MemoryStream())
{
entry.Extract(stream);
stream.Position = 0;
xmlDoc.PreserveWhitespace = true;
xmlDoc.Load(stream);
}
//Create an XmlNamespaceManager for resolving namespaces
XmlNamespaceManager nsmgr =
new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("t", ContentTypeNamespace);
// Find location of "document.xml"
XmlNode node = xmlDoc.DocumentElement.SelectSingleNode(
DocumentXmlXPath, nsmgr);
if (node != null)
{
string location =
((XmlElement)node).GetAttribute("PartName");
return location.TrimStart(new char[] { '/' });
}
break;
}
}
}
return null;
}
#endregion
#region ReadDocumentXml()
///
/// Reads "document.xml" zip entry.
///
/// Text containing in the document.
private string ReadDocumentXml()
{
StringBuilder sb = new StringBuilder();
using (ZipFile zip = new ZipFile(docxFile))
{
foreach (ZipEntry entry in zip)
{
if (string.Compare(entry.FileName, docxFileLocation, true) == 0)
{
XmlDocument xmlDoc = new XmlDocument();
using (var stream = new MemoryStream())
{
entry.Extract(stream);
stream.Position = 0;
xmlDoc.PreserveWhitespace = true;
xmlDoc.Load(stream);
}
XmlNamespaceManager nsmgr =
new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("w", WordprocessingMlNamespace);
XmlNode node =
xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, nsmgr);
if (node == null)
return string.Empty;
sb.Append(ReadNode(node));
break;
}
}
}
return sb.ToString();
}
#endregion
#region ReadNode()
///
/// Reads content of the node and its nested childs.
///
/// XmlNode.
/// Text containing in the node.
private string ReadNode(XmlNode node)
{
if (node == null || node.NodeType != XmlNodeType.Element)
return string.Empty;
StringBuilder sb = new StringBuilder();
foreach (XmlNode child in node.ChildNodes)
{
if (child.NodeType != XmlNodeType.Element) continue;
switch (child.LocalName)
{
case "t": // Text
sb.Append(child.InnerText.TrimEnd());
string space =
((XmlElement)child).GetAttribute("xml:space");
if (!string.IsNullOrEmpty(space) &&
space == "preserve")
sb.Append(' ');
break;
case "cr": // Carriage return
case "br": // Page break
sb.Append(Environment.NewLine);
break;
case "tab": // Tab
sb.Append("\t");
break;
case "p": // Paragraph
sb.Append(ReadNode(child));
sb.Append(Environment.NewLine);
sb.Append(Environment.NewLine);
break;
default:
sb.Append(ReadNode(child));
break;
}
}
return sb.ToString();
}
#endregion
}
以下是此代码的用法...
DocxToText dtt = new DocxToText(filepath);
string docxText = dtt.ExtractText();
答案 1 :(得分:1)
将光标放在单词的中间并保存文档将导致单词在两个XML标记之间分割,其间带有“_GoBack”书签。结果是在使用此例程解析后,在这两个字符串片段之间放置一个空格,而不是将它们合并回一个字符串。处理“_GoBack”场景很容易,但也可能还有其他场景。也许是“跟踪变化”,谁知道还有什么。
DOCX是否存在更详细的解析算法?