对于C#Web应用程序,我想索引存储在数据库中的PDF,DOC等文件的文本。
我一直在试验an IFilter example on Code Project,它适用于文件系统中的文件,但我的文件存储在MS-SQL数据库中。
任何人都可以帮我找一个示例来从存储在数据库中的文件中提取文本,或者知道如何修改Code Project代码以使用数据库而不是文件系统?
答案 0 :(得分:13)
最后几个小时后,我想出了如何完成这项工作!我需要在存储在数据库中的PDF内容上运行IFilter,我希望避免将数据保存到临时文件中。
首先,我尝试使用BindIFilterFromStream API为存储在Stream中的内容创建IFilter,但似乎它无法正常工作(至少不适用于此方案)。所以不要这样做。
相反,您需要为文件扩展名创建一个IFilter(或以其他方式访问它)。然后,您可以访问IPersistStream COM界面并使用它将PDF内容加载到IFilter中。其余的工作与文件相同。但请注意,每个IFilter都可能无法实现IPersistStream API。它适用于Adobe PDF IFilter。
代码应如下所示(我删除了一些返回代码检查以使代码更具可读性,但是,您应该检查所有可能的返回代码)。
private string ParseIFilter(Stream s)
{
// Get an IFilter for a file or file extension
IFilter filter = null;
FilterReturnCodes result = NativeMethods.LoadIFilter(".pdf", null, ref filter);
if (result != FilterReturnCodes.S_OK)
{
Marshal.ThrowExceptionForHR((int)result);
}
// Copy the content to global memory
byte[] buffer = new byte[s.Length];
s.Read(buffer, 0, buffer.Length);
IntPtr nativePtr = Marshal.AllocHGlobal(buffer.Length);
Marshal.Copy(buffer, 0, nativePtr, buffer.Length);
// Create a COM stream
System.Runtime.InteropServices.ComTypes.IStream comStream;
NativeMethods.CreateStreamOnHGlobal(nativePtr, true, out comStream);
// Load the contents to the iFilter using IPersistStream interface
var persistStream = (IPersistStream)filter;
persistStream.Load(comStream);
// Initialize iFilter
FilterFlags filterFlags;
FilterReturnCodes result = filter.Init(
FilterInit.IFILTER_INIT_INDEXING_ONLY, 0, IntPtr.Zero, out filterFlags);
return ExtractTextFromIFilter(filter);
}
过滤器中的文本提取在我的代码中看起来像这样。网上有很多这样的例子,它可以根据你的需要以多种方式实现。
private string ExtractTextFromIFilter(IFilter filter)
{
var sb = new StringBuilder();
while (true)
{
StatChunk chunk;
result = filter.GetChunk(out chunk);
if (result == FilterReturnCodes.S_OK)
{
if (chunk.flags == ChunkState.CHUNK_TEXT)
{
sb.Append(ExtractTextFromChunk(filter, chunk));
}
continue;
}
if (result == FilterReturnCodes.FILTER_E_END_OF_CHUNKS)
{
return sb.ToString();
}
Marshal.ThrowExceptionForHR((int)result);
}
}
private virtual string ExtractTextFromChunk(IFilter filter, StatChunk chunk)
{
var sb = new StringBuilder();
var result = FilterReturnCodes.S_OK;
while (result == FilterReturnCodes.S_OK)
{
int sizeBuffer = 16384;
var buffer = new StringBuilder(sizeBuffer);
result = filter.GetText(ref sizeBuffer, buffer);
if ((result == FilterReturnCodes.S_OK) || (result == FilterReturnCodes.FILTER_S_LAST_TEXT))
{
if((sizeBuffer > 0) && (buffer.Length > 0))
{
sb.Append(buffer.ToString(0, sizeBuffer));
}
}
if (result == FilterReturnCodes.FILTER_E_NO_TEXT)
{
return string.Empty;
}
if ((result == FilterReturnCodes.FILTER_S_LAST_TEXT) || (result == FilterReturnCodes.FILTER_E_NO_MORE_TEXT))
{
return sb.ToString();
}
}
return sb.ToString();
}
以下是本机方法的定义及其使用的结构。
internal static class NativeMethods
{
[DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
public static extern FilterReturnCodes LoadIFilter(
string pwcsPath,
[MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter,
ref IFilter ppIUnk);
[DllImport("ole32.dll")]
public static extern int CreateStreamOnHGlobal(IntPtr hGlobal, bool fDeleteOnRelease, out IStream ppstm);
}
[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IFilter
{
[PreserveSig]
FilterReturnCodes Init(FilterInit grfFlags, int cAttributes, IntPtr aAttributes, out FilterFlags pdwFlags);
[PreserveSig]
FilterReturnCodes GetChunk(out StatChunk pStat);
[PreserveSig]
FilterReturnCodes GetText(
ref int pcwcBuffer,
[Out, MarshalAs(UnmanagedType.LPWStr)] StringBuilder awcBuffer);
[PreserveSig]
FilterReturnCodes GetValue(ref IntPtr propVal);
[PreserveSig]
FilterReturnCodes BindRegion(ref FilterRegion origPos, ref Guid riid, ref object ppunk);
}
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("0000010c-0000-0000-C000-000000000046")]
public interface IPersist
{
void GetClassID(out Guid pClassID);
}
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("00000109-0000-0000-C000-000000000046")]
public interface IPersistStream : IPersist
{
new void GetClassID(out Guid pClassID);
[PreserveSig]
int IsDirty();
void Load([In] IStream pStm);
void Save(
[In] IStream pStm,
[In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);
void GetSizeMax(out long pcbSize);
}
public struct StatChunk
{
public int idChunk;
[MarshalAs(UnmanagedType.U4)]
public ChunkBreaktype breakType;
[MarshalAs(UnmanagedType.U4)]
public ChunkState flags;
public int locale;
public FullPropSpec attribute;
public int idChunkSource;
public int cwcStartSource;
public int cwcLenSource;
}
public enum ChunkBreaktype
{
CHUNK_NO_BREAK = 0,
CHUNK_EOW = 1,
CHUNK_EOS = 2,
CHUNK_EOP = 3,
CHUNK_EOC = 4
}
public enum ChunkState
{
CHUNK_TEXT = 0x1,
CHUNK_VALUE = 0x2,
CHUNK_FILTER_OWNED_VALUE = 0x4
}
[Flags]
public enum FilterFlags
{
IFILTER_FLAGS_OLE_PROPERTIES = 1
}
[Flags]
public enum FilterInit
{
IFILTER_INIT_CANON_PARAGRAPHS = 1,
IFILTER_INIT_HARD_LINE_BREAKS = 2,
IFILTER_INIT_CANON_HYPHENS = 4,
IFILTER_INIT_CANON_SPACES = 8,
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
IFILTER_INIT_INDEXING_ONLY = 64,
IFILTER_INIT_SEARCH_LINKS = 128,
IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512
}
public struct FilterRegion
{
public int idChunk;
public int cwcStart;
public int cwcExtent;
}
public enum FilterReturnCodes : uint
{
S_OK = 0,
E_ACCESSDENIED = 0x80070005,
E_HANDLE = 0x80070006,
E_INVALIDARG = 0x80070057,
E_OUTOFMEMORY = 0x8007000E,
E_NOTIMPL = 0x80004001,
E_FAIL = 0x80000008,
FILTER_E_PASSWORD = 0x8004170B,
FILTER_E_UNKNOWNFORMAT = 0x8004170C,
FILTER_E_NO_TEXT = 0x80041705,
FILTER_E_NO_VALUES = 0x80041706,
FILTER_E_END_OF_CHUNKS = 0x80041700,
FILTER_E_NO_MORE_TEXT = 0x80041701,
FILTER_E_NO_MORE_VALUES = 0x80041702,
FILTER_E_ACCESS = 0x80041703,
FILTER_W_MONIKER_CLIPPED = 0x00041704,
FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
FILTER_E_LINK_UNAVAILABLE = 0x80041708,
FILTER_S_LAST_TEXT = 0x00041709,
FILTER_S_LAST_VALUES = 0x0004170A
}
public struct FullPropSpec
{
public Guid guidPropSet;
public PropSpec psProperty;
}
[StructLayout(LayoutKind.Explicit)]
public struct PropSpec
{
[FieldOffset(0)]
public int ulKind;
[FieldOffset(4)]
public int propid;
[FieldOffset(4)]
public IntPtr lpwstr;
}
答案 1 :(得分:1)
我过去曾致力于提供一个iFilter,旨在为AutoCad dwg文件中的文本内容提供任何搜索/索引工具访问。你可以在这里阅读我的一些冒险经历:http://blogs.msdn.com/b/ifilter/archive/2006/12/25/chronicles-of-an-ifilter-development-inception-to-deployment.aspx
您所指的代码是旧的,但仍然有效。但是,GetTextFromFile旁边现在有更多接口在使用。您将需要使用流阅读器,在我上面提到的链接中的IPersistStream中读取。如果我了解您要执行的操作,则需要将该文件作为数据库中的流打开,并将此流提供给搜索/索引器或您选择的iFilter。
祝你好运, 马可答案 2 :(得分:1)
我希望做同样的事情,但我最终在TextContent的数据库表中添加了另一列。我将BinaryContent保存到一个临时文件中,使用CodeProject库Epocalisde.IFilter dll查找Text,并将其添加到TextContent列。
答案 3 :(得分:0)
在Mareks示例的基础上,我采用了IStream接口的实现,而不是通过Marshal.AllocHGlobal
分配内存来创建COM流。
它可以与Adobe PDF iFilter 64 11.0.01和大量的格式一起使用,例如.doc
,.docx
,.html
,.odt
,.rtf
,清单继续。
完整示例:
using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
namespace TextExtraction
{
class Program
{
static void Main(string[] args)
{
var file = new FileInfo(@"C:\Path\To\Some.doc");
using (var stream = file.OpenRead())
{
var filter = Load(stream, file.Extension);
if (filter != null)
{
var text = GetText(filter);
Console.WriteLine(text);
}
}
Console.WriteLine("Press your favorite key to exit");
Console.ReadKey();
}
private static IFilter Load(Stream stream, string extension)
{
IFilter filter = null;
if (NativeMethods.LoadIFilter(extension, null, ref filter) == HRESULT.S_OK)
{
if (filter is IPersistStream persistStream)
{
persistStream.Load(new ManagedStream(stream));
if (filter.Init(IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, 0, IntPtr.Zero, out IFILTER_FLAGS filterFlags) == IFilterReturnCodes.S_OK)
{
return filter;
}
}
}
return null;
}
private static string GetText(IFilter filter)
{
var text = new StringBuilder();
while (filter.GetChunk(out var chunk) == IFilterReturnCodes.S_OK)
{
ReadChunk(filter, chunk, text);
}
return text.ToString();
}
private static void ReadChunk(IFilter filter, STAT_CHUNK chunk, StringBuilder text)
{
var textResult = IFilterReturnCodes.S_OK;
while (textResult == IFilterReturnCodes.S_OK)
{
var bufferSize = 4096U;
var buffer = new char[bufferSize];
textResult = filter.GetText(ref bufferSize, buffer);
if ((textResult == IFilterReturnCodes.S_OK || textResult == IFilterReturnCodes.FILTER_S_LAST_TEXT) && bufferSize > 0)
{
if (chunk.breakType == CHUNK_BREAKTYPE.CHUNK_EOP)
{
text.Append('\n');
}
text.Append(buffer, 0, (int) bufferSize);
}
}
}
[Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IFilter
{
[PreserveSig]
IFilterReturnCodes Init(IFILTER_INIT grfFlags, int cAttributes, IntPtr aAttributes,
out IFILTER_FLAGS pdwFlags);
[PreserveSig]
IFilterReturnCodes GetChunk(out STAT_CHUNK pStat);
[PreserveSig]
IFilterReturnCodes GetText(ref uint pcwcBuffer, [Out, MarshalAs(UnmanagedType.LPArray)]
char[] awcBuffer);
[PreserveSig]
IFilterReturnCodes GetValue(ref IntPtr propVal);
[PreserveSig]
IFilterReturnCodes BindRegion(ref FILTERREGION origPos, ref Guid riid, ref object ppunk);
}
[Guid("0000010C-0000-0000-C000-000000000046")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersist
{
void GetClassID(out Guid pClassID);
}
[Guid("00000109-0000-0000-C000-000000000046")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStream : IPersist
{
new void GetClassID(out Guid pClassID);
[PreserveSig]
int IsDirty();
void Load([In] IStream pStm);
void Save([In] IStream pStm, [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);
void GetSizeMax(out long pcbSize);
}
[Guid("0000000C-0000-0000-C000-000000000046")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IStream
{
[PreserveSig]
HRESULT Read([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] [Out]
byte[] pv, int cb, IntPtr pcbRead);
[PreserveSig]
HRESULT Write([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)]
byte[] pv, int cb, IntPtr pcbWritten);
[PreserveSig]
HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition);
[PreserveSig]
HRESULT SetSize(long libNewSize);
HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten);
[PreserveSig]
HRESULT Commit(int grfCommitFlags);
[PreserveSig]
HRESULT Revert();
[PreserveSig]
HRESULT LockRegion(long libOffset, long cb, int dwLockType);
[PreserveSig]
HRESULT UnlockRegion(long libOffset, long cb, int dwLockType);
[PreserveSig]
HRESULT Stat(out STATSTG pstatstg, int grfStatFlag);
[PreserveSig]
HRESULT Clone(out IStream ppstm);
}
public class ManagedStream : IStream
{
private readonly Stream _stream;
public ManagedStream(Stream stream)
{
_stream = stream ?? throw new ArgumentNullException(nameof(stream));
}
public HRESULT Clone(out IStream ppstm)
{
ppstm = null;
return HRESULT.E_NOTIMPL;
}
public HRESULT Commit(int grfCommitFlags)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT LockRegion(long libOffset, long cb, int dwLockType)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT Read(byte[] pv, int cb, IntPtr pcbRead)
{
var bytesRead = _stream.Read(pv, 0, cb);
if (pcbRead != IntPtr.Zero)
{
Marshal.WriteInt32(pcbRead, bytesRead);
}
return HRESULT.S_OK;
}
public HRESULT Revert()
{
return HRESULT.E_NOTIMPL;
}
public HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition)
{
SeekOrigin seekOrigin;
switch (dwOrigin)
{
case (int) STREAM_SEEK.STREAM_SEEK_SET:
seekOrigin = SeekOrigin.Begin;
break;
case (int) STREAM_SEEK.STREAM_SEEK_CUR:
seekOrigin = SeekOrigin.Current;
break;
case (int) STREAM_SEEK.STREAM_SEEK_END:
seekOrigin = SeekOrigin.End;
break;
default:
return HRESULT.E_FAIL;
}
var position = _stream.Seek(dlibMove, seekOrigin);
if (plibNewPosition != IntPtr.Zero)
{
Marshal.WriteInt64(plibNewPosition, position);
}
return HRESULT.S_OK;
}
public HRESULT SetSize(long libNewSize)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT Stat(out STATSTG pstatstg, int grfStatFlag)
{
pstatstg = new STATSTG
{
type = (int) STGTY.STGTY_STREAM,
cbSize = _stream.Length,
grfMode = (int) STGM.STGM_READ
};
if (_stream.CanRead && _stream.CanWrite)
{
pstatstg.grfMode |= (int) STGM.STGM_READWRITE;
}
else if (_stream.CanRead)
{
pstatstg.grfMode |= (int) STGM.STGM_READ;
}
else if (_stream.CanWrite)
{
pstatstg.grfMode |= (int) STGM.STGM_WRITE;
}
else
{
return HRESULT.E_ACCESSDENIED;
}
return HRESULT.S_OK;
}
public HRESULT UnlockRegion(long libOffset, long cb, int dwLockType)
{
return HRESULT.E_NOTIMPL;
}
public HRESULT Write(byte[] pv, int cb, IntPtr pcbWritten)
{
return HRESULT.E_NOTIMPL;
}
}
public class NativeMethods
{
[DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
public static extern HRESULT LoadIFilter(string pwcsPath, [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter, ref IFilter ppIUnk);
}
public struct FILETIME
{
public uint DateTimeLow;
public uint DateTimeHigh;
}
[StructLayout(LayoutKind.Sequential)]
public struct FILTERREGION
{
public ulong idChunk;
public ulong cwcStart;
public ulong cwcExtent;
}
[StructLayout(LayoutKind.Sequential)]
public struct FULLPROPSPEC
{
public Guid guidPropSet;
public PROPSPEC psProperty;
}
[StructLayout(LayoutKind.Explicit)]
public struct PROPSPEC
{
[FieldOffset(0)]
public PROPSPECKIND ulKind;
[FieldOffset(4)]
public uint propid;
[FieldOffset(4)]
public IntPtr lpwstr;
}
public struct STAT_CHUNK
{
public int idChunk;
[MarshalAs(UnmanagedType.U4)]
public CHUNK_BREAKTYPE breakType;
[MarshalAs(UnmanagedType.U4)]
public CHUNKSTATE flags;
public int locale;
public FULLPROPSPEC attribute;
public int idChunkSource;
public int cwcStartSource;
public int cwcLenSource;
}
public struct STATSTG
{
[MarshalAs(UnmanagedType.LPTStr)]
public string pwcsName;
public int type;
public long cbSize;
public FILETIME mtime;
public FILETIME ctime;
public FILETIME atime;
public int grfMode;
public int grfLocksSupported;
public Guid clsid;
public int grfStateBits;
public int reserved;
}
[Flags]
public enum IFilterReturnCodes : uint
{
S_OK = 0,
E_ACCESSDENIED = 0x80070005,
E_HANDLE = 0x80070006,
E_INVALIDARG = 0x80070057,
E_OUTOFMEMORY = 0x8007000E,
E_NOTIMPL = 0x80004001,
E_FAIL = 0x80000008,
FILTER_E_PASSWORD = 0x8004170B,
FILTER_E_UNKNOWNFORMAT = 0x8004170C,
FILTER_E_NO_TEXT = 0x80041705,
FILTER_E_NO_VALUES = 0x80041706,
FILTER_E_END_OF_CHUNKS = 0x80041700,
FILTER_E_NO_MORE_TEXT = 0x80041701,
FILTER_E_NO_MORE_VALUES = 0x80041702,
FILTER_E_ACCESS = 0x80041703,
FILTER_W_MONIKER_CLIPPED = 0x00041704,
FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
FILTER_E_LINK_UNAVAILABLE = 0x80041708,
FILTER_S_LAST_TEXT = 0x00041709,
FILTER_S_LAST_VALUES = 0x0004170A
}
[Flags]
public enum CHUNK_BREAKTYPE : uint
{
CHUNK_NO_BREAK = 0,
CHUNK_EOW = 1,
CHUNK_EOS = 2,
CHUNK_EOP = 3,
CHUNK_EOC = 4
}
[Flags]
public enum CHUNKSTATE : uint
{
CHUNK_TEXT = 0x1,
CHUNK_VALUE = 0x2,
CHUNK_FILTER_OWNED_VALUE = 0x4
}
[Flags]
public enum HRESULT : uint
{
S_OK = 0x00000000,
E_NOTIMPL = 0x80004001,
E_NOINTERFACE = 0x80004002,
E_POINTER = 0x80004003,
E_ABORT = 0x80004004,
E_FAIL = 0x80004005,
E_UNEXPECTED = 0x8000FFFF,
E_ACCESSDENIED = 0x80070005,
E_HANDLE = 0x80070006,
E_OUTOFMEMORY = 0x8007000E,
E_INVALIDARG = 0x80070057
}
[Flags]
public enum IFILTER_FLAGS
{
IFILTER_FLAGS_OLE_PROPERTIES = 1
}
[Flags]
public enum IFILTER_INIT
{
IFILTER_INIT_CANON_PARAGRAPHS = 1,
IFILTER_INIT_HARD_LINE_BREAKS = 2,
IFILTER_INIT_CANON_HYPHENS = 4,
IFILTER_INIT_CANON_SPACES = 8,
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
IFILTER_INIT_INDEXING_ONLY = 64,
IFILTER_INIT_SEARCH_LINKS = 128,
IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512,
IFILTER_INIT_FILTER_AGGRESSIVE_BREAK = 1024,
IFILTER_INIT_DISABLED_EMBEDDED = 2048,
IFILTER_INIT_EMIT_FORMATTING = 4096
}
[Flags]
public enum PROPSPECKIND : ulong
{
PRSPEC_LPWSTR = 0,
PRSPEC_PROPID = 1
}
[Flags]
public enum STGM : ulong
{
STGM_READ = 0x00000000L,
STGM_WRITE = 0x00000001L,
STGM_READWRITE = 0x00000002L,
STGM_SHARE_DENY_NONE = 0x00000040L,
STGM_SHARE_DENY_READ = 0x00000030L,
STGM_SHARE_DENY_WRITE = 0x00000020L,
STGM_SHARE_EXCLUSIVE = 0x00000010L,
STGM_PRIORITY = 0x00040000L,
STGM_CREATE = 0x00001000L,
STGM_CONVERT = 0x00020000L,
STGM_FAILIFTHERE = 0x00000000L,
STGM_DIRECT = 0x00000000L,
STGM_TRANSACTED = 0x00010000L,
STGM_NOSCRATCH = 0x00100000L,
STGM_NOSNAPSHOT = 0x00200000L,
STGM_SIMPLE = 0x08000000L,
STGM_DIRECT_SWMR = 0x00400000L,
STGM_DELETEONRELEASE = 0x04000000L
}
[Flags]
public enum STGTY : int
{
STGTY_STORAGE = 1,
STGTY_STREAM = 2,
STGTY_LOCKBYTES = 3,
STGTY_PROPERTY = 4
}
[Flags]
public enum STREAM_SEEK : int
{
STREAM_SEEK_SET = 0,
STREAM_SEEK_CUR = 1,
STREAM_SEEK_END = 2
}
}
}