我有一个IE BHO插件,可以通过COM调用发送窗口中加载的页面的HTML。
// Note all error handling removed for readability :)
STDMETHODIMP CPlugin::get_HTML(long lMaxSize, BSTR *pbstrHTML)
{
CComPtr<IDispatch> pDispatch;
MSHTML::IHTMLDocument2Ptr pDocument2 = NULL;
MSHTML::IHTMLDocument3Ptr pDocument3 = NULL;
hr = m_spWebBrowser->get_Document(&pDispatch);
hr = pDispatch->QueryInterface(IID_IHTMLDocument3, (void**)&pDocument3);
MSHTML::IHTMLElementPtr pRoot = pDocument3->documentElement;
wstring strHTML = pRoot->outerHTML;
CComBSTR bstrHTML = strOutput.c_str();
bstrHTML.CopyTo(pbstrHTML);
}
但是当遇到非常大的页面(例如“http://sitemap.zillow.com/uncompressed/ForSale_Hood_MedPri_1.xml”)时,从DOM创建HTML需要3分钟。
有没有办法访问原始HTML / XML?
当您在IE中执行“查看页面源”时,它几乎立即弹出,因此内部IE必须使用一些可以做我想要的API。
谢谢,
沙恩。
答案 0 :(得分:2)
似乎在旧版本的MSHTML中,outerHTML具有O(n ^ 2)性能。但是,在较新的版本(IE8)中,这个问题已经消失。如果您有选择,请使用IE8或更高版本。
否则,使用IPersistStream :: Save是一个选项。但CreateStreamOnHGlobal不会帮助你,因为它的实现也是O(n ^ 2)。您必须使用自定义IStream。
包含一个IStream实现,它是为此目的而实现的,并支持快速写入:
#include <atlbase.h>
#include <atlcom.h>
#include <vector>
// an implementation of a write-only IStream.
// needed because the CreateStreamOnHGlobal implementation doesn't handle
// resizes well (N writes seem to take O(N^2) time)
class MyStream :
public CComObjectRootEx<CComSingleThreadModel>,
public CComCoClass<MyStream>,
public IStreamImpl
{
public:
std::vector<char> buf;
BEGIN_COM_MAP(MyStream)
COM_INTERFACE_ENTRY(IStream)
END_COM_MAP()
STDMETHOD(Write) (const void * pv, ULONG cb, ULONG *pcbWritten);
};
/*
Usage:
CComPtr<IStream> stream;
hr = MyStream::CreateInstance(&stream);
// streamObj will be valid as long as IStream smart pointer lives
MyStream *streamObj = (MyStream*)stream.p;
*/
STDMETHODIMP MyStream::Write(const void * pv, ULONG cb, ULONG *pcbWritten)
{
buf.insert(buf.end(), (char*)pv, (char*)pv+cb);
return S_OK;
}
答案 1 :(得分:1)
是的,您可以为IPersistStream进行QI并保存到CreateStreamOnHGlobal创建的内存流中 请注意,文档必须完成下载(准备状态需要完成)。
答案 2 :(得分:0)
感谢Amnon,以下代码主要是为我工作。
// an implementation of a write-only IStream.
// needed because the CreateStreamOnHGlobal implementation doesn't handle
// resizes well (N writes seem to take O(N^2) time)
class MyStream :
public CComObjectRootEx<CComSingleThreadModel>,
public CComCoClass<MyStream>,
public IStream
{
public:
std::vector<char> buf;
BEGIN_COM_MAP(MyStream)
COM_INTERFACE_ENTRY(IStream)
END_COM_MAP()
STDMETHOD(Write) (const void * pv, ULONG cb, ULONG *pcbWritten);
// Implement IStream abstract functions
STDMETHOD(Read) (void *pv, ULONG cb, ULONG *pcbRead) { return S_OK; };
STDMETHOD(Seek) (LARGE_INTEGER dlibMove,DWORD dwOrigin,ULARGE_INTEGER *plibNewPosition) { return S_OK; };
STDMETHOD(SetSize) (ULARGE_INTEGER libNewSize) { return S_OK; };
STDMETHOD(CopyTo) (IStream *pstm,ULARGE_INTEGER cb,ULARGE_INTEGER *pcbRead,ULARGE_INTEGER *pcbWritten) { return S_OK; };
STDMETHOD(Commit) (DWORD grfCommitFlags) { return S_OK; };
STDMETHOD(Revert) () { return S_OK; };
STDMETHOD(LockRegion) (ULARGE_INTEGER libOffset,ULARGE_INTEGER cb,DWORD dwLockType) { return S_OK; };
STDMETHOD(UnlockRegion) (ULARGE_INTEGER libOffset,ULARGE_INTEGER cb,DWORD dwLockType) { return S_OK; };
STDMETHOD(Stat) (__RPC__out STATSTG *pstatstg,DWORD grfStatFlag) { return S_OK; };
STDMETHOD(Clone) (__RPC__deref_out_opt IStream **ppstm) { return S_OK; };
};
STDMETHODIMP MyStream::Write(const void * pv, ULONG cb, ULONG *pcbWritten)
{
buf.insert(buf.end(), (char*)pv, (char*)pv+cb);
return S_OK;
}
// Retrieves the HTML of the current page
STDMETHODIMP CPlugin::get_HTML(long lMaxSize, BSTR *pbstrHTML)
{
HRESULT hr = S_OK;
try
{
CComPtr<IDispatch> pDispatch;
MSHTML::IHTMLDocumentPtr pDocument = NULL;
CComPtr<IStream> mystream;
hr = MyStream::CreateInstance(&mystream);
// streamObj will be valid as long as IStream smart pointer lives
MyStream *streamObj = (MyStream*)mystream.p;
hr = m_spWebBrowser->get_Document(&pDispatch);
hr = pDispatch->QueryInterface(IID_IHTMLDocument, (void**)&pDocument);
IPersistStreamInitPtr persistStream = pDocument;
hr = CreateStreamOnHGlobal(NULL, TRUE, &stream);
hr = persistStream->Save(mystream, FALSE);
}
catch(...)
{
TRACE_FN("Got exception somewhere");
}
return hr;
}
现在唯一的问题是如何解释为什么有些它会在大多数情况下返回单字节字符,而在其他时候则返回双字节字符。有什么想法吗?
感谢您的帮助。