递归枚举IHTMLDocument2的所有元素
http://blog.csdn.net/fishmai/article/details/52388843
void EnumHTMLDocument( MSHTML::IHTMLDocument2* pDoc )
{
if( pDoc == NULL )return;
//遍历搜索子框架,递归处理子框架的文档
CComPtr<MSHTML::IHTMLFramesCollection2> spFramesCollection;
pDoc->get_frames( &spFramesCollection );
long lCount = 0;
HRESULT hr = spFramesCollection->get_length( &lCount );
if( FAILED( hr ) )return;
for ( long lIndex = 0; lIndex < lCount; lIndex++ )
{
CComVariant vDispWin;
vDispWin = spFramesCollection->item( &CComVariant( lIndex ) );
CComQIPtr<MSHTML::IHTMLWindow2> spWin = vDispWin.pdispVal;
if( spWin == NULL )continue;
CComPtr<MSHTML::IHTMLDocument2> spSubDoc;
spWin->get_document( &spSubDoc );
EnumHTMLDocument( spSubDoc );
}
CComQIPtr<MSHTML::IHTMLElementCollection> spElementCollection;
hr = pDoc->get_forms( &spElementCollection );
if( FAILED( hr ) )return;
long lFormCount = 0;
hr = spElementCollection->get_length( &lFormCount );
if( FAILED( hr ) )return;
for ( long lIndex = 0; lIndex < lFormCount; lIndex++ )
{
CComQIPtr<MSHTML::IHTMLFormElement> spFormElement = spElementCollection->item( &CComVariant( lIndex ) );
if( spFormElement == NULL )continue;
long lElemCount = 0;
hr = spFormElement->get_length( &lElemCount );
if( FAILED( hr ) )continue;
for ( long lElemIndex = 0; lElemIndex < lElemCount; lElemIndex++ )
{
CComDispatchDriver spInputElement;
spInputElement = spFormElement->item( &CComVariant( lElemIndex ) );
if( spInputElement == NULL )continue;
CComVariant varName, varValue, varType;
hr = spInputElement.GetPropertyByName( L"name", &varName );
if( SUCCEEDED( hr ) )
{
LPCTSTR lpszName = varName.bstrVal ? COLE2CT( varName.bstrVal ) : _T("NULL");
AtlMessageBox( NULL, lpszName );
}
hr = spInputElement.GetPropertyByName( L"value", &varValue );
if( SUCCEEDED( hr ) )
{
LPCTSTR lpszValue = varValue.bstrVal ? COLE2CT( varValue.bstrVal ) : _T("NULL");
AtlMessageBox( NULL, lpszValue );
}
hr = spInputElement.GetPropertyByName( L"type", &varType );
if( SUCCEEDED( hr ) )
{
LPCTSTR lpszType = varType.bstrVal ? COLE2CT( varType.bstrVal ) : _T("NULL");
AtlMessageBox( NULL, lpszType );
}
}
}
}
解析html程序(C#版)——遍历各个节点(mshtml) .
http://blog.csdn.net/hanjieson/article/details/8576150
/*
在项目里引用了mshtml.dll,并且引用命名空间:using mshtml;
首先,参数html就是html文本内容(里面有markup标记和显示文本等等)
其次,getHtmlDisplayContent这个函数就是获取html里浏览器上可看到的内容,即从源码中取出显示文本。
最后,traverseNodes是个人写的一个遍历各个节点的一个小小递归程序,没考虑效率什么的,只是想知道怎么使用IHtmlDocument2和IHtmlDocument3接口
Note:当html文档不规范时,比如在<!Document....之前还有别的标记或者符号时,加载工作受到严重影响,此时估计是解析不出来了,我开始还不知道为什么解析有些html时卡住了,原来是因为这些html文档在html标记前有\n\n\n....等。。。
*/
private static string getHtmlDisplayContent(string html)
{
string cont = "";
mshtml.HTMLDocumentClass oc = new mshtml.HTMLDocumentClass();
mshtml.IHTMLDocument2 doc2 = oc;
doc2.write(html);
mshtml.IHTMLDocument3 HTMLDocument = (mshtml.IHTMLDocument3)doc2;
traverseNodes(HTMLDocument.documentElement, ref cont);
//mshtml.IHTMLTitleElement title = (mshtml.IHTMLTitleElement)doc2.title;
/* cont += doc2.title.ToString();
mshtml.IHTMLBodyElement body = (mshtml.IHTMLBodyElement)doc2.body;
if (body.text!=null)
cont += body.text.ToString();
* */
doc2.close();
return cont;
}
private static void traverseNodes(mshtml.IHTMLElement parentNode,ref string cont)
{
if (parentNode.innerText!=null)
cont += parentNode.innerText;
mshtml.IHTMLElementCollection nodes = (IHTMLElementCollection)parentNode.children;
IEnumerator ienum= nodes.GetEnumerator();
while (ienum.MoveNext())
{
IHTMLElement node = (IHTMLElement)ienum.Current;
traverseNodes(node,ref cont);
}
}