C#Xpath解析HtmlDocument的使用方法与递归取得页面所有标签xpath值(附源码)
在学习HTML Xpath之前呢我们先来下载一下Dll文件
下载地址:http://htmlagilitypack.codeplex.com/
大家下载单击如下图片下载就行了
<ignore_js_op>
接下来就是在程序中引用一下,
<ignore_js_op>
然后就可以直接调用 了,大家看看
代码吧
-
//htmlDcoument对象用来访问Html文档s -
HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument(); -
//加载Html文档 -
hd.LoadHtml(strhtml); -
string str = hd.DocumentNode.SelectSingleNode("//*[@id='e_font']").OuterHtml;
这样就可以得到一个标签的HTml代码了
OuterHtml是取包含本身的Html如果是InnerHtml就是取的包含在这个标签之内的所有Html代码了
这点大家要注意了
如果大家想获取Html代码的Xpath路径就是这部分
- //*[@id='e_font']
这个其实很简单只在大家安装一个Firbug就行了,
看下图片
<ignore_js_op>
大家只要进入选择模式,然后选择你要的内容,然后右键复制一下就行了。
然后放在SelectSingleNode()方法里就OK了
下面我说说几个方法和属性的意思吧、
方法
SelectNodes 获取的是一个集合
SelectSingleNode 获取一个标签
SetAttributeValue 设置标签的属性值例如:SetAttributeValue("name","xpath-89");这说明把name属性的值修改为xpath-89
属性
OuterHtml 是取包含本身的Html
InnerHtml 取的包含在这个标签之内的所有Html代码了
XPath 获取相对应的Xpath值
Attributes 获取一个属性的值例如:Attributes("name")
也可以进行添加属性例如:
-
hd.DocumentNode.SelectSingleNode(item.Key).Attributes.Add("xpathid", "xpath_1" );
下面我写了一个递归获取Html页面所有Xpath值的方法大家看一下吧
-
//key(Xpath),value(整个节点) -
public List<ObjXpath> XpathList = new List<ObjXpath>(); -
public string strhtml = "";//这里就是你的Html代码具体怎么获取请参考我的<a href=\"http://www.sufeinet.com/thread-3-1-1.html\" target=\"_blank\">HttpHelper</a>类吧 -
private int Index = 0; -
//开始处理Node -
private void SartNode() -
{ -
//htmlDcoument对象用来访问Html文档s -
HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument(); -
//加载Html文档 -
hd.LoadHtml(strhtml); -
HtmlNodeCollection htmllist = hd.DocumentNode.ChildNodes; -
Index = 0; -
XpathList.Clear(); -
foreach (HtmlNode em in htmllist) -
{ -
Setxpath(em); -
} -
} -
/// <summary> -
/// 递归获取Html Dom -
/// </summary> -
/// <param name="node">要处理的节点</param> -
private void Setxpath(HtmlNode node) -
{ -
foreach (HtmlNode item in node.ChildNodes) -
{ -
if (item.XPath.Contains("#")) -
{ -
continue; -
} -
if (item.ChildNodes.Count > 0) -
{ -
XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" }); -
Index++; -
Setxpath(item); -
} -
else -
{ -
XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" }); -
Index++; -
} -
} -
} -
public class ObjXpath -
{ -
public string id { get; set; } -
public string Key { get; set; } -
public string Value { get; set; } -
}
XpathList 就是获取的所有Xpath值了,大家有兴趣的话可以试试
我们先来看看效果吧
<ignore_js_op>
好了下面放出所有代码给大家
-
using System; -
using System.Collections.Generic; -
using System.ComponentModel; -
using System.Data; -
using System.Drawing; -
using System.Linq; -
using System.Text; -
using System.Windows.Forms; -
using System.Text.RegularExpressions; -
using System.Threading; -
using HtmlAgilityPack; -
using System.IO; -
using System.Runtime.Serialization.Json; -

-
namespace AutoXpathTools -
{ -
public partial class Form1 : Form -
{ -
public Form1() -
{ -
InitializeComponent(); -
} -

-
#region 私有变量和方法 -

-
//委托传入一个字符串 -
private delegate void SetListBox(string str); -

-
//key(Xpath),value(整个节点) -
List<ObjXpath> XpathList = new List<ObjXpath>(); -
private int Index = 0; -
//htmlDcoument对象用来访问Html文档 -
HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument(); -

-
#endregion -

-
//分析Xpath的所有代码 -
private void btnGetXpath_Click(object sender, EventArgs e) -
{ -
try -
{ -
HttpHelper http = new HttpHelper(); -
HttpItem item = new HttpItem() { URL = textBox1.Text.Trim(), IsToLower = false, Encoding = "gbk" }; -
txtXml.Text = http.GetHtml(item); -
if (!string.IsNullOrWhiteSpace(txtXml.Text) && txtXml.Text.Trim().ToLower() != "error") -
{ -
//加载Html文档 -
hd.LoadHtml(txtXml.Text); -
-

-
Thread pingTask = new Thread(new ThreadStart(delegate -
{ -
//代码,线程要执行的代码 -
SartNode(txtXml.Text); -
})); -
pingTask.Start(); -
-
} -
else -
{ -
txtXml.Text = "根据您的的ULR:" + textBox1.Text.Trim() + "无法得到任何内容"; -
} -
} -
catch (Exception ex) -
{ -
txtXml.Text = ex.Message.Trim(); -
} -
} -
-

-
//开始处理Node -
private void SartNode(string strhtml) -
{ -
//htmlDcoument对象用来访问Html文档s -
HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument(); -
//加载Html文档 -
hd.LoadHtml(strhtml); -
HtmlNodeCollection htmllist = hd.DocumentNode.ChildNodes; -
Index = 0; -
XpathList.Clear(); -
foreach (HtmlNode em in htmllist) -
{ -
Setxpath(em); -
} -
} -
/// <summary> -
/// 递归获取Html Dom -
/// </summary> -
/// <param name="node">要处理的节点</param> -
private void Setxpath(HtmlNode node) -
{ -
foreach (HtmlNode item in node.ChildNodes) -
{ -
if (item.XPath.Contains("#")) -
{ -
continue; -
} -
if (item.ChildNodes.Count > 0) -
{ -
XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" }); -
UIContorol(item.XPath); -
Index++; -
Setxpath(item); -
} -
else -
{ -
XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" }); -
UIContorol(item.XPath); -
Index++; -
} -
} -
} -
-
//使用委托给控件赋值 -
private void UIContorol(string str) -
{ -
listBox1.Items.Add(str); -
toolStripStatusLabel1.Text = str; -
} -

-
private void listBox1_SelectedValueChanged(object sender, EventArgs e) -
{ -
if (listBox1.SelectedItem != null) -
{ -
txtPath.Text = listBox1.SelectedItem.ToString().Trim(); -
} -
} -

-
private void button3_Click(object sender, EventArgs e) -
{ -
txtContents.Text = hd.DocumentNode.SelectSingleNode(txtPath.Text.Trim()).OuterHtml; -
} -
-
private void Form1_Load(object sender, EventArgs e) -
{ -
//HttpItem item = new HttpItem() -
//{ -
// URL = "http://www.diandian.com/login", -
// Method = "post", -
// Cookie = "dtid=ZfXUVo1IsplHR4mHW1HYmgKbY4GJa003; kvf=1358855337188; alf=1; dru=1356356040; _l5=y", -
// ContentType = "application/x-www-form-urlencoded", -
// Postdata = "account=xinsuilie1998@163.com&password=wjlove520&nextUrl=&lcallback=&persistent=1", -
// Referer = "http://www.diandian.com/logout?formKey=e4714d863c862a84fafd83d98e5ecb22" -
//}; -
//HttpHelper http = new HttpHelper(); -
//string html = http.GetHtml(item); -
//string cookie = item.Cookie; -
//item = new HttpItem() { URL = "http://www.diandian.com/home", Cookie = cookie }; -
//html = http.GetHtml(item); -
} -
} -
public class ObjXpath -
{ -
public string id { get; set; } -
public string Key { get; set; } -
public string Value { get; set; } -
} -
}
就到这里吧,大家可以下载我的源代码试试手
打包下载:
<ignore_js_op>
AutoXpathTools.zip (76.32 KB, 下载次数: 0)
如果你感觉可以话就给我推荐一下吧。感谢大家
posted on 2013-03-11 11:15 HOT SUMMER 阅读(15845) 评论(1) 收藏 举报
浙公网安备 33010602011771号