获取页面数据(C#.net)
通过地址获取网页中的数据,读取网页的代码如下:
View Code
public string GetPageData(string url, string EncodingName) { string str = ""; if (num < 10) { HttpWebRequest q; HttpWebResponse p; StreamReader sr; try { HttpWebRequest.DefaultCachePolicy = new RequestCachePolicy(RequestCacheLevel.NoCacheNoStore); q = (HttpWebRequest)HttpWebRequest.Create(url); q.Method = "POST"; q.ReadWriteTimeout = 300000; q.Timeout = 300000; p = (HttpWebResponse)q.GetResponse(); if (p.StatusCode == HttpStatusCode.OK && p.ContentLength < 10240 * 10240) { if ("".Equals(EncodingName)) { EncodingName = "GB2312"; } try { sr = new StreamReader(p.GetResponseStream(), Encoding.GetEncoding(EncodingName));//HttpWebResponse获得源码需500毫秒 } catch { sr = new StreamReader(p.GetResponseStream(), Encoding.GetEncoding("GB2312"));//HttpWebResponse获得源码需500毫秒 } str = sr.ReadToEnd();//读入数据流需300毫秒 p.Close(); q.Abort(); } } catch (Exception htl) { MessageBox.Show(htl.ToString()); num++; return GetPageData(url, EncodingName); } } num = 0; return str; }
要获取网页中特定的数据,就要依情况而定了,下面是我获取网页中表格中的数据的例子:
View Code
public string[] GetTableString(string html, out bool goon) { string[] s = { "" }; string cenbegen = "科目</div></td>"; int index = html.IndexOf(cenbegen); if (index < 0) { goon = true; return s; } html = html.Substring(index + cenbegen.Length); cenbegen = "科目</div></td>"; index = html.LastIndexOf(cenbegen); if (index < 0) { goon = true; return s; } html = html.Substring(index + cenbegen.Length); cenbegen = "<!-- 右边列表开始-->"; index = html.IndexOf(cenbegen); if (index < 0) { goon = true; return s; } html = html.Substring(0, index); html = StripHtml(html); string[] datastr = new string[200]; datastr = Regex.Split(html, " ", RegexOptions.IgnoreCase); goon = true; return datastr; }
上面的代码中含有去除html标签,并合并成为空格的函数,代码如下:
View Code
public string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strResponse = objRegExp.Replace(strHtml, ""); strResponse = strResponse.Replace("<", "<"); strResponse = strResponse.Replace(">", ">"); //把所有空格变为一个空格 Regex r = new Regex(@"\s+"); string strOutput = r.Replace(strResponse, " "); return strOutput.Trim(); }
最后得到的数据就是想要的数据了,可以将其放在字符串数组中,方便使用。


浙公网安备 33010602011771号