.net网页数据抓取

一、后台抓取代码

View Code

System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);   
  request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";   
  System.Net.WebResponse response = request.GetResponse();   
  System.IO.Stream resStream = response.GetResponseStream();   
  System.IO.StreamReader sr = new System.IO.StreamReader(resStream, encoding);   
  string html = (sr.ReadToEnd());   
  resStream.Close();   
  sr.Close();
    
  System.Net.WebClient wc = new System.Net.WebClient();
   wc.Credentials = System.Net.CredentialCache.DefaultCredentials;
   Byte[] pageData = wc.DownloadData(PageUrl);
   string Content= System.Text.Encoding.Default.GetString(pageData);

View Code

try
        {
            HttpWebRequest request=(HttpWebRequest)HttpWebRequest.Create("http://www.baidu.com");
            request.Method=WebRequestMethods.Http.Get;
            HttpWebResponse response=(HttpWebResponse)request.GetResponse();
            System.IO.StreamReader reader=new System.IO.StreamReader(response.GetResponseStream());
            string data=reader.ReadToEnd();
            response.Close();
                        HttpContext.Current.Response.Write(data);
               HttpContext.Current.Response.End();
            }
        }
        catch{}

Regex reg = new Regex(@"(?i)(?<=<span.*?id=""s"".*?>)[^<]+(?=</span>)");
MatchCollection mc = reg.Matches(html);
foreach (Match m in mc)
{
Console.WriteLine(m.Groups[0].ToString() );
}

二正则应用

//删除所有的html标记 

public static string delHtml(string str) 

{ 

if (str != null && str.Trim() != " ") 

return Regex.Replace(str, " <[^> ]+> ", " "); 

return str; 

} 

// 删除字符串中的特定标记 

//isContent：是否清除内容 

public static string delTag(string str, string tag, bool isContent) 

{ 

if (tag == null || tag == " ") 

{ 

return str; 

} 

if (isContent) //要求清除内容 

{ 

return Regex.Replace(str, string.Format( " <({0})[^> ]*> ([\\s\\S]*?) 
<\\/\\1> ", tag), " ", RegexOptions.IgnoreCase); 

} 

return Regex.Replace(str, string.Format(@ "( <{0}[^> ]*(> )?)|( </{0}[^> ] 
*> )| ", tag), " ", RegexOptions.IgnoreCase); 

}

　　// 链接正则
String regexa = "<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]";

MatchCollection mc = Regex.Matches(htmlstring, @"<a\s+href=(?<url>.+?)>(?<content>.+?)</a>");
2            foreach (Match m in mc)
3            {
4                url = m.Groups["url"].Value;
5                
6                content = m.Groups["content"].Value;
7            }

其中htmlstring 为输入代码

　　图片 src[^>]*[^/].(?:jpg|bmp|gif)(?:\"|\')
中文 ^([\u4e00-\u9fa5]+|[a-zA-Z0-9]+)$
网址 "\<a.+?href=['""](?!http\:\/\/)(?!mailto\:)(?>foundAnchor>[^'"">]+?)[^>]*?\>"

匹配中文字符的正则表达式： [\u4e00-\u9fa5]

匹配双字节字符(包括汉字在内)：[^\x00-\xff]

匹配空行的正则表达式：\n[\s| ]*\r

匹配HTML标记的正则表达式：/<(.*)>.*<\/\1>|<(.*) \/>/

匹配首尾空格的正则表达式：(^\s*)|(\s*$)（像vbscript那样的trim函数）

匹配Email地址的正则表达式：\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*

匹配网址URL的正则表达式：http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?

平衡组的一个最常见的应用就是匹配HTML,下面这个例子可以匹配嵌套的<div>标签：<div[^>]*>[^<>]*(((?'Open'<div[^>]*>)[^<>]*)+((?'-Open'</div>)[^<>]*)+)*(?(Open)(?!))</div>

正则表达式	说明
/^\s*$/	匹配空行。
/\d{2}-\d{5}/	匹配由两位数字、一个连字符再加五位数字组成的 ID 号。
/<\s(\S+)(\s[^>])?>[\s\S]<\s\/\1\s*>/	匹配 HTML 标记。

正则表达式

说明

/^\s*$/

匹配空行。

/\d{2}-\d{5}/

匹配由两位数字、一个连字符再加五位数字组成的 ID 号。

/<\s*(\S+)(\s[^>]*)?>[\s\S]*<\s*\/\1\s*>/

匹配 HTML 标记。

posted on 2011-08-12 10:47 alexmen 阅读(2439) 评论(0) 收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 阿里巴巴为什么禁止超过3张表join？
· 博客园众包线下沙龙第1期：云栖开发者基地，共建技术新天地
· 让 AI 帮我部署网站，太方便了！
· 别做抢活的导演：代码中的抽象层次原则
· .NET周刊【7月第1期 2025-07-06】

alexmen

公告

搜索

常用链接

我的标签

积分与排名

随笔分类 (2)

随笔档案 (17)

文章分类 (67)

文章档案 (69)

阅读排行榜

评论排行榜

推荐排行榜

最新评论