alexmen

专注.net软件开发,项目管理体系PMBOK.

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

一、后台抓取代码

View Code
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);   
request.UserAgent
= "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
System.Net.WebResponse response
= request.GetResponse();
System.IO.Stream resStream
= response.GetResponseStream();
System.IO.StreamReader sr
= new System.IO.StreamReader(resStream, encoding);
string html = (sr.ReadToEnd());
resStream.Close();
sr.Close();

System.Net.WebClient wc
= new System.Net.WebClient();
wc.Credentials
= System.Net.CredentialCache.DefaultCredentials;
Byte[] pageData
= wc.DownloadData(PageUrl);
string Content= System.Text.Encoding.Default.GetString(pageData);

  

View Code
try
{
HttpWebRequest request
=(HttpWebRequest)HttpWebRequest.Create("http://www.baidu.com");
request.Method
=WebRequestMethods.Http.Get;
HttpWebResponse response
=(HttpWebResponse)request.GetResponse();
System.IO.StreamReader reader
=new System.IO.StreamReader(response.GetResponseStream());
string data=reader.ReadToEnd();
response.Close();
HttpContext.Current.Response.Write(data);
HttpContext.Current.Response.End();
}
}
catch{}

  

Regex reg = new Regex(@"(?i)(?<=<span.*?id=""s"".*?>)[^<]+(?=</span>)");
MatchCollection mc = reg.Matches(html);
foreach (Match m in mc)
{
Console.WriteLine(m.Groups[0].ToString() );
}

二 正则应用  

//删除所有的html标记 

public static string delHtml(string str)

{

if (str != null && str.Trim() != " ")

return Regex.Replace(str, " <[^> ]+> ", " ");

return str;

}

// 删除字符串中的特定标记

//isContent:是否清除内容

public static string delTag(string str, string tag, bool isContent)

{

if (tag == null || tag == " ")

{

return str;

}

if (isContent) //要求清除内容

{

return Regex.Replace(str, string.Format( " <({0})[^> ]*> ([\\s\\S]*?)
<\\/\\1> ", tag), " ", RegexOptions.IgnoreCase);

}

return Regex.Replace(str, string.Format(@ "( <{0}[^> ]*(> )?)|( </{0}[^> ]
*> )| ", tag), " ", RegexOptions.IgnoreCase);

}

  

  1.   // 链接正则
  2. String regexa = "<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]"
MatchCollection mc = Regex.Matches(htmlstring, @"<a\s+href=(?<url>.+?)>(?<content>.+?)</a>");
2 foreach (Match m in mc)
3 {
4 url = m.Groups["url"].Value;
5
6 content = m.Groups["content"].Value;
7 }

其中htmlstring 为输入代码

  图片 src[^>]*[^/].(?:jpg|bmp|gif)(?:\"|\')
中文 ^([\u4e00-\u9fa5]+|[a-zA-Z0-9]+)$
网址 "\<a.+?href=['""](?!http\:\/\/)(?!mailto\:)(?>foundAnchor>[^'"">]+?)[^>]*?\>"

匹配中文字符的正则表达式: [\u4e00-\u9fa5]

匹配双字节字符(包括汉字在内):[^\x00-\xff]

匹配空行的正则表达式:\n[\s| ]*\r

匹配HTML标记的正则表达式:/<(.*)>.*<\/\1>|<(.*) \/>/

匹配首尾空格的正则表达式:(^\s*)|(\s*$)(像vbscript那样的trim函数)

匹配Email地址的正则表达式:\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*

匹配网址URL的正则表达式:http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?

平衡组的一个最常见的应用就是匹配HTML,下面这个例子可以匹配嵌套的<div>标签<div[^>]*>[^<>]*(((?'Open'<div[^>]*>)[^<>]*)+((?'-Open'</div>)[^<>]*)+)*(?(Open)(?!))</div>

正则表达式

说明

/^\s*$/

匹配空行。

/\d{2}-\d{5}/

匹配由两位数字、一个连字符再加五位数字组成的 ID 号。

/<\s*(\S+)(\s[^>]*)?>[\s\S]*<\s*\/\1\s*>/

匹配 HTML 标记。

 

posted on 2011-08-12 10:47  alexmen  阅读(2431)  评论(0编辑  收藏  举报