远程获取网页信息 转

使用的两个WebClient,HttpWebRequest。NET的类分别做出一个静态类和动态类。

静态类GetHtml 支持一般的网页取数据和POET提交,但不能支持验证码,自动识别网页编码也可以手动输入网页编码。不过最好是手动输入那样会让程序少做运行代码。

System.Collections.Specialized.NameValueCollection PostVars =new System.Collections.Specialized.NameValueCollection()
PostVars.Add("uid","name");
PostVars.Add("pwd","123456");

string tmphtml= GetStrHtmlPost(url,PostVars);

动态类PostWeb 支持验证码、验证用户、登陆过会产生COOKIES字符串,第二次运行程序时可通过COOKIES而不用再次登陆。

PostWeb web=new PostWeb();

web.GetCode(验证码地址);

string tmplogin=web.LoginPost("http://www.mystand.com.cn/");

if(tmplogin.Contains(条件))

{

string cookie= web.cookieHeader;//保存到文件中下次直接付到类就可免登陆

web.GetPage("http://www.mystand.com.cn/", "http://www.mystand.com.cn/");

}

 

PostWeb web=new PostWeb();

web.cookieHeader=cookie;//把保存文件中的cookie付到类中

web.GetPage("http://www.mystand.com.cn/", "http://www.mystand.com.cn/");


using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using System.IO;
using System.IO.Compression;

namespace Manager.Net.Html
{

/// <summary>
/// HTML相关
/// </summary>
public class CHtml
{


public CHtml()
{

}

~CHtml()
{

}
/// <summary>
/// 过滤 Sql 语句字符串中的注入脚本
/// </summary>
/// <param name="source">传入的字符串</param>
/// <returns></returns>
public static string FilterSql(string source)
{
//单引号替换成两个单引号
source = source.Replace("'", "''");
source
= source.Replace("\"", "");
source = source.Replace("|", "");
//半角封号替换为全角封号,防止多语句执行
source = source.Replace(";", "");

//半角括号替换为全角括号
source = source.Replace("(", "");
source
= source.Replace(")", "");

/**/
///////////////要用正则表达式替换,防止字母大小写得情况////////////////////

//去除执行存储过程的命令关键字
source = source.Replace("Exec", "");
source
= source.Replace("Execute", "");

//去除系统存储过程或扩展存储过程关键字
source = source.Replace("xp_", "x p_");
source
= source.Replace("sp_", "s p_");

//防止16进制注入
source = source.Replace("0x", "0 x");

return source;
}







/// <summary>
/// 输出HTML
/// </summary>
/// <param name="Stream"></param>
/// <param name="Encod">编码</param>
/// <returns></returns>
public static string HtmlStr(System.IO.Stream Stream, Encoding Encod)
{

System.IO.StreamReader sr;
if (Encod != null)
{
sr
= new System.IO.StreamReader(Stream, Encod);
return sr.ReadToEnd();
}
else
{
sr
= new System.IO.StreamReader(Stream, Encoding.Default);
return sr.ReadToEnd();

}

}


/// <summary>
/// 检验用户提交的URL参数字符里面是否有非法字符,如果有则返回True.防止SQL注入.
/// </summary>
/// <param name="str">(string)</param>
/// <returns>bool</returns>
public static bool VerifyString(string str)
{
string strTmp = str.ToUpper();
if (strTmp.IndexOf("SELECT ") >= 0 || strTmp.IndexOf(" AND ") >= 0 || strTmp.IndexOf(" OR ") >= 0 ||
strTmp.IndexOf(
"EXEC ") >= 0 || strTmp.IndexOf("CHAR(") >= 0)
{
return true;
}

strTmp.Replace(
"'", "").Replace(";", "");
return false;
}


/// <summary>
/// 匹配页面的图片地址
/// </summary>
/// <param name="HtmlCode"></param>
/// <param name="imgHttp">要补充的http://路径信息</param>
/// <returns></returns>
public static string GetImgSrc(string HtmlCode, string imgHttp)
{
string MatchVale = "";
string Reg = @"<img.+?>";
foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))
{
MatchVale
+= GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
}

return MatchVale;
}

/// <summary>
/// 匹配<img src="" />中的图片路径实际链接
/// </summary>
/// <param name="ImgString"><img src="" />字符串</param>
/// <returns></returns>
public static string GetImg(string ImgString, string imgHttp)
{
string MatchVale = "";
string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
{
MatchVale
+= (m.Value).ToLower().Trim().Replace("src=", "");
}
if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)
return (MatchVale);
else
return (imgHttp + MatchVale);
}



/// <summary>
/// 获取页面的链接正则
/// </summary>
/// <param name="HtmlCode"></param>
/// <returns></returns>
public static string GetHref(string HtmlCode)
{
string MatchVale = "";
string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
foreach (Match m in Regex.Matches(HtmlCode, Reg))
{
MatchVale
+= (m.Value).ToLower().Replace("href=", "").Trim() + "|";
}
return MatchVale;
}

/// <summary>
/// 去HTML标记
/// </summary>
/// <param name="strhtml">HTML页面</param>
/// <returns></returns>
public static string RemoveHTML(string strhtml)
{
string stroutput = strhtml;
Regex regex
= new Regex(@"<[^>]+>|</[^>]+>");
stroutput
= regex.Replace(stroutput, "");
return stroutput.Trim();
}

/// <summary>
/// 取网页编码
/// </summary>
/// <param name="strHtml">HTML页面</param>
/// <returns>返回编码</returns>
public static Encoding GetEncoding(string strHtml)
{

string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)";
string charset = Regex.Match(strHtml, pattern).Groups["charset"].Value;
if (charset.Length <= 0)
{
if (strHtml.Contains("charset=\""))
charset = Manager.Text.StringEx.GetStringMiddle(strHtml, "charset=\"", "\"");

}
if (charset.Length <= 0)
{
if (strHtml.Contains("charset="))
charset
= Manager.Text.StringEx.GetStringMiddle(strHtml, "charset=", "\"");
}
if (charset.Length <= 0)
{
charset
= Encoding.UTF8.BodyName;
}
try
{
return Encoding.GetEncoding(charset);
}
catch (Exception)
{

return Encoding.Default;
}



}


/// <summary>
/// 取IE版本
/// </summary>
/// <returns></returns>
public static string GetIEVersion()
{
using (Microsoft.Win32.RegistryKey versionKey = Microsoft.Win32.Registry.LocalMachine.OpenSubKey(@"Software\Microsoft\Internet Explorer"))
{
String version
= versionKey.GetValue("Version").ToString();
return version;
}
}
}

/// <summary>
/// 模拟网页提交数据
/// </summary>
public class PostWeb
{
/// <summary>
/// Cookies
/// </summary>
public string cookieHeader = "";
/// <summary>
/// 网页编号
/// </summary>
public string Encod = "";

public bool SetCookies = false;
public string Method = "POST";
/// <summary>
/// 是否使用代理
/// </summary>
public bool IsProxy = false;

/// <summary>
/// 代理地址
/// </summary>
public string proxyaddress = "";
/// <summary>
/// 密码验证用户
/// </summary>
public string CredentialUserName = "";
/// <summary>
/// 密码验证密码
/// </summary>
public string CredentialPassWord = "";
/// <summary>
/// 密码验证域名
/// </summary>
public string CredentialDoMain = "";


Encoding tmpEncod;
public PostWeb()
{

}
~PostWeb()
{

}
/// <summary>
/// 代理
/// </summary>
/// <param name="request"></param>
private void ProxySetting(HttpWebRequest request)
{
if (IsProxy)
{
WebProxy proxy
= WebProxy.GetDefaultProxy();//获取IE缺省设置
//如果缺省设置为空,则有可能是根本不需要代理服务器,如果此时配置文件中也未配置则认为不需Proxy
if (proxy.Address == null && !String.IsNullOrEmpty(proxyaddress))
proxy.Address
= new Uri(proxyaddress);//按配置文件创建Proxy 地置
}
}

/// <summary>
/// 身份验证
/// </summary>
/// <param name="request"></param>
private void NetworkCredentialSetting(HttpWebRequest request)
{
if (!String.IsNullOrEmpty(CredentialUserName) && !String.IsNullOrEmpty(CredentialPassWord) && !String.IsNullOrEmpty(CredentialDoMain))
{
request.PreAuthenticate
= true;
NetworkCredential myCred
= new NetworkCredential(CredentialUserName, CredentialPassWord, CredentialDoMain);
request.Credentials
= myCred;
request.SendChunked
= false;
}
}

/// <summary>
/// 验证码,并保存文件
/// </summary>
/// <param name="strURL">验证码地址</param>
/// <param name="dir">目录</param>
/// <param name="filename">文件</param>
/// <param name="imageFormat">格式</param>
public void GetCode(string strURL, string dir, string filename, System.Drawing.Imaging.ImageFormat imageFormat)
{
string path = "";
if (!String.IsNullOrEmpty(dir))
{

System.IO.Directory.CreateDirectory(dir);
path
= dir + "\\";

}
string FileNamePath = path + filename;
System.Drawing.Image code
= GetCode(strURL);
code.Save(FileNamePath, imageFormat);
code.Dispose();
}

/// <summary>
/// 验证码,返回Bitmap
/// </summary>
/// <param name="strURL">验证码地址</param>
/// <returns></returns>
public System.Drawing.Image GetCode(string strURL)
{
HttpWebRequest myHttpWebRequest
= (HttpWebRequest)WebRequest.Create(strURL);
ProxySetting(myHttpWebRequest);
NetworkCredentialSetting(myHttpWebRequest);
myHttpWebRequest.Method
= "GET";
myHttpWebRequest.KeepAlive
= true;
CookieCollection myCookies
= null;
CookieContainer myCookieContainer
= new CookieContainer();
myHttpWebRequest.CookieContainer
= myCookieContainer;
using (HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse())
{
cookieHeader
= myHttpWebRequest.CookieContainer.GetCookieHeader(new Uri(strURL));
myCookies
= response.Cookies;
System.Drawing.Image code
= System.Drawing.Image.FromStream(response.GetResponseStream(), false,false);
return code;

}
}

/// <summary>
/// 功能描述:模拟登录页面,提交登录数据进行登录,并记录Header中的cookie
/// </summary>
/// <param name="strURL">登录数据提交的页面地址</param>
/// <param name="strArgs">用户登录数据</param>
/// <param name="strReferer">引用地址</param>
/// <returns>可以返回页面内容或不返回</returns>
public string LoginGet(string strURL, string strReferer)
{

HttpWebRequest myHttpWebRequest
= (HttpWebRequest)WebRequest.Create(strURL);
ProxySetting(myHttpWebRequest);
NetworkCredentialSetting(myHttpWebRequest);
myHttpWebRequest.AllowAutoRedirect
= true;
myHttpWebRequest.KeepAlive
= true;
myHttpWebRequest.Accept
= " image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/xaml+xml, application/vnd.ms-xpsdocument, application/x-ms-xbap, application/x-ms-application, application/QVOD, */*";
myHttpWebRequest.Referer
= strReferer;
myHttpWebRequest.UserAgent
= "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; MAXTHON 2.0)";
myHttpWebRequest.ContentType
= "application/x-www-form-urlencoded";
myHttpWebRequest.Method
= "Get";
myHttpWebRequest.Timeout
= 3000;

CookieCollection myCookies
= null;
CookieContainer myCookieContainer
= new CookieContainer();
if (cookieHeader.Length > 0)
{
myCookieContainer.SetCookies(
new Uri(strURL), cookieHeader);
myHttpWebRequest.CookieContainer
= myCookieContainer;
}




using (HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse())
{
cookieHeader
= myHttpWebRequest.CookieContainer.GetCookieHeader(new Uri(strURL));
myCookies
= response.Cookies;


SetEncod(response.CharacterSet);


return System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(CHtml.HtmlStr(response.GetResponseStream(), tmpEncod)));
}
}

/// <summary>
/// 设置网页编码
/// </summary>
void SetEncod(string cod)
{
if (tmpEncod == null)
{
if (Encod.Length > 0)
tmpEncod
= Encoding.GetEncoding(Encod);
else
tmpEncod
= Encoding.GetEncoding(cod);
}



}

/// <summary>
/// 功能描述:模拟登录页面,提交登录数据进行登录,并记录Header中的cookie
/// LoginPost("http://www.mystand.com.cn/login/submit.jsp","userid=hgj0000&password=06045369","http://www.mystand.com.cn/");
/// </summary>
/// <param name="strURL">登录数据提交的页面地址</param>
/// <param name="strArgs">用户登录数据</param>
/// <param name="strReferer">引用地址</param>
/// <returns>可以返回页面内容或不返回</returns>
public string LoginPost(string strURL, string strArgs, string strReferer)
{

HttpWebRequest myHttpWebRequest
= (HttpWebRequest)WebRequest.Create(strURL);
ProxySetting(myHttpWebRequest);
NetworkCredentialSetting(myHttpWebRequest);
myHttpWebRequest.AllowAutoRedirect
= true;
myHttpWebRequest.KeepAlive
= true;
myHttpWebRequest.Accept
= " image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/xaml+xml, application/vnd.ms-xpsdocument, application/x-ms-xbap, application/x-ms-application, application/QVOD, */*";
myHttpWebRequest.Referer
= strReferer;
myHttpWebRequest.UserAgent
= "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; MAXTHON 2.0)";
myHttpWebRequest.ContentType
= "application/x-www-form-urlencoded";
myHttpWebRequest.Method
= "POST";
myHttpWebRequest.Timeout
= 3000;

CookieCollection myCookies
= null;
CookieContainer myCookieContainer
= new CookieContainer();
if (cookieHeader.Length > 0)
{
myCookieContainer.SetCookies(
new Uri(strURL), cookieHeader);
myHttpWebRequest.CookieContainer
= myCookieContainer;
}
Stream MyRequestStrearm
= myHttpWebRequest.GetRequestStream();
StreamWriter MyStreamWriter
= new StreamWriter(MyRequestStrearm, Encoding.ASCII);
//把数据写入HttpWebRequest的Request流
MyStreamWriter.Write(strArgs);
//关闭打开对象
MyStreamWriter.Close();
MyRequestStrearm.Close();



using (HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse())
{
cookieHeader
= myHttpWebRequest.CookieContainer.GetCookieHeader(new Uri(strURL));
myCookies
= response.Cookies;


SetEncod(response.CharacterSet);

return System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode( CHtml.HtmlStr(response.GetResponseStream(), tmpEncod)));
}
}

/// <summary>
/// 功能描述:在PostLogin成功登录后记录下Headers中的cookie,然后获取此网站上其他页面的内容
/// </summary>
/// <param name="strURL">获取网站的某页面的地址</param>
/// <param name="strReferer">引用的地址</param>
/// <returns>返回页面内容</returns>
public string GetPage(string strURL, string strReferer)
{

HttpWebRequest myHttpWebRequest
= (HttpWebRequest)WebRequest.Create(strURL);
ProxySetting(myHttpWebRequest);
NetworkCredentialSetting(myHttpWebRequest);
myHttpWebRequest.ContentType
= "text/html";
myHttpWebRequest.Method
= "GET";
// myHttpWebRequest.Timeout = 5000;
if (!string.IsNullOrEmpty(strReferer))
myHttpWebRequest.Referer
= strReferer;
myHttpWebRequest.UserAgent
= "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; MAXTHON 2.0)";
myHttpWebRequest.Headers.Add(
"cookie:" + cookieHeader);
CookieContainer myCookieContainer
= new CookieContainer();
Console.WriteLine(myHttpWebRequest.TransferEncoding);
using (HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse())
{
SetEncod(response.CharacterSet);

return System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(CHtml.HtmlStr(response.GetResponseStream(), tmpEncod)));
}
}


}

/// <summary>
/// 取网页数据
/// </summary>
public class GetHtml
{
public GetHtml()
{

}
~GetHtml()
{

}


/// <summary>
/// 模拟提交表单POST
/// System.Collections.Specialized.NameValueCollection PostVars =new System.Collections.Specialized.NameValueCollection()
/// PostVars.Add("uid","name");
/// PostVars.Add("pwd","123456");
/// GetStrHtmlPost(url,PostVars)
/// </summary>
/// <param name="url">地址</param>
/// <param name="PostVars">PostValue</param>
/// <returns></returns>
public static string GetStrHtmlPost(String url, System.Collections.Specialized.NameValueCollection PostVars)
{
if (PostVars == null)
return "";
System.Net.WebClient WebClientObj
= new System.Net.WebClient();

string html;
try
{
byte[] buf = WebClientObj.UploadValues(url, "POST", PostVars);
//下面都没用啦,就上面一句话就可以了
html = System.Text.Encoding.Default.GetString(buf);
Encoding encoding
= CHtml.GetEncoding(html);
if (encoding == Encoding.UTF8) return html;
return encoding.GetString(buf);

}
catch
{

}

return "";

}

/// <summary>
/// 获取网页的HTML内容
/// </summary>
/// <param name="url">url</param>
/// <returns></returns>
public static string GetStrHtml(string url)
{
return GetStrHtml(url, null);
}
/// <summary>
/// 获取网页的HTML内容
/// </summary>
/// <param name="url">URL</param>
/// <param name="encoding">Encoding</param>
/// <returns></returns>
public static string GetStrHtml(string url, Encoding encoding)
{
byte[] buf = new WebClient().DownloadData(url);
if (encoding != null) return encoding.GetString(buf);
string html = Encoding.UTF8.GetString(buf);
encoding
= CHtml.GetEncoding(html);
if (encoding == Encoding.UTF8) return html;
return encoding.GetString(buf);
}







}
}

 

 

posted on 2008-11-11 16:17  西湖浪子  阅读(364)  评论(0)    收藏  举报