爬虫(转载的)
你需要先得到网页编码。下面这段代码可以解决大部分的网页
private void button3_Click(object sender, EventArgs e)
{
String[] UrlList = {
"http://www.kbs.co.kr/",
"http://rosemary.kbs.co.kr/",
"http://sbcx.saic.gov.cn/trade/index.jsp",
"http://www.csdn.net",
"http://www.google.cn/",
"http://www.baidu.com",
"http://www.javaeye.com/",
"http://blog.163.com/kel_scott66/blog/static/1150539632009614115635700/",
"http://www.sina.com.hk/",
"http://www.rthk.org.hk/"
};
foreach (String u in UrlList)
{
textBox1.Text = GetWebPage(u, "GET");
MessageBox.Show(u);
}
}
public string GetWebPage(string uri, string method)
{
try
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
req.Method = method;
req.Timeout = 10000;
req.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)";
String ReturnedEncoding = "";
HttpWebResponse res = req.GetResponse() as HttpWebResponse;
Stream ReceiveStream = res.GetResponseStream();
StreamReader sr = new StreamReader(ReceiveStream, Encoding.UTF8);
string ReturnedContent = sr.ReadToEnd();
if (ReturnedEncoding == "")
{
//string h = "<meta http-equiv='Content-Type' content='text/html; charset=big5'>";
Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""|^'']*)");
if (reg_charset.IsMatch(ReturnedContent))
{
ReturnedEncoding = reg_charset.Match(ReturnedContent).Groups["charset"].Value;
}
}
if (ReturnedEncoding == "")
{
String ct = res.ContentType.ToLower().Replace(" ", "");
if (ct.IndexOf("charset") > -1)
{
ReturnedEncoding = ct.Substring(ct.IndexOf("charset=") + 8);
}
}
if (ReturnedEncoding == "")
{
ReturnedEncoding = res.ContentEncoding;
}
if (ReturnedEncoding == "")
{
ReturnedEncoding = res.CharacterSet;
}
Encoding HtmlEncoding = Encoding.Default;
if (ReturnedEncoding != "")
{
HtmlEncoding = Encoding.GetEncoding(ReturnedEncoding);
}
req = (HttpWebRequest)WebRequest.Create(uri);
req.Method = method;
req.Timeout = 10000;
req.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)";
res = req.GetResponse() as HttpWebResponse;
ReceiveStream = res.GetResponseStream();
sr = new StreamReader(ReceiveStream, HtmlEncoding);
ReturnedContent = sr.ReadToEnd();
return ReturnedContent;
}
catch
{
return "获取失败!";
}
}
浙公网安备 33010602011771号