代理IP抓取

针对http://www.youdaili.net/有代理IP上的国内IP进行了抓取,花了我两天的时间研究,解决了中文乱码的问题,成功的抓取到了IP和端口号,采用的是HtmlAgilityPack技术 下面贴出代码 ,如果需要下载请到http://download.csdn.net/detail/waiwai1015/9035015里,我上传了代码和库文件

 

public class getProxyIp
{
//ScrapySharp
private static string youdaili = "http://www.youdaili.net/";
private static string hrefhead = youdaili + "Daili/guonei/";
private static HtmlDocument getip(string url)
{
HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb();//用的HtmlAgilityPack
hw.AutoDetectEncoding = false;
HttpWebRequest req;
req = WebRequest.Create(new Uri(url)) as HttpWebRequest;
req.Method = "GET";
HttpWebResponse rs = (HttpWebResponse)req.GetResponse();
System.IO.StreamReader sr = new StreamReader(rs.GetResponseStream(), System.Text.Encoding.GetEncoding("utf-8"));
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(sr.ReadToEnd());
return doc;
}
private static List<string> GetHrefs()
{
HtmlDocument _doc = getip(youdaili);
List<string> IpUrlList = new List<string>();
string todaydaili = DateTime.Now.ToString("MM月dd");
HtmlNodeCollection hrefs = _doc.DocumentNode.SelectNodes(".//a");//ul/li/a
if (hrefs == null)
return null;
foreach (HtmlNode href in hrefs)
{
if (href.Attributes["title"] != null && href.Attributes["href"] != null)
{
string tilte = href.Attributes["title"].Value;
string urll = href.Attributes["href"].Value;
if (tilte.IndexOf(todaydaili) >= 0 && urll.Length > 0)
{
if (urll.IndexOf("guonei") > 0)
{
IpUrlList.Add(urll);
}
//if (urll.IndexOf("guowai") > 0)
//{
// IpUrlList.Add(urll);//国外的自己加上
//}
}
}
}
return IpUrlList;
}
private static List<string> GetIPhrefs()
{

List<string> IpHrefList = new List<string>();
List<string> hrefs = GetHrefs();
for (int j = 0; j < hrefs.Count; j++)
{
string str = string.Empty;
string suburl = hrefs[j];
IpHrefList.Add(suburl);
string html = HttpGet(suburl);//这个页有乱码
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
// doc.Save("d:\\3.html");
//共几页
HtmlNodeCollection hrefas = doc.DocumentNode.SelectNodes("//ul[@class='pagelist']//a");// <li><a>共3页: </a></li><li><a href='#'>上一页</a></li><li class="thisclass"><a href='#'>1</a></li><li><a href='3537_2.html'>2</a></li><li><a href='3537_3.html'>3</a></li><li><a href='3537_2.html'>下一页</a></li>

if (hrefas != null && hrefas.Count >= 4)
{
for (int m = 3; m < hrefas.Count - 1; m++)
{
IpHrefList.Add(hrefhead+hrefas[m].Attributes["href"].Value);
}
}
}
return IpHrefList;

}
public static List<string> GetIPs()
{
try
{
List<string> hrefs = GetIPhrefs();
List<string> IpList = new List<string>();
for (int j = 0; j < hrefs.Count; j++)
{
string str = string.Empty;
string suburl = hrefs[j];
string html = HttpGet(suburl);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
// doc.Save("d:\\3.html");
var artlist = doc.DocumentNode.SelectNodes("//p")[0];
str = artlist.InnerHtml;
string[] strsub = str.Split('\n');
for (int i = 0; i < strsub.Length; i++)
{
string[] ips = strsub[i].Split('@');
if (ips.Length > 1)
{
IpList.Add(ips[0]);
}
}
}
return IpList;
}
catch
{
return null;
}
}
public static Encoding GetEncoding(string CharacterSet)
{
switch (CharacterSet)
{
case "gb2312": return Encoding.GetEncoding("gb2312");
case "utf-8": return Encoding.UTF8;
default: return Encoding.Default;
}
}
public static string HttpGet(string url)
{
string responsestr = "";
HttpWebRequest req = HttpWebRequest.Create(url) as HttpWebRequest;
req.Accept = "*/*";
req.Method = "GET";
req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1";
using (HttpWebResponse response = req.GetResponse() as HttpWebResponse)
{
Stream stream;
if (response.ContentEncoding.ToLower().Contains("gzip"))
{
stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
}
else if (response.ContentEncoding.ToLower().Contains("deflate"))
{
stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress);
}
else
{
stream = response.GetResponseStream();
}
using (StreamReader reader = new StreamReader(stream, GetEncoding(response.CharacterSet)))
{
responsestr = reader.ReadToEnd();
stream.Dispose();
}
}
return responsestr;
}
}

 

posted @ 2015-08-22 13:16  歪歪咯  阅读(568)  评论(0编辑  收藏  举报