class Crawler_Method
{
public static Dictionary<String, String> GETCity()
{
String html=GET("https://www.zhaopin.com/citymap.html");//调用网络请求函数
return Parse1(html);//调用解析函数解析网页得到数据
}
public static Dictionary<String, String> Parse1(String html)
{
Dictionary<String, String> map = new Dictionary<string, string>();
Document doc = NSoup.NSoupClient.Parse(html);//将网页返回的数据用Nsoup初始化为document文档进行结构初始化
Elements elements = doc.GetElementsByClass("col1");//得到属性class为col1的元素
Elements e = elements[0].Select("a");//得到tag为a的元素
for (int x=0;x<e.Count;x++)//对得到的多个a 进行循环获得数据
{
Element a = e[x];
if(a.Attr("href").Substring(2).StartsWith("www"))//得到目标数据
map.Add(a.Text(), "https://" + a.Attr("href").Substring(2));
}
return map;
}
public static Dictionary<String, String> GetCompany(String url)
{
String html = GET(url);
return Parse2(html);
}
public static Dictionary<String, String> Parse2(String html)//此方法注释与上同
{
Dictionary<String, String> map = new Dictionary<string, string>();
Document doc = NSoup.NSoupClient.Parse(html);
Elements elements = doc.GetElementsByClass("nctt");
Elements e = elements[0].Select("li");
for (int x = 0; x < e.Count; x++)
{
Element a = e[x];
map.Add(a.Select("a")[0].Text(),a.Select("a")[0].Attr("href"));
}
return map;
}
public static String GET(String share_url)
{
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(share_url);//通过网页链接对网络请求类进行初始化
request.Method = "GET";//设置请求方式为get
request.AllowAutoRedirect = true;//允许网页重定向
// request.Headers.Set("Content-Range", " bytes 0 - 126399 / 8065760");
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0";
//反爬虫的设置 设置浏览器标识
HttpWebResponse response = (HttpWebResponse)request.GetResponse();//得到网页返回reponse对象
//Console.WriteLine(response.Headers.ToString());
Stream stream = response.GetResponseStream();//得到网页输出流
StreamReader read = new StreamReader(stream, System.Text.Encoding.GetEncoding("utf-8"));//对返回的数据进行解码
String nextline = "";
String html = "";
while ((nextline = read.ReadLine()) != null)//不断地读取输入流,读取网页源码
{
html += nextline;
}
read.Close();
return html;//返回网页源码
}
}