// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html
// 我这里是从省开始往下爬的,如果需要一次性爬取所有省的数据,得改一下从外一层开始爬
// 地址
public string url;
// 存储表名
public string dbname;
// 省级编码
public string code;
// 省名称
public string name;
// 数据库名称
public static string database = "TEST";
// 处理连接超时等意外断开情况
public int flag = 0;
public void ProcessRequest(HttpContext context)
{
url= System.Web.HttpUtility.HtmlDecode(System.Web.HttpContext.Current.Request.Form["url"]);
TableExist(dbname);
Provincial();
City();
County();
Town();
Village();
context.Response.Write("爬取成功");
}
public void TableExist(string dbname) {
DataTable dt = bll.SelectbySql("SELECT table_name FROM information_schema.TABLES WHERE table_name ='" + dbname + "'");
if (dt.Rows.Count <= 0) {
string sql =
"USE [" + database + "]\r\n" +
"SET ANSI_NULLS ON\r\n" +
"SET QUOTED_IDENTIFIER ON\r\n" +
"CREATE TABLE[dbo].[" + dbname + "](" +
"[ID][int] IDENTITY(1, 1) NOT NULL," +
"[Code] [nvarchar] (20) NULL," +
"[ParentCode] [nvarchar] (20) NULL," +
"[Name] [nvarchar] (50) NULL," +
"[Path] [nvarchar] (100) NULL," +
"[PathName] [nvarchar] (200) NULL," +
"[Levels] [int] NULL," +
"[Urls]" +
"[nvarchar]" +
"(max) NULL," +
"[DeleteMark] [bit] NULL," +
"CONSTRAINT[PK_" + dbname + "] PRIMARY KEY CLUSTERED" +
"(" +
"[ID] ASC" +
")WITH(PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON[PRIMARY]" +
") ON[PRIMARY] TEXTIMAGE_ON[PRIMARY]\r\n";
//"GO";
bll.RunbySql(sql);
}
}
public void Provincial()
{
bll.RunbySql("insert into " + dbname + " values('" + code + "','0','" + name + "','" + code + "','" + name + "',0,'" + url + "',0)");
}
public void City()
{
DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=0 and Urls is not null and Urls<>''");
if (dt.Rows.Count > 0)
{
for (int i = 0; i < dt.Rows.Count; i++)
{
String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
//先获取id为artContent的元素,再获取所有的p标签
Elements lists = doc.GetElementsByClass("citytr");
foreach (Element element in lists)
{
//td节点,包括路径和编码
Element elements_code = element.Children[0];
//td节点,包括路径和名称
Element elements_name = element.Children[1];
if (elements_code.Children.Count > 0)
{
elements_code = elements_code.Children[0];
elements_name = elements_name.Children[0];
}
string newurls = "";
if (elements_code.HasAttr("href")) {
string thisUrl = dt.Rows[i]["Urls"].ToString();
newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
}
bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','"+ dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',1,'" + newurls + "',0)");
}
}
}
}
public void County()
{
DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=1 and Urls is not null and Urls<>''");
if (dt.Rows.Count > 0)
{
for (int i = 0; i < dt.Rows.Count; i++)
{
String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
//先获取id为artContent的元素,再获取所有的p标签
Elements lists = doc.GetElementsByClass("countytr");
foreach (Element element in lists)
{
//td节点,包括路径和编码
Element elements_code = element.Children[0];
//td节点,包括路径和名称
Element elements_name = element.Children[1];
if (elements_code.Children.Count > 0)
{
elements_code = elements_code.Children[0];
elements_name = elements_name.Children[0];
}
string newurls = "";
if (elements_code.HasAttr("href"))
{
string thisUrl = dt.Rows[i]["Urls"].ToString();
newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
}
bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',2,'" + newurls + "',0)");
}
}
}
}
public void Town()
{
DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=2 and Urls is not null and Urls<>''");
if (dt.Rows.Count > 0)
{
for (int i = 0; i < dt.Rows.Count; i++)
{
String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
//先获取id为artContent的元素,再获取所有的p标签
Elements lists = doc.GetElementsByClass("towntr");
foreach (Element element in lists)
{
//td节点,包括路径和编码
Element elements_code = element.Children[0];
//td节点,包括路径和名称
Element elements_name = element.Children[1];
if (elements_code.Children.Count > 0)
{
elements_code = elements_code.Children[0];
elements_name = elements_name.Children[0];
}
string newurls = "";
if (elements_code.HasAttr("href"))
{
string thisUrl = dt.Rows[i]["Urls"].ToString();
newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
}
bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',3,'" + newurls + "',0)");
}
}
}
}
public void Village()
{
DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=3 and Urls is not null and Urls<>''");
if (dt.Rows.Count > 0)
{
for (int i = 0; i < dt.Rows.Count; i++)
{
String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
//先获取id为artContent的元素,再获取所有的p标签
Elements lists = doc.GetElementsByClass("villagetr");
foreach (Element element in lists)
{
//td节点,包括路径和编码
Element elements_code = element.Children[0];
//td节点,包括路径和名称
Element elements_name = element.Children[2];
bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',4,'',0)");
}
}
}
}
public string returnHtml(string Urls) {
String HtmlString = "";
try
{
WebClient webClient = new WebClient();
HtmlString = Encoding.GetEncoding("gb2312").GetString(webClient.DownloadData(Urls));
flag = 0;
return HtmlString;
}
catch
{
flag++;
if (flag <= 10) {
return returnHtml(Urls);
}
else
{
return HtmlString;
}
}
}