爬虫/蜘蛛程序的制作(C#语言) 二
今天给大家列出一些代码,仅供参考
列出数据层和逻辑层的代码

WebPage类
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.Web;
5
using System.Web.SessionState;
6
using System.Web.UI;
7
using System.Web.UI.WebControls;
8
using System.Web.UI.HtmlControls;
9
namespace WebPage
10

{
11
public class StringHelper
12
{
13
public StringHelper()
14
{
15
}
16
/**//// <summary>
17
/// 将中文字符转换为URL编码格式
18
/// </summary>
19
/// <param name="strText">字符串</param>
20
/// <returns>URL编码格式</returns>
21
static public string GetChineseURLCode(string strText)
22
{
23
int len = strText.Length;
24
string myStr = "";
25
for (int i = 0; i < len; i++)
26
{
27
myStr += getSpell(strText.Substring(i, 1));
28
}
29
return myStr;
30
}
31
32
static private string getSpell(string cnChar)
33
{
34
byte[] arrCN = System.Text.Encoding.Default.GetBytes(cnChar);
35
if (arrCN.Length > 1)
36
{
37
return System.Web.HttpUtility.UrlEncode(cnChar);
38
}
39
else
40
{
41
return cnChar;
42
}
43
}
44
}
45
}
46
这个是逻辑层的一个辅助类
WebDetail类
using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Data;
using System.Text.RegularExpressions;

/**//// <summary>
/// 功能:文章最终页类
/// 创建时间:07-3-5
/// 创建人:曹振华
/// </summary>
namespace WebPage


{
//文章最终页类
public class WebDetailPage:WebPage

{
private string _strDelRegex;
public WebDetailPage(WebDetailPageConfig clsDetailPage)

{
Url=clsDetailPage.DetailUrl;
CutRegex=clsDetailPage.strCutRegex;
TimeOut=clsDetailPage.intTimeOut;
filterFlag = clsDetailPage.filterFlag;
_strDelRegex = clsDetailPage.strDelRegex;
}

//获得文章最终页截取的内容
public string GetDetailPageHtml()

{
string strHtml = "";
Match match = matchRegexHtml;
if (match.Success)

{
strHtml = match.Groups["tmpDetailContent"].Value.ToString().Trim();
strHtml = Regex.Replace(strHtml, _strDelRegex, "");
}
return strHtml;
}



}
}
这个是个页面最终解析类

WebPage类
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;

namespace WebPage


{

/**//// <summary>
/// 功能:web页面基类
/// 创建时间:07-3-5
/// 创建人:曹振华
/// </summary>
public class WebPage

{
private string _strUrl;
private int _intTimeOut;//设置抓取时间
private string _strCutHtmlRegex;//截取列表内容部分正则
private int _intflag;

//网页的url地址
public string Url

{

get
{ return _strUrl; }

set
{ _strUrl = value; }
}

//抓取超时时间
public int TimeOut

{

get
{ return _intTimeOut; }

set
{ _intTimeOut = value; }
}

//截取主要内容的正则表达式
public string CutRegex

{

get
{ return _strCutHtmlRegex; }

set
{ _strCutHtmlRegex = value; }
}

//网页的内容



public string PageHtml

{

get
{ return GetUrlstrHtml(); }
}

//获取正则表达式得到的match
public Match matchRegexHtml

{

get
{ return GetRegexHtml(); }
}

//过滤标志
public int filterFlag

{

get
{ return _intflag; }

set
{ _intflag = value; }
}



/**////////////////////////////////////////////////////////////////////////////////////////////
//获取页面内容
protected string GetUrlstrHtml()

{
string strHtml = "";
strHtml = WebPageTools.GetConent(_strUrl, _intTimeOut, filterFlag);
return strHtml;
}

//获取截取的页面内容



protected Match GetRegexHtml()

{
Regex reg = new Regex(_strCutHtmlRegex, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase);
Match match;
try

{
match = reg.Match(GetUrlstrHtml());
}
catch

{
match = null;
}
return match;
}



}
}
这个是个页面基类

WebList类
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Data;
using System.Collections;

/**//// <summary>
/// 功能:新闻列表页类
/// 创建时间:07-3-5
/// 创建人:张杰
/// </summary>
namespace WebPage


{
//WebListPage:新闻列表页类
public class WebListPage:WebPage

{
private string _strDelRegex;//删除不需要html的正则
private string _strListRegex;//获取列表正则
private string _strNextPageRegex;//获取下一页的正则
private string _strListPageHtml;

public WebListPage(WebListPageConfig objListPageConfig)

{
Url = objListPageConfig.ListUrl;

CutRegex = objListPageConfig.strCutRegex;

TimeOut = objListPageConfig.intTimeOut;

filterFlag = objListPageConfig.filterFlag;

_strDelRegex = objListPageConfig.strDelRegex;

_strListRegex = objListPageConfig.strListRegex;

_strNextPageRegex = objListPageConfig.strNextPageRegex;

_strListPageHtml = GetListPageHtml();

}

//获得列表页截取的内容
private string GetListPageHtml()

{
string strHtml="";
Match match = matchRegexHtml;
if (match.Success)

{
strHtml = match.Groups["tmpListContent"].Value.ToString().Trim();
strHtml = Regex.Replace(strHtml, _strDelRegex, "");
}
return strHtml;
}

//获得需要文章列表的数组
public List<ListPageItems> GetListArray(string replaceStr,string beReplaceStr)

{
Regex reg = new Regex(_strListRegex, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase);
MatchCollection mCollect = reg.Matches(_strListPageHtml);
List<ListPageItems> list = new List<ListPageItems>();
if (mCollect.Count > 0)

{
for (int i = 0; i < mCollect.Count - 1; i++)

{
ListPageItems clsListItem = new ListPageItems();
clsListItem.Title = mCollect[i].Groups["Title"].Value.ToString().Trim();
clsListItem.DetailUrl = mCollect[i].Groups["DetailUrl"].Value.ToString().Trim();
string detailUrl = clsListItem.DetailUrl;
if (beReplaceStr != "")

{
detailUrl=detailUrl.Replace(beReplaceStr, replaceStr);
clsListItem.DetailUrl = detailUrl;
}
list.Add(clsListItem);
}
}
return list;
}

//获得下一页网页地址
public string GetNextPage()

{
string strHtml = "";
Regex reg = new Regex(_strNextPageRegex, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase);
Match match = reg.Match(_strListPageHtml);
if (match.Success)

{

strHtml = match.Groups["NextPageUrl"].Value.ToString().Trim();
}
strHtml = StringHelper.GetChineseURLCode(strHtml);
return strHtml;
}

}

//列表类
public class ListPageItems

{
private string _strTitle;
private string _strDetailUrl;

public string Title

{

get
{ return _strTitle; }

set
{ _strTitle = value; }
}

public string DetailUrl

{

get
{ return _strDetailUrl; }

set
{ _strDetailUrl = value; }
}
}
}
这个是列表类

WebTool类
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;

/**//// <summary>
/// 功能:页面请求
/// 创建时间:07-3-5
/// 创建人:曹振华
/// </summary>
namespace WebPage


{
public static class WebPageTools

{
public static string GetCompeletUrl(string strFristPage, string NextPage)

{
string strNextPageUrl = NextPage;
string strDomain = "";
strFristPage = StringHelper.GetChineseURLCode(strFristPage);
Regex reg = new Regex(@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
Match match = reg.Match(strNextPageUrl);
if (match.Success)

{
return strNextPageUrl;
}
else

{
//获取相对地址的路径,例如:/zh_cn/hairdressing/nxzr/zx/11052507/20060705/
Regex reg2 = new Regex(@"/(?<Files>.*/)");
Match match2 = reg2.Match(NextPage);
string strFiles="";
if (match2.Success)

{
strFiles = match2.Groups["Files"].Value.ToString().Trim();
}

if (strFiles.Length > 0)

{
Regex reg1 = new Regex(@"(?<domain>http://([\w-]+\.)+[\w-]+(/)?)");

Match match1 = reg1.Match(strFristPage);
if (match1.Success)

{
strDomain = match1.Groups["domain"].Value.ToString().Trim();
}
string PageNum = Regex.Replace(strNextPageUrl, @"/(?<Files>.*/)", "");
strNextPageUrl = strDomain + strFiles+PageNum;
}
else

{
Regex reg3 = new Regex(@"(?<domain>.*/)");

Match match3 = reg3.Match(strFristPage);
if (match3.Success)

{
strDomain = match3.Groups["domain"].Value.ToString().Trim();
}
strNextPageUrl = Regex.Replace(strNextPageUrl, @"(?<domain>.*/)", "");
strNextPageUrl = strDomain + strNextPageUrl;
}
//Regex reg1 = new Regex(@"(?<domain>.*/)");
}

return strNextPageUrl;
}


public static string GetConent(string _strUrl, int _intTimeOut,int flag)

{
string strHtml = "";
try

{
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(_strUrl);
myReq.Timeout = _intTimeOut;
myReq.Method = "GET";
//myReq.ContentType = "application/x-www-form-urlencoded";

HttpWebResponse MyRes = (HttpWebResponse)myReq.GetResponse();
Stream stream = MyRes.GetResponseStream();
StreamReader streamreader = new StreamReader(stream, Encoding.GetEncoding("GB2312"));
strHtml = streamreader.ReadToEnd();
if (flag == 1)

{
strHtml = Regex.Replace(strHtml, @"[\""\r\f\n']", "");
}
MyRes.Close();
streamreader.Close();
}
catch

{

}
return strHtml;
}

public static string FilterPaticularChar(string strFilter)

{
strFilter = strFilter.Replace("<BR>", "<br>");
strFilter = strFilter.Replace("<BR />", "<br>");
strFilter = strFilter.Replace("<BR/>", "<br>");
strFilter = strFilter.Replace("<Br>", "<br>");
strFilter = strFilter.Replace("<br>", "[---]");

strFilter = strFilter.Replace("<P>", "<p>");
strFilter = strFilter.Replace("<p>", "[----]");

strFilter = strFilter.Replace("</P>", "</p>");
strFilter = strFilter.Replace("</p>", "[/----]");

strFilter = strFilter.Replace("<STRONG>", "<strong>");
strFilter = strFilter.Replace("<strong>", "[-----]");

strFilter = strFilter.Replace("</STRONG>", "</strong>");
strFilter = strFilter.Replace("</strong>", "[/-----]");

strFilter = Regex.Replace(strFilter, "<[^>]*>", "");

strFilter = strFilter.Replace("[---]", "<br>");
strFilter = strFilter.Replace("[----]", "<p>");
strFilter = strFilter.Replace("[/----]", "</p>");
strFilter = strFilter.Replace("[-----]", "<strong>");
strFilter = strFilter.Replace("[/-----]", "</strong>");

return strFilter;
}
}

}
页面工具类

WebConfig类
using System;
using System.Collections.Generic;
using System.Text;

/**//// <summary>
/// 功能:ListPage类设置
/// 创建时间:07-3-5
/// 创建人:张杰
/// </summary>
namespace WebPage


{
public class WebPageConfig

{

}

//ListPage类设置
public class WebListPageConfig : WebPageConfig

{
private string _strUrl;
private string strRegex_CutHtml;
private string strRegex_ListHtml;
private string strRegex_DelHtml;
private string strRegex_NextPageHtml;
private int _intTimeOut;
private int _intfilterFlag;

public string ListUrl

{

get
{ return _strUrl; }

set
{ _strUrl = value; }
}

public string strCutRegex

{

get
{ return strRegex_CutHtml; }

set
{ strRegex_CutHtml = value; }
}

public string strListRegex

{

get
{ return strRegex_ListHtml; }

set
{ strRegex_ListHtml = value; }
}

public string strDelRegex

{

get
{ return strRegex_DelHtml; }

set
{ strRegex_DelHtml = value; }
}

public string strNextPageRegex

{

get
{ return strRegex_NextPageHtml; }

set
{ strRegex_NextPageHtml = value; }
}

public int intTimeOut

{

get
{ return _intTimeOut; }

set
{ _intTimeOut = value; }
}

public int filterFlag

{

get
{ return _intfilterFlag; }

set
{ _intfilterFlag = value; }
}

public WebListPageConfig(string strListPage, string strCutRegex, string strListRegex, string strDelRegex, string strNextPageRegex, int intTimeOut, int filterFlag)

{
_strUrl = strListPage;
strRegex_CutHtml = strCutRegex;
strRegex_ListHtml = strListRegex;
strRegex_DelHtml = strDelRegex;
strRegex_NextPageHtml = strNextPageRegex;
_intTimeOut = intTimeOut;
_intfilterFlag = filterFlag;
}

}


//DetailPage类设置
public class WebDetailPageConfig : WebPageConfig

{
private string _strUrl;
private string strRegex_CutHtml;
private string strRegex_DelHtml;
private int _intTimeOut;
private int _intfilterFlag;

public string DetailUrl

{

get
{ return _strUrl; }

set
{ _strUrl = value; }
}

public string strCutRegex

{

get
{ return strRegex_CutHtml; }

set
{ strRegex_CutHtml = value; }
}

public string strDelRegex

{

get
{ return strRegex_DelHtml; }

set
{ strRegex_DelHtml = value; }
}

public int intTimeOut

{

get
{ return _intTimeOut; }

set
{ _intTimeOut = value; }
}

public int filterFlag

{

get
{ return _intfilterFlag; }

set
{ _intfilterFlag = value; }
}


public WebDetailPageConfig(string strDetailUrl, string strCutRegex, string strDelRegex, int intTimeOut, int filterFlag)

{
_strUrl = strDetailUrl;
strRegex_CutHtml = strCutRegex;
strRegex_DelHtml = strDelRegex;
_intTimeOut = intTimeOut;
_intfilterFlag = filterFlag;
}
}

//数据库结构
public class DBSoures

{
private string _strUrlAddress;
private string _strKindName;
private int _intKindID;
private int _intTotolPage;
private string _strListRegex1;
private string _strListRegex2;
private string _strListRegex3;
private string _strListRegex4;
private int _intTimeOut;
private string _strDetailRegex1;
private string _strDetailRegex2;
private int _intFlag;
private string _ReplaceUrl;
private string _BeReplaceStr;
public string UrlAddress

{

get
{ return _strUrlAddress; }

set
{ _strUrlAddress = value; }
}

public string KindName

{

get
{ return _strKindName; }

set
{ _strKindName = value; }
}

public int KindID

{

get
{ return _intKindID; }

set
{ _intKindID = value; }
}

public int TotolPage

{

get
{ return _intTotolPage; }

set
{ _intTotolPage = value; }
}

public string ListRegex1

{

get
{ return _strListRegex1; }

set
{ _strListRegex1 = value; }
}

public string ListRegex2

{

get
{ return _strListRegex2; }

set
{ _strListRegex2 = value; }
}

public string ListRegex3

{

get
{ return _strListRegex3; }

set
{ _strListRegex3 = value; }
}

public string ListRegex4

{

get
{ return _strListRegex4; }

set
{ _strListRegex4 = value; }
}

public int TimeOut

{

get
{ return _intTimeOut; }

set
{ _intTimeOut = value; }
}

public string DetailRegex1

{

get
{ return _strDetailRegex1; }

set
{ _strDetailRegex1 = value; }
}

public string DetailRegex2

{

get
{ return _strDetailRegex2; }

set
{ _strDetailRegex2 = value; }
}

public int filterFlag

{

get
{ return _intFlag; }

set
{ _intFlag = value; }
}
public string ReplaceUrl

{

get
{ return _ReplaceUrl; }

set
{ _ReplaceUrl = value; }
}
public string BeReplaceStr

{

get
{ return _BeReplaceStr; }

set
{ _BeReplaceStr = value; }
}

public DBSoures(string UrlAddress,string KindName,int KindID,int TotolPage,string ListRegex1,string ListRegex2,string ListRegex3,string ListRegex4,int TimeOut,string DetailRegex1,string DetailRegex2,int intFlag,string ReplaceUrl,string BeReplaceStr)

{
_strUrlAddress = UrlAddress;
_strKindName = KindName;
_intKindID = KindID;
_intTotolPage = TotolPage;
_strListRegex1 = ListRegex1;
_strListRegex2 = ListRegex2;
_strListRegex3 = ListRegex3;
_strListRegex4 = ListRegex4;
_intTimeOut = TimeOut;
_strDetailRegex1 = DetailRegex1;
_strDetailRegex2 = DetailRegex2;
_intFlag = intFlag;
_ReplaceUrl = ReplaceUrl;
_BeReplaceStr = BeReplaceStr;
}
}

}
这就是所有的业务层代码,数据层可以对sqlserver和mysq进行操作 download
列出数据层和逻辑层的代码
1
using System;2
using System.Collections.Generic;3
using System.Text;4
using System.Web;5
using System.Web.SessionState;6
using System.Web.UI;7
using System.Web.UI.WebControls;8
using System.Web.UI.HtmlControls;9
namespace WebPage10


{11
public class StringHelper12

{13
public StringHelper()14

{15
}16

/**//// <summary>17
/// 将中文字符转换为URL编码格式18
/// </summary>19
/// <param name="strText">字符串</param>20
/// <returns>URL编码格式</returns>21
static public string GetChineseURLCode(string strText)22

{23
int len = strText.Length;24
string myStr = "";25
for (int i = 0; i < len; i++)26

{27
myStr += getSpell(strText.Substring(i, 1));28
}29
return myStr;30
}31

32
static private string getSpell(string cnChar)33

{34
byte[] arrCN = System.Text.Encoding.Default.GetBytes(cnChar);35
if (arrCN.Length > 1)36

{37
return System.Web.HttpUtility.UrlEncode(cnChar);38
}39
else40

{41
return cnChar;42
}43
}44
}45
}46

这个是逻辑层的一个辅助类
using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Data;
using System.Text.RegularExpressions;
/**//// <summary>
/// 功能:文章最终页类
/// 创建时间:07-3-5
/// 创建人:曹振华
/// </summary>
namespace WebPage

{
//文章最终页类
public class WebDetailPage:WebPage
{
private string _strDelRegex;
public WebDetailPage(WebDetailPageConfig clsDetailPage)
{
Url=clsDetailPage.DetailUrl;
CutRegex=clsDetailPage.strCutRegex;
TimeOut=clsDetailPage.intTimeOut;
filterFlag = clsDetailPage.filterFlag;
_strDelRegex = clsDetailPage.strDelRegex;
}
//获得文章最终页截取的内容
public string GetDetailPageHtml()
{
string strHtml = "";
Match match = matchRegexHtml;
if (match.Success)
{
strHtml = match.Groups["tmpDetailContent"].Value.ToString().Trim();
strHtml = Regex.Replace(strHtml, _strDelRegex, "");
}
return strHtml;
}


}
}
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace WebPage

{
/**//// <summary>
/// 功能:web页面基类
/// 创建时间:07-3-5
/// 创建人:曹振华
/// </summary>
public class WebPage
{
private string _strUrl;
private int _intTimeOut;//设置抓取时间
private string _strCutHtmlRegex;//截取列表内容部分正则
private int _intflag;
//网页的url地址
public string Url
{
get
{ return _strUrl; }
set
{ _strUrl = value; }
}
//抓取超时时间
public int TimeOut
{
get
{ return _intTimeOut; }
set
{ _intTimeOut = value; }
}
//截取主要内容的正则表达式
public string CutRegex
{
get
{ return _strCutHtmlRegex; }
set
{ _strCutHtmlRegex = value; }
} 
//网页的内容


public string PageHtml
{
get
{ return GetUrlstrHtml(); }
}
//获取正则表达式得到的match
public Match matchRegexHtml
{
get
{ return GetRegexHtml(); }
}
//过滤标志
public int filterFlag
{
get
{ return _intflag; }
set
{ _intflag = value; }
}


/**////////////////////////////////////////////////////////////////////////////////////////////
//获取页面内容
protected string GetUrlstrHtml()
{
string strHtml = "";
strHtml = WebPageTools.GetConent(_strUrl, _intTimeOut, filterFlag);
return strHtml;
}
//获取截取的页面内容


protected Match GetRegexHtml()
{
Regex reg = new Regex(_strCutHtmlRegex, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase);
Match match;
try
{
match = reg.Match(GetUrlstrHtml());
}
catch
{
match = null;
}
return match;
}


}
}
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Data;
using System.Collections;
/**//// <summary>
/// 功能:新闻列表页类
/// 创建时间:07-3-5
/// 创建人:张杰
/// </summary>
namespace WebPage

{
//WebListPage:新闻列表页类
public class WebListPage:WebPage
{
private string _strDelRegex;//删除不需要html的正则
private string _strListRegex;//获取列表正则
private string _strNextPageRegex;//获取下一页的正则
private string _strListPageHtml;
public WebListPage(WebListPageConfig objListPageConfig)
{
Url = objListPageConfig.ListUrl;
CutRegex = objListPageConfig.strCutRegex;
TimeOut = objListPageConfig.intTimeOut;
filterFlag = objListPageConfig.filterFlag;
_strDelRegex = objListPageConfig.strDelRegex;
_strListRegex = objListPageConfig.strListRegex;
_strNextPageRegex = objListPageConfig.strNextPageRegex;
_strListPageHtml = GetListPageHtml();
}
//获得列表页截取的内容
private string GetListPageHtml()
{
string strHtml="";
Match match = matchRegexHtml;
if (match.Success)
{
strHtml = match.Groups["tmpListContent"].Value.ToString().Trim();
strHtml = Regex.Replace(strHtml, _strDelRegex, "");
}
return strHtml;
}
//获得需要文章列表的数组
public List<ListPageItems> GetListArray(string replaceStr,string beReplaceStr)
{
Regex reg = new Regex(_strListRegex, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase);
MatchCollection mCollect = reg.Matches(_strListPageHtml);
List<ListPageItems> list = new List<ListPageItems>();
if (mCollect.Count > 0)
{
for (int i = 0; i < mCollect.Count - 1; i++)
{
ListPageItems clsListItem = new ListPageItems();
clsListItem.Title = mCollect[i].Groups["Title"].Value.ToString().Trim();
clsListItem.DetailUrl = mCollect[i].Groups["DetailUrl"].Value.ToString().Trim();
string detailUrl = clsListItem.DetailUrl;
if (beReplaceStr != "")
{
detailUrl=detailUrl.Replace(beReplaceStr, replaceStr);
clsListItem.DetailUrl = detailUrl;
}
list.Add(clsListItem);
}
}
return list;
}
//获得下一页网页地址
public string GetNextPage()
{
string strHtml = "";
Regex reg = new Regex(_strNextPageRegex, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase);
Match match = reg.Match(_strListPageHtml);
if (match.Success)
{
strHtml = match.Groups["NextPageUrl"].Value.ToString().Trim();
}
strHtml = StringHelper.GetChineseURLCode(strHtml);
return strHtml;
}
}
//列表类
public class ListPageItems
{
private string _strTitle;
private string _strDetailUrl;
public string Title
{
get
{ return _strTitle; }
set
{ _strTitle = value; }
}
public string DetailUrl
{
get
{ return _strDetailUrl; }
set
{ _strDetailUrl = value; }
}
}
}
这个是列表类
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
/**//// <summary>
/// 功能:页面请求
/// 创建时间:07-3-5
/// 创建人:曹振华
/// </summary>
namespace WebPage

{
public static class WebPageTools
{
public static string GetCompeletUrl(string strFristPage, string NextPage)
{
string strNextPageUrl = NextPage;
string strDomain = "";
strFristPage = StringHelper.GetChineseURLCode(strFristPage);
Regex reg = new Regex(@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
Match match = reg.Match(strNextPageUrl);
if (match.Success)
{
return strNextPageUrl;
}
else
{
//获取相对地址的路径,例如:/zh_cn/hairdressing/nxzr/zx/11052507/20060705/
Regex reg2 = new Regex(@"/(?<Files>.*/)");
Match match2 = reg2.Match(NextPage);
string strFiles="";
if (match2.Success)
{
strFiles = match2.Groups["Files"].Value.ToString().Trim();
}
if (strFiles.Length > 0)
{
Regex reg1 = new Regex(@"(?<domain>http://([\w-]+\.)+[\w-]+(/)?)");
Match match1 = reg1.Match(strFristPage);
if (match1.Success)
{
strDomain = match1.Groups["domain"].Value.ToString().Trim();
}
string PageNum = Regex.Replace(strNextPageUrl, @"/(?<Files>.*/)", "");
strNextPageUrl = strDomain + strFiles+PageNum;
}
else
{
Regex reg3 = new Regex(@"(?<domain>.*/)");
Match match3 = reg3.Match(strFristPage);
if (match3.Success)
{
strDomain = match3.Groups["domain"].Value.ToString().Trim();
}
strNextPageUrl = Regex.Replace(strNextPageUrl, @"(?<domain>.*/)", "");
strNextPageUrl = strDomain + strNextPageUrl;
}
//Regex reg1 = new Regex(@"(?<domain>.*/)");
}
return strNextPageUrl;
}

public static string GetConent(string _strUrl, int _intTimeOut,int flag)
{
string strHtml = "";
try
{
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(_strUrl);
myReq.Timeout = _intTimeOut;
myReq.Method = "GET";
//myReq.ContentType = "application/x-www-form-urlencoded";
HttpWebResponse MyRes = (HttpWebResponse)myReq.GetResponse();
Stream stream = MyRes.GetResponseStream();
StreamReader streamreader = new StreamReader(stream, Encoding.GetEncoding("GB2312"));
strHtml = streamreader.ReadToEnd();
if (flag == 1)
{
strHtml = Regex.Replace(strHtml, @"[\""\r\f\n']", "");
}
MyRes.Close();
streamreader.Close();
}
catch
{
}
return strHtml;
}
public static string FilterPaticularChar(string strFilter)
{
strFilter = strFilter.Replace("<BR>", "<br>");
strFilter = strFilter.Replace("<BR />", "<br>");
strFilter = strFilter.Replace("<BR/>", "<br>");
strFilter = strFilter.Replace("<Br>", "<br>");
strFilter = strFilter.Replace("<br>", "[---]");
strFilter = strFilter.Replace("<P>", "<p>");
strFilter = strFilter.Replace("<p>", "[----]");
strFilter = strFilter.Replace("</P>", "</p>");
strFilter = strFilter.Replace("</p>", "[/----]");
strFilter = strFilter.Replace("<STRONG>", "<strong>");
strFilter = strFilter.Replace("<strong>", "[-----]");
strFilter = strFilter.Replace("</STRONG>", "</strong>");
strFilter = strFilter.Replace("</strong>", "[/-----]");
strFilter = Regex.Replace(strFilter, "<[^>]*>", "");
strFilter = strFilter.Replace("[---]", "<br>");
strFilter = strFilter.Replace("[----]", "<p>");
strFilter = strFilter.Replace("[/----]", "</p>");
strFilter = strFilter.Replace("[-----]", "<strong>");
strFilter = strFilter.Replace("[/-----]", "</strong>");
return strFilter;
}
}
}
页面工具类
using System;
using System.Collections.Generic;
using System.Text;
/**//// <summary>
/// 功能:ListPage类设置
/// 创建时间:07-3-5
/// 创建人:张杰
/// </summary>
namespace WebPage

{
public class WebPageConfig
{
}
//ListPage类设置
public class WebListPageConfig : WebPageConfig
{
private string _strUrl;
private string strRegex_CutHtml;
private string strRegex_ListHtml;
private string strRegex_DelHtml;
private string strRegex_NextPageHtml;
private int _intTimeOut;
private int _intfilterFlag;
public string ListUrl
{
get
{ return _strUrl; }
set
{ _strUrl = value; }
}
public string strCutRegex
{
get
{ return strRegex_CutHtml; }
set
{ strRegex_CutHtml = value; }
}
public string strListRegex
{
get
{ return strRegex_ListHtml; }
set
{ strRegex_ListHtml = value; }
}
public string strDelRegex
{
get
{ return strRegex_DelHtml; }
set
{ strRegex_DelHtml = value; }
}
public string strNextPageRegex
{
get
{ return strRegex_NextPageHtml; }
set
{ strRegex_NextPageHtml = value; }
}
public int intTimeOut
{
get
{ return _intTimeOut; }
set
{ _intTimeOut = value; }
}
public int filterFlag
{
get
{ return _intfilterFlag; }
set
{ _intfilterFlag = value; }
}
public WebListPageConfig(string strListPage, string strCutRegex, string strListRegex, string strDelRegex, string strNextPageRegex, int intTimeOut, int filterFlag)
{
_strUrl = strListPage;
strRegex_CutHtml = strCutRegex;
strRegex_ListHtml = strListRegex;
strRegex_DelHtml = strDelRegex;
strRegex_NextPageHtml = strNextPageRegex;
_intTimeOut = intTimeOut;
_intfilterFlag = filterFlag;
}
}

//DetailPage类设置
public class WebDetailPageConfig : WebPageConfig
{
private string _strUrl;
private string strRegex_CutHtml;
private string strRegex_DelHtml;
private int _intTimeOut;
private int _intfilterFlag;
public string DetailUrl
{
get
{ return _strUrl; }
set
{ _strUrl = value; }
}
public string strCutRegex
{
get
{ return strRegex_CutHtml; }
set
{ strRegex_CutHtml = value; }
}
public string strDelRegex
{
get
{ return strRegex_DelHtml; }
set
{ strRegex_DelHtml = value; }
}
public int intTimeOut
{
get
{ return _intTimeOut; }
set
{ _intTimeOut = value; }
}
public int filterFlag
{
get
{ return _intfilterFlag; }
set
{ _intfilterFlag = value; }
}

public WebDetailPageConfig(string strDetailUrl, string strCutRegex, string strDelRegex, int intTimeOut, int filterFlag)
{
_strUrl = strDetailUrl;
strRegex_CutHtml = strCutRegex;
strRegex_DelHtml = strDelRegex;
_intTimeOut = intTimeOut;
_intfilterFlag = filterFlag;
}
}
//数据库结构
public class DBSoures
{
private string _strUrlAddress;
private string _strKindName;
private int _intKindID;
private int _intTotolPage;
private string _strListRegex1;
private string _strListRegex2;
private string _strListRegex3;
private string _strListRegex4;
private int _intTimeOut;
private string _strDetailRegex1;
private string _strDetailRegex2;
private int _intFlag;
private string _ReplaceUrl;
private string _BeReplaceStr;
public string UrlAddress
{
get
{ return _strUrlAddress; }
set
{ _strUrlAddress = value; }
}
public string KindName
{
get
{ return _strKindName; }
set
{ _strKindName = value; }
}
public int KindID
{
get
{ return _intKindID; }
set
{ _intKindID = value; }
}
public int TotolPage
{
get
{ return _intTotolPage; }
set
{ _intTotolPage = value; }
}
public string ListRegex1
{
get
{ return _strListRegex1; }
set
{ _strListRegex1 = value; }
}
public string ListRegex2
{
get
{ return _strListRegex2; }
set
{ _strListRegex2 = value; }
}
public string ListRegex3
{
get
{ return _strListRegex3; }
set
{ _strListRegex3 = value; }
}
public string ListRegex4
{
get
{ return _strListRegex4; }
set
{ _strListRegex4 = value; }
}
public int TimeOut
{
get
{ return _intTimeOut; }
set
{ _intTimeOut = value; }
}
public string DetailRegex1
{
get
{ return _strDetailRegex1; }
set
{ _strDetailRegex1 = value; }
}
public string DetailRegex2
{
get
{ return _strDetailRegex2; }
set
{ _strDetailRegex2 = value; }
}
public int filterFlag
{
get
{ return _intFlag; }
set
{ _intFlag = value; }
}
public string ReplaceUrl
{
get
{ return _ReplaceUrl; }
set
{ _ReplaceUrl = value; }
}
public string BeReplaceStr
{
get
{ return _BeReplaceStr; }
set
{ _BeReplaceStr = value; }
}
public DBSoures(string UrlAddress,string KindName,int KindID,int TotolPage,string ListRegex1,string ListRegex2,string ListRegex3,string ListRegex4,int TimeOut,string DetailRegex1,string DetailRegex2,int intFlag,string ReplaceUrl,string BeReplaceStr)
{
_strUrlAddress = UrlAddress;
_strKindName = KindName;
_intKindID = KindID;
_intTotolPage = TotolPage;
_strListRegex1 = ListRegex1;
_strListRegex2 = ListRegex2;
_strListRegex3 = ListRegex3;
_strListRegex4 = ListRegex4;
_intTimeOut = TimeOut;
_strDetailRegex1 = DetailRegex1;
_strDetailRegex2 = DetailRegex2;
_intFlag = intFlag;
_ReplaceUrl = ReplaceUrl;
_BeReplaceStr = BeReplaceStr;
}
}
}
这就是所有的业务层代码,数据层可以对sqlserver和mysq进行操作 download
长期招聘java,有找工作可以联系我,微信:caozhenhua1563

浙公网安备 33010602011771号