这是我面试的题目,借用了很多网上同仁的代码,如有冒犯,请海涵!
现在工作真难找,我应聘的单位是http://www.027dns.net/,希望公司经理能给我上班的机会,我会很努力的,因为软件行业才是我的世界!我一个大学本科生当保安都成了同事们的笑话了,呵呵。
这是我第一次求职软件行业,第一次做面试题目,第一次自己这么认真写博客文章,写的不好,大家请指正,我会进步的!
张素丰,转载请注明出处http://www.cnblogs.com/zhangsufeng/archive/2009/02/28/1400224.html
屁话少说,正文开始:
假如我们采集网址:http://info.laser.hc360.com/list/z_news_yw.shtml 上的新闻,要求采集标题、时间、内容、单篇文章如果有翻页则采集完全。
这种类型的采集就是从指定网页获得新闻列表(即url),然后通过其url获得新闻详情,这是一种很常见的采集方式,有可能到很多页面上去采集,所以我们可以采用接口来构造基类。
首先定义 IGatherInfo.cs
1
using System;2
using System.Collections.Generic;3
using System.Linq;4
using System.Text;5

6
namespace ClassLibrary7


{8

/**//// <summary>9
/// 新闻采集类接口10
/// </summary>11
interface IGatherInfo12

{13

/**//// <summary>14
/// 采集时间15
/// </summary>16
string gatherTime17

{18
get;19
set;20
}21

/**//// <summary>22
/// NewsListUrl:抽取页地址23
/// RegexString:正则表达式,抽取逻辑24
/// 返回新闻页url25
/// </summary> 26
List<string> GatherUrlList(string NewsListUrl, string RegexString);27
//采集新闻详细内容28
List<NewsDetail> GatherNewsDetail(List<string> NewsUrlList, string RegeXString);29
}30
}31

接口定义了三个成员:gatherTime采集时间,GatherUrlList()从指定网址抽取新闻Url,GatherNewsDetail()读取新闻详细内容。
下面我们分析一下该程序中可能要用的公共方法,定义在 GatherInfoBase.cs
1.时间转换函数string DateToString()
/// 日期转换函数
/// 日期->字符串
public string DateToString()
{
DateTime d = DateTime.Now;
string s = null, y, m, dd, h, mm, ss;
y = d.Year.ToString();
m = d.Month.ToString();
if (m.Length < 2) m = "0" + m;
dd = d.Day.ToString();
if (dd.Length < 2) dd = "0" + dd;
h = d.Hour.ToString();
if (h.Length < 2) h = "0" + h;
mm = d.Minute.ToString();
if (mm.Length < 2) mm = "0" + mm;
ss = d.Second.ToString();
if (ss.Length < 2) ss = "0" + ss;
s += y + m + dd + h + mm + ss;
return s;
}
2.获取远程文件源代码 string GetRemoteHtmlCode(string url)
1

/**//// 获取远程文件源代码 2
/// param:远程url 3
/// 需要添加引用 X:\windows\system32\msxml2.dll4
public string GetRemoteHtmlCode(string Url)5

{6
string s = null;7
try8

{9
MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();10
_xmlhttp.open("GET", Url, false, null, null);11
_xmlhttp.send("");12
if (_xmlhttp.readyState == 4)13

{14
s = System.Text.Encoding.Default.GetString((byte[])_xmlhttp.responseBody);15
}16
}17
catch18

{19
}20
return s;21
}22

3.从HtmlCode截取字符串 string SniffwebCode(string code, string wordsBegin, string wordsEnd),用于抽取标题,时间,正文
1

截取字符串,获取网页标题#region 截取字符串,获取网页标题2
public string SniffwebCode(string code, string wordsBegin, string wordsEnd)3

{4
string NewsTitle = "";5
Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);6
for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())7

{8
NewsTitle = match1.Groups["title"].ToString();9
}10
return NewsTitle;11
}12
#endregion13

4.替换HTML源代码 string RemoveHTML(string HtmlCode),用于将抽取到的正文内容去Html
1
public string RemoveHTML(string HtmlCode)2

{3
string MatchVale = HtmlCode;4
foreach (Match s in Regex.Matches(HtmlCode, "<.+?>"))5

{6
MatchVale = MatchVale.Replace(s.Value, "");7
}8
return MatchVale;9
}10

11

12
//另一个方法,该方法保留了图片的连接13
public string RemoveHTML2(string strHtml)14

{15

string[] aryReg =
{16
@"<script[^>]*?>.*?</script>",17
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(file://[""'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>",18
@"([\r\n])[\s]+",19
@"&(quot|#34);",20
@"&(amp|#38);",21
@"&(lt|#60);",22
@"&(gt|#62);", 23
@"&(nbsp|#160);", 24
@"&(iexcl|#161);",25
@"&(cent|#162);",26
@"&(pound|#163);",27
@"&(copy|#169);",28
@"&#(\d+);",29
@"-->",30
@"<!--.*\n" 31
};32

33

string[] aryRep =
{34
"",35
"",36
"",37
"\"",38
"&",39
"<",40
">",41
" ",42
"\xa1",//chr(161),43
"\xa2",//chr(162),44
"\xa3",//chr(163),45
"\xa9",//chr(169),46
"",47
"\r\n",48
""49
};50

51
string newReg = aryReg[0];52
string strOutput = strHtml;53
for (int i = 0; i < aryReg.Length; i++)54

{55
Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);56
strOutput = regex.Replace(strOutput, aryRep[i]);57

58
}59

60
strOutput.Replace("<", "");61
strOutput.Replace(">", "");62
strOutput.Replace("\r\n", "");63

64

65
return strOutput;66
}67

5.更改文件名方法string changFileName(string filename, string addStr),利用其分页规律定义其增加的字符,
1
//文件名+增加字符如a.php -> a-1.php2
public string changFileName(string filename, string addStr)3

{4
string ext = null;5
string tmpFileName = "";6
if (filename.IndexOf('.') > 0)7

{8
string[] fs = filename.Split('.');9
ext = fs[fs.Length - 1];//获取后缀名10
for (int i = 0; i < fs.Length - 2; i++)11

{12
tmpFileName += fs[i] + ".";13
}14
tmpFileName += fs[fs.Length - 2];15
tmpFileName += addStr + ".";16
}17
return tmpFileName + ext;18
}19

6.获取页面连接
1
//获取指定区域的连接,返回一个List<url>2
public List<string> getHrefList(string HtmlCode, string urlHead)3

{4
string tempStr = "";5
List<string> MatchVale = new List<string>();6
string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)('|""| *|>)?";7
foreach (Match m in Regex.Matches(HtmlCode, Reg))8

{9
tempStr = (m.Value).ToLower().Replace("href=\"", "").Trim();10
tempStr = tempStr.Replace("\"", "").Trim();11
MatchVale.Add(urlHead + tempStr);12
}13
return MatchVale;14
}15

接下来我们定义一个NewsDeatil.cs
1
public class NewsDetail2

{3
public string strUrl;4
public string title;5
public string upTime;6
public string contents;7
}
因为程序比较小,所以我采用access来存取数据,创建GatherInfo_laser_hc360.db,添加两个表
GatherUrls:strUrl 备注,strGahterTime 文本
GatherInfos:strUrl 备注,upTime 文本,title 文本,content 备注
用数据集实现数据连接,代码中可见。
最后我们来实现对所给网址的采集,直接给出代码
1

/**//// <summary>2
/// 慧聪网 激光频道 新闻咨询3
/// Http://info.laser.hc360.com/list/z_news_yw.shtml4
/// 5
/// </summary>6
public class laser_hc360 : GatherInfoBase, IGatherInfo7

{8
public void Dispose()9

{10
GC.SuppressFinalize(this);11
}12

13

IGatherInfo 成员#region IGatherInfo 成员14

15
private string _gatherTime;16
private string _newsListUrl = @"http://info.laser.hc360.com/list/z_news_yw.shtml";17
private string _regexString = "";18

19
public string gatherTime20

{21
get22

{23
return _gatherTime;24
}25
set26

{27
_gatherTime = value;28
}29
}30

/**//// <summary>31
/// 获取指定网页连接的新闻url,并写入数据库32
/// </summary>33
/// <returns>完成返回true</returns>34
public bool aGatherUrlsList()35

{36
List<string> urlsList = this.GatherUrlList();37
gatherTime = this.DateToString();38
using (ClassLibrary.LaserHc360TableAdapters.GatherUrlsTableAdapter ta = new ClassLibrary.LaserHc360TableAdapters.GatherUrlsTableAdapter())39

{40
foreach (string str in urlsList)41

{42
try43

{ 44
ta.InsertNewsUrl(str, gatherTime);45
}46
catch47

{48
}49
}50
}51
return true;52
}53

/**//// <summary>54
/// 获取新闻内容,并写入数据库55
/// </summary>56
/// <returns>完成返回true</returns>57
public bool aGatherNewsDetails()58

{59
List<NewsDetail> newsDtl = this.GatherNewsDetail();60
using (ClassLibrary.LaserHc360TableAdapters.GatherInfosTableAdapter ta = new ClassLibrary.LaserHc360TableAdapters.GatherInfosTableAdapter())61

{62
foreach (NewsDetail nd in newsDtl)63

{64
try65

{66
ta.InsertNewsDetail(nd.strUrl, nd.title, nd.upTime, nd.contents);67
}68
catch69

{70
}71
}72
}73
return true;74
}75
76

/**//// <summary>77
/// 抽取其中指定部分的url地址78
/// </summary>79
/// <returns>url列表</returns>80
public List<string> GatherUrlList()81

{82
return GatherUrlList(_newsListUrl, _regexString);83
}84
public List<string> GatherUrlList(string NewsListUrl, string RegexString)85

{86
string HtmlCode = GetRemoteHtmlCode(NewsListUrl);87
int i = HtmlCode.IndexOf("<tr>", 2858);88
int j = HtmlCode.IndexOf("</tr>", 3830);89
HtmlCode = HtmlCode.Substring(i, j + 6);90
string urlHead = @"http://info.laser.hc360.com";91
List<string> returnList = getHrefList(HtmlCode, urlHead);92
return returnList;93
}94
public List<NewsDetail> GatherNewsDetail()95

{96
return GatherNewsDetail(GatherUrlList(), _regexString); ;97
}98
public List<NewsDetail> GatherNewsDetail(List<string> NewsUrlList, string RegeXString)99

{100
List<NewsDetail> newsdetail = new List<NewsDetail>();101
foreach (string str in NewsUrlList)102

{103
string HtmlCode = GetRemoteHtmlCode(str);104
NewsDetail nd = new NewsDetail();105
nd.strUrl = str;106
nd.title = SniffwebCode(HtmlCode, "<h1>", "</h1>");107
nd.upTime = SniffwebCode(HtmlCode, "<span id=\"endData\">", "</span>");108
nd.contents = SniffwebCode(HtmlCode, "<div id=\"artical\">", "</div>");109
int i = 2;110
string s = str;111
s = changFileName(str, "-" + i.ToString());112
bool isMore = true;113
while (isMore)114

{115
string htmlcode = GetRemoteHtmlCode(s);116
isMore = htmlcode.Contains("对不起,您查找的页面不存在!5秒钟后将自动跳转。");117
if (isMore == true) break;118
nd.contents += SniffwebCode(htmlcode, "<div id=\"artical\">", "</div>");119
i++;120
s = changFileName(str, "-" + i.ToString());121
isMore = true;122
}123
nd.contents = StripHTML(nd.contents);124
newsdetail.Add(nd);125
}126
return newsdetail;127
}128

129
#endregion130
}131

浙公网安备 33010602011771号