一个简易文章抓取工具(C#) (转载)
/*************************************
* CopyRight (c) edzh.com
* Date --> 2006-3-22
* Coder --> yesun
*************************************/
using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Web;
using System.Threading;
using System.Xml;
namespace GetArticle
{
/// <summary>
/// 夜隼信息采集器 v2.0 - 针对edzh.com开发
/// </summary>
public class Form1 : System.Windows.Forms.Form
{
public Thread t;
DataTable listdt = new DataTable();
public int j = 0;
//多线程,只允许5个线程同时进行
public static int maxThreadCount = 4;
public static int currentThreadCount = 0;
public static int intCurrentThread = 0;
public bool istrue = true;
public Thread[] thread;
public static string encoding = "gb2312";
//20s判断一次当前线程数
private System.Timers.Timer runable_Timer = new System.Timers.Timer(20*1000);
/// <summary>
/// 必需的设计器变量。
/// </summary>
private System.ComponentModel.Container components = null;
public Form1()
{
//
// Windows 窗体设计器支持所必需的
//
InitializeComponent();
BindLink();
BindEncode();
//
// TODO: 在 InitializeComponent 调用后添加任何构造函数代码
//
}
/// <summary>
/// 应用程序的主入口点。
/// </summary>
[STAThread]
static void Main()
{
Application.Run(new Form1());
}
static ManualResetEvent ev = new ManualResetEvent(false);
/// <summary>
/// 开始抓取
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void btnStrat_Click(object sender, System.EventArgs e)
{
//测试抓取文章
string url = this.url.Text.Trim();
string reg = this.reg.Text.Trim();
string folderpath = this.floder.Text.Trim();
string startTag = this.startTag.Text.Trim();
string endTag = this.endTag.Text.Trim();
string adstartTag = this.adStartTag.Text.Trim();
string adendTag = this.adEndTag.Text.Trim();
encoding = this.coder.SelectedItem.ToString().ToLower();
//归零
j = 0;
this.comboBoxListURL.Enabled = false;
try
{
maxThreadCount = Int32.Parse(this.textBoxMaxThread.Text);
}
catch{}
//ListView Clear
this.lvResult.Clear();
this.lvResult.FullRowSelect = true;
this.lvResult.View = View.LargeIcon;
this.lvResult.View = View.Details;
this.lvResult.Columns.Add("编号",80, HorizontalAlignment.Center);
this.lvResult.Columns.Add("标题",300,HorizontalAlignment.Left);
this.lvResult.Columns.Add("状态",50, HorizontalAlignment.Left);
this.lvResult.Columns.Add("大小",65,HorizontalAlignment.Left);
this.lvResult.Columns.Add("耗时",75,HorizontalAlignment.Left);
ImageList imgList = new ImageList();
try
{
Image largeImg = Image.FromFile(Application.StartupPath+"file://largeImg.gif/");
imgList.Images.Add(largeImg);
}
catch{}
this.listView1.Columns.Clear();
this.listView1.Items.Clear();
this.listView1.LargeImageList = imgList;
this.listView1.Scrollable = true;
//this.listView1.SmallImageList = this.imglist_fild;
this.listView1.View = View.LargeIcon;
//this.listView1.View = View.Details;
this.listView1.GridLines = true;
this.listView1.FullRowSelect = true;
this.listView1.Columns.Add("名称", 60, HorizontalAlignment.Left);
if(this.chkBoxIsMutiPage.Checked)
{
//多页搜索
if(txtUrl.Text.Trim()!="")
{
int startpage = 0;
int endpage = 0;
int leijia = 1;
try
{
startpage = Convert.ToInt32(this.txtstartpage.Text);
endpage = Convert.ToInt32(this.txtendpage.Text);
leijia = Convert.ToInt32(this.txtleijia.Text.Trim());
}
catch
{
}
//调用抓取接口
//for(int i=startpage;i<=endpage;i++)
thread = new Thread[endpage - startpage + 1];
this.btnStrat.Enabled = false;
int tempInt = 0;
this.statusBar1.Text = "正在初始化线程...";
for(int i=startpage;i<=endpage;i=i+leijia)
{
GetArticle ga = new GetArticle();
//传入相关参数
ga.url = txtUrl.Text.Replace("@pageid",i.ToString());
ga.reg = reg;
ga.pageReg = this.txtPagePatt.Text;
ga.folderpath = folderpath;
ga.startTag = startTag;
ga.endTag = endTag;
ga.adStartTag = adstartTag;
ga.adEndTag = adendTag;
ga.parentForm = this;
Thread th = new Thread(new ThreadStart(ga.strat));
thread[tempInt++] = th;
}
this.statusBar1.Text = "共"+tempInt+"个线程保存队列中,正在启动线程,请稍候...";
runable_Timer.Elapsed += new System.Timers.ElapsedEventHandler(timer_CheckThread);
runable_Timer.Start();
}
else
{
MessageBox.Show("请输入通用网址");
}
}
else
{
if(url!="")
{
//调用抓取接口
GetArticle ga = new GetArticle();
//传入相关参数
ga.url = url;
ga.reg = reg;
ga.pageReg = this.txtPagePatt.Text;
ga.folderpath = folderpath;
ga.startTag = startTag;
ga.endTag = endTag;
ga.adStartTag = adstartTag;
ga.adEndTag = adendTag;
ga.parentForm = this;
this.btnStrat.Enabled = false;
this.statusBar1.Text = "正在准备抓取数据,请稍候...";
ThreadStart ts = new ThreadStart(ga.strat);
t = new Thread(ts);
t.Name = "线程#1";
ListViewItem item = new ListViewItem(t.Name, 0);
item.SubItems.Add(t.Name);
item.Tag = t.GetHashCode();
item.Text = t.Name;
item.ForeColor = Color.Red;
item.EnsureVisible();
this.listView1.Items.AddRange(new ListViewItem[] { item });
t.Priority = ThreadPriority.Lowest;
t.Start();
}
else
{
MessageBox.Show("请输入网址");
}
}
}
/// <summary>
/// 定时检查线程数
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
void timer_CheckThread(object sender, System.Timers.ElapsedEventArgs e)
{
System.Timers.Timer initTimer = (System.Timers.Timer)sender;
initTimer.Stop();
//判断当前线程数,如果不足5个,则Join新的线程
if(currentThreadCount < maxThreadCount)
{
//表示当前线程可以加入线程,使得总执行线程为5个
try
{
if(thread[intCurrentThread] != null && thread[intCurrentThread].ThreadState == ThreadState.Unstarted)
{
thread[intCurrentThread].Priority = ThreadPriority.Lowest;
currentThreadCount++;
thread[intCurrentThread].Name = "线程#"+(intCurrentThread+1);
this.statusBar1.Text = thread[intCurrentThread].Name+"已开始执行...";
thread[intCurrentThread].Start();
ListViewItem item = new ListViewItem("线程#"+(intCurrentThread+1), 0);
//Item.Tag 和 Thread.GetHashCode关联
item.Tag = thread[intCurrentThread].GetHashCode();
item.SubItems.Add("线程#"+(intCurrentThread+1));
item.Text = "线程#"+(intCurrentThread+1);
item.ForeColor = Color.Red;
item.EnsureVisible();
this.listView1.Items.AddRange(new ListViewItem[] { item });
intCurrentThread++;
}
else
{
istrue = false;
}
}
catch
{
istrue = false;
}
}
initTimer.Interval = 3 * 1000;
initTimer.Start();
}
/// <summary>
/// 设置编码
/// </summary>
void BindEncode()
{
//clear list
for(int i=this.coder.Items.Count-1;i>=0;i--)
{
this.coder.Items.RemoveAt(i);
}
this.coder.Items.Add((object)"gb2312");
this.coder.Items.Add((object)"utf-8");
this.coder.SelectedIndex = 0;
}
/// <summary>
/// 绑定一些默认的网站
/// </summary>
void BindLink()
{
//ComBox list
for(int i=this.comboBoxListURL.Items.Count-1;i>=0;i--)
{
this.comboBoxListURL.Items.RemoveAt(i);
}
//绑定默认数据
try
{
MyItem myitem;
XmlDataDocument xmlDoc = new XmlDataDocument();
xmlDoc.Load(Application.StartupPath+"/URL.xml");
XmlNodeList xmlNodes = xmlDoc.SelectNodes("//root/url");
for(int i = 0;i<xmlNodes.Count;i++)
{
XmlElement xmlElem = (XmlElement)xmlNodes[i];
myitem = new MyItem();
myitem.id = Convert.ToInt32(xmlElem.GetAttribute("id"));
myitem.name = Convert.ToString(xmlElem.GetAttribute("name"));
myitem.regex = b2a(xmlElem.GetAttribute("regex"));
myitem.url = b2a(xmlElem.GetAttribute("url"));
myitem.starttag = b2a(xmlElem.GetAttribute("starttag"));
myitem.endtag = b2a(xmlElem.GetAttribute("endtag"));
myitem.adstarttag = b2a(xmlElem.GetAttribute("adstarttag"));
myitem.adendtag = b2a(xmlElem.GetAttribute("adendtag"));
try
{
myitem.pageReg = b2a(xmlElem.GetAttribute("pageReg"));
}
catch{}
//add to list
this.comboBoxListURL.Items.Add(myitem);
}
this.comboBoxListURL.SelectedIndex = 0;
}
catch
{
}
}
/// <summary>
/// 转换一些特殊字符
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
string a2b(string str)
{
str = str.Replace("<","<");
str = str.Replace(">",">");
return str;
}
/// <summary>
/// 转换一些特殊字符
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
string b2a(string str)
{
str = str.Replace("<","<");
str = str.Replace(">",">");
return str;
}
/// <summary>
/// 终止搜索
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void btnReset_Click(object sender, System.EventArgs e)
{
//中止线程
try
{
istrue = false;
if(this.chkBoxIsMutiPage.Checked)
{
//Application.Exit();
//终止所有运行中的线程
for(int i=0;i<10;i++)
{
thread[i].Abort();
this.statusBar1.Text = "线程"+thread[i].Name+"已终止!";
}
//归零
j = 0;
currentThreadCount = 0;
intCurrentThread = 0;
}
else
{
t.Abort();
}
this.statusBar1.Text = "已终止搜索!";
this.comboBoxListURL.Enabled = true;
this.btnStrat.Enabled = true;
this.btnReset.Enabled = true;
}
catch
{
}
}
/// <summary>
/// 保存网站
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void btnSave_Click(object sender, System.EventArgs e)
{
try
{
XmlDataDocument xmlDoc = new XmlDataDocument();
XmlElement xmlElem;
if(File.Exists(Application.StartupPath+"/URL.xml"))
{
xmlDoc.Load(Application.StartupPath+"/URL.xml");
}
else
{
xmlDoc.LoadXml("<root />");
}
xmlElem = xmlDoc.CreateElement("url");
xmlElem.SetAttribute("id" ,xmlDoc.SelectNodes("/root/url").Count.ToString());
xmlElem.SetAttribute("name" ,a2b(this.name.Text));
xmlElem.SetAttribute("url" ,a2b(this.url.Text));
xmlElem.SetAttribute("regex", a2b(this.reg.Text));
xmlElem.SetAttribute("starttag", a2b(this.startTag.Text));
xmlElem.SetAttribute("endtag", a2b(this.endTag.Text));
xmlElem.SetAttribute("adstarttag",a2b(this.adStartTag.Text));
xmlElem.SetAttribute("adendtag", a2b(this.adEndTag.Text));
xmlElem.SetAttribute("pageReg", a2b(this.txtPagePatt.Text));
xmlDoc.DocumentElement.AppendChild(xmlElem);
xmlDoc.Save(Application.StartupPath+"/URL.xml");
MessageBox.Show("操作成功!");
//refresh list
this.BindLink();
this.comboBoxListURL.SelectedIndex = this.comboBoxListURL.Items.Count-1;
}
catch(Exception ex)
{
MessageBox.Show(ex.ToString());
}
}
/// <summary>
/// 新建网站
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void menuItem2_Click(object sender, System.EventArgs e)
{
this.name.Text = "";
this.url.Text = "";
this.reg.Text = "";
this.startTag.Text = "";
this.endTag.Text = "";
this.adStartTag.Text = "";
this.adEndTag.Text = "";
this.txtPagePatt.Text = "";
this.txtReplace.Text = "";
}
/// <summary>
/// 退出程序
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void menuItem3_Click(object sender, System.EventArgs e)
{
Application.Exit();
}
/// <summary>
/// 是否多页抓取
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void chkBoxIsMutiPage_CheckedChanged(object sender, System.EventArgs e)
{
if(this.txtUrl.Enabled)
{
this.txtUrl.Enabled = false;
}
else
{
this.txtUrl.Enabled = true;
}
if(this.txtstartpage.Enabled)
{
this.txtstartpage.Enabled = false;
}
else
{
this.txtstartpage.Enabled = true;
}
if(this.txtendpage.Enabled)
{
this.txtendpage.Enabled = false;
}
else
{
this.txtendpage.Enabled = true;
}
if(this.txtleijia.Enabled)
{
this.txtleijia.Enabled = false;
}
else
{
this.txtleijia.Enabled = true;
}
if(this.textBoxMaxThread.Enabled)
{
this.textBoxMaxThread.Enabled = false;
}
else
{
this.textBoxMaxThread.Enabled = true;
}
//分析URL
string strUrl= this.url.Text.Trim();
try
{
if(strUrl.ToLower().IndexOf("page=") > -1)
{
strUrl = strUrl.Substring(0,strUrl.IndexOf("page=")+5) + "@pageid" + strUrl.Substring(strUrl.IndexOf("&",strUrl.IndexOf("page=")+1));
}
if(strUrl.ToLower().IndexOf("pageid=") > -1)
{
strUrl = strUrl.Substring(0,strUrl.IndexOf("pageid=")+7) + "@pageid" + strUrl.Substring(strUrl.IndexOf("&",strUrl.IndexOf("pageid=")+1));
}
if(strUrl.ToLower() == this.url.Text.Trim().ToLower())
{
//表示没有处理
if(strUrl.IndexOf("_") > -1)
{
strUrl = strUrl.Substring(0,strUrl.LastIndexOf("_")+1)+"@pageid"+strUrl.Substring(strUrl.LastIndexOf("."));
}
}
if(strUrl.ToLower() == this.url.Text.Trim().ToLower())
{
//表示没有处理
if(strUrl.ToLower().IndexOf("index") > -1)
{
strUrl = strUrl.Substring(0,strUrl.LastIndexOf("index")+5)+"@pageid"+strUrl.Substring(strUrl.LastIndexOf("."));
}
}
}
catch
{
this.txtUrl.Text = this.url.Text;
}
this.txtUrl.Text = strUrl;
}
/// <summary>
/// 选择网站
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void comboBoxListURL_SelectedIndexChanged(object sender, System.EventArgs e)
{
try
{
MyItem myitem = (MyItem)comboBoxListURL.SelectedItem;
this.url.Text = myitem.url;
this.name.Text = myitem.name;
this.reg.Text = myitem.regex;
this.startTag.Text = myitem.starttag;
this.endTag.Text = myitem.endtag;
this.adStartTag.Text = myitem.adstarttag;
this.adEndTag.Text = myitem.adendtag;
this.txtPagePatt.Text = myitem.pageReg;
}
catch{}
}
/// <summary>
/// 保存
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void btnDel_Click(object sender, System.EventArgs e)
{
try
{
MyItem myitem = (MyItem)comboBoxListURL.SelectedItem;
if(myitem != null)
{
XmlDataDocument xmlDoc = new XmlDataDocument();
if(File.Exists(Application.StartupPath+"/URL.xml"))
{
xmlDoc.Load(Application.StartupPath+"/URL.xml");
}
else
{
xmlDoc.LoadXml("<root />");
}
XmlNode node = xmlDoc.SelectSingleNode("/root/url[@id="+myitem.id+"]");
xmlDoc.SelectSingleNode("/root").RemoveChild(node);
xmlDoc.Save(Application.StartupPath+"/URL.xml");
MessageBox.Show("操作成功!");
}
BindLink();
}
catch(Exception ex)
{
MessageBox.Show(ex.Message);
}
}
}
/// <summary>
/// 抓取文章类
/// </summary>
public class GetArticle
{
#region "属 性"
public Form1 parentForm = null;
private string _url = "";
public string url
{
get{return _url;}
set{this._url=value;}
}
private string _reg = "";
public string reg
{
get{return _reg;}
set{this._reg=value;}
}
private string _reg1 = "";
public string reg1
{
get{return _reg1;}
set{this._reg1=value;}
}
private string _pageReg = "";
public string pageReg
{
get{return _pageReg;}
set{this._pageReg=value;}
}
private string _folderpath = "";
public string folderpath
{
get{return _folderpath;}
set{this._folderpath=value;}
}
private string _startTag = "";
public string startTag
{
get{return _startTag;}
set{this._startTag=value;}
}
private string _endTag = "";
public string endTag
{
get{return _endTag;}
set{this._endTag=value;}
}
private string _adStartTag = "";
public string adStartTag
{
get{return _adStartTag;}
set{this._adStartTag=value;}
}
private string _adEndTag = "";
public string adEndTag
{
get{return _adEndTag;}
set{this._adEndTag=value;}
}
#endregion
/// <summary>
/// 开始抓取数据
/// </summary>
/// <param name="url">网址</param>
/// <param name="reg">正则表达式</param>
/// <returns></returns>
public void strat()
{
if(this.folderpath=="")
{
folderpath = "C:\\Documents and Settings\\"+Environment.UserName+"\\桌面\\";
}
if(!Directory.Exists(folderpath))
{
Directory.CreateDirectory(folderpath);
}
if(url=="")
{
return;
}
if(reg=="")
{
reg = ".*";
}
string content = "";
Regex regex;
Match mc;
content = getWebContent(url);content = content.Replace("\"","'");
//开始用正则表达式取出图片文件名
regex = new Regex(reg,RegexOptions.Compiled | RegexOptions.IgnoreCase);
XmlDataDocument xmlDoc = new XmlDataDocument();
xmlDoc.LoadXml("<root />");
XmlElement xmlElem = xmlDoc.CreateElement("ArticleList");
int successCount = 1;
int i= 1;
for (mc = regex.Match(content),i=1; mc.Success; mc = mc.NextMatch(),i++)
{
DateTime startTime = DateTime.Now;
//if(i>10)break;
//outHtml += "标题:" + mc.Groups["topic"].Value + " 链接:" + mc.Groups["url"].Value+"\n";
//抓取出来了数据,然后根据地址把内容取出来
string tempstr = "";
string topic = "";
string href = "";
string vdatetime = "";
string catalogname = "";
try
{
//tempstr = content.Replace("\"","'");
//处理tempstr,取其内容
topic = mc.Groups["topic"].Value.Trim();
href = mc.Groups["url"].Value;
vdatetime = mc.Groups["vdatetime"].Value;
catalogname = mc.Groups["catalogname"].Value;
string RealURL = "";
string url1 = this.url;
if (href.IndexOf("http") <= -1)
{
if (href.StartsWith("/"))
{
//url1 = url1.Substring(0, url1.IndexOf('/', 0, 2));
Regex r = new Regex(@"^http://(?<d>[^/]+)/", RegexOptions.Compiled);
RealURL = "http://"+ r.Match(url).Result("${d}") + href;
}
else
{
url1 = url1.Substring(0, url1.LastIndexOf('/'));
RealURL = url1 + "/" + href;
}
}
else
{
RealURL = href;
}
//this.parentForm.statusBar1.ForeColor = Color.Black;
//this.parentForm.statusBar1.Text = "正在下载"+RealURL;
tempstr = getWebContent(RealURL);
//备用
//过滤内容页
string contentString = tempstr;
tempstr = FilterContent(tempstr);
//是否内容页有分页
if(this.pageReg != "")
{
try
{
Match mcContent;
Regex regexContent = new Regex(this.pageReg);
int intPage = 0;
for (mcContent = regexContent.Match(contentString),intPage=1; mcContent.Success; mcContent = mcContent.NextMatch(),intPage++)
{
string pageUrl = mcContent.Groups["url"].Value;
int page = 1;
if(mcContent.Groups["page"].Value != "")
{
try
{
page = Convert.ToInt32(mcContent.Groups["page"].Value);
}
catch{}
}
string pageRealURL = "";
string pageUrl1 = RealURL;
if (pageUrl.IndexOf("http") <= -1)
{
if (pageUrl.StartsWith("/"))
{
Regex r = new Regex(@"^http://(?<d>[^/]+)/", RegexOptions.Compiled);
pageRealURL = "http://"+ r.Match(pageUrl1).Result("${d}") + pageUrl;
}
else
{
pageUrl1 = pageUrl1.Substring(0, pageUrl1.LastIndexOf('/'));
pageRealURL = pageUrl1 + "/" + pageUrl;
}
}
else
{
pageRealURL = pageUrl;
}
if(page > 1)
{
//从第二页开始
//开始抓去内容分页的下一页内容
string nextPageContent = getWebContent(pageRealURL);
//开始过滤
nextPageContent = FilterContent(nextPageContent);
tempstr += "<br/>"+nextPageContent;
tempstr = Regex.Replace(tempstr,this.pageReg,"");
}
}
}
catch{}
}
if(tempstr!="" && topic != "")
{
//开始加入xml
XmlElement xmlElemArt = xmlDoc.CreateElement("Article");
xmlElemArt.SetAttribute("topic",topic);
xmlElemArt.SetAttribute("href",href);
xmlElemArt.SetAttribute("comefrom",this.parentForm.name.Text);
xmlElemArt.SetAttribute("vdatetime",vdatetime);
xmlElemArt.SetAttribute("catalogname",catalogname);
xmlElemArt.InnerText = tempstr;
xmlElem.AppendChild(xmlElemArt);
this.parentForm.j++;
//Add to listView
this.parentForm.statusBar1.ForeColor=Color.Black;
this.parentForm.statusBar1.Text = "成功抓取"+this.parentForm.j+"篇";
ListViewItem item1 = new ListViewItem(Thread.CurrentThread.Name+"->"+i.ToString(),0);
item1.SubItems.Add(topic);
item1.SubItems.Add("True");
item1.SubItems.Add(tempstr.Length +"byte");
DateTime endTime = DateTime.Now;
TimeSpan ts = endTime - startTime;
item1.SubItems.Add(ts.TotalSeconds +"秒");
this.parentForm.lvResult.Items.AddRange(new ListViewItem[] { item1 });
this.parentForm.lvResult.Items[this.parentForm.lvResult.Items.Count-1].EnsureVisible();
this.parentForm.lvResult.TopItem.Selected = true;
//更改ListView中的名字,
for(int j = 0;j<this.parentForm.listView1.Items.Count;j++)
{
if((int)this.parentForm.listView1.Items[j].Tag == Thread.CurrentThread.GetHashCode())
{
this.parentForm.listView1.Items[j].Text = Thread.CurrentThread.Name+"-"+successCount+"";
}
}
successCount++;
}
else
{
throw new Exception("Error");
}
}
catch
{
//MessageBox.Show(ex.ToString());
this.parentForm.statusBar1.ForeColor=Color.Red;
this.parentForm.statusBar1.Text = "抓取失败 ====> "+topic;
ListViewItem item1 = new ListViewItem(Thread.CurrentThread.Name+"->"+i.ToString(),0);
item1.SubItems.Add(topic);
item1.SubItems.Add("False");
item1.SubItems.Add("");
item1.SubItems.Add("");
this.parentForm.lvResult.Items.AddRange(new ListViewItem[] { item1 });
this.parentForm.lvResult.Items[this.parentForm.lvResult.Items.Count-1].EnsureVisible();
this.parentForm.statusBar1.ForeColor = Color.Red;
}
}
xmlDoc.DocumentElement.AppendChild(xmlElem);
string filename = "";
filename = this.parentForm.name.Text+"_"+DateTime.Now.ToShortDateString();
int filenameNum = 1;
while(File.Exists(folderpath+"file://%22+filename+%22_%22+filenameNum+%22.xml/"))
{
filenameNum++;
}
filename = filename+"_"+filenameNum+".xml";
try
{
xmlDoc.Save(folderpath+"file://%22+filename/);
}
catch
{
this.parentForm.statusBar1.Text = "保存Xml失败 ==> "+folderpath+"file://%22+filename/;
}
this.parentForm.statusBar1.ForeColor = Color.Black;
//减小当前线程数
//this.parentForm.currentThreadCount--;
Form1.currentThreadCount--;
this.parentForm.statusBar1.Text = Thread.CurrentThread.Name+" 执行完毕!";
//更改ListView中的名字,
for(int j = 0;j<this.parentForm.listView1.Items.Count;j++)
{
if((int)this.parentForm.listView1.Items[j].Tag == Thread.CurrentThread.GetHashCode())
{
this.parentForm.listView1.Items[j].ForeColor = Color.Black;
}
}
this.parentForm.statusBar1.ForeColor = Color.Green;
this.parentForm.statusBar1.Text = Thread.CurrentThread.Name+" 抓取完毕,共抓取"+this.parentForm.j+"篇,数据已保存在"+folderpath+"file://%22+filename/;
this.parentForm.btnStrat.Enabled = true;
this.parentForm.comboBoxListURL.Enabled = true;
}
/// <summary>
/// 过滤内容
/// </summary>
/// <param name="tempstr"></param>
/// <returns></returns>
public string FilterContent(string tempstr)
{
string url1 = this.url;
try
{
//开始对内容页分析
tempstr = tempstr.Substring(tempstr.IndexOf(this.startTag)+this.startTag.Length);
tempstr = tempstr.Substring(0,tempstr.IndexOf(this.endTag));
if(this.adStartTag != "" && this.adEndTag !="")
{
//clear ad
try
{
tempstr = tempstr.Substring(0,tempstr.IndexOf(this.adStartTag)) + tempstr.Substring(tempstr.IndexOf(this.adEndTag,tempstr.IndexOf(this.adStartTag))+this.adEndTag.Length);
tempstr = tempstr.Substring(0,tempstr.IndexOf(this.adStartTag)) + tempstr.Substring(tempstr.IndexOf(this.adEndTag,tempstr.IndexOf(this.adStartTag))+this.adEndTag.Length);
tempstr = tempstr.Substring(0,tempstr.IndexOf(this.adStartTag)) + tempstr.Substring(tempstr.IndexOf(this.adEndTag,tempstr.IndexOf(this.adStartTag))+this.adEndTag.Length);
}
catch{}
}
//开始转换tempstr中的一些相对路径的图片 ,带“
string imagename = System.Text.RegularExpressions.Regex.Match(tempstr,".*src=\"(?<image>[^\\s]+)\".*").Groups["image"].Value;
if (imagename.IndexOf("http") <= -1)
{
if (imagename.StartsWith("/"))
{
Regex r1 = new Regex(@"^http://(?<d>[^/]+)/", RegexOptions.Compiled|RegexOptions.IgnoreCase);
tempstr = System.Text.RegularExpressions.Regex.Replace(tempstr,"src=\"(?<image>[^\\s]+)\"","src=\"http://"+ r1.Match(this.url).Result("${d}")+"${image}\"",RegexOptions.IgnoreCase);
}
else
{
tempstr = System.Text.RegularExpressions.Regex.Replace(tempstr,"src=\"(?<image>[^\\s]+)\"","src=\""+url1.Substring(0, url1.LastIndexOf('/'))+"/${image}\"",RegexOptions.IgnoreCase);
}
}
//开始转换tempstr中的一些相对路径的图片 ,不带“
imagename = System.Text.RegularExpressions.Regex.Match(tempstr,".*src=(?<image>[^\\s]+).*").Groups["image"].Value;
if(imagename != "")
{
if (imagename.IndexOf("http") <= -1)
{
if (imagename.StartsWith("/"))
{
Regex r1 = new Regex(@"^http://(?<d>[^/]+)/", RegexOptions.Compiled|RegexOptions.IgnoreCase);
tempstr = System.Text.RegularExpressions.Regex.Replace(tempstr,"src=(?<image>[^\\s]+)","src=\"http://"+ r1.Match(this.url).Result("${d}")+"${image}\"",RegexOptions.IgnoreCase);
}
else
{
tempstr = System.Text.RegularExpressions.Regex.Replace(tempstr,"src=(?<image>[^\\s]+)","src=\""+url1.Substring(0, url1.LastIndexOf('/'))+"/${image}\"",RegexOptions.IgnoreCase);
}
}
}
//替换掉垃圾内容,简单替换,暂时不支持正则替换
if(this.parentForm.txtReplace.Text!="")
{
tempstr = tempstr.Replace(this.parentForm.txtReplace.Text,"");
}
//如果需要下载图片,则下载图片到本地,并且替换内容中图片路径
if(this.parentForm.IsDownloadImage.Checked)
{
Match imgMc;
//找出所有的图片或者文件
Regex imgReg = new Regex(".*src=\"(?<FileName>[^\\s^>]+)\".*");
for (imgMc = imgReg.Match(tempstr); imgMc.Success; imgMc = imgMc.NextMatch())
{
//开始下载文件 ContentFileName
DownloadFile(imgMc.Groups["FileName"].Value);
//开始修改文件中的名字
//有时间再写
}
//找出所有的图片或者文件
imgReg = new Regex(".*src=(?<FileName>[^\\s^>]+).*");
for (imgMc = imgReg.Match(tempstr); imgMc.Success; imgMc = imgMc.NextMatch())
{
//开始下载文件 ContentFileName
DownloadFile(imgMc.Groups["FileName"].Value);
//开始修改文件中的名字
//有时间再写
}
}
return tempstr;
}
catch
{
return "";
}
}
/// <summary>
/// 下载文件
/// </summary>
/// <param name="filename"></param>
private void DownloadFile(string filename)
{
if(filename == "")
return;
string path = Application.StartupPath+"file://ArticleContentImageFile//";
if(!Directory.Exists(path))
{
Directory.CreateDirectory(path);
}
try
{
HttpWebRequest oRequest = (HttpWebRequest)WebRequest.Create(filename);
HttpWebResponse oResponse = (HttpWebResponse)oRequest.GetResponse();
StreamReader sr = new StreamReader(oResponse.GetResponseStream(), System.Text.Encoding.GetEncoding("utf-8"));
string sResultContents = sr.ReadToEnd();
oResponse.Close();
byte[] bytes = System.Text.Encoding.GetEncoding("utf-8").GetBytes(sResultContents);
FileStream fs = new FileStream(path+filename.Substring(filename.LastIndexOf("/")), FileMode.OpenOrCreate, FileAccess.Write);
fs.Write(bytes, 0, bytes.Length);
fs.Flush();
fs.Close();
}
catch(Exception ex)
{
Console.WriteLine(ex.Message);
}
}
/// <summary>
/// 抓取页面接口 - WebClient
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private string getWebContent( string contenturl )
{
string str = "";
contenturl = contenturl.Replace("&","&");
WebClient client = new WebClient();
client.Headers.Add("Accept","image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
client.Headers.Add("Accept-Language","zh-cn");
client.Headers.Add("UA-CPU","x86");
client.Headers.Add("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
try
{
byte[] buffer = client.DownloadData( contenturl );
if(Form1.encoding == "utf-8")
{
str = System.Text.Encoding.GetEncoding("utf-8").GetString( buffer, 0, buffer.Length );
}
else
{
str = System.Text.Encoding.GetEncoding("gb2312").GetString( buffer, 0, buffer.Length );
}
}
catch(Exception ex)
{
//MessageBox.Show(ex.Message);
}
return str;
}
}
/// <summary>
/// MyItem Object
/// </summary>
public class MyItem : object
{
public int id;
public string name;
public string regex;
public string url;
public string starttag;
public string endtag;
public string adstarttag;
public string adendtag;
public string pageReg;
public override string ToString()
{
// TODO: 添加 MyItem.ToString 实现
return name;
}
}
}
/************************************* * CopyRight (c) edzh.com * Date --> 2006-3-22 * Coder --> yesun *************************************/ using System; using System.Drawing; using System.Collections; using System.ComponentModel; using System.Windows.Forms; using System.Data; using System.IO; using System.Net; using System.Text.RegularExpressions; using System.Web; using System.Threading; using System.Xml; namespace GetArticle { /// <summary> /// 夜隼信息采集器 v2.0 - 针对edzh.com开发 /// </summary> public class Form1 : System.Windows.Forms.Form { public Thread t; DataTable listdt = new DataTable(); private System.Windows.Forms.MainMenu mainMenu1; private System.Windows.Forms.MenuItem menuItem1; private System.Windows.Forms.MenuItem menuItem2; private System.Windows.Forms.MenuItem menuItem3; private System.Windows.Forms.Label label11; private System.Windows.Forms.Label label12; public int j = 0; private System.Windows.Forms.TabControl tabControl1; private System.Windows.Forms.TabPage tabPage1; public System.Windows.Forms.ListView lvResult; public string appPath = Application.StartupPath; //多线程,只允许5个线程同时进行 public static int maxThreadCount = 4; public static int currentThreadCount = 0; public static int intCurrentThread = 0; public bool istrue = true; public Thread[] thread; public static string encoding = "gb2312"; //20s判断一次当前线程数 private System.Timers.Timer runable_Timer = new System.Timers.Timer(20 * 1000); private System.Windows.Forms.TabControl tabControl3; private System.Windows.Forms.TabPage tabPage3; private System.Windows.Forms.TabControl tabControl2; private System.Windows.Forms.TabPage tabPage2; public System.Windows.Forms.ListView listView1; private System.Windows.Forms.Label label15; private System.Windows.Forms.Button btnDel; public System.Windows.Forms.ComboBox comboBoxListURL; private System.Windows.Forms.Button btnReset; public System.Windows.Forms.Button btnStrat; private System.Windows.Forms.Button btnSave; private System.Windows.Forms.TextBox adEndTag; public System.Windows.Forms.CheckBox IsDownloadImage; private System.Windows.Forms.Label label17; private System.Windows.Forms.Label label16; private System.Windows.Forms.Label label14; private System.Windows.Forms.Label label13; public System.Windows.Forms.TextBox txtPagePatt; private System.Windows.Forms.TextBox txtUrl; public System.Windows.Forms.TextBox name; private System.Windows.Forms.TextBox adStartTag; private System.Windows.Forms.TextBox endTag; private System.Windows.Forms.TextBox startTag; public System.Windows.Forms.TextBox floder; private System.Windows.Forms.TextBox reg; private System.Windows.Forms.TextBox url; private System.Windows.Forms.Label label7; private System.Windows.Forms.CheckBox chkBoxIsMutiPage; private System.Windows.Forms.Label label10; public System.Windows.Forms.ComboBox coder; private System.Windows.Forms.Label label9; private System.Windows.Forms.Label label8; private System.Windows.Forms.Label label6; public System.Windows.Forms.TextBox txtReplace; private System.Windows.Forms.Label label4; private System.Windows.Forms.Label label3; private System.Windows.Forms.Label label2; private System.Windows.Forms.Label label1; private System.Windows.Forms.Label label18; private System.Windows.Forms.NumericUpDown txtstartpage; private System.Windows.Forms.NumericUpDown txtleijia; private System.Windows.Forms.NumericUpDown txtendpage; private System.Windows.Forms.NumericUpDown textBoxMaxThread; public System.Windows.Forms.StatusBar statusBar1; /// <summary> /// 必需的设计器变量。 /// </summary> private System.ComponentModel.Container components = null; public Form1() { // // Windows 窗体设计器支持所必需的 // InitializeComponent(); BindLink(); BindEncode(); // // TODO: 在 InitializeComponent 调用后添加任何构造函数代码 // } /// <summary> /// 清理所有正在使用的资源。 /// </summary> protected override void Dispose(bool disposing) { if (disposing) { if (components != null) { components.Dispose(); } } base.Dispose(disposing); } #region Windows 窗体设计器生成的代码 /// <summary> /// 设计器支持所需的方法 - 不要使用代码编辑器修改 /// 此方法的内容。 /// </summary> private void InitializeComponent() { this.mainMenu1 = new System.Windows.Forms.MainMenu(); this.menuItem1 = new System.Windows.Forms.MenuItem(); this.menuItem2 = new System.Windows.Forms.MenuItem(); this.menuItem3 = new System.Windows.Forms.MenuItem(); this.label11 = new System.Windows.Forms.Label(); this.label12 = new System.Windows.Forms.Label(); this.tabControl1 = new System.Windows.Forms.TabControl(); this.tabPage1 = new System.Windows.Forms.TabPage(); this.lvResult = new System.Windows.Forms.ListView(); this.tabControl3 = new System.Windows.Forms.TabControl(); this.tabPage3 = new System.Windows.Forms.TabPage(); this.tabControl2 = new System.Windows.Forms.TabControl(); this.tabPage2 = new System.Windows.Forms.TabPage(); this.listView1 = new System.Windows.Forms.ListView(); this.label15 = new System.Windows.Forms.Label(); this.btnDel = new System.Windows.Forms.Button(); this.comboBoxListURL = new System.Windows.Forms.ComboBox(); this.btnReset = new System.Windows.Forms.Button(); this.btnStrat = new System.Windows.Forms.Button(); this.btnSave = new System.Windows.Forms.Button(); this.adEndTag = new System.Windows.Forms.TextBox(); this.IsDownloadImage = new System.Windows.Forms.CheckBox(); this.label17 = new System.Windows.Forms.Label(); this.label16 = new System.Windows.Forms.Label(); this.label14 = new System.Windows.Forms.Label(); this.label13 = new System.Windows.Forms.Label(); this.txtPagePatt = new System.Windows.Forms.TextBox(); this.txtUrl = new System.Windows.Forms.TextBox(); this.name = new System.Windows.Forms.TextBox(); this.adStartTag = new System.Windows.Forms.TextBox(); this.endTag = new System.Windows.Forms.TextBox(); this.startTag = new System.Windows.Forms.TextBox(); this.floder = new System.Windows.Forms.TextBox(); this.reg = new System.Windows.Forms.TextBox(); this.url = new System.Windows.Forms.TextBox(); this.label7 = new System.Windows.Forms.Label(); this.chkBoxIsMutiPage = new System.Windows.Forms.CheckBox(); this.label10 = new System.Windows.Forms.Label(); this.coder = new System.Windows.Forms.ComboBox(); this.label9 = new System.Windows.Forms.Label(); this.label8 = new System.Windows.Forms.Label(); this.label6 = new System.Windows.Forms.Label(); this.txtReplace = new System.Windows.Forms.TextBox(); this.label4 = new System.Windows.Forms.Label(); this.label3 = new System.Windows.Forms.Label(); this.label2 = new System.Windows.Forms.Label(); this.label1 = new System.Windows.Forms.Label(); this.label18 = new System.Windows.Forms.Label(); this.txtstartpage = new System.Windows.Forms.NumericUpDown(); this.txtleijia = new System.Windows.Forms.NumericUpDown(); this.txtendpage = new System.Windows.Forms.NumericUpDown(); this.textBoxMaxThread = new System.Windows.Forms.NumericUpDown(); this.statusBar1 = new System.Windows.Forms.StatusBar(); this.tabControl1.SuspendLayout(); this.tabPage1.SuspendLayout(); this.tabControl3.SuspendLayout(); this.tabPage3.SuspendLayout(); this.tabControl2.SuspendLayout(); this.tabPage2.SuspendLayout(); ((System.ComponentModel.ISupportInitialize) (this.txtstartpage)).BeginInit(); ((System.ComponentModel.ISupportInitialize) (this.txtleijia)).BeginInit(); ((System.ComponentModel.ISupportInitialize) (this.txtendpage)).BeginInit(); ((System.ComponentModel.ISupportInitialize) (this.textBoxMaxThread)).BeginInit(); this.SuspendLayout(); // // mainMenu1 // this.mainMenu1.MenuItems.AddRange(new System.Windows.Forms.MenuItem[] { this.menuItem1}); // // menuItem1 // this.menuItem1.Index = 0; this.menuItem1.MenuItems.AddRange(new System.Windows.Forms.MenuItem[] { this.menuItem2, this.menuItem3}); this.menuItem1.Text = "文件"; // // menuItem2 // this.menuItem2.Index = 0; this.menuItem2.Text = "新建"; this.menuItem2.Click += new System.EventHandler(this.menuItem2_Click); // // menuItem3 // this.menuItem3.Index = 1; this.menuItem3.Text = "退出"; this.menuItem3.Click += new System.EventHandler(this.menuItem3_Click); // // label11 // this.label11.Location = new System.Drawing.Point(88, 80); this.label11.Name = "label11"; this.label11.Size = new System.Drawing.Size(32, 16); this.label11.TabIndex = 27; this.label11.Text = "首页"; // // label12 // this.label12.Location = new System.Drawing.Point(168, 80); this.label12.Name = "label12"; this.label12.Size = new System.Drawing.Size(32, 16); this.label12.TabIndex = 29; this.label12.Text = "尾页"; // // tabControl1 // this.tabControl1.Controls.Add(this.tabPage1); this.tabControl1.Location = new System.Drawing.Point(8, 416); this.tabControl1.Name = "tabControl1"; this.tabControl1.SelectedIndex = 0; this.tabControl1.Size = new System.Drawing.Size(768, 168); this.tabControl1.TabIndex = 31; // // tabPage1 // this.tabPage1.Controls.Add(this.lvResult); this.tabPage1.Location = new System.Drawing.Point(4, 21); this.tabPage1.Name = "tabPage1"; this.tabPage1.Size = new System.Drawing.Size(760, 143); this.tabPage1.TabIndex = 0; this.tabPage1.Text = "Result"; // // lvResult // this.lvResult.Location = new System.Drawing.Point(0, 8); this.lvResult.Name = "lvResult"; this.lvResult.Size = new System.Drawing.Size(760, 144); this.lvResult.TabIndex = 0; // // tabControl3 // this.tabControl3.Controls.Add(this.tabPage3); this.tabControl3.Location = new System.Drawing.Point(8, 8); this.tabControl3.Name = "tabControl3"; this.tabControl3.SelectedIndex = 0; this.tabControl3.Size = new System.Drawing.Size(768, 400); this.tabControl3.TabIndex = 32; // // tabPage3 // this.tabPage3.Controls.Add(this.textBoxMaxThread); this.tabPage3.Controls.Add(this.txtleijia); this.tabPage3.Controls.Add(this.txtendpage); this.tabPage3.Controls.Add(this.txtstartpage); this.tabPage3.Controls.Add(this.coder); this.tabPage3.Controls.Add(this.label1); this.tabPage3.Controls.Add(this.adEndTag); this.tabPage3.Controls.Add(this.IsDownloadImage); this.tabPage3.Controls.Add(this.label17); this.tabPage3.Controls.Add(this.label16); this.tabPage3.Controls.Add(this.label14); this.tabPage3.Controls.Add(this.label13); this.tabPage3.Controls.Add(this.txtPagePatt); this.tabPage3.Controls.Add(this.txtUrl); this.tabPage3.Controls.Add(this.name); this.tabPage3.Controls.Add(this.adStartTag); this.tabPage3.Controls.Add(this.endTag); this.tabPage3.Controls.Add(this.startTag); this.tabPage3.Controls.Add(this.floder); this.tabPage3.Controls.Add(this.reg); this.tabPage3.Controls.Add(this.url); this.tabPage3.Controls.Add(this.label7); this.tabPage3.Controls.Add(this.chkBoxIsMutiPage); this.tabPage3.Controls.Add(this.label10); this.tabPage3.Controls.Add(this.label9); this.tabPage3.Controls.Add(this.label8); this.tabPage3.Controls.Add(this.label6); this.tabPage3.Controls.Add(this.txtReplace); this.tabPage3.Controls.Add(this.label3); this.tabPage3.Controls.Add(this.label2); this.tabPage3.Controls.Add(this.tabControl2); this.tabPage3.Controls.Add(this.label15); this.tabPage3.Controls.Add(this.btnDel); this.tabPage3.Controls.Add(this.comboBoxListURL); this.tabPage3.Controls.Add(this.btnReset); this.tabPage3.Controls.Add(this.btnStrat); this.tabPage3.Controls.Add(this.btnSave); this.tabPage3.Controls.Add(this.label11); this.tabPage3.Controls.Add(this.label12); this.tabPage3.Controls.Add(this.label4); this.tabPage3.Controls.Add(this.label18); this.tabPage3.Location = new System.Drawing.Point(4, 21); this.tabPage3.Name = "tabPage3"; this.tabPage3.Size = new System.Drawing.Size(760, 375); this.tabPage3.TabIndex = 0; this.tabPage3.Text = "参数设置"; // // tabControl2 // this.tabControl2.Controls.Add(this.tabPage2); this.tabControl2.Location = new System.Drawing.Point(448, 120); this.tabControl2.Name = "tabControl2"; this.tabControl2.SelectedIndex = 0; this.tabControl2.Size = new System.Drawing.Size(304, 248); this.tabControl2.TabIndex = 46; // // tabPage2 // this.tabPage2.Controls.Add(this.listView1); this.tabPage2.Location = new System.Drawing.Point(4, 21); this.tabPage2.Name = "tabPage2"; this.tabPage2.Size = new System.Drawing.Size(296, 223); this.tabPage2.TabIndex = 0; this.tabPage2.Text = "线程管理"; // // listView1 // this.listView1.Location = new System.Drawing.Point(0, 0); this.listView1.Name = "listView1"; this.listView1.Size = new System.Drawing.Size(304, 288); this.listView1.TabIndex = 0; // // label15 // this.label15.Location = new System.Drawing.Point(456, 27); this.label15.Name = "label15"; this.label15.Size = new System.Drawing.Size(56, 23); this.label15.TabIndex = 45; this.label15.Text = "常用网址"; // // btnDel // this.btnDel.Location = new System.Drawing.Point(544, 91); this.btnDel.Name = "btnDel"; this.btnDel.Size = new System.Drawing.Size(80, 23); this.btnDel.TabIndex = 44; this.btnDel.Text = "删除地址"; this.btnDel.Click += new System.EventHandler(this.btnDel_Click); // // comboBoxListURL // this.comboBoxListURL.Location = new System.Drawing.Point(520, 19); this.comboBoxListURL.Name = "comboBoxListURL"; this.comboBoxListURL.Size = new System.Drawing.Size(216, 20); this.comboBoxListURL.TabIndex = 43; this.comboBoxListURL.SelectedIndexChanged += new System.EventHandler(this.comboBoxListURL_SelectedIndexChanged); // // btnReset // this.btnReset.Location = new System.Drawing.Point(632, 91); this.btnReset.Name = "btnReset"; this.btnReset.Size = new System.Drawing.Size(80, 23); this.btnReset.TabIndex = 41; this.btnReset.Text = " 取 消 "; this.btnReset.Click += new System.EventHandler(this.btnReset_Click); // // btnStrat // this.btnStrat.Location = new System.Drawing.Point(632, 59); this.btnStrat.Name = "btnStrat"; this.btnStrat.Size = new System.Drawing.Size(80, 24); this.btnStrat.TabIndex = 40; this.btnStrat.Text = "开始抓取"; this.btnStrat.Click += new System.EventHandler(this.btnStrat_Click); // // btnSave // this.btnSave.Location = new System.Drawing.Point(544, 59); this.btnSave.Name = "btnSave"; this.btnSave.Size = new System.Drawing.Size(80, 23); this.btnSave.TabIndex = 42; this.btnSave.Text = "保存地址"; this.btnSave.Click += new System.EventHandler(this.btnSave_Click); // // adEndTag // this.adEndTag.Location = new System.Drawing.Point(80, 272); this.adEndTag.Multiline = true; this.adEndTag.Name = "adEndTag"; this.adEndTag.Size = new System.Drawing.Size(352, 20); this.adEndTag.TabIndex = 59; this.adEndTag.Text = "内容中需要过滤得广告结束HTML标记"; // // IsDownloadImage // this.IsDownloadImage.Location = new System.Drawing.Point(344, 16); this.IsDownloadImage.Name = "IsDownloadImage"; this.IsDownloadImage.TabIndex = 76; this.IsDownloadImage.Text = "是否下载图片"; // // label17 // this.label17.Location = new System.Drawing.Point(24, 328); this.label17.Name = "label17"; this.label17.Size = new System.Drawing.Size(56, 16); this.label17.TabIndex = 74; this.label17.Text = "页码正则"; // // label16 // this.label16.Location = new System.Drawing.Point(336, 80); this.label16.Name = "label16"; this.label16.Size = new System.Drawing.Size(48, 16); this.label16.TabIndex = 72; this.label16.Text = "线程数"; // // label14 // this.label14.Location = new System.Drawing.Point(24, 304); this.label14.Name = "label14"; this.label14.Size = new System.Drawing.Size(56, 23); this.label14.TabIndex = 71; this.label14.Text = "过滤内容"; // // label13 // this.label13.Location = new System.Drawing.Point(248, 80); this.label13.Name = "label13"; this.label13.Size = new System.Drawing.Size(48, 16); this.label13.TabIndex = 69; this.label13.Text = "累加数"; // // txtPagePatt // this.txtPagePatt.Location = new System.Drawing.Point(80, 320); this.txtPagePatt.Multiline = true; this.txtPagePatt.Name = "txtPagePatt"; this.txtPagePatt.Size = new System.Drawing.Size(352, 20); this.txtPagePatt.TabIndex = 75; this.txtPagePatt.Text = "如果内容页还带分页,则写上内容页分页正则"; // // txtUrl // this.txtUrl.Enabled = false; this.txtUrl.Location = new System.Drawing.Point(80, 112); this.txtUrl.Name = "txtUrl"; this.txtUrl.Size = new System.Drawing.Size(352, 21); this.txtUrl.TabIndex = 64; this.txtUrl.Text = "请使用@pageid代替页码"; // // name // this.name.Location = new System.Drawing.Point(80, 16); this.name.Name = "name"; this.name.Size = new System.Drawing.Size(136, 21); this.name.TabIndex = 61; this.name.Text = ""; // // adStartTag // this.adStartTag.Location = new System.Drawing.Point(80, 248); this.adStartTag.Multiline = true; this.adStartTag.Name = "adStartTag"; this.adStartTag.Size = new System.Drawing.Size(352, 20); this.adStartTag.TabIndex = 57; this.adStartTag.Text = "内容中需要过滤得广告开始HTML标记"; // // endTag // this.endTag.Location = new System.Drawing.Point(80, 224); this.endTag.Multiline = true; this.endTag.Name = "endTag"; this.endTag.Size = new System.Drawing.Size(352, 20); this.endTag.TabIndex = 55; this.endTag.Text = "您要抓取的内容结束HTML标记"; // // startTag // this.startTag.Location = new System.Drawing.Point(80, 200); this.startTag.Multiline = true; this.startTag.Name = "startTag"; this.startTag.Size = new System.Drawing.Size(352, 20); this.startTag.TabIndex = 53; this.startTag.Text = "您要抓取的内容开始HTML标记"; // // floder // this.floder.Location = new System.Drawing.Point(80, 344); this.floder.Name = "floder"; this.floder.Size = new System.Drawing.Size(352, 21); this.floder.TabIndex = 51; this.floder.Text = Application.StartupPath + "\\ArticleListXml\\" + DateTime.Now.ToShortDateString(); // // reg // this.reg.Location = new System.Drawing.Point(80, 136); this.reg.Multiline = true; this.reg.Name = "reg"; this.reg.Size = new System.Drawing.Size(352, 56); this.reg.TabIndex = 48; this.reg.Text = "列表页正则表达式"; // // url // this.url.Location = new System.Drawing.Point(80, 40); this.url.Name = "url"; this.url.Size = new System.Drawing.Size(352, 21); this.url.TabIndex = 47; this.url.Text = ""; // // label7 // this.label7.Location = new System.Drawing.Point(8, 280); this.label7.Name = "label7"; this.label7.Size = new System.Drawing.Size(72, 23); this.label7.TabIndex = 58; this.label7.Text = "广告结束Tag"; // // chkBoxIsMutiPage // this.chkBoxIsMutiPage.Location = new System.Drawing.Point(24, 72); this.chkBoxIsMutiPage.Name = "chkBoxIsMutiPage"; this.chkBoxIsMutiPage.Size = new System.Drawing.Size(64, 24); this.chkBoxIsMutiPage.TabIndex = 65; this.chkBoxIsMutiPage.Text = "按分页"; this.chkBoxIsMutiPage.CheckedChanged += new System.EventHandler(this.chkBoxIsMutiPage_CheckedChanged); // // label10 // this.label10.Location = new System.Drawing.Point(24, 120); this.label10.Name = "label10"; this.label10.Size = new System.Drawing.Size(80, 23); this.label10.TabIndex = 63; this.label10.Text = "通用地址"; // // coder // this.coder.Location = new System.Drawing.Point(256, 16); this.coder.Name = "coder"; this.coder.Size = new System.Drawing.Size(80, 20); this.coder.TabIndex = 62; this.coder.Text = "comboBox1"; // // label9 // this.label9.Location = new System.Drawing.Point(24, 24); this.label9.Name = "label9"; this.label9.Size = new System.Drawing.Size(56, 23); this.label9.TabIndex = 60; this.label9.Text = "网站名称"; // // label8 // this.label8.Location = new System.Drawing.Point(8, 256); this.label8.Name = "label8"; this.label8.Size = new System.Drawing.Size(88, 23); this.label8.TabIndex = 56; this.label8.Text = "广告开始Tag"; // // label6 // this.label6.Location = new System.Drawing.Point(0, 232); this.label6.Name = "label6"; this.label6.Size = new System.Drawing.Size(80, 23); this.label6.TabIndex = 54; this.label6.Text = "内容终止标记"; // // txtReplace // this.txtReplace.Location = new System.Drawing.Point(80, 296); this.txtReplace.Multiline = true; this.txtReplace.Name = "txtReplace"; this.txtReplace.Size = new System.Drawing.Size(352, 20); this.txtReplace.TabIndex = 70; this.txtReplace.Text = "内容中需要过滤的内容"; // // label4 // this.label4.Location = new System.Drawing.Point(0, 208); this.label4.Name = "label4"; this.label4.Size = new System.Drawing.Size(80, 23); this.label4.TabIndex = 52; this.label4.Text = "内容起始标记"; // // label3 // this.label3.Location = new System.Drawing.Point(24, 352); this.label3.Name = "label3"; this.label3.Size = new System.Drawing.Size(56, 16); this.label3.TabIndex = 50; this.label3.Text = "保存地址"; // // label2 // this.label2.Location = new System.Drawing.Point(24, 160); this.label2.Name = "label2"; this.label2.Size = new System.Drawing.Size(56, 23); this.label2.TabIndex = 49; this.label2.Text = "列表正则"; // // label1 // this.label1.Location = new System.Drawing.Point(24, 48); this.label1.Name = "label1"; this.label1.Size = new System.Drawing.Size(56, 23); this.label1.TabIndex = 77; this.label1.Text = "网站地址"; // // label18 // this.label18.Location = new System.Drawing.Point(224, 24); this.label18.Name = "label18"; this.label18.Size = new System.Drawing.Size(56, 23); this.label18.TabIndex = 78; this.label18.Text = "编码"; // // txtstartpage // this.txtstartpage.Enabled = false; this.txtstartpage.Location = new System.Drawing.Point(120, 72); this.txtstartpage.Maximum = new System.Decimal(new int[] { 500, 0, 0, 0}); this.txtstartpage.Name = "txtstartpage"; this.txtstartpage.Size = new System.Drawing.Size(40, 21); this.txtstartpage.TabIndex = 79; this.txtstartpage.Value = new System.Decimal(new int[] { 2, 0, 0, 0}); // // txtleijia // this.txtleijia.Enabled = false; this.txtleijia.Location = new System.Drawing.Point(288, 72); this.txtleijia.Name = "txtleijia"; this.txtleijia.Size = new System.Drawing.Size(40, 21); this.txtleijia.TabIndex = 81; this.txtleijia.Value = new System.Decimal(new int[] { 1, 0, 0, 0}); // // txtendpage // this.txtendpage.Enabled = false; this.txtendpage.Location = new System.Drawing.Point(200, 72); this.txtendpage.Name = "txtendpage"; this.txtendpage.Size = new System.Drawing.Size(40, 21); this.txtendpage.TabIndex = 80; this.txtendpage.Value = new System.Decimal(new int[] { 20, 0, 0, 0}); // // textBoxMaxThread // this.textBoxMaxThread.Enabled = false; this.textBoxMaxThread.Location = new System.Drawing.Point(384, 72); this.textBoxMaxThread.Name = "textBoxMaxThread"; this.textBoxMaxThread.Size = new System.Drawing.Size(40, 21); this.textBoxMaxThread.TabIndex = 82; this.textBoxMaxThread.Value = new System.Decimal(new int[] { 4, 0, 0, 0}); // // statusBar1 // this.statusBar1.Location = new System.Drawing.Point(0, 595); this.statusBar1.Name = "statusBar1"; this.statusBar1.Size = new System.Drawing.Size(786, 16); this.statusBar1.TabIndex = 33; this.statusBar1.Text = "就绪"; this.statusBar1.Left = 20; // // Form1 // this.AutoScale = false; this.AutoScaleBaseSize = new System.Drawing.Size(6, 14); this.ClientSize = new System.Drawing.Size(786, 611); this.Controls.Add(this.statusBar1); this.Controls.Add(this.tabControl3); this.Controls.Add(this.tabControl1); this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog; this.MaximizeBox = false; this.Menu = this.mainMenu1; this.Name = "Form1"; this.Text = "YESUN文章自动抓取工具 v2.0"; this.tabControl1.ResumeLayout(false); this.tabPage1.ResumeLayout(false); this.tabControl3.ResumeLayout(false); this.tabPage3.ResumeLayout(false); this.tabControl2.ResumeLayout(false); this.tabPage2.ResumeLayout(false); ((System.ComponentModel.ISupportInitialize) (this.txtstartpage)).EndInit(); ((System.ComponentModel.ISupportInitialize) (this.txtleijia)).EndInit(); ((System.ComponentModel.ISupportInitialize) (this.txtendpage)).EndInit(); ((System.ComponentModel.ISupportInitialize) (this.textBoxMaxThread)).EndInit(); this.ResumeLayout(false); } #endregion /// <summary> /// 应用程序的主入口点。 /// </summary> [STAThread] static void Main() { Application.Run(new Form1()); } //static AutoResetEvent ev=new AutoResetEvent(false); static ManualResetEvent ev = new ManualResetEvent(false); /// <summary> /// 开始抓取 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void btnStrat_Click(object sender, System.EventArgs e) { //测试抓取文章 string url = this.url.Text.Trim(); string reg = this.reg.Text.Trim(); string folderpath = this.floder.Text.Trim(); string startTag = this.startTag.Text.Trim(); string endTag = this.endTag.Text.Trim(); string adstartTag = this.adStartTag.Text.Trim(); string adendTag = this.adEndTag.Text.Trim(); encoding = this.coder.SelectedItem.ToString().ToLower(); //归零 j = 0; this.comboBoxListURL.Enabled = false; try { maxThreadCount = Int32.Parse(this.textBoxMaxThread.Text); } catch { } //ListView Clear this.lvResult.Clear(); this.lvResult.FullRowSelect = true; this.lvResult.View = View.LargeIcon; this.lvResult.View = View.Details; this.lvResult.Columns.Add("编号", 80, HorizontalAlignment.Center); this.lvResult.Columns.Add("标题", 300, HorizontalAlignment.Left); this.lvResult.Columns.Add("状态", 50, HorizontalAlignment.Left); this.lvResult.Columns.Add("大小", 65, HorizontalAlignment.Left); this.lvResult.Columns.Add("耗时", 75, HorizontalAlignment.Left); ImageList imgList = new ImageList(); try { Image largeImg = Image.FromFile(Application.StartupPath + "\\largeImg.gif"); imgList.Images.Add(largeImg); } catch { } this.listView1.Columns.Clear(); this.listView1.Items.Clear(); this.listView1.LargeImageList = imgList; this.listView1.Scrollable = true; //this.listView1.SmallImageList = this.imglist_fild; this.listView1.View = View.LargeIcon; //this.listView1.View = View.Details; this.listView1.GridLines = true; this.listView1.FullRowSelect = true; this.listView1.Columns.Add("名称", 60, HorizontalAlignment.Left); if (this.chkBoxIsMutiPage.Checked) { //多页搜索 if (txtUrl.Text.Trim() != "") { int startpage = 0; int endpage = 0; int leijia = 1; try { startpage = Convert.ToInt32(this.txtstartpage.Text); endpage = Convert.ToInt32(this.txtendpage.Text); leijia = Convert.ToInt32(this.txtleijia.Text.Trim()); } catch { } //调用抓取接口 //for(int i=startpage;i<=endpage;i++) thread = new Thread[endpage - startpage + 1]; this.btnStrat.Enabled = false; int tempInt = 0; this.statusBar1.Text = "正在初始化线程..."; for (int i = startpage; i <= endpage; i = i + leijia) { GetArticle ga = new GetArticle(); //传入相关参数 ga.url = txtUrl.Text.Replace("@pageid", i.ToString()); ga.reg = reg; ga.pageReg = this.txtPagePatt.Text; ga.folderpath = folderpath; ga.startTag = startTag; ga.endTag = endTag; ga.adStartTag = adstartTag; ga.adEndTag = adendTag; ga.parentForm = this; Thread th = new Thread(new ThreadStart(ga.strat)); thread[tempInt++] = th; } this.statusBar1.Text = "共" + tempInt + "个线程保存队列中,正在启动线程,请稍候..."; runable_Timer.Elapsed += new System.Timers.ElapsedEventHandler(timer_CheckThread); runable_Timer.Start(); } else { MessageBox.Show("请输入通用网址"); } } else { if (url != "") { //调用抓取接口 GetArticle ga = new GetArticle(); //传入相关参数 ga.url = url; ga.reg = reg; ga.pageReg = this.txtPagePatt.Text; ga.folderpath = folderpath; ga.startTag = startTag; ga.endTag = endTag; ga.adStartTag = adstartTag; ga.adEndTag = adendTag; ga.parentForm = this; this.btnStrat.Enabled = false; this.statusBar1.Text = "正在准备抓取数据,请稍候..."; ThreadStart ts = new ThreadStart(ga.strat); t = new Thread(ts); t.Name = "线程#1"; ListViewItem item = new ListViewItem(t.Name, 0); item.SubItems.Add(t.Name); item.Tag = t.GetHashCode(); item.Text = t.Name; item.ForeColor = Color.Red; item.EnsureVisible(); this.listView1.Items.AddRange(new ListViewItem[] { item }); t.Priority = ThreadPriority.Lowest; t.Start(); } else { MessageBox.Show("请输入网址"); } } } /// <summary> /// 定时检查线程数 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> void timer_CheckThread(object sender, System.Timers.ElapsedEventArgs e) { System.Timers.Timer initTimer = (System.Timers.Timer) sender; initTimer.Stop(); //判断当前线程数,如果不足5个,则Join新的线程 if (currentThreadCount < maxThreadCount) { //表示当前线程可以加入线程,使得总执行线程为5个 try { if (thread[intCurrentThread] != null && thread[intCurrentThread].ThreadState == ThreadState.Unstarted) { thread[intCurrentThread].Priority = ThreadPriority.Lowest; currentThreadCount++; thread[intCurrentThread].Name = "线程#" + (intCurrentThread + 1); this.statusBar1.Text = thread[intCurrentThread].Name + "已开始执行..."; thread[intCurrentThread].Start(); ListViewItem item = new ListViewItem("线程#" + (intCurrentThread + 1), 0); //Item.Tag 和 Thread.GetHashCode关联 item.Tag = thread[intCurrentThread].GetHashCode(); item.SubItems.Add("线程#" + (intCurrentThread + 1)); item.Text = "线程#" + (intCurrentThread + 1); item.ForeColor = Color.Red; item.EnsureVisible(); this.listView1.Items.AddRange(new ListViewItem[] { item }); intCurrentThread++; } else { istrue = false; } } catch { istrue = false; } } initTimer.Interval = 3 * 1000; initTimer.Start(); } /// <summary> /// 设置编码 /// </summary> void BindEncode() { //clear list for (int i = this.coder.Items.Count - 1; i >= 0; i--) { this.coder.Items.RemoveAt(i); } this.coder.Items.Add((object) "gb2312"); this.coder.Items.Add((object) "utf-8"); this.coder.SelectedIndex = 0; } /// <summary> /// 绑定一些默认的网站 /// </summary> void BindLink() { //ComBox list for (int i = this.comboBoxListURL.Items.Count - 1; i >= 0; i--) { this.comboBoxListURL.Items.RemoveAt(i); } //绑定默认数据 try { MyItem myitem; XmlDataDocument xmlDoc = new XmlDataDocument(); xmlDoc.Load(Application.StartupPath + "/URL.xml"); XmlNodeList xmlNodes = xmlDoc.SelectNodes("//root/url"); for (int i = 0; i < xmlNodes.Count; i++) { XmlElement xmlElem = (XmlElement) xmlNodes[i]; myitem = new MyItem(); myitem.id = Convert.ToInt32(xmlElem.GetAttribute("id")); myitem.name = Convert.ToString(xmlElem.GetAttribute("name")); myitem.regex = b2a(xmlElem.GetAttribute("regex")); myitem.url = b2a(xmlElem.GetAttribute("url")); myitem.starttag = b2a(xmlElem.GetAttribute("starttag")); myitem.endtag = b2a(xmlElem.GetAttribute("endtag")); myitem.adstarttag = b2a(xmlElem.GetAttribute("adstarttag")); myitem.adendtag = b2a(xmlElem.GetAttribute("adendtag")); try { myitem.pageReg = b2a(xmlElem.GetAttribute("pageReg")); } catch { } //add to list this.comboBoxListURL.Items.Add(myitem); } this.comboBoxListURL.SelectedIndex = 0; } catch { } } /// <summary> /// 转换一些特殊字符 /// </summary> /// <param name="str"></param> /// <returns></returns> string a2b(string str) { str = str.Replace("<", "<"); str = str.Replace(">", ">"); return str; } /// <summary> /// 转换一些特殊字符 /// </summary> /// <param name="str"></param> /// <returns></returns> string b2a(string str) { str = str.Replace("<", "<"); str = str.Replace(">", ">"); return str; } /// <summary> /// 终止搜索 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void btnReset_Click(object sender, System.EventArgs e) { //中止线程 try { istrue = false; if (this.chkBoxIsMutiPage.Checked) { //Application.Exit(); //终止所有运行中的线程 for (int i = 0; i < 10; i++) { thread[i].Abort(); this.statusBar1.Text = "线程" + thread[i].Name + "已终止!"; } //归零 j = 0; currentThreadCount = 0; intCurrentThread = 0; } else { t.Abort(); } this.statusBar1.Text = "已终止搜索!"; this.comboBoxListURL.Enabled = true; this.btnStrat.Enabled = true; this.btnReset.Enabled = true; } catch { } } /// <summary> /// 保存网站 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void btnSave_Click(object sender, System.EventArgs e) { try { XmlDataDocument xmlDoc = new XmlDataDocument(); XmlElement xmlElem; if (File.Exists(Application.StartupPath + "/URL.xml")) { xmlDoc.Load(Application.StartupPath + "/URL.xml"); } else { xmlDoc.LoadXml("<root />"); } xmlElem = xmlDoc.CreateElement("url"); xmlElem.SetAttribute("id", xmlDoc.SelectNodes("/root/url").Count.ToString()); xmlElem.SetAttribute("name", a2b(this.name.Text)); xmlElem.SetAttribute("url", a2b(this.url.Text)); xmlElem.SetAttribute("regex", a2b(this.reg.Text)); xmlElem.SetAttribute("starttag", a2b(this.startTag.Text)); xmlElem.SetAttribute("endtag", a2b(this.endTag.Text)); xmlElem.SetAttribute("adstarttag", a2b(this.adStartTag.Text)); xmlElem.SetAttribute("adendtag", a2b(this.adEndTag.Text)); xmlElem.SetAttribute("pageReg", a2b(this.txtPagePatt.Text)); xmlDoc.DocumentElement.AppendChild(xmlElem); xmlDoc.Save(Application.StartupPath + "/URL.xml"); MessageBox.Show("操作成功!"); //refresh list this.BindLink(); this.comboBoxListURL.SelectedIndex = this.comboBoxListURL.Items.Count - 1; } catch (Exception ex) { MessageBox.Show(ex.ToString()); } } /// <summary> /// 新建网站 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void menuItem2_Click(object sender, System.EventArgs e) { this.name.Text = ""; this.url.Text = ""; this.reg.Text = ""; this.startTag.Text = ""; this.endTag.Text = ""; this.adStartTag.Text = ""; this.adEndTag.Text = ""; this.txtPagePatt.Text = ""; this.txtReplace.Text = ""; } /// <summary> /// 退出程序 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void menuItem3_Click(object sender, System.EventArgs e) { Application.Exit(); } /// <summary> /// 是否多页抓取 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void chkBoxIsMutiPage_CheckedChanged(object sender, System.EventArgs e) { if (this.txtUrl.Enabled) { this.txtUrl.Enabled = false; } else { this.txtUrl.Enabled = true; } if (this.txtstartpage.Enabled) { this.txtstartpage.Enabled = false; } else { this.txtstartpage.Enabled = true; } if (this.txtendpage.Enabled) { this.txtendpage.Enabled = false; } else { this.txtendpage.Enabled = true; } if (this.txtleijia.Enabled) { this.txtleijia.Enabled = false; } else { this.txtleijia.Enabled = true; } if (this.textBoxMaxThread.Enabled) { this.textBoxMaxThread.Enabled = false; } else { this.textBoxMaxThread.Enabled = true; } //分析URL string strUrl = this.url.Text.Trim(); try { if (strUrl.ToLower().IndexOf("page=") > -1) { strUrl = strUrl.Substring(0, strUrl.IndexOf("page=") + 5) + "@pageid" + strUrl.Substring(strUrl.IndexOf("&", strUrl.IndexOf("page=") + 1)); } if (strUrl.ToLower().IndexOf("pageid=") > -1) { strUrl = strUrl.Substring(0, strUrl.IndexOf("pageid=") + 7) + "@pageid" + strUrl.Substring(strUrl.IndexOf("&", strUrl.IndexOf("pageid=") + 1)); } if (strUrl.ToLower() == this.url.Text.Trim().ToLower()) { //表示没有处理 if (strUrl.IndexOf("_") > -1) { strUrl = strUrl.Substring(0, strUrl.LastIndexOf("_") + 1) + "@pageid" + strUrl.Substring(strUrl.LastIndexOf(".")); } } if (strUrl.ToLower() == this.url.Text.Trim().ToLower()) { //表示没有处理 if (strUrl.ToLower().IndexOf("index") > -1) { strUrl = strUrl.Substring(0, strUrl.LastIndexOf("index") + 5) + "@pageid" + strUrl.Substring(strUrl.LastIndexOf(".")); } } } catch { this.txtUrl.Text = this.url.Text; } this.txtUrl.Text = strUrl; } /// <summary> /// 选择网站 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void comboBoxListURL_SelectedIndexChanged(object sender, System.EventArgs e) { try { MyItem myitem = (MyItem) comboBoxListURL.SelectedItem; this.url.Text = myitem.url; this.name.Text = myitem.name; this.reg.Text = myitem.regex; this.startTag.Text = myitem.starttag; this.endTag.Text = myitem.endtag; this.adStartTag.Text = myitem.adstarttag; this.adEndTag.Text = myitem.adendtag; this.txtPagePatt.Text = myitem.pageReg; } catch { } } /// <summary> /// 保存 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void btnDel_Click(object sender, System.EventArgs e) { try { MyItem myitem = (MyItem) comboBoxListURL.SelectedItem; if (myitem != null) { XmlDataDocument xmlDoc = new XmlDataDocument(); if (File.Exists(Application.StartupPath + "/URL.xml")) { xmlDoc.Load(Application.StartupPath + "/URL.xml"); } else { xmlDoc.LoadXml("<root />"); } XmlNode node = xmlDoc.SelectSingleNode("/root/url[@id=" + myitem.id + "]"); xmlDoc.SelectSingleNode("/root").RemoveChild(node); xmlDoc.Save(Application.StartupPath + "/URL.xml"); MessageBox.Show("操作成功!"); } BindLink(); } catch (Exception ex) { MessageBox.Show(ex.Message); } } } /// <summary> /// 抓取文章类 /// </summary> public class GetArticle { #region "属 性" public Form1 parentForm = null; private string _url = ""; public string url { get { return _url; } set { this._url = value; } } private string _reg = ""; public string reg { get { return _reg; } set { this._reg = value; } } private string _reg1 = ""; public string reg1 { get { return _reg1; } set { this._reg1 = value; } } private string _pageReg = ""; public string pageReg { get { return _pageReg; } set { this._pageReg = value; } } private string _folderpath = ""; public string folderpath { get { return _folderpath; } set { this._folderpath = value; } } private string _startTag = ""; public string startTag { get { return _startTag; } set { this._startTag = value; } } private string _endTag = ""; public string endTag { get { return _endTag; } set { this._endTag = value; } } private string _adStartTag = ""; public string adStartTag { get { return _adStartTag; } set { this._adStartTag = value; } } private string _adEndTag = ""; public string adEndTag { get { return _adEndTag; } set { this._adEndTag = value; } } #endregion /// <summary> /// 开始抓取数据 /// </summary> /// <param name="url">网址</param> /// <param name="reg">正则表达式</param> /// <returns></returns> public void strat() { if (this.folderpath == "") { folderpath = "C:\\Documents and Settings\\" + Environment.UserName + "\\桌面\\"; } if (!Directory.Exists(folderpath)) { Directory.CreateDirectory(folderpath); } if (url == "") { return; } if (reg == "") { reg = ".*"; } string content = ""; Regex regex; Match mc; content = getWebContent(url); content = content.Replace("\"", "'"); //开始用正则表达式取出图片文件名 regex = new Regex(reg, RegexOptions.Compiled | RegexOptions.IgnoreCase); XmlDataDocument xmlDoc = new XmlDataDocument(); xmlDoc.LoadXml("<root />"); XmlElement xmlElem = xmlDoc.CreateElement("ArticleList"); int successCount = 1; int i = 1; for (mc = regex.Match(content), i = 1; mc.Success; mc = mc.NextMatch(), i++) { DateTime startTime = DateTime.Now; //if(i>10)break; //outHtml += "标题:" + mc.Groups["topic"].Value + " 链接:" + mc.Groups["url"].Value+"\n"; //抓取出来了数据,然后根据地址把内容取出来 string tempstr = ""; string topic = ""; string href = ""; string vdatetime = ""; string catalogname = ""; try { //tempstr = content.Replace("\"","'"); //处理tempstr,取其内容 topic = mc.Groups["topic"].Value.Trim(); href = mc.Groups["url"].Value; vdatetime = mc.Groups["vdatetime"].Value; catalogname = mc.Groups["catalogname"].Value; string RealURL = ""; string url1 = this.url; if (href.IndexOf("http") <= -1) { if (href.StartsWith("/")) { //url1 = url1.Substring(0, url1.IndexOf('/', 0, 2)); Regex r = new Regex(@"^http://(?<d>[^/]+)/", RegexOptions.Compiled); RealURL = "http://" + r.Match(url).Result("${d}") + href; } else { url1 = url1.Substring(0, url1.LastIndexOf('/')); RealURL = url1 + "/" + href; } } else { RealURL = href; } //this.parentForm.statusBar1.ForeColor = Color.Black; //this.parentForm.statusBar1.Text = "正在下载"+RealURL; tempstr = getWebContent(RealURL); //备用 //过滤内容页 string contentString = tempstr; tempstr = FilterContent(tempstr); //是否内容页有分页 if (this.pageReg != "") { try { Match mcContent; Regex regexContent = new Regex(this.pageReg); int intPage = 0; for (mcContent = regexContent.Match(contentString), intPage = 1; mcContent.Success; mcContent = mcContent.NextMatch(), intPage++) { string pageUrl = mcContent.Groups["url"].Value; int page = 1; if (mcContent.Groups["page"].Value != "") { try { page = Convert.ToInt32(mcContent.Groups["page"].Value); } catch { } } string pageRealURL = ""; string pageUrl1 = RealURL; if (pageUrl.IndexOf("http") <= -1) { if (pageUrl.StartsWith("/")) { Regex r = new Regex(@"^http://(?<d>[^/]+)/", RegexOptions.Compiled); pageRealURL = "http://" + r.Match(pageUrl1).Result("${d}") + pageUrl; } else { pageUrl1 = pageUrl1.Substring(0, pageUrl1.LastIndexOf('/')); pageRealURL = pageUrl1 + "/" + pageUrl; } } else { pageRealURL = pageUrl; } if (page > 1) { //从第二页开始 //开始抓去内容分页的下一页内容 string nextPageContent = getWebContent(pageRealURL); //开始过滤 nextPageContent = FilterContent(nextPageContent); tempstr += "<br/>" + nextPageContent; tempstr = Regex.Replace(tempstr, this.pageReg, ""); } } } catch { } } if (tempstr != "" && topic != "") { //开始加入xml XmlElement xmlElemArt = xmlDoc.CreateElement("Article"); xmlElemArt.SetAttribute("topic", topic); xmlElemArt.SetAttribute("href", href); xmlElemArt.SetAttribute("comefrom", this.parentForm.name.Text); xmlElemArt.SetAttribute("vdatetime", vdatetime); xmlElemArt.SetAttribute("catalogname", catalogname); xmlElemArt.InnerText = tempstr; xmlElem.AppendChild(xmlElemArt); this.parentForm.j++; //Add to listView this.parentForm.statusBar1.ForeColor = Color.Black; this.parentForm.statusBar1.Text = "成功抓取" + this.parentForm.j + "篇"; ListViewItem item1 = new ListViewItem(Thread.CurrentThread.Name + "->" + i.ToString(), 0); item1.SubItems.Add(topic); item1.SubItems.Add("True"); item1.SubItems.Add(tempstr.Length + "byte"); DateTime endTime = DateTime.Now; TimeSpan ts = endTime - startTime; item1.SubItems.Add(ts.TotalSeconds + "秒"); this.parentForm.lvResult.Items.AddRange(new ListViewItem[] { item1 }); this.parentForm.lvResult.Items[this.parentForm.lvResult.Items.Count - 1].EnsureVisible(); this.parentForm.lvResult.TopItem.Selected = true; //更改ListView中的名字, for (int j = 0; j < this.parentForm.listView1.Items.Count; j++) { if ((int) this.parentForm.listView1.Items[j].Tag == Thread.CurrentThread.GetHashCode()) { this.parentForm.listView1.Items[j].Text = Thread.CurrentThread.Name + "-" + successCount + ""; } } successCount++; } else { throw new Exception("Error"); } } catch { //MessageBox.Show(ex.ToString()); this.parentForm.statusBar1.ForeColor = Color.Red; this.parentForm.statusBar1.Text = "抓取失败 ====> " + topic; ListViewItem item1 = new ListViewItem(Thread.CurrentThread.Name + "->" + i.ToString(), 0); item1.SubItems.Add(topic); item1.SubItems.Add("False"); item1.SubItems.Add(""); item1.SubItems.Add(""); this.parentForm.lvResult.Items.AddRange(new ListViewItem[] { item1 }); this.parentForm.lvResult.Items[this.parentForm.lvResult.Items.Count - 1].EnsureVisible(); this.parentForm.statusBar1.ForeColor = Color.Red; } } xmlDoc.DocumentElement.AppendChild(xmlElem); string filename = ""; filename = this.parentForm.name.Text + "_" + DateTime.Now.ToShortDateString(); int filenameNum = 1; while (File.Exists(folderpath + "\\" + filename + "_" + filenameNum + ".xml")) { filenameNum++; } filename = filename + "_" + filenameNum + ".xml"; try { xmlDoc.Save(folderpath + "\\" + filename); } catch { this.parentForm.statusBar1.Text = "保存Xml失败 ==> " + folderpath + "\\" + filename; } this.parentForm.statusBar1.ForeColor = Color.Black; //减小当前线程数 //this.parentForm.currentThreadCount--; Form1.currentThreadCount--; this.parentForm.statusBar1.Text = Thread.CurrentThread.Name + " 执行完毕!"; //更改ListView中的名字, for (int j = 0; j < this.parentForm.listView1.Items.Count; j++) { if ((int) this.parentForm.listView1.Items[j].Tag == Thread.CurrentThread.GetHashCode()) { this.parentForm.listView1.Items[j].ForeColor = Color.Black; } } this.parentForm.statusBar1.ForeColor = Color.Green; this.parentForm.statusBar1.Text = Thread.CurrentThread.Name + " 抓取完毕,共抓取" + this.parentForm.j + "篇,数据已保存在" + folderpath + "\\" + filename; this.parentForm.btnStrat.Enabled = true; this.parentForm.comboBoxListURL.Enabled = true; } /// <summary> /// 过滤内容 /// </summary> /// <param name="tempstr"></param> /// <returns></returns> public string FilterContent(string tempstr) { string url1 = this.url; try { //开始对内容页分析 tempstr = tempstr.Substring(tempstr.IndexOf(this.startTag) + this.startTag.Length); tempstr = tempstr.Substring(0, tempstr.IndexOf(this.endTag)); if (this.adStartTag != "" && this.adEndTag != "") { //clear ad try { tempstr = tempstr.Substring(0, tempstr.IndexOf(this.adStartTag)) + tempstr.Substring(tempstr.IndexOf(this.adEndTag, tempstr.IndexOf(this.adStartTag)) + this.adEndTag.Length); tempstr = tempstr.Substring(0, tempstr.IndexOf(this.adStartTag)) + tempstr.Substring(tempstr.IndexOf(this.adEndTag, tempstr.IndexOf(this.adStartTag)) + this.adEndTag.Length); tempstr = tempstr.Substring(0, tempstr.IndexOf(this.adStartTag)) + tempstr.Substring(tempstr.IndexOf(this.adEndTag, tempstr.IndexOf(this.adStartTag)) + this.adEndTag.Length); } catch { } } //开始转换tempstr中的一些相对路径的图片 ,带“ string imagename = System.Text.RegularExpressions.Regex.Match(tempstr, ".*src=\"(?<image>[^\\s]+)\".*").Groups["image"].Value; if (imagename.IndexOf("http") <= -1) { if (imagename.StartsWith("/")) { Regex r1 = new Regex(@"^http://(?<d>[^/]+)/", RegexOptions.Compiled | RegexOptions.IgnoreCase); tempstr = System.Text.RegularExpressions.Regex.Replace(tempstr, "src=\"(?<image>[^\\s]+)\"", "src=\"http://" + r1.Match(this.url).Result("${d}") + "${image}\"", RegexOptions.IgnoreCase); } else { tempstr = System.Text.RegularExpressions.Regex.Replace(tempstr, "src=\"(?<image>[^\\s]+)\"", "src=\"" + url1.Substring(0, url1.LastIndexOf('/')) + "/${image}\"", RegexOptions.IgnoreCase); } } //开始转换tempstr中的一些相对路径的图片 ,不带“ imagename = System.Text.RegularExpressions.Regex.Match(tempstr, ".*src=(?<image>[^\\s]+).*").Groups["image"].Value; if (imagename != "") { if (imagename.IndexOf("http") <= -1) { if (imagename.StartsWith("/")) { Regex r1 = new Regex(@"^http://(?<d>[^/]+)/", RegexOptions.Compiled | RegexOptions.IgnoreCase); tempstr = System.Text.RegularExpressions.Regex.Replace(tempstr, "src=(?<image>[^\\s]+)", "src=\"http://" + r1.Match(this.url).Result("${d}") + "${image}\"", RegexOptions.IgnoreCase); } else { tempstr = System.Text.RegularExpressions.Regex.Replace(tempstr, "src=(?<image>[^\\s]+)", "src=\"" + url1.Substring(0, url1.LastIndexOf('/')) + "/${image}\"", RegexOptions.IgnoreCase); } } } //替换掉垃圾内容,简单替换,暂时不支持正则替换 if (this.parentForm.txtReplace.Text != "") { tempstr = tempstr.Replace(this.parentForm.txtReplace.Text, ""); } //如果需要下载图片,则下载图片到本地,并且替换内容中图片路径 if (this.parentForm.IsDownloadImage.Checked) { Match imgMc; //找出所有的图片或者文件 Regex imgReg = new Regex(".*src=\"(?<FileName>[^\\s^>]+)\".*"); for (imgMc = imgReg.Match(tempstr); imgMc.Success; imgMc = imgMc.NextMatch()) { //开始下载文件 ContentFileName DownloadFile(imgMc.Groups["FileName"].Value); //开始修改文件中的名字 //有时间再写 } //找出所有的图片或者文件 imgReg = new Regex(".*src=(?<FileName>[^\\s^>]+).*"); for (imgMc = imgReg.Match(tempstr); imgMc.Success; imgMc = imgMc.NextMatch()) { //开始下载文件 ContentFileName DownloadFile(imgMc.Groups["FileName"].Value); //开始修改文件中的名字 //有时间再写 } } return tempstr; } catch { return ""; } } /// <summary> /// 下载文件 /// </summary> /// <param name="filename"></param> private void DownloadFile(string filename) { if (filename == "") return; string path = Application.StartupPath + "\\ArticleContentImageFile\\"; if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } try { HttpWebRequest oRequest = (HttpWebRequest) WebRequest.Create(filename); HttpWebResponse oResponse = (HttpWebResponse) oRequest.GetResponse(); StreamReader sr = new StreamReader(oResponse.GetResponseStream(), System.Text.Encoding.GetEncoding("utf-8")); string sResultContents = sr.ReadToEnd(); oResponse.Close(); byte[] bytes = System.Text.Encoding.GetEncoding("utf-8").GetBytes(sResultContents); FileStream fs = new FileStream(path + filename.Substring(filename.LastIndexOf("/")), FileMode.OpenOrCreate, FileAccess.Write); fs.Write(bytes, 0, bytes.Length); fs.Flush(); fs.Close(); } catch (Exception ex) { Console.WriteLine(ex.Message); } } /// <summary> /// 抓取页面接口 - WebClient /// </summary> /// <param name="url"></param> /// <returns></returns> private string getWebContent(string contenturl) { string str = ""; contenturl = contenturl.Replace("&", "&"); WebClient client = new WebClient(); client.Headers.Add("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*"); client.Headers.Add("Accept-Language", "zh-cn"); client.Headers.Add("UA-CPU", "x86"); client.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); try { byte[] buffer = client.DownloadData(contenturl); if (Form1.encoding == "utf-8") { str = System.Text.Encoding.GetEncoding("utf-8").GetString(buffer, 0, buffer.Length); } else { str = System.Text.Encoding.GetEncoding("gb2312").GetString(buffer, 0, buffer.Length); } } catch (Exception ex) { //MessageBox.Show(ex.Message); } return str; } } /// <summary> /// MyItem Object /// </summary> public class MyItem : object { public int id; public string name; public string regex; public string url; public string starttag; public string endtag; public string adstarttag; public string adendtag; public string pageReg; public override string ToString() { // TODO: 添加 MyItem.ToString 实现 return name; } } }
原文地址:http://www.cnblogs.com/yesun/archive/2006/06/26/431304.html

浙公网安备 33010602011771号