搞垂直搜索搞了这么写天,现在已经把我们学院的网页爬完了,非常快,时间不超过30s.








1
using System;
2
using System.Collections.Generic;
3
using System.ComponentModel;
4
using System.Data;
5
using System.Drawing;
6
using System.Text;
7
using System.Windows.Forms;
8
using System.Web.Security;
9
using System.IO;
10
using System.Net;
11
using System.Text.RegularExpressions;
12
13
namespace ie
14
{
15
public partial class Form1 : Form
16
{
17
public Form1()
18
{
19
InitializeComponent();
20
21
}
22
23
private void button1_Click(object sender, EventArgs e)
24
{
25
26
for (Int32 i = 100; i <= 1750; i++)
27
{
28
String ii = i.ToString();
29
30
string strmd5 = FormsAuthentication.HashPasswordForStoringInConfigFile(ii, "md5");
31
string x = "http://ie.wh.sdu.edu.cn/show." + strmd5 + ".ie";
32
readText read = new readText();
33
read.setUrl(x);
34
if (read.equalS() == 0) continue;
35
else
36
{
37
read.writeText(ii);
38
}
39
}
40
MessageBox.Show("完成!");
41
}
42
public class readText
43
{
44
private string[] text;
45
private string url;
46
public readText()
47
{
48
text = new string[6];
49
for (int i = 0; i <= 5; i++)
50
text[i] = "";
51
url = "";
52
}
53
public void setUrl(string urr)
54
{
55
56
url = urr;
57
}
58
59
public int equalS()
60
{
61
int x = read();
62
if (x == -1) return 0;
63
for (int i = 0; i < 5; i++)
64
{
65
string s = text[i];
66
if (s.Trim().Equals(""))
67
return 0;
68
}
69
return 1;
70
}
71
72
73
public void writeText(String str)
74
{
75
76
string s = url + "\r\n";
77
for (int i = 0; i <= 5; i++)
78
{
79
s = s + text[i] + "\r\n";
80
}
81
string ss = "d:\\ie\\" + str + ".txt";
82
if (!Directory.Exists("d:\\ie\\"))
83
Directory.CreateDirectory("d:\\ie\\");
84
File.AppendAllText(ss, s);
85
}
86
public int read()
87
{
88
89
string sss;
90
WebRequest wreq = WebRequest.Create(url);//_url是字符串,代表要得到的网页。
91
WebResponse wres = wreq.GetResponse();
92
Stream stream = wres.GetResponseStream();//这里得到的流是网页内容
93
if (stream == null) return -1;
94
StreamReader sr = new StreamReader(stream, Encoding.Default);
95
StringBuilder sb = new StringBuilder();
96
string rl;
97
while ((rl = sr.ReadLine()) != null)
98
{
99
sb.Append(rl);
100
}
101
sss = sb.ToString();
102
wres.Close();
103
pReadIe(sss);
104
return 0;
105
}
106
public void pReadIe(String sInput)
107
{
108
MatchCollection matches;
109
Regex[] extractHTML = new Regex[5];
110
extractHTML[0] = new Regex("<td class=\"title\" align=\"center\">(?<name>[^/]+)</td>");
111
extractHTML[1] = new Regex(@"作者:[\s]+(?<name>[\S]+)");
112
extractHTML[2] = new Regex("<td align=\"center\" class=\"adddatetime\">(?<data>[^ ]+)[\\s](?<time>[^ ]+)</td>");
113
extractHTML[3] = new Regex(@" 浏览:[\s]+(?<num>[\d]+)[\s]+次</td>");
114
extractHTML[4] = new Regex("<td align=\"left\" class=\"content\">(?<count>[\\w\\W]+)</td>[\\s]+</tr>[\\s]+<tr>[\\s]+<td align=");
115
for (int i = 0; i <= 4; i++)
116
{
117
matches = extractHTML[i].Matches(sInput);
118
119
foreach (Match matchMade in matches)
120
{
121
122
switch (i)
123
{
124
case 0: text[0] = matchMade.Groups[1].Value; break;
125
case 1: text[1] = matchMade.Groups[1].Value; break;
126
case 2: text[2] = matchMade.Groups[1].Value;
127
text[3] = matchMade.Groups[2].Value;
128
break;
129
case 3: text[4] = matchMade.Groups[1].Value; break;
130
case 4: text[5] = matchMade.Groups[1].Value; break;
131
}
132
133
}
134
}
135
}
136
}
137
138
139
}
140
}
141
142
using System;2
using System.Collections.Generic;3
using System.ComponentModel;4
using System.Data;5
using System.Drawing;6
using System.Text;7
using System.Windows.Forms;8
using System.Web.Security;9
using System.IO;10
using System.Net;11
using System.Text.RegularExpressions;12

13
namespace ie14
{15
public partial class Form1 : Form16
{17
public Form1()18
{19
InitializeComponent();20

21
}22

23
private void button1_Click(object sender, EventArgs e)24
{25

26
for (Int32 i = 100; i <= 1750; i++)27
{28
String ii = i.ToString();29

30
string strmd5 = FormsAuthentication.HashPasswordForStoringInConfigFile(ii, "md5");31
string x = "http://ie.wh.sdu.edu.cn/show." + strmd5 + ".ie";32
readText read = new readText();33
read.setUrl(x);34
if (read.equalS() == 0) continue;35
else36
{37
read.writeText(ii);38
}39
}40
MessageBox.Show("完成!");41
}42
public class readText43
{44
private string[] text;45
private string url;46
public readText()47
{48
text = new string[6];49
for (int i = 0; i <= 5; i++)50
text[i] = "";51
url = "";52
}53
public void setUrl(string urr)54
{55

56
url = urr;57
}58

59
public int equalS()60
{61
int x = read();62
if (x == -1) return 0;63
for (int i = 0; i < 5; i++)64
{65
string s = text[i];66
if (s.Trim().Equals(""))67
return 0;68
}69
return 1;70
}71

72

73
public void writeText(String str)74
{75

76
string s = url + "\r\n";77
for (int i = 0; i <= 5; i++)78
{79
s = s + text[i] + "\r\n";80
}81
string ss = "d:\\ie\\" + str + ".txt";82
if (!Directory.Exists("d:\\ie\\"))83
Directory.CreateDirectory("d:\\ie\\");84
File.AppendAllText(ss, s);85
}86
public int read()87
{88

89
string sss;90
WebRequest wreq = WebRequest.Create(url);//_url是字符串,代表要得到的网页。91
WebResponse wres = wreq.GetResponse();92
Stream stream = wres.GetResponseStream();//这里得到的流是网页内容93
if (stream == null) return -1;94
StreamReader sr = new StreamReader(stream, Encoding.Default);95
StringBuilder sb = new StringBuilder();96
string rl;97
while ((rl = sr.ReadLine()) != null)98
{99
sb.Append(rl);100
}101
sss = sb.ToString();102
wres.Close();103
pReadIe(sss);104
return 0;105
}106
public void pReadIe(String sInput)107
{108
MatchCollection matches;109
Regex[] extractHTML = new Regex[5];110
extractHTML[0] = new Regex("<td class=\"title\" align=\"center\">(?<name>[^/]+)</td>");111
extractHTML[1] = new Regex(@"作者:[\s]+(?<name>[\S]+)");112
extractHTML[2] = new Regex("<td align=\"center\" class=\"adddatetime\">(?<data>[^ ]+)[\\s](?<time>[^ ]+)</td>");113
extractHTML[3] = new Regex(@" 浏览:[\s]+(?<num>[\d]+)[\s]+次</td>");114
extractHTML[4] = new Regex("<td align=\"left\" class=\"content\">(?<count>[\\w\\W]+)</td>[\\s]+</tr>[\\s]+<tr>[\\s]+<td align=");115
for (int i = 0; i <= 4; i++)116
{117
matches = extractHTML[i].Matches(sInput);118

119
foreach (Match matchMade in matches)120
{121

122
switch (i)123
{124
case 0: text[0] = matchMade.Groups[1].Value; break;125
case 1: text[1] = matchMade.Groups[1].Value; break;126
case 2: text[2] = matchMade.Groups[1].Value;127
text[3] = matchMade.Groups[2].Value;128
break;129
case 3: text[4] = matchMade.Groups[1].Value; break;130
case 4: text[5] = matchMade.Groups[1].Value; break;131
}132

133
}134
}135
}136
}137

138

139
}140
}141

142

