1 /**
2 * @author Jack.Wang
3 *
4 */
5 import java.io.BufferedReader;
6 import java.io.InputStreamReader;
7 import java.net.URL;
8 import java.util.ArrayList;
9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.LinkedHashSet;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15 // 搜索Web爬行者
16 public class SearchCrawler implements Runnable {
17
18 /*
19 * disallowListCache缓存robot不允许搜索的URL。 Robot协议在Web站点的根目录下设置一个robots.txt文件,
20 * 规定站点上的哪些页面是限制搜索的。
21 * 搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子:
22 * # robots.txt for http://somehost.com/ User-agent:
23 * Disallow: /cgi-bin/
24 * Disallow: /registration # Disallow robots on registration page
25 * Disallow: /login
26 */
27
28 private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
29 ArrayList<String> errorList = new ArrayList<String>();// 错误信息
30 ArrayList<String> result = new ArrayList<String>(); // 搜索到的结果
31 String startUrl;// 开始搜索的起点
32 int maxUrl;// 最大处理的url数
33 String searchString;// 要搜索的字符串(英文)
34 boolean caseSensitive = false;// 是否区分大小写
35 boolean limitHost = false;// 是否在限制的主机内搜索
36
37 public SearchCrawler(String startUrl, int maxUrl, String searchString) {
38 this.startUrl = startUrl;
39 this.maxUrl = maxUrl;
40 this.searchString = searchString;
41 }
42
43 public ArrayList<String> getResult() {
44 return result;
45 }
46
47 public void run() {// 启动搜索线程
48 crawl(startUrl, maxUrl, searchString, limitHost, caseSensitive);
49 }
50
51 // 检测URL格式
52 private URL verifyUrl(String url) {
53 // 只处理HTTP URLs.
54 if (!url.toLowerCase().startsWith("http://"))
55 return null;
56 URL verifiedUrl = null;
57 try {
58 verifiedUrl = new URL(url);
59 } catch (Exception e) {
60 return null;
61 }
62 return verifiedUrl;
63 }
64
65 // 检测robot是否允许访问给出的URL.
66 private boolean isRobotAllowed(URL urlToCheck) {
67 String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机
68 // System.out.println("主机="+host);
69
70 // 获取主机不允许搜索的URL缓存
71 ArrayList<String> disallowList = disallowListCache.get(host);
72
73 // 如果还没有缓存,下载并缓存。
74 if (disallowList == null) {
75 disallowList = new ArrayList<String>();
76 try {
77 URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
78 BufferedReader reader = new BufferedReader(
79 new InputStreamReader(robotsFileUrl.openStream()));
80
81 // 读robot文件,创建不允许访问的路径列表。
82 String line;
83 while ((line = reader.readLine()) != null) {
84 if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
85 String disallowPath = line.substring("Disallow:"
86 .length());// 获取不允许访问路径
87
88 // 检查是否有注释。
89 int commentIndex = disallowPath.indexOf("#");
90 if (commentIndex != -1) {
91 disallowPath = disallowPath.substring(0,
92 commentIndex);// 去掉注释
93 }
94
95 disallowPath = disallowPath.trim();
96 disallowList.add(disallowPath);
97 }
98 }
99
100 // 缓存此主机不允许访问的路径。
101 disallowListCache.put(host, disallowList);
102 } catch (Exception e) {
103 return true; // web站点根目录下没有robots.txt文件,返回真
104 }
105 }
106
107 String file = urlToCheck.getFile();
108 // System.out.println("文件getFile()="+file);
109 for (int i = 0; i < disallowList.size(); i++) {
110 String disallow = disallowList.get(i);
111 if (file.startsWith(disallow)) {
112 return false;
113 }
114 }
115
116 return true;
117 }
118
119 private String downloadPage(URL pageUrl) {
120 try {
121 // Open connection to URL for reading.
122 BufferedReader reader = new BufferedReader(new InputStreamReader(
123 pageUrl.openStream()));
124
125 // Read page into buffer.
126 String line;
127 StringBuffer pageBuffer = new StringBuffer();
128 while ((line = reader.readLine()) != null) {
129 pageBuffer.append(line);
130 }
131
132 return pageBuffer.toString();
133 } catch (Exception e) {
134 }
135
136 return null;
137 }
138
139 // 从URL中去掉"www"
140 private String removeWwwFromUrl(String url) {
141 int index = url.indexOf("://www.");
142 if (index != -1) {
143 return url.substring(0, index + 3) + url.substring(index + 7);
144 }
145
146 return (url);
147 }
148
149 // 解析页面并找出链接
150 private ArrayList<String> retrieveLinks(URL pageUrl, String pageContents,
151 HashSet crawledList, boolean limitHost) {
152 // 用正则表达式编译链接的匹配模式。
153 Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
154 Pattern.CASE_INSENSITIVE);
155 Matcher m = p.matcher(pageContents);
156
157 ArrayList<String> linkList = new ArrayList<String>();
158 while (m.find()) {
159 String link = m.group(1).trim();
160
161 if (link.length() < 1) {
162 continue;
163 }
164
165 // 跳过链到本页面内链接。
166 if (link.charAt(0) == '#') {
167 continue;
168 }
169
170 if (link.indexOf("mailto:") != -1) {
171 continue;
172 }
173
174 if (link.toLowerCase().indexOf("javascript") != -1) {
175 continue;
176 }
177
178 if (link.indexOf("://") == -1) {
179 if (link.charAt(0) == '/') {// 处理绝对地
180 link = "http://" + pageUrl.getHost() + ":"
181 + pageUrl.getPort() + link;
182 } else {
183 String file = pageUrl.getFile();
184 if (file.indexOf('/') == -1) {// 处理相对地址
185 link = "http://" + pageUrl.getHost() + ":"
186 + pageUrl.getPort() + "/" + link;
187 } else {
188 String path = file.substring(0,
189 file.lastIndexOf('/') + 1);
190 link = "http://" + pageUrl.getHost() + ":"
191 + pageUrl.getPort() + path + link;
192 }
193 }
194 }
195
196 int index = link.indexOf('#');
197 if (index != -1) {
198 link = link.substring(0, index);
199 }
200
201 link = removeWwwFromUrl(link);
202
203 URL verifiedLink = verifyUrl(link);
204 if (verifiedLink == null) {
205 continue;
206 }
207
208 /* 如果限定主机,排除那些不合条件的URL */
209 if (limitHost
210 && !pageUrl.getHost().toLowerCase().equals(
211 verifiedLink.getHost().toLowerCase())) {
212 continue;
213 }
214
215 // 跳过那些已经处理的链接.
216 if (crawledList.contains(link)) {
217 continue;
218 }
219
220 linkList.add(link);
221 }
222
223 return (linkList);
224 }
225
226 // 搜索下载Web页面的内容,判断在该页面内有没有指定的搜索字符串
227
228 private boolean searchStringMatches(String pageContents,
229 String searchString, boolean caseSensitive) {
230 String searchContents = pageContents;
231 if (!caseSensitive) {// 如果不区分大小写
232 searchContents = pageContents.toLowerCase();
233 }
234
235 Pattern p = Pattern.compile("[\\s]+");
236 String[] terms = p.split(searchString);
237 for (int i = 0; i < terms.length; i++) {
238 if (caseSensitive) {
239 if (searchContents.indexOf(terms[i]) == -1) {
240 return false;
241 }
242 } else {
243 if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
244 return false;
245 }
246 }
247 }
248
249 return true;
250 }
251
252 // 执行实际的搜索操作
253 public ArrayList<String> crawl(String startUrl, int maxUrls,
254 String searchString, boolean limithost, boolean caseSensitive) {
255
256 HashSet<String> crawledList = new HashSet<String>();
257 LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();
258
259 if (maxUrls < 1) {
260 errorList.add("Invalid Max URLs value.");
261 System.out.println("Invalid Max URLs value.");
262 }
263
264 if (searchString.length() < 1) {
265 errorList.add("Missing Search String.");
266 System.out.println("Missing search String");
267 }
268
269 if (errorList.size() > 0) {
270 System.out.println("err!!!");
271 return errorList;
272 }
273
274 // 从开始URL中移出www
275 startUrl = removeWwwFromUrl(startUrl);
276
277 toCrawlList.add(startUrl);
278 while (toCrawlList.size() > 0) {
279
280 if (maxUrls != -1) {
281 if (crawledList.size() == maxUrls) {
282 break;
283 }
284 }
285
286 // Get URL at bottom of the list.
287 String url = toCrawlList.iterator().next();
288
289 // Remove URL from the to crawl list.
290 toCrawlList.remove(url);
291
292 // Convert string url to URL object.
293 URL verifiedUrl = verifyUrl(url);
294
295 // Skip URL if robots are not allowed to access it.
296 if (!isRobotAllowed(verifiedUrl)) {
297 continue;
298 }
299
300 // 增加已处理的URL到crawledList
301 crawledList.add(url);
302 String pageContents = downloadPage(verifiedUrl);
303
304 if (pageContents != null && pageContents.length() > 0) {
305 // 从页面中获取有效的链接
306 ArrayList<String> links = retrieveLinks(verifiedUrl,
307 pageContents, crawledList, limitHost);
308
309 toCrawlList.addAll(links);
310
311 if (searchStringMatches(pageContents, searchString,
312 caseSensitive)) {
313 result.add(url);
314 System.out.println(url);
315 }
316 }
317
318 }
319 return result;
320 }
321
322 // 主函数
323 public static void main(String[] args) {
324 SearchCrawler crawler = new SearchCrawler("http://www.blogjava.net/Jack2007/", 20,"jack");
325 Thread search = new Thread(crawler);
326 System.out.println("Start searching...");
327 System.out.println("result:");
328 search.start();
329 try {
330 search.join();
331 } catch (InterruptedException e) {
332 // TODO Auto-generated catch block
333 e.printStackTrace();
334 }
335 }
336 }