import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net.*;
import java.io.*;
public class Crawler {
/**
* @param args
*/
ArrayList unVisitedURL = new ArrayList();
HashSet VisitedURL = new HashSet();
public String downloadURL(String url) {
try {
URL pageurl = new URL(url);
BufferedReader reader = (new BufferedReader(new InputStreamReader(
pageurl.openStream())));
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}
return pageBuffer.toString();
} catch (Exception e) {
System.out.println("Fail to download the page named " + url);
}
return null;
}
private void get_reexp(String s, String url) {
// System.out.println(s);
Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(s);
while (m.find()) {
String link = m.group(1).trim();
if (!link.equals("#") && link.indexOf("mailto") == -1
&& link.indexOf("javascript") == -1 && link.indexOf("http:") == 0) {
if (link.indexOf('/') == 0)
link = url + link;
if (!VisitedURL.contains(link)&&link!=null && link!="")
unVisitedURL.add(link);
}
}
}
private void savePage(String Content, String url) {
try {
FileWriter bw=new FileWriter(new File(System.currentTimeMillis()+".txt"));
bw.write(Content);
System.out.println(url+"的大小是:"+Content.length());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void graspWeb() {
String url = (String) unVisitedURL.iterator().next();
if (!VisitedURL.contains(url)) {
String Content = downloadURL(url);
get_reexp(Content, url);
savePage(Content, url);
VisitedURL.add(url);
}
unVisitedURL.remove(url);
graspWeb();
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
Crawler myCrawler = new Crawler();
myCrawler.unVisitedURL.add("http://sina.cn");
myCrawler.graspWeb();
}
}