我的第一个web spider,在处理链接的时候有点小问题
import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.net.*; import java.io.*; public class Crawler { /** * @param args */ ArrayList unVisitedURL = new ArrayList(); HashSet VisitedURL = new HashSet(); public String downloadURL(String url) { try { URL pageurl = new URL(url); BufferedReader reader = (new BufferedReader(new InputStreamReader( pageurl.openStream()))); String line; StringBuffer pageBuffer = new StringBuffer(); while ((line = reader.readLine()) != null) { pageBuffer.append(line); } return pageBuffer.toString(); } catch (Exception e) { System.out.println("Fail to download the page named " + url); } return null; } private void get_reexp(String s, String url) { // System.out.println(s); Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(s); while (m.find()) { String link = m.group(1).trim(); if (!link.equals("#") && link.indexOf("mailto") == -1 && link.indexOf("javascript") == -1 && link.indexOf("http:") == 0) { if (link.indexOf('/') == 0) link = url + link; if (!VisitedURL.contains(link)&&link!=null && link!="") unVisitedURL.add(link); } } } private void savePage(String Content, String url) { try { FileWriter bw=new FileWriter(new File(System.currentTimeMillis()+".txt")); bw.write(Content); System.out.println(url+"的大小是:"+Content.length()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void graspWeb() { String url = (String) unVisitedURL.iterator().next(); if (!VisitedURL.contains(url)) { String Content = downloadURL(url); get_reexp(Content, url); savePage(Content, url); VisitedURL.add(url); } unVisitedURL.remove(url); graspWeb(); } public static void main(String[] args) throws Exception { // TODO Auto-generated method stub Crawler myCrawler = new Crawler(); myCrawler.unVisitedURL.add("http://sina.cn"); myCrawler.graspWeb(); } }