用Java实现网络爬虫

myCrawler.java

package WebCrawler;

import java.io.File;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Queue;


public class MyCrawler {
    
    private static final String SAVEPATH = "C:"+File.separator+"downloadURL";

    public void crawl(ArrayList<URL> urls, int depth) {
        
        //初始化队列
        Queue<URL> q = new LinkedList<URL>();
        ArrayList<URL> visited = new ArrayList<URL>();
        q.addAll(urls);
        
        while (!q.isEmpty()) {
            
            URL head = q.poll();    //出列
            if(head.getDepth() > depth){
                break;
            }
            visited.add(head);
            String page = HtmlParserTool.getPage(head.toString());
            String charset = HtmlParserTool.getCharset(page);
            String urlFullPath = SAVEPATH+File.separator+head.toString().replaceAll("[?:<>*|////]","_")+".html";        
            HtmlParserTool.writeToDisk(urlFullPath, page, charset);    //保存到磁盘
            ArrayList<String> toVisit = HtmlParserTool.extractLinks(page);
                    
            for (String s : toVisit) {
                if (!visited.contains(s)) {
                    //visited.add(s);
                    q.add(new URL(s, head.getDepth()+1));
                }
            }

        }
    }
    
    public static void main(String[] args) throws Exception {
        
        ArrayList<URL> urls = new ArrayList<URL>();
        urls.add(new URL("http://www.baidu.com"));
        new MyCrawler().crawl(urls,1);
    }
}

 

HtmlParserTool.java

package WebCrawler;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.visitors.NodeVisitor;


public class HtmlParserTool {


    //判断字符串是否是一个网址
    private static boolean isValidUrl(String url) {
        if (url.startsWith("http") | url.startsWith("https")) {
            return true;
        } else {
            return false;
        }
    }
    
    
    //获取网页包含的超链接
    public static ArrayList<String> extractLinks(String content){

        ArrayList<String> links = new ArrayList<String>();
        Parser parser = null;
        NodeVisitor visitor = null;
        try {
            parser = new Parser(content);    
            visitor = new NodeVisitor() {
                
                @Override
                public void visitTag(Tag tag) {
                    if(tag instanceof LinkTag) {
                        LinkTag link = (LinkTag)tag;
                        String linkString = link.getLink();
                        if(isValidUrl(linkString) && !links.contains(linkString)) {
                            links.add(linkString);
                        }
                    }
                }
            };
            parser.visitAllNodesWith(visitor);
            
        } catch (Exception e) {
            e.printStackTrace();
        }
            
        return links;
    }
    
    
    //获取字符集
    public static String getCharset(String content) {
        int startIdx = content.indexOf("charset");
        int endIdx = content.indexOf("\"", startIdx+9);    
        String charset = content.substring(startIdx+9, endIdx);
        return charset;
    }
    
    //获取网页内容
    public static String getPage(String url) {

        CloseableHttpClient client = HttpClients.createDefault();
        HttpGet request = new HttpGet(url);
        String content="";
        try {
            CloseableHttpResponse response = client.execute(request);
            //System.out.println("Response Code: " + response.getStatusLine().getStatusCode());

            
            BufferedReader rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));
            
            String line = "";
            while ((line = rd.readLine()) != null) {
                content = content + line + "\n";
            }

            response.close();
            client.close();
            String charset = getCharset(content);
            if(charset != null) {
                content = new String(content.getBytes(),charset);
            }
            
        } catch (Exception e) {
            e.printStackTrace();
        }
            
        return content;
    }
    
    //将网页内容写至磁盘
    public static void writeToDisk(String path, String content, String charset){
        
        try {
            File file = new File(path);
            OutputStream o = new FileOutputStream(file);
            o.write(content.getBytes(charset));
            o.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

 

 

URL.java

package WebCrawler;

public class URL {

    private String url;
    private int depth;
    
    public URL(String url) {
        this.url = url;
        this.depth = 1;
    }
    
    public URL(String url, int depth) {
        this.url = url;
        this.depth = depth;
    }
    
    public String toString() {
        return this.url;
    }
    
    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }
    
    public int getDepth() {
        return depth;
    }

    public void setDepth(int depth) {
        this.depth = depth;
    }
}

 

posted @ 2016-04-27 13:21  finalboss1987  阅读(338)  评论(0编辑  收藏  举报