java实现爬虫

package org.jsoup.examples;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
/**
 * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted
 * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a
 * scrape.
 * <p/>
 * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend.
 *
 * @author Jonathan Hedley, jonathan@hedley.net
 */
public class HtmlToPlainText {
private static int i=1;
    public static void main(String... args) throws IOException {
//        Validate.isTrue(args.length == 1, "usage: supply url to fetch");
//       String url = "http://www.16788.cn/3d/3Dlshaoma.asp?page="+i;
    String url="http://zhidao.baidu.com/question/72080439.html";
    Document doc = Jsoup.connect(url).get();
    String need="";
        HtmlToPlainText formatter = new HtmlToPlainText();
        String plainText = formatter.getPlainText(doc);
        FileWriter writer=new FileWriter(new File("d:/采集数据.txt"),true);
        writer.append(plainText);
        writer.append("\r\n");
        writer.append("########################################################"+"\r\n");
        writer.close();
        System.out.println(plainText);
        // fetch the specified URL and parse to a HTML DOM
//       getData(url);
//       for(String tx:txts){
//       System.out.println(tx);
//       }
    }
    private static void getData(String url2) throws IOException {
    i--;
    Document doc = Jsoup.connect(url2).get();
    String need="";
         HtmlToPlainText formatter = new HtmlToPlainText();
         String plainText = formatter.getPlainText(doc);
         String str=plainText.substring(plainText.indexOf("福彩3D历史开奖数据 开奖期号"),plainText.indexOf("一共有")).replaceAll("详情请看", "");
         String[] ss=str.split(" ");
         List<String> c3d=new ArrayList<String>();
         for(int i=0;i<ss.length;i++){
        if(ss[i].trim().length()==3){
        c3d.add(ss[i].trim());
        if(ss[i].trim().equals("636")){
        c3d.add("333");
        }
        }
         }
         File file=new File("D://cp.txt");
         for(int i=0;i<c3d.size();i++){
        if(i%2!=0){
         BufferedWriter writer=new BufferedWriter(new FileWriter(file,true));
                need=c3d.get(i);
                writer.write(need.charAt(0)+"   "+need.charAt(1)+"   "+need.charAt(2)+"\r");
                writer.close();
        }
         }
         
//         if(file.exists()){
//         file.delete();
//         }
         System.err.println("第"+i+"页数据读取成功");
         String next="http://www.16788.cn/3d/3Dlshaoma.asp?page="+i;
        if(i>0){
        System.out.println(next);
        System.out.println(i);
         getData(next);
         try {
new Thread().sleep(10000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
        }
         System.out.println("读取网页数据完毕");
}
/**
     * Format an Element to plain-text
     * @param element the root element to format
     * @return formatted text
     */
    public String getPlainText(Element element) {
        FormattingVisitor formatter = new FormattingVisitor();
        NodeTraversor traversor = new NodeTraversor(formatter);
        traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node
        return formatter.toString();
    }
    // the formatting rules, implemented in a breadth-first DOM traverse
    private class FormattingVisitor implements NodeVisitor {
        private static final int maxWidth = 80;
        private int width = 0;
        private StringBuilder accum = new StringBuilder(); // holds the accumulated text
        // hit when the node is first seen
        public void head(Node node, int depth) {
            String name = node.nodeName();
            if (node instanceof TextNode)
                append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
            else if (name.equals("li"))
                append("\n * ");
        }
        // hit when all of the node's children (if any) have been visited
        public void tail(Node node, int depth) {
            String name = node.nodeName();
            if (name.equals("br"))
                append("\n");
            else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5"))
                append("\n\n");
            else if (name.equals("a"))
                append(String.format(" <%s>", node.absUrl("href")));
        }
        // appends text to the string builder with a simple word wrap method
        private void append(String text) {
            if (text.startsWith("\n"))
                width = 0; // reset counter if starts with a newline. only from formats above, not in natural text
            if (text.equals(" ") &&
                    (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))
                return; // don't accumulate long runs of empty spaces
            if (text.length() + width > maxWidth) { // won't fit, needs to wrap
                String words[] = text.split("\\s+");
                for (int i = 0; i < words.length; i++) {
                    String word = words[i];
                    boolean last = i == words.length - 1;
                    if (!last) // insert a space if not the last word
                        word = word + " ";
                    if (word.length() + width > maxWidth) { // wrap and reset counter
                        accum.append("\n").append(word);
                        width = word.length();
                    } else {
                        accum.append(word);
                        width += word.length();
                    }
                }
            } else { // fits as is, without need to wrap text
                accum.append(text);
                width += text.length();
            }
        }
        public String toString() {
            return accum.toString();
        }
    }
}

 

对应项目

http://www.cnblogs.com/hxy520/admin/Files.aspx/WebHarvestTest.rar 

posted @ 2013-02-01 18:09  6小贝  阅读(331)  评论(0)    收藏  举报