java实现爬虫

package org.jsoup.examples;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileWriter;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import org.jsoup.Jsoup;

import org.jsoup.helper.StringUtil;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.nodes.Node;

import org.jsoup.nodes.TextNode;

import org.jsoup.select.NodeTraversor;

import org.jsoup.select.NodeVisitor;

/**

* HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted

* plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a

* scrape.

* <p/>

* Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend.

* @author Jonathan Hedley, jonathan@hedley.net

public class HtmlToPlainText {

private static int i=1;

public static void main(String... args) throws IOException {

// Validate.isTrue(args.length == 1, "usage: supply url to fetch");

// String url = "http://www.16788.cn/3d/3Dlshaoma.asp?page="+i;

String url="http://zhidao.baidu.com/question/72080439.html";

Document doc = Jsoup.connect(url).get();

String need="";

HtmlToPlainText formatter = new HtmlToPlainText();

String plainText = formatter.getPlainText(doc);

FileWriter writer=new FileWriter(new File("d:/采集数据.txt"),true);

writer.append(plainText);

writer.append("\r\n");

writer.append("########################################################"+"\r\n");

writer.close();

System.out.println(plainText);

// fetch the specified URL and parse to a HTML DOM

// getData(url);

// for(String tx:txts){

// System.out.println(tx);

// }

}

private static void getData(String url2) throws IOException {

i--;

Document doc = Jsoup.connect(url2).get();

String need="";

HtmlToPlainText formatter = new HtmlToPlainText();

String plainText = formatter.getPlainText(doc);

String str=plainText.substring(plainText.indexOf("福彩3D历史开奖数据开奖期号"),plainText.indexOf("一共有")).replaceAll("详情请看", "");

String[] ss=str.split(" ");

List<String> c3d=new ArrayList<String>();

for(int i=0;i<ss.length;i++){

if(ss[i].trim().length()==3){

c3d.add(ss[i].trim());

if(ss[i].trim().equals("636")){

c3d.add("333");

}

File file=new File("D://cp.txt");

for(int i=0;i<c3d.size();i++){

if(i%2!=0){

BufferedWriter writer=new BufferedWriter(new FileWriter(file,true));

need=c3d.get(i);

writer.write(need.charAt(0)+" "+need.charAt(1)+" "+need.charAt(2)+"\r");

writer.close();

}

// if(file.exists()){

// file.delete();

// }

System.err.println("第"+i+"页数据读取成功");

String next="http://www.16788.cn/3d/3Dlshaoma.asp?page="+i;

if(i>0){

System.out.println(next);

System.out.println(i);

getData(next);

try {

new Thread().sleep(10000);

} catch (InterruptedException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

System.out.println("读取网页数据完毕");

}

/**

* Format an Element to plain-text

* @param element the root element to format

* @return formatted text

public String getPlainText(Element element) {

FormattingVisitor formatter = new FormattingVisitor();

NodeTraversor traversor = new NodeTraversor(formatter);

traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node

return formatter.toString();

}

// the formatting rules, implemented in a breadth-first DOM traverse

private class FormattingVisitor implements NodeVisitor {

private static final int maxWidth = 80;

private int width = 0;

private StringBuilder accum = new StringBuilder(); // holds the accumulated text

// hit when the node is first seen

public void head(Node node, int depth) {

String name = node.nodeName();

if (node instanceof TextNode)

append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.

else if (name.equals("li"))

append("\n * ");

}

// hit when all of the node's children (if any) have been visited

public void tail(Node node, int depth) {

String name = node.nodeName();

if (name.equals("br"))

append("\n");

else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5"))

append("\n\n");

else if (name.equals("a"))

append(String.format(" <%s>", node.absUrl("href")));

}

// appends text to the string builder with a simple word wrap method

private void append(String text) {

if (text.startsWith("\n"))

width = 0; // reset counter if starts with a newline. only from formats above, not in natural text

if (text.equals(" ") &&

(accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))

return; // don't accumulate long runs of empty spaces

if (text.length() + width > maxWidth) { // won't fit, needs to wrap

String words[] = text.split("\\s+");

for (int i = 0; i < words.length; i++) {

String word = words[i];

boolean last = i == words.length - 1;

if (!last) // insert a space if not the last word

word = word + " ";

if (word.length() + width > maxWidth) { // wrap and reset counter

accum.append("\n").append(word);

width = word.length();

} else {

accum.append(word);

width += word.length();

}

} else { // fits as is, without need to wrap text

accum.append(text);

width += text.length();

}

public String toString() {

return accum.toString();

}

对应项目

http://www.cnblogs.com/hxy520/admin/Files.aspx/WebHarvestTest.rar

posted @ 2013-02-01 18:09 6小贝阅读(331) 评论(0) 收藏举报

刷新页面返回顶部

6小贝

java实现爬虫

公告