抓取网页数据

package zhou;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
    /**
     * 获取博客上的文章标题和链接
     */
    public  void article() {
        Document doc;
        try {
            doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/").get();
            Elements ListDiv = doc.getElementsByAttributeValue("class","postTitle");//获取指定class
            for (Element element :ListDiv) {
                Elements links = element.getElementsByTag("a");
                for (Element link : links) {
                    String linkHref = link.attr("href");
                    String linkText = link.text().trim();
                    System.out.println(linkHref);
                    System.out.println(linkText);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

    }
    
    
       /**
     * 获取指定博客文章的内容
     */
    //@Test
    public void Blog() {
        Document doc;
        try {
            doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html").get();
            Elements ListDiv = doc.getElementsByAttributeValue("class","postBody");
            for (Element element :ListDiv) {
                System.out.println(element.html());
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        
    }
    

    //@Test
    public void souLaoLai(){
        int count = 0;
        try{
            OutputStream os=new FileOutputStream("D:\\tempSql.sql");
            BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(os,"GBK"));
            String allsql = "";
            System.out.println("开始");
            for(int i =1;i<=132;i++){
                String phpUrl = "http://123.57.67.94:8080/lxth/forum.php?mod=forumdisplay&fid=36&page="+i;
                Document doc = Jsoup.connect(phpUrl).timeout(5000).get();
                Elements e1 = doc.getElementById("wp").getElementById("threadlist").getElementsByTag("th").select("a");
                String tempStr = e1.text();
                tempStr = tempStr.replace("New", "");
                String []temparray = tempStr.split("姓名");
                for (int j = 1; j < temparray.length; j++) {
                    String tempStr2 = temparray[j];
                    if(tempStr2.indexOf("身份证号")>0){
                        String []temparrya2 = tempStr2.split("身份证号");
                        String name = "";
                        String cardid = "";
                        if(i==11){
                            name = temparrya2[0].replace(":","").replace(":", "").trim();
                            cardid = temparrya2[1].replace(":","").replace(":", "").trim();
                        }else{
                            name = temparrya2[0].replace(":","").replace(":", "").trim();
                            cardid = temparrya2[1].replace(":","").replace(":", "").trim();
                        }
                        String sql = "insert into BLACKLIST_INFO (ID, BLACKLIST_USER, CARD_ID, PROVINCE, ADDRESS, WORK_ADDRESS, USER_PHONE, LOAN_DATE, PAY_DATE, OVER_TIME, " +
                                "UPDATE_DATE, STATISTICAL_DATE, OVER_DAYS, OVER_NUMBER, OVER_MONEY, PAY_CAPITAL, OVER_TYPE, DATA_SOURCE, LOAN_CLIENT, LOAN_TYPE," +
                                " NETWORK_LINK, REMARK1, REMARK2, EMAIL, LELIEVE, CREATE_DEPT_ID, CREATE_DEPT_NAME, CREATE_NAME_ID, CREATE_NAME, SOURCE_TYPE)" +
                                " values (SEQ_BLACKLIST_INFO.Nextval, '"+name+"', '"+cardid+"', null, null, null, null, null, null, null, to_date(to_char(sysdate,'dd-mm-yyyy hh24:mi:ss'), 'dd-mm-yyyy hh24:mi:ss'), null, null, null, null, null, '0', 3, '老赖网', '不详', '"+phpUrl+"', null, null, null, null, null, null,null,  null, null);";
                        sql = sql.replace("&", "&&");
                        allsql +=sql+"\r\n";
                        count ++;
                        System.out.println(sql);
                    }
                }

            }

           // bw.write(allsql);
            System.out.println("共:"+count+"条");
        }catch (Exception e) {
            System.out.println(count);
            e.printStackTrace();
            // TODO: handle exception
        }
        
    }
    
    
    /**
     * 解析网页数据
     */
    //@Test
    public  void xiaokeai() {
        Document doc;
        try {
            for (int i = 1; i < 165; i++) {
            doc = Jsoup.connect("http://www.jiedai.cn/blacklist/"+i+".html").get();
            Elements links = doc.getElementsByAttributeValue("class","black_item");//获取指定的class
            for (Element element :links) {
                System.out.println(element.text().trim().substring(element.text().trim().indexOf("姓名:")+"姓名:".length(),element.text().trim().indexOf("手机:")).trim());
                System.out.println(element.text().trim().substring(element.text().trim().indexOf("身份:")+"身份:".length(),element.text().trim().indexOf("地址:")).trim());
                System.out.println(element.text().trim().substring(element.text().trim().indexOf("手机:")+"手机:".length(),element.text().trim().indexOf("身份:")).trim());
                System.out.println(element.text().trim().substring(element.text().trim().indexOf("地址:")+"地址:".length(),element.text().trim().indexOf("申报情况:")).trim());
            }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


}

 

posted @ 2015-01-26 16:00  也许还年轻  阅读(178)  评论(0)    收藏  举报