package task;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Crawler {
//1.成员变量
private static String outputfile = "output\\2018年世界各国GDP数据.txt";
private static String sourcefile = "resource\\2018年世界各国GDP数据 .htm";
//2.解析本地网页,返回Document
public static Document getHtml() throws IOException {
File f=new File(sourcefile); //创建文件对象
//解析本地文件,第一个参数是文件对象
//第二个参数是网页的字符编码格式
Document doc=Jsoup.parse(f,"UTF-8");
return doc;
}
//3.提取数据,返回集合
public static ArrayList<EntityBean> getData(Document doc) throws IOException {
//利用selector选择器,提取32行数据
Elements trs=doc.select("body > div.container > div.container > div:nth-child(5) > div.col-md-9 > div > div > div > table > tbody > tr");
ArrayList<EntityBean> list=new ArrayList<EntityBean>();//创建集合对象
for(Element tr : trs) {
if(tr.childNodeSize()<5) {
continue;
}
//获取tr,每一行的前6列数据
String pm=tr.child(0).text();
String gj=tr.child(1).text();
String szz=tr.child(2).text();
String GDP=tr.child(3).text();
String zb=tr.child(4).text();
//每行数据创建一个对应的实例化对象ed
EntityBean eb=new EntityBean(pm,gj,szz,GDP,zb);
//System.out.println(eb.toString());
list.add(eb);//将每一行数据对象放进集合
}
return list;
}
//4.写出到本地文件系统
public static void wirteToFile(ArrayList<EntityBean> ebs) throws FileNotFoundException {
PrintWriter pw=new PrintWriter(outputfile);//创建字节打印流对象
for(EntityBean eb : ebs) {
pw.println(eb.getPm()+"\t"+eb.getGj()+"\t"+eb.getSzz()+
"\t"+eb.getGDP()+"\t"+eb.getZb());
}
pw.close();
}
//5.主方法
public static void main(String[] args) throws IOException {
Document doc = Crawler.getHtml();//解析网页,得到doc对象
System.out.println("网页标题:"+doc.title());
ArrayList<EntityBean> entitys = Crawler.getData(doc); //提取数据
Crawler.wirteToFile(entitys);
}
}
package task;
public class EntityBean {
//成员变量列表
private String pm;
private String gj;
private String szz;
private String GDP;
private String zb;
//get()方法和set()方法
public String getPm() {
return pm;
}
public void setPm(String pm) {
this.pm = pm;
}
public String getGj() {
return gj;
}
public void setGj(String gj) {
this.gj = gj;
}
public String getSzz() {
return szz;
}
public void setSzz(String szz) {
this.szz = szz;
}
public String getGDP() {
return GDP;
}
public void setGDP(String gDP) {
GDP = gDP;
}
public String getZb() {
return zb;
}
public void setZb(String zb) {
this.zb = zb;
}
//重写toString()方法
@Override
public String toString() {
return "EntityBean [pm=" + pm + ", gj=" + gj + ", szz=" + szz + ", GDP=" + GDP + ", zb=" + zb + "]";
}
//无参构造方法
public EntityBean() {
super();
}
//有参构造方法
public EntityBean(String pm, String gj, String szz, String gDP, String zb) {
super();
this.pm = pm;
this.gj = gj;
this.szz = szz;
GDP = gDP;
this.zb = zb;
}
}