使用jsoup抓取网页数据并存入数据库
该项目简单思路就是通过jsoup抓取到的数据先存入到ArrayList中,然后再通过jdbc存入到数据库。
创建的是maven项目,需要在pom.xml中添加两个依赖:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
创建实体类:
package dao;
/**
* 实体类
*/
public class Blog {
private String title;//标题
private String href;//链接
private String author;//作者
private String reads;//阅读量
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getHref() {
return href;
}
public void setHref(String href) {
this.href = href;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getReads() {
return reads;
}
public void setReads(String reads) {
this.reads = reads;
}
public Blog(String title, String href, String author, String reads) {
super();
this.title = title;
this.href = href;
this.author = author;
this.reads = reads;
}
public Blog(){
super();
}
@Override
public String toString() {
return "Blog{" +
"title='" + title + '\'' +
", href='" + href + '\'' +
", author='" + author + '\'' +
", reads='" + reads + '\'' +
'}';
}
}
然后进行数据爬取操作和存数据库操作:
package control;
import dao.Blog;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
public class Grab {
private static Document doc;
public static void main(String[] args) {
try {
//获取的网页链接
doc = Jsoup.connect("https://www.cnblogs.com/").get();
} catch (IOException e) {
e.printStackTrace();
}
//调用Blog类
BlogZhua();
}
/**
* 抓取网页数据
*/
public static void BlogZhua() {
//获取网页中的元素
Elements eles = doc.select("div#post_list>div.post_item");
//将抓取的数据存入到ArrayList集合中
ArrayList<Blog> arrayList = new ArrayList();
//使用for循环遍历网页中的数据
for (Element ele : eles) {
//抓取页面中的文章标题
String txt = ele.select("div.post_item_body>h3>a.titlelnk").text();
//抓取页面中的文章链接
String href = ele.select("div.post_item_body>h3>a.titlelnk").attr("href");
//爬取文章作者
String author = ele.select("div.post_item_foot > a.lightblue").text();
//爬取文章发布时间
String reads = ele.select("div.post_item_foot > span.article_view > a.gray").text();
//将数据添加到集合中
Blog blog = new Blog();
blog.setTitle(txt);
blog.setHref(href);
blog.setAuthor(author);
blog.setReads(reads);
arrayList.add(blog);
}
//遍历ArrayList集合
for (Blog test : arrayList) {
// System.out.println("数据:" + test);
//此处调用AddBlog类,把集合中的数据添加到数据库中
AddBlog(test);
}
}
/**
* 数据库操作
*
* @param blog
*/
public static void AddBlog(Blog blog) {
try {
//加载mysql驱动
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
Connection connection = null;
PreparedStatement pstmt = null;
try {
//连接mysql
connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/jsoup?useSSL=true", "root", "1234");
//编写sql语句
String sql = "INSERT INTO `content`(`title`, `href`, `author`, `reads`) VALUES (?,?,?,?)";
pstmt = connection.prepareStatement(sql);
pstmt.setString(1, blog.getTitle());
pstmt.setString(2, blog.getHref());
pstmt.setString(3, blog.getAuthor());
pstmt.setString(4, blog.getReads());
int result = pstmt.executeUpdate();
//验证数据是否添加成功
if (result > 0) {
System.out.println("数据添加成功!");
}
} catch (SQLException throwables) {
throwables.printStackTrace();
System.out.println("数据库访问失败!");
}
}
}
浙公网安备 33010602011771号