爬虫作业
今天建民哥让我们自学爬虫,并完成对论文的爬取,如下
其实一开始在课上自己看并没有看的很明白也一直迷迷糊糊没搞懂
后来找张可欣帮忙,懂了怎么使用pc去爬取,然后做了该系统,学完了发现也挺简单嘿嘿
eclipse代码如下:
bean包
bean.java
package bean; public class bean { private String zuozhe; private String title; private String date; public bean(String zuozhe, String title, String date) { this.zuozhe = zuozhe; this.title = title; this.date = date; } public String getZuozhe() { return zuozhe; } public void setZuozhe(String zuozhe) { this.zuozhe = zuozhe; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } @Override public String toString() { return "bean{" + "zuozhe='" + zuozhe + '\'' + ", title='" + title + '\'' + ", date='" + date + '\'' + '}'; } }
result.java
package bean; public class result { private String name; private String title; private String date; private String author; private String booktitle; public result(String name, String title, String date, String author, String booktitle) { this.name = name; this.title = title; this.date = date; this.author = author; this.booktitle = booktitle; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getBooktitle() { return booktitle; } public void setBooktitle(String booktitle) { this.booktitle = booktitle;
dao包
package dao; import bean.*; import dbutil.dbutil; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import static dbutil.dbutil.close; public class dao { public static List<result> show(String name) { Connection con = null; PreparedStatement ptst=null; List<result> re=new ArrayList<>(); try { con = dbutil.getconn(); } catch (SQLException e) { e.printStackTrace(); } // System.out.println(s); if( con!=null ){ // System.out.println("succeed"); try { String sql= "select name,author,title,booktitle,date from cvpr where name =?"; ptst = con.prepareStatement(sql); // System.out.println(ptst); ptst.setString(1,name); ResultSet rs=ptst.executeQuery(); while (rs.next()) { String name1=rs.getString(1); String anthor=rs.getString(2); String title=rs.getString(3); String booktitle=rs.getString(4); String date=rs.getString(5); result re1=new result(name1,title,date,anthor,booktitle); //System.out.println(s1); re.add(re1); } System.out.println(re); } catch (SQLException e) { e.printStackTrace(); } finally { close(ptst,con); } } return re; } public static List<result> show1(String author) { Connection con = null; PreparedStatement ptst=null; List<result> re=new ArrayList<>(); try { con = dbutil.getconn(); } catch (SQLException e) { e.printStackTrace(); } // System.out.println(s); if( con!=null ){ // System.out.println("succeed"); try { String sql= "select name,author,title,booktitle,date from cvpr where author =?"; ptst = con.prepareStatement(sql); // System.out.println(ptst); ptst.setString(1,author); ResultSet rs=ptst.executeQuery(); while (rs.next()) { String name1=rs.getString(1); String anthor=rs.getString(2); String title=rs.getString(3); String booktitle=rs.getString(4); String date=rs.getString(5); result re1=new result(name1,title,date,anthor,booktitle); //System.out.println(s1); re.add(re1); } System.out.println(re); } catch (SQLException e) { e.printStackTrace(); } finally { close(ptst,con); } } return re; }
dbutil包
package dbutil; import java.sql.*; public class dbutil { private static String URL="jdbc:mysql://localhost:3306/pachong?useSSL=true&useUnicode=true&characterEncoding=UTF-8"; //取得驱动程序 private static String DRIVER="com.mysql.jdbc.Driver"; //取得用户 private static String USER="root"; //登录密码 private static String PASSWORD="LCRZKA"; //静态代码块加载驱动类信息 static { try { Class.forName(DRIVER);//将"com.mysql.jdbc.Driver"类的Class类对象加载到运行时内存中 } catch (ClassNotFoundException e) { e.printStackTrace(); } } public static Connection getconn() throws SQLException { //1-注册驱动器,驱动管理器类加载SQLServerDriver类的静态方法,如果没有添加这个驱动,则创建这个驱动 // Connection con=null; // try { // Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver"); // con=DriverManager.getConnection("jdbc:sqlserver://localhost:1433;database=students;encrypt=false", "sa", "ddjsdd"); // } catch (ClassNotFoundException e) { // e.printStackTrace(); // } // //会话连接 Connection conn = null; try { conn = DriverManager.getConnection(URL, USER, PASSWORD); } catch (SQLException e) { e.printStackTrace(); } return conn; } // public static void main(String[] args) throws SQLException { // Connection con=dbutil.getconn(); // System.out.println(con); // close(con); // } // //3-创建一个Statement对象,用于将SQL语句发送到数据库 // stmt = con.createStatement(); // // //4- SQL语句 // String SQL = "SELECT * FROM Student"; // // //5-执行SQL,返回数据 // rs = stmt.executeQuery(SQL); // // //6-遍历 // while (rs.next()) { // // System.out.println(rs.getString(1) + "," + rs.getString(2).trim()+"," + rs.getString(3)); // } public static void close(Connection con) { if(con!=null) try { con.close(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void close (PreparedStatement preparedState, Connection conn) { if (preparedState != null) { try { preparedState.close(); } catch (SQLException e) { e.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } public static void close (ResultSet rs, PreparedStatement preparedState, Connection conn) { if (rs != null) { try { rs.close(); } catch (SQLException e) { e.printStackTrace(); } } if (preparedState != null) { try { preparedState.close(); } catch (SQLException e) { e.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } /** * 关闭连接 * @param state * @param conn */ public static void close (Statement state, Connection conn) { if (state != null) { try { state.close(); } catch (SQLException e) { e.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } public static void close (ResultSet rs, Statement state, Connection conn) { if (rs != null) { try { rs.close(); } catch (SQLException e) { e.printStackTrace(); } } if (state != null) { try { state.close(); } catch (SQLException e) { e.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } }
servlet包
package servlet; import java.io.IOException; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import dao.dao; import bean.*; @SuppressWarnings("serial") @WebServlet("/servelt") public class servlet extends HttpServlet { protected void service(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { req.setCharacterEncoding("utf-8"); String method = req.getParameter("method"); if("chaxun".equals(method)) { chaxun(req, resp); } } private static void chaxun(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException { req.setCharacterEncoding("utf-8"); String name=req.getParameter("name"); String author=req.getParameter("zuozhe"); String date =req.getParameter("date"); System.out.println(date); bean be=new bean(author,name,date); System.out.println(be); List<result> re= dao.show(name); req.setAttribute("rs",re); req.getRequestDispatcher("2.jsp").forward(req, resp);
servlet1
package servlet; import java.io.IOException; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import dao.dao; import bean.*; @SuppressWarnings("serial") @WebServlet("/servelt1") public class servlet1 extends HttpServlet { protected void service(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { req.setCharacterEncoding("utf-8"); String method = req.getParameter("method"); if("chaxun".equals(method)) { chaxun(req, resp); } } private static void chaxun(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException { req.setCharacterEncoding("utf-8"); String name=req.getParameter("name"); String author=req.getParameter("zuozhe"); String date =req.getParameter("date"); System.out.println(date); bean be=new bean(author,name,date); System.out.println(be); List<result> re= dao.show1(author); req.setAttribute("rs",re); req.getRequestDispatcher("result.jsp").forward(req, resp); } }
index.jsp
<%-- Created by IntelliJ IDEA. User: 34723 Date: 2022/5/17 Time: 12:51 To change this template use File | Settings | File Templates. --%> <%@ page contentType="text/html;charset=UTF-8" language="java" %> <html> <head> <title>查询界面</title> </head> <style> body { background: url('image/88.jpg') no-repeat; background-size: 100% auto; } #login_box { width: 75%; height: 400px; background-color: #00000060; margin: auto; margin-top: 10%; text-align: center; border-radius: 10px; padding: 50px 50px; } #login_box input, #login_box button { outline: none; } #login_box h1 { color: #ffffff90; margin-top: 3%; font-size:300%; font-family:"宋体"; } #login_box #form #input_box { margin-top: 5%; } #login_box #sign_up { margin-top: 8%; } #login_box #sign_up a { color: #FFFFFF; } #Login_box.aaa{ width: 20%; height: 50px; font-size: 14px; text-align: center; line-height: 50px; } </style> <body> <div id="login_box"> <form action="servelt?method=chaxun" method="post"> <h1>查询界面</h1> <div class="aaa"> 请选择论文题目:</div> <div class="aaa"> <select name="name"> <option value="IDA-3D: Instance-Depth-Aware 3D Object Detection From Stereo Vision for Autonomous Driving" >IDA-3D: Instance-Depth-Aware 3D Object Detection From Stereo Vision for Autonomous Driving</option> <option value="FroDO: From Detections to 3D Objects" >FroDO: From Detections to 3D Objects</option> <option value="KeypointNet: A Large-Scale 3D Keypoint Dataset Aggregated From Numerous Human Annotations" >KeypointNet: A Large-Scale 3D Keypoint Dataset Aggregated From Numerous Human Annotations</option> <option value="Bridging the Gap Between Anchor-Based and Anchor-Free Detection via Adaptive Training Sample Selection" >Bridging the Gap Between Anchor-Based and Anchor-Free Detection via Adaptive Training Sample Selection</option> <option value="Structure Aware Single-Stage 3D Object Detection From Point Cloud" >Structure Aware Single-Stage 3D Object Detection From Point Cloud/option> </select> </div> <br> <div class="bbb"> 请选择论文作者:</div> <div class="bbb"><select name="zuozhe"> <option value="Peng, Wanli and Pan, Hao and Liu, He and Sun, Yi" >Peng, Wanli and Pan, Hao and Liu, He and Sun, Yi</option> <option value="Xiong, Peixi and Wu, Ying">Xiong, Peixi and Wu, Ying</option> <option value="Zhang, Shifeng and Chi, Cheng and Yao, Yongqiang and Lei, Zhen and Li, Stan Z." >Zhang, Shifeng and Chi, Cheng and Yao, Yongqiang and Lei, Zhen and Li, Stan Z.</option> <option value="He, Chenhang and Zeng, Hui and Huang, Jianqiang and Hua, Xian-Sheng and Zhang, Lei" >He, Chenhang and Zeng, Hui and Huang, Jianqiang and Hua, Xian-Sheng and Zhang, Lei</option> </select> </div> </table> <br> <input type="submit" value="提交查询"> </form> </div> </body> </html>
result.jsp
<%@ page import="bean.result" %> <%@ page import="java.util.List" %><%-- Created by IntelliJ IDEA. User: 34723 Date: 2022/5/16 Time: 12:05 To change this template use File | Settings | File Templates. --%> <html> <%@ page contentType="text/html;charset=UTF-8" language="java" %> <head> <title>查询结果</title> </head> <style type="text/css"> body{ background: url('image/88.jpg') no-repeat; /*兼容浏览器版本*/ -webkit-background-size: cover; -o-background-size: cover; background-size: 100% auto; } .bottom-list .t-b-1{ border: 1px solid #f0f0f0; width: 20%; height: 50px; font-size: 14px; text-align: center; line-height: 50px; } .bottom-list .t-b-2{ border: 1px solid #f0f0f0; width: 20%; height: 300px; text-align: center; line-height: 50px; } #login_box { width: 75%; height: 400px; background-color: #00000060; margin: auto; margin-top: 10%; text-align: center; border-radius: 10px; padding: 50px 50px; } #login_box input, #login_box button { outline: none; } #login_box h1 { color: #ffffff90; margin-top: 3%; font-size:300%; font-family:"宋体"; } #login_box #form #input_box { margin-top: 5%; } #login_box #sign_up { margin-top: 8%; } #login_box #sign_up a { color: #FFFFFF; } </style> <body> <div id="login_box"> <%List<result> list = (List<result>) request.getAttribute("rs");%> <div class="bottom-list"> <div style=" background:#f9f9f9;display: flex;justify-content: center;"> <div class="t-b-1"> 编号</div> <div class="t-b-1"> 论文名称</div> <div class="t-b-1">作者</div> <div class="t-b-1">标题</div> <div class="t-b-1"> 发行期刊</div> <div class="t-b-1"> 发行日期</div> </div> <div style=" background:#fff;display: flex;justify-content: center;"> <% for (int i = 0; i < list.size(); i++) { %> <div class="t-b-2"><%=i + 1%></div> <div class="t-b-2"> <%=list.get(i).getName()%></div> <div class="t-b-2"> <%=list.get(i).getAuthor()%></div> <div class="t-b-2"><%=list.get(i).getTitle()%></div> <div class="t-b-2"> <%=list.get(i).getBooktitle()%></div> <div class="t-b-2"> <%=list.get(i).getDate()%></div> <%}%> </div> </div> <div align="center"> <a href="index.jsp">返回主页面</a> </div> </div> </body> </html>
爬虫代码
import requests from bs4 import BeautifulSoup import re import pymysql url = 'https://openaccess.thecvf.com/CVPR2020?day=2021-06-18' response = requests.get(url) obj1 = re.compile(r'<dt class="ptitle"><br>.*?.html">(?P<name>.*?)</a></dt>.*?' r'\[<a href="(?P<pdf>.*?)">pdf</a>].*?' r'author = {(?P<author>.*?)},<br>.*?' r'title = {(?P<title>.*?)},<br>.*?' r'booktitle = {(?P<booktitle>.*?)},<br>', re.S) result = obj1.finditer(response.text) # 连接数据库 conn = pymysql.connect(host='localhost', user='root', password='LCRZKA', database='pachong', charset='utf8', port=3306) # 创建游标对象 cursor = conn.cursor() sql = 'INSERT INTO cvpr(`name`, pdf, author, title, booktitle, `date`) values(%s,%s,%s,%s,%s,%s)' for it in result: try: data = [it.group('name'), it.group('pdf'), it.group('author'), it.group('title'), it.group('booktitle'), 20100618] cursor.execute(sql, data) conn.commit() except Exception as e: print(e) response.close() # 关闭游标 cursor.close() # 关闭连接 conn.close() print('over!!!')