双人合作---爬取CVPR论文
需求:
爬取官网: http://openaccess.thecvf.com/ICCV2019.py 论文,
爬取内容:论文标题,简介,摘要,热词(由于官网没有数据,之后自己统计),论文链接
存入数据库并生成一个热词云,并要求点击热词云上的链接能够访问包含该热词的所有论文
最终效果:


思路:
爬虫使用python的requests模块,导入数据库,然后对爬取的文章进行题目和摘要的关键词分析,分析出词频最高的有效词汇,最后使用Echart
显示热词云
原码:
爬虫:
import pymysql
import re
import requests
# 连接数据库函数
def insertCvpr(value):
try:
db = pymysql.connect("localhost", "root", "root", "jiaoli")
print("数据库连接成功!")
cur = db.cursor()
sql = 'INSERT INTO cvpr(title,ab,hotword,pdf) VALUE (%s,%s,%s,%s)'
cur.execute(sql, value)
db.commit()
print("增加数据成功!")
except pymysql.Error as e:
print("增加数据失败: " + str(e))
db.rollback()
db.close()
#开头
url = "http://openaccess.thecvf.com/ICCV2019.py"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"}
res = requests.get(url,headers=headers)
res.encoding = "utf-8"
# 先爬取每个论文的网址
web = re.findall("""<dt class="ptitle"><br><a href="(.*?)">.*?</a></dt>""", res.text, re.S)
for each in web:
try:
each = "http://openaccess.thecvf.com/" + each
print(each)
res = requests.get(each, headers=headers, timeout=(3, 7))
res.encoding = "utf-8"
# 在各各论文网站中爬取详细信息
title = re.findall("""<div id="papertitle">(.*?)</div>""", res.text, re.S)
ab = re.findall("""<div id="abstract" >(.*?)</div>""", res.text, re.S)
pdf = re.findall("""\[<a href="\.\./\.\./(.*?)">pdf</a>\]""", res.text, re.S)
if (len(title) > 0):
title = title[0].replace("\n", "")
ab = ab[0].replace("\n", "")
pdf = "http://openaccess.thecvf.com/" + pdf[0]
print(title)
value = (title, ab, "", pdf)
insertCvpr(value)
except:
print("闪过")
统计热词:
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.servlet.http.HttpSession;
//2.将删除改成类名
/**
* Servlet implementation class index
*/
@WebServlet("/input")
public class input extends HttpServlet{
private static final long serialVersionUID = 1L;
/**
* @see HttpServlet#HttpServlet()
*/
public input() {
super();
// TODO Auto-generated constructor stub
}
/**
* @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
*/
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
request.setCharacterEncoding("UTF-8");
response.setContentType("text/html;charset=UTF-8");
//声明缓冲区
HttpSession session = request.getSession();
String url = "jdbc:mysql://localhost:3306/cvpr?&useSSL=false&serverTimezone=UTC&useUnicode=yes&characterEncoding=utf8";
Connection conn = null;
PreparedStatement ps = null;
try {
Class.forName("com.mysql.cj.jdbc.Driver");
conn = DriverManager.getConnection(url, "root", "root");
} catch (ClassNotFoundException e) {
response.getWriter().print("加载驱动失败");
} catch (SQLException e) {
response.getWriter().print("连接数据库失败");
}
StringBuffer buffer2 = new StringBuffer();
try {
Statement stmt = conn.createStatement();
//1.改sql语句
ResultSet rs = stmt.executeQuery("select * from cvpr");
while (rs.next()) {
String title=new String(rs.getString("title"));
String ab=new String(rs.getString("ab"));
buffer2.append(title);
buffer2.append(ab);
}
}catch (SQLException e) {
response.getWriter().print("查找失败");
}
String file = buffer2.toString();
String[] a=file.split("[^a-zA-Z]+");
int n=a.length;
int kind=0,zs=0;
Object[][] b=new Object[n][2];
for(;zs<n;zs++){
int k=0;
for(int i=0;i<kind;i++){
if(((String) b[i][0]).equalsIgnoreCase(a[zs])){
b[i][1]=(int)b[i][1]+1;
k=1;
break;
}
}
if(k==0){
b[kind][0]=a[zs];
b[kind][1]=1;
kind++;
}
}
int max=0;
int p=0,q=0;
String m;
String[] c=new String[1000];
int[] d=new int[1000];
for(int i=0;i<1000;i++)
{
c[i]="";
d[i]=0;
}
for(int i=0;i<kind;i++){
for(int j=0;j<kind;j++){
if((int)b[j][1]>(int)b[max][1])
{if(((String) b[j][0]).length()<4)
{j=j++;
continue;}
if(((String) b[j][0]).equals("With"))
{j++;
continue;}
if(((String) b[j][0]).equals("with"))
{j++;
continue;}
if(((String) b[j][0]).equals("that"))
{j++;
continue;}
if(((String) b[j][0]).equals("this"))
{j++;
continue;}
if(((String) b[j][0]).equals("from"))
{j++;
continue;}
if(((String) b[j][0]).equals("which"))
{j++;
continue;}
else max=j;
}
}
System.out.println(b[max][0]+"出现次数为:"+b[max][1]);
if(i<30)
{c[i]=(String) b[max][0];
d[i]= (int) b[max][1];
b[max][1]=0;
}
else
b[max][1]=0;
}
session.setAttribute("c",c);
session.setAttribute("d",d);
request.getRequestDispatcher( "reciyun.jsp").forward(request,response);
}
}
展示界面:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Document</title>
</head>
<body>
<%String num[] = (String[])session.getAttribute("c");%>
<%int num2[] = (int[])session.getAttribute("d");%>
<form action="tiaozhuan" method="post">
<div id = "main" style="width: 1200px;height: 800px;"></div>
</form>
<script type="text/javascript" src = "js/echarts.min.js"></script>
<script type="text/javascript" src = "js/echarts-wordcloud.min.js"></script>
<script type="text/javascript">
var worldCloudcharts=echarts.init(document.getElementById('main'));
var worldCloudoption = {
title: {
text: 'CVPR热词',
x: 'center',
textStyle: {
fontSize: 23,
color:'#FFFFFF'
}
},
tooltip: {
show: true
},
series: [{
name: 'CVPR热词',
type: 'wordCloud',
sizeRange: [20, 130],
rotationRange: [-45, 90],
textPadding: 0,
autoSize: {
enable: true,
minSize: 10
},
textStyle: {
normal: {
color: function() {
return 'rgb(' + [
Math.round(Math.random() * 160),
Math.round(Math.random() * 160),
Math.round(Math.random() * 160)
].join(',') + ')';
}
},
emphasis: {
shadowBlur: 10,
shadowColor: '#333'
}
},
data: [{
name: "Jayfee",
value: 666
}, {
name: "Nancy",
value: 520
}]
}]
};
var JosnList = [];
JosnList.push(
<%for(int i=0;i<29;i++)
{
%>
{name: "<%=num[i]%>",value: <%=num2[i]%>,url:'tiaozhuan?title=<%=num[i]%>'},
<%
}
%>
{name: "<%=num[99]%>",value: <%=num2[29]%>,url:'tiaozhuan?title=<%=num[29]%>'}
);
worldCloudoption.series[0].data = JosnList;
worldCloudcharts.setOption(worldCloudoption);
worldCloudcharts.on("click",function(e){
console.log(e);
window.open(e.data.url);
});
</script>
</body>
</html>
jsp
最后对关键词在数据库进行模糊查询把相关文章信息返回jsp中展示

浙公网安备 33010602011771号