制作CVPR 热词云(并爬取pdf地址 名称)
任务目标:
1.结对开发
2.完成论文的题目、摘要、关键词、原文链接四项内容爬取;
3.存储到本地数据库中
4.按照题目、关键词分类统计得到最热的十个领域方向
(完成的任务和目标有出入)
爬取的python 代码:

#!/usr/bin/python #这里是解释器位置和python版本 #-*- coding: utf-8 -*- #编码格式 """ @author: CuiXingYu @contact: a15931829662@163.com @software: PyCharm @file: CVPR.py @time: 2020/4/17 19:36 """ import re import requests import pymysql def get_context(url): """ params: url: link return: web_context """ web_context = requests.get(url) return web_context.text def get_conn(): """ 建立数据库连接 :return: """ conn=pymysql.connect( #本机IP地址 host='127.0.0.1', #数据库用户名 user='root', #密码 password='101032', #需要操作的数据库名称 db='db_database07', ) #cursor对象 可以进行sql语句执行 和 获得返回值 cursor=conn.cursor() return conn,cursor def close_conn(conn,cursor): """ 关闭连接 :param conn: 连接对象 :param cursor: cursor对象 :return: """ if cursor: cursor.close() if conn: conn.close() def get_name(): """ 获取论文的名字 url 地址 :return: """ conn,cursor=get_conn() url = 'http://openaccess.thecvf.com//CVPR2019.py' web_context = get_context(url) # find paper files ''' (?<=href=\"): 寻找开头,匹配此句之后的内容 .+: 匹配多个字符(除了换行符) ?pdf: 匹配零次或一次pdf (?=\">pdf): 以">pdf" 结尾 |: 或 ''' info = [] # link pattern: href="***_CVPR_2019_paper.pdf">pdf link_list = re.findall(r"(?<=href=\").+?pdf(?=\">pdf)|(?<=href=\').+?pdf(?=\">pdf)", web_context) # name pattern: <a href="***_CVPR_2019_paper.html">***</a> name_list = re.findall(r"(?<=2019_paper.html\">).+(?=</a>)", web_context) for one,two in zip(name_list,link_list): info.append([one,two]) # sql语句 对数据库进行操作 sql = "insert into paperinfo(name,url) values(%s,%s)" try: # 执行sql语句 cursor.executemany(sql,info) conn.commit() except: conn.rollback() close_conn(conn, cursor) def saveContent_list(hotword ,number): """ 插入数据库 :param hotword: 单词 :param number: 数量 :return: """ # 打开数据库连接(ip/数据库用户名/登录密码/数据库名) conn,cursor=get_conn() sql="insert into hotword (hotword,number) values (%s,%s)" val=(hotword,number) cursor.execute(sql,val) # 使用 fetchone() 方法获取数据. conn.commit() # 关闭数据库连接(别忘了) conn.close() def get_hotword(): """ 爬取热词并统计数目 :return: """ url = 'http://openaccess.thecvf.com//CVPR2019.py' web_context = get_context(url) name_list = re.findall(r"(?<=2019_paper.html\">).+(?=</a>)", web_context) text = " " for word in name_list: text = text + word word = text.split() word_dict = {} for w in word: if w not in word_dict: word_dict[w] = 1 else: word_dict[w] = word_dict[w] + 1 a = sorted(word_dict.items(), key=lambda item: item[1], reverse=True) # sql语句 对数据库进行操作 for x in a: try: word=x[0] num=x[1] saveContent_list(word,num) except: print("失败") get_hotword() get_name()
实现的Web界面:

1 <%@ page language="java" contentType="text/html; charset=UTF-8" 2 pageEncoding="UTF-8"%> 3 4 <%@taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> 5 6 <%request.setCharacterEncoding("utf-8"); 7 response.setCharacterEncoding("utf-8");%> 8 <% 9 10 String path = request.getContextPath(); 11 String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; 12 %> 13 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> 14 <html> 15 <head> 16 <meta charset="utf-8"> 17 <base href="<%=basePath%>"> 18 <!--// echarts CDN--> 19 <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script> 20 <!--// 下载wordcloud.js文件 21 // https://github.com/ecomfe/echarts-wordcloud--> 22 <script src="js/worldcloud.js"></script> 23 <script src="js/jquery-3.4.1.min.js"></script> 24 </head> 25 <body> 26 <style> 27 html, body{ 28 width: 100%; 29 height: 100%; 30 margin: 0; 31 } 32 #main{ 33 width: 600px; 34 height: 500px; 35 background: rgba(70, 120, 200, 0.2) 36 } 37 a{ 38 text-decoration: none; 39 font-family: Segoe UI Light; 40 font-size: 20px; 41 } 42 </style> 43 <div id='main'></div> 44 <script> 45 var chart = echarts.init(document.getElementById('main')); 46 var postURL = "WordServlet"; 47 var mydata = new Array(); 48 $.ajaxSettings.async = false; 49 $.post(postURL, {}, function(rs) { 50 var dataList = JSON.parse(rs); 51 for (var i = 0; i < dataList.length; i++) { 52 var d = {}; 53 d['name'] = dataList[i].word; 54 d['value'] = dataList[i].number; 55 mydata.push(d); 56 } 57 }); 58 $.ajaxSettings.async = true; 59 var option = { 60 tooltip : {}, 61 series : [ { 62 type : 'wordCloud', 63 gridSize : 2, 64 sizeRange : [ 20, 50 ], 65 rotationRange : [ -90, 90 ], 66 shape : 'pentagon', 67 width : 800, 68 height : 600, 69 drawOutOfBound : false, 70 textStyle : { 71 normal : { 72 color : function() { 73 return 'rgb(' 74 + [ Math.round(Math.random() * 160), 75 Math.round(Math.random() * 160), 76 Math.round(Math.random() * 160) ] 77 .join(',') + ')'; 78 } 79 }, 80 emphasis : { 81 shadowBlur : 10, 82 shadowColor : '#333' 83 } 84 }, 85 data : mydata 86 } ] 87 }; 88 chart.setOption(option); 89 chart.on('click', function(params) { 90 var url = "ClickServlet?name=" + params.name; 91 window.location.href = url; 92 }); 93 94 95 </script> 96 <div> 97 <table class="table table-hover"> 98 <thead> 99 <tr> 100 <td style="font-size: 20px;">论文链接</td> 101 </tr> 102 </thead> 103 <tbody> 104 <c:forEach items="${list}" var="data" varStatus="vs"> 105 <tr> 106 <td><a href="${data.url}">${data.name}</a></td> 107 </tr> 108 </c:forEach> 109 </tbody> 110 </table> 111 </div> 112 113 </body> 114 </html>
效果展示:
热词爬取有问题,还未修改(羞愧)。多是代词 。。。。