个人作业二
个人作业——爬数据并对其进行操作
当时上课的时候现学的爬虫,写了一个尝试爬取微博热搜的代码并存入txt文件的代码。
import requests
# import os
from bs4 import BeautifulSoup
cookies = {
'PC_TOKEN': '460f44babc',
'SUB': '_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z',
'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80',
'_s_tentry': 'passport.weibo.com',
'Apache': '8055727688366.35.1652407589169',
'SINAGLOBAL': '8055727688366.35.1652407589169',
'ULV': '1652407589186:1:1:1:8055727688366.35.1652407589169:',
}
headers = {
'authority': 's.weibo.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'max-age=0',
# Requests sorts cookies= alphabetically
# 'cookie': 'PC_TOKEN=460f44babc; SUB=_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80; _s_tentry=passport.weibo.com; Apache=8055727688366.35.1652407589169; SINAGLOBAL=8055727688366.35.1652407589169; ULV=1652407589186:1:1:1:8055727688366.35.1652407589169:',
'referer': 'https://passport.weibo.com/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-site',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39',
}
params = {
'cate': 'realtimehot',
}
#获取网页这里的参数是怎么定的
response = requests.get('https://s.weibo.com/top/summary', params=params, cookies=cookies, headers=headers)
response.encoding='utf-8'
# print(response.text)
#解析网页
#pl_top_realtimehot > table > tbody > tr> td.td-02 > a
#pl_top_realtimehot > table > tbody > tr > td.td-02 > a
content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a"
main_page = BeautifulSoup(response.text, 'html.parser')
# 获取数据,第一个参数是标签,attrs代表参数
# find找一个,findall()找所有
# main_page.find("div",attrs={"class":"TypeList"})
#清洗数据
a=main_page.select(content)
# print(a)
for i in range(0,len(a)):
a[i] = a[i].text
print(a[i])
爬取数据代码:
import requests
from bs4 import BeautifulSoup
import re
import pymysql
url = 'https://openaccess.thecvf.com/CVPR2020?day=2020-06-18'
response = requests.get(url)
obj1 = re.compile(r'<dt class="ptitle"><br>.*?.html">(?P<name>.*?)</a></dt>.*?'
r'\[<a href="(?P<pdf>.*?)">pdf</a>].*?'
r'author = {(?P<author>.*?)},<br>.*?'
r'title = {(?P<title>.*?)},<br>.*?'
r'booktitle = {(?P<booktitle>.*?)},<br>', re.S)
result = obj1.finditer(response.text)
# 连接数据库
conn = pymysql.connect(host='localhost', user='root', password='123456', database='exercise', charset='utf8', port=3306)
# 创建游标对象
cursor = conn.cursor()
sql = 'INSERT INTO cvpr(`name`, pdf, author, title, booktitle, `date`) values(%s,%s,%s,%s,%s,%s)'
for it in result:
try:
data = [it.group('name'), it.group('pdf'), it.group('author'), it.group('title'), it.group('booktitle'), 20200618]
cursor.execute(sql, data)
conn.commit()
except Exception as e:
print(e)
response.close()
# 关闭游标
cursor.close()
# 关闭连接
conn.close()
print('over!!!')
写入数据库:
BufferedReader hrefBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestHref.json"));
BufferedReader authorBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAuthor.json"));
BufferedReader articleBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestArticle.json"));
BufferedReader abstractBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAbstract.json"));
BufferedReader yearBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestYear.json"));
String lineHref = null;
String lineAuthor = null;
String lineArticle = null;
String lineAbstract = abstractBufferedReader.readLine();
String lineYear = null;
while ((lineHref = hrefBufferedReader.readLine()) != null) {
lineAbstract = abstractBufferedReader.readLine();
lineAuthor = authorBufferedReader.readLine();
lineArticle = articleBufferedReader.readLine();
lineYear = yearBufferedReader.readLine();
Paper paper = new Paper();
paper.setHref(lineHref);
paper.setAuthor(lineAuthor);
paper.setArticle(lineArticle);
paper.setPaperAbstract(lineAbstract);
paper.setYear(lineYear);
paperMapper.insert(paper);
}


浙公网安备 33010602011771号