Python爬虫学习(一)
写在前面:
最近稍微闲下来一点,趁着有这个功夫学习一下爬虫。虽然在大二下学期的时候接触过爬虫,但不太熟练,Python中数据存储的结构也不太了解,所以重新开始学习。
实战:
我是观看尚硅谷的爬虫视频进行学习的,了解了request,xpath,beautifulsoup等的使用方法,在实战部分我重新完成了大二下学期的爬虫任务,加深了理解。
一.顶会论文的爬取
import urllib.request
from bs4 import BeautifulSoup
import requests
from lxml import etree
import json
url = "https://openaccess.thecvf.com/WACV2021"
url1 = "https://openaccess.thecvf.com"
response = urllib.request.urlopen(url)
# 获取页面源码
content = response.read().decode("utf-8")
# 创建soup对象
soup = BeautifulSoup(content, 'lxml')
list_info = soup.find('dl')
list_url = list_info.find_all('dt') # 返回一个列表
list_title = soup.select('.ptitle a')
title_url = [] #获取所有论文标题的链接
for title in list_title:
title_url.append(url1+title.get('href'))
# ok,现在都存到一个列表里面了,接下来循环遍历,依次爬取
# down_title = [] # 论文标题
# down_abstract = [] # 论文摘要
# down_author = [] #论文作者
# down_pdf = [] #论文PDF
#authors
for two_url in title_url:
# resp = urllib.request.urlopen(two_url)
resp = requests.get(two_url)
# html = resp.read().decode('utf-8')
body = etree.HTML(resp.text)
# two_soup = BeautifulSoup(html, 'lxml')
# papertitle = two_soup.select('#papertitle>')
# title1 = two_soup.xpath("//div[@id='papertitle']/text()")[0]
down_title = body.xpath('//*[@id="papertitle"]/text()')[0].strip() # 论文标题
down_author = body.xpath('//*[@id="authors"]/b/i/text()')[0].strip() #论文作者
down_abstract = body.xpath('//*[@id="abstract"]/text()')[0].strip() #论文摘要
down_pdf = url1+body.xpath('//*[@id="content"]/dl/dd/a/@href')[0].strip() #论文PDF
down_abstract = down_abstract.replace("'", "\\'")
# 爬取标题
print("开始爬取"+two_url)
print(down_author)
import pymysql
db = pymysql.connect(host="localhost", user="root", password="156132", database="cloud1", charset="utf8mb4")
cursor = db.cursor()
sql = "insert into lw(title,author,abstract,pdf) values ('" + str(down_title) + "','" + str(
down_author) + "','" + down_abstract + "','" + str(
down_pdf) + "')"
try:
cursor.execute(sql)
print("插入成功")
db.commit()
# print(school_shengfen + "\t" + school_name + "添加成功")
except pymysql.Error as e:
print("增加数据失败: " + str(e))
db.rollback()
# down_author.append(two_soup.select('#authors'))
# print(title1)
# print(soup.select('#authors'))
# print('爬取完成')
# /html/body/div[3]/dl/div[1]
二.疫情数据爬取
import json
import requests
import urllib.request
url = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
response = requests.get(url)
# 取得数据词典
data = json.loads(response.content.decode())
# 3、解析数据:json模块:把json字符串转换成python可交互的数据类型字典
data_str = data['data']
json_data = json.loads(data_str)
# 获取areaTree下的children
list_data = json_data['areaTree'][0]['children']
# 两层循环遍历
lastUpdateTime = json_data['lastUpdateTime']
for areaTree in list_data:
# 获取省份名称
province_name = areaTree['name']
confirm_total = areaTree['total']['confirm']
dead_total = areaTree['total']['dead']
suspect_total = areaTree['total']['suspect']
heal_total = areaTree['total']['heal']
# 具体到市
for city_info in areaTree['children']:
city_name = city_info['name']
confirm = city_info['total']['confirm']
dead = city_info['total']['dead']
heal = city_info['total']['heal']
# 获取时间
# json_date = json_data['lastUpdateTime']
# print(json_date)
三.
import urllib.request
from bs4 import BeautifulSoup
def schoolinfo(url):
response = urllib.request.urlopen(url)
# 获取页面源码
content = response.read().decode('utf-8')
# 创建bs4对象
soup = BeautifulSoup(content, 'lxml')
# 获得表格所有信息
info_list = soup.find_all('table', {'class': 'ch-table'})
return info_list
def handinfo(list):
for tr_info in list:
list1 = tr_info.find_all('tr')
# print(list1)
# 对td进行清洗
stu_info = [] # 列表初始化
for td_info in list1:
list2 = td_info.find_all('td')
if (len(list2)):
stu_info.append(list2[0:3])
else:
continue
for info in stu_info:
school_name = info[0].get_text().strip
school_shengfen = info[1].get_text()
school_belong = info[2].get_text()
print(school_belong)
return stu_info
def insertintoDB(stu_info):
import pymysql
db = pymysql.connect(host="localhost", user="root", password="156132", database="upload", charset="utf8mb4")
cursor = db.cursor()
for info in stu_info:
# school_name = info[0].get_text().strip()
school_shengfen = info[1].get_text()
school_belong = info[2].get_text()
school = info[0].get_text().split()
school_name = ','.join(str(i) for i in school)
sql = "insert into stu_info(stu_name,stu_province,stu_belong) values ('" + str(school_name) + "','" + str(
school_shengfen) + "','" + str(school_belong) + "')"
try:
cursor.execute(sql)
db.commit()
# print(school_shengfen + "\t" + school_name + "添加成功")
except:
print("插入出错")
db.rollback()
index = 0
while index < 44:
info_list = schoolinfo("https://yz.chsi.com.cn/sch/?start=" + str(index * 20))
stu_info = handinfo(info_list)
print('爬取完成')
print('开始插入数据库')
insertintoDB(stu_info)
print('成功')
print(index)
index += 1
# 插到数据库


浙公网安备 33010602011771号