'''
1 BeautifulSoup 解析库
2.MongoDB 存储库
3.requests-html
'''
'''
1. 什么是bs4?
是一个基于re开发的解析库,可以提供一些强大的解析功能。
提高提取数据的效率和与爬虫开发效率
'''
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 从bs4中导入BeautifulSoup
from bs4 import BeautifulSoup
# 调用实例化得到䘝soup对象
# 参数1:解析文本 参数2: 解析器(html,parser,'lxml')
soup = BeautifulSoup(html_doc,'lxml') # 第一个参数是对象,第二个是解析器
print(soup)
print('*'*100)
print(type(soup))
# 文档美化
html_doc = soup.prettify()
print(soup)
![复制代码]()
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>lyj</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
'''
1. 直接使用
2.获取标签的名称
3. 获取标签的属性
4. 获取标签的内容
5. 嵌套选择
6. 子节点、子孙节点
7. 父节点、祖先节点
8. 兄弟节点
'''
# # 直接使用
# print(soup.p) # 查找第一个p标签
# print(soup.a) # 查找第一个a标签
#
# # 2.获取标签的名称
# print(soup.head.name) #获取head标签的名称
#
# # 3. 获取标签的属性
# print(soup.a.attrs) # 获取a标签中的所有属性
# print(soup.a.attrs['href']) # 获取a标签中的href的属性
#
# # 4. 获取标签的内容
# print(soup.p.text) # $37
#
# # 5. 嵌套选择
# print(soup.html.head)
#
# # 6. 子节点、子孙节点
# print(soup.body.children) # body所有子节点,返回的是一个迭代器对象
# print(list(soup.body.children)) # 强转成列表类型
# print(soup.body.descendants)
# print(list(soup.body.descendants))
# # 7. 父节点、祖先节点
# print(soup.p.parent) # 获取p标签的父亲节点
# print(soup.p.parents) # 获取p标签的所有祖先节点,返回的是迭代器对象
# print(list(soup.p.parents))
# # 8. 兄弟节点
# # 找下一个兄弟
# print(soup.p.next_sibling)
# # 找下面所有兄弟,返回的是生成器对象
# print(soup.p.next_siblings)
# print(list(soup.p.next_siblings))
# 找上一个兄弟
print(soup.a.previous_sibling) # 文本是一个节点,也是他的兄弟
# 找到a标签上面的所有兄弟节点
print(soup.a.previous_siblings)
print(list(soup.a.previous_siblings)) # 返回时生成器
![复制代码]()
![复制代码]()
'''
find() 找一个
find_all() 找多个
'''
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
# # 字符串过滤器
# p_tag = soup.find(name='p')
# print(p_tag) # 根据文本p查找某个标签
# p_tags = soup.find_all(name='p')
# print(p_tags) # 找到所有标签为p的
# # attrs
# # 查找到第一个class为sister的节点
# p = soup.find(attrs={'class':'sister'})
# print(p)
# # 查找到所有class为sister的节点
# tag_s = soup.find_all(attrs={'class':'sister'})
# print(tag_s)
# # text
# text = soup.find(text="$37")
# print(text)
# # 配合使用:
# # 找到一个ID为link2、文本为Lacie的标签
# a = soup.find(name='a', attrs={'id': "link2"}, text='Lacie')
# print(a)
# # 正则过滤器
# import re
# # name
# p_tag = soup.find(name=re.compile('p'))
# print(p_tag)
# # 列表过滤器
# import re
# tags = soup.find_all(name=['p','a',re.compile('html')])
# print(tags)
# -bool 过滤器
# True匹配
# 找到有id的p标签
p = soup.find(name='p', attrs={"id":True})
print(p)
# # 方法过滤器
# # # 匹配标签名为a、属性有id,class的标签
# # def have_id_class(tag):
# # if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):
# # return tag
# #
# # tag = soup.find(name=have_id_class)
# # print(tag)
![复制代码]()
![复制代码]()
'''
主页:
图标地址、下载次数、大小、详情页地址
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=wu90ydNj9Q4dxxHzRq5PvALC
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=wu90ydNj9Q4dxxHzRq5PvALC
32
'''
import requests
from bs4 import BeautifulSoup
# 发送请求
def get_page(url):
response = requests.get(url)
return response
# 开始解析
# 解析主页
def parse_detail(text):
soup = BeautifulSoup(text, 'lxml')
name = soup.find(name='span',attrs={'class':'title'}).text
print(name)
love = soup.find(name='span',attrs={'class':'love'}).text
print(love)
commit_num = soup.find(name='a', attrs={'class':'comment-open'}).text
print(commit_num)
commit_content = soup.find(name='div', attrs={'class': 'con'}).text
print(commit_content)
download_url = soup.find(name='a', attrs={'class': 'normal-dl-btn'}).attrs['href']
# print(download_url)
print(
f'''
=========begin============
app名称:{name}
好评率:{love}
评论数:{commit_num}
小编点评:{commit_content}
app下载链接:{download_url}
==========end===============
'''
)
def parse_index(data):
soup = BeautifulSoup(data, 'lxml')
# 获取所有app的li标签
app_list = soup.find_all(name='li',attrs={'class':'card'})
for app in app_list:
# 图标地址
# 获取第一个img标签中的data-original属性
img = app.find(name='img').attrs['data-original']
print(img)
# 下载次数
# 获取class为install-count的span标签中的文本
down_num = app.find(name='span',attrs={'class':'install-count'}).text
print(down_num)
import re
# 大小
# 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本
size = soup.find(name='span', text=re.compile("\d+MB")).text
print(size)
# 详情页地址
# 获取class为detail-check-btn的a标签中的href属性
detail_url = app.find(name='a').attrs['href']
print(detail_url)
# 3,往详情页发送请求
response = get_page(detail_url)
# 4, 解析app详情页
parse_detail(response.text)
def main():
for line in range(1, 33):
url = f'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=lAd3GvU1DbFpJzYVdADWw9pS'
# 1. 往app接口发送请求
response = get_page(url)
print('*'*1000)
# 反序列化为字典
data = response.json()
# 获取接口中app标签数据
app_li = data['data']['content']
# print(app_li)
# 2. 解析app标签数据
parse_index(app_li)
if __name__ == '__main__':
main()
![复制代码]()
'''
1. 下载安装
https://www.cnblogs.com/kermitjam/p/10863933.html#_label1
2. 在c盘创建一个data/db文件夹
- 数据的存放路径
3. mongod启动服务
进入终端,输入mongod启动mongoDB服务
4. mongo进入mongoDB客户端
打开一个新的终端,输入mongo进入客户端
数据库操作:
SQL:
切换库
use admin; 有则切换,无则报错
MongoDB:
切换库
use tank; 有则切换,无则创建,并切换到tank库中。
查数据库
SQL:
show database
MongoDB:
show dbs
显示的数据库若无数据,则不显示
删除库:
SQL:
drop database
MongoDB:
db.dropDatabase()
集合的操作: MySQL中叫表
SQL:
create table f1,f2
MongoDB:
#在当前库中,通过.来创建集合
db.student
插入数据:
# 插入多条数据
db.student.insert([{"name1":"lyj"},{"name2":"zmm"}])
# 插入一条
db.student.insert({"name":"lyj"})
查数据:
#查找student集合所有数据
db.student.find({})
db.student.find({"name":"lyj"})
'''
from pymongo import MongoClient
# 1. 链接mongoDB客户端
# 参数1:mongoDB的ip地址
# 参数2:mongoDB的端口号 默认:27017
client = MongoClient('localhost',27017)
# print(client)
#
# # 2.进入lyj_db库,没有则创建
# print(client['lyj_db'])
#
# # 3.创建集合
# print(client['lyj_db']['people'])
# 4. 给lyj_db库插入数据
# a.插入一条
data1 = {
'name':'lyj',
'age':'21',
'sex':'female'
}
data2 = {
'name':'zmm',
'age':'20',
'sex':'female'
}
data3 = {
'name':'zcj',
'age':'21',
'sex':'female'
}
client['lyj_db']['people'].insert([data1,data2,data3])
# 5.查数据
# 查看所有
data_s = client['lyj_db']['people'].find()
print(data_s)
# 需要循环打印所有数据
for data in data_s:
print(data)
# 查看一条数据
data = client['lyj_db']['people'].find_one()
print(data)
# 官方推荐使用
#插入一条insert_one
client['lyj_db']['people'].insert_one()
# 插入多条insert_many
client['lyj_db']['people'].insert_many()