常用对象
一、爬虫过程
1 import re
2 import requests
3
4 # 发送请求,查看响应的各个属性
5 url = "http://www.buu.edu.cn"
6 resp = requests.get(url)
7
8 print(resp.status_code) # 响应码
9 print(resp.url)
10 print(resp.headers)
11 print(resp.request)
12 print(resp.content)
13 print(resp.encoding) # ISO-8859-1
14 print(resp.apparent_encoding) # utf-8
15
16 resp.encoding = resp.apparent_encoding
17
18 def getHtml(url):
19 try:
20 resp = requests.get(url)
21 resp.raise_for_status() # 网络连接异常
22 resp.encoding = resp.apparent_encoding # 设置编码方式
23 except:
24 print("error")
25 return resp.text # 返回响应的内容
26
27 html = getHtml(url)
28 print(html)
二、封装请求
1 import requests
2
3 # 模拟浏览器和定制请求
4 url = "http://www.douban.com"
5 resp = requests.get(url)
6
7 # 查看属性
8 print(resp.text)
9 print(resp.request.headers)
10
11 # 封装请求
12 header = {
13 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/92.0.4515.131 Safari/537.36"
14 }
15
16 # 发送请求
17 resp = requests.get(url,headers=header)
18 print(resp.text) # 网页源代码
三、BS解析
1 from bs4 import BeautifulSoup # 解析网页
2 import re
3
4 # 创建一个BeautifulSoup对象,参数: 打开的要解析的文件
5 soup = BeautifulSoup(open("index.html",encoding='utf-8'))
6
7 # 字符串
8 for tag in soup.find_all("p"):
9 print(tag.name)
10
11 # 正则表达式 ,compile编写
12 for tag in soup.find_all(re.compile("^b")): # 以b开头
13 print(tag.name)
14
15 for tag in soup.find_all(re.compile("e")): # 含有e
16 print(tag.name)
17
18 # 列表,["",""]
19 ls=["a","p"]
20 for tag in soup.find_all(ls):
21 print(tag) # 显示的是内容+标签
22
23 # True 全部的标签
24 for tag in soup.find_all(True):
25 print(tag.name)
26
27
28 # find_all()方法
29 # 标签名查找
30 res = soup.find_all("p")
31 print(res)
32
33 # 标签属性
34 res = soup.find_all(border="1")
35 print(res)
36
37 res = soup.find_all(attrs={"border":"1"}) # 多个参数时,常用{"":""}
38 print(res)
39
40 # CSS选择器
41 res = soup.find_all(class_="me")
42 print(res)
43
44 # 标签内容
45 res = soup.find_all(text="大数据导论")
46 print(res)
47
48 # 组合查找
49 res = soup.find_all("td",border="1") # 指明
50 print(res)
51
52 res = soup.find_all(text=re.compile("数据")) # 赋值内容
53 print(res)
54
55 res = soup.find_all(href=re.compile("^http:")) # 查找链接
56 print(res)
四、选中元素
1 import re
2 from bs4 import BeautifulSoup
3
4 soup = BeautifulSoup(open("index.html",encoding="utf-8"))
5
6 '''
7 select() 方法
8
9 '''
10 # 单个标签名查找
11 res = soup.select("a")
12 print(res)
13
14 # 多个标签层层查找
15 res = soup.select("p a")
16 print(res)
17
18 res = soup.select("body table th")
19 print(res)
20
21 # 按属性查找
22 res = soup.select("a[href]")
23 print(res)
24
25
26 # 增加上属性值
27 res = soup.select("a[href='http://www.buu.edu.cn']")
28 print(res)
29
30 res = soup.select('a[href*="www"]') # 含有www
31 print(res)
32
33 res = soup.select("a[href='http://']")
34 print(res)
35
36 res = soup.select("a[href$='.cn']")
37 print(res)
38
39
40 # CSS选择器
41 # id选择器
42 res = soup.select("#t1")
43 print(res)
44
45 # 类名选择器
46 res = soup.select(".me")
47 print(res)
48
49 # 同时用多种CSS选择器查询元素
50 res = soup.select("#t1,.me")
51 print(res)
五、获取内容
1 from bs4 import BeautifulSoup
2
3 soup = BeautifulSoup(open("index.html",encoding="utf-8"))
4
5 print(soup.prettify) # 页面内容
6
7 print(soup.a)
8 print(soup.table)
9 print(soup.table.name)
10
11 tattrs = soup.table.attrs # 属性
12
13 print(tattrs)
14 print(tattrs["border"]) # 取出其中的一个属性
15
16 print(type(soup.a)) # 查看类型
17
18 print(soup.a.string) # 查看超链接的内容
19
20 print(soup.contents) # 源代码
21
22 print(soup.table.tr)
六、遍历树
1 from bs4 import BeautifulSoup
2
3 soup = BeautifulSoup(open("index.html",encoding="utf-8"))
4
5 # 文档树的遍历
6 print(soup.table.parent) # 源代码
7
8 tags = soup.table.parents # 所有的父元素
9 for tag in tags:
10 print(tag.name)
11
12 #
13 print(soup.table.children) # 子元素
14
15
16 #
17 print(soup.table.descendants) # 后代