Python爬虫基础

常用对象

一、爬虫过程

 1 import re
 2 import requests
 3 
 4 # 发送请求,查看响应的各个属性
 5 url = "http://www.buu.edu.cn"
 6 resp = requests.get(url)     
 7 
 8 print(resp.status_code)    # 响应码 
 9 print(resp.url)
10 print(resp.headers)
11 print(resp.request)
12 print(resp.content)
13 print(resp.encoding)   # ISO-8859-1
14 print(resp.apparent_encoding)   # utf-8
15 
16 resp.encoding = resp.apparent_encoding
17 
18 def getHtml(url):
19     try:
20         resp = requests.get(url)
21         resp.raise_for_status()      # 网络连接异常
22         resp.encoding = resp.apparent_encoding   # 设置编码方式
23     except:
24         print("error")
25     return resp.text     # 返回响应的内容
26 
27 html = getHtml(url)
28 print(html)

二、封装请求

 1 import requests
 2 
 3 # 模拟浏览器和定制请求
 4 url = "http://www.douban.com"
 5 resp = requests.get(url)
 6 
 7 # 查看属性
 8 print(resp.text)
 9 print(resp.request.headers)
10 
11 # 封装请求
12 header = {
13     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/92.0.4515.131 Safari/537.36"
14 }
15 
16 # 发送请求
17 resp = requests.get(url,headers=header)
18 print(resp.text)   # 网页源代码

 

三、BS解析

 1 from bs4 import BeautifulSoup # 解析网页
 2 import re
 3 
 4 # 创建一个BeautifulSoup对象,参数: 打开的要解析的文件
 5 soup = BeautifulSoup(open("index.html",encoding='utf-8'))
 6 
 7 # 字符串
 8 for tag in soup.find_all("p"):
 9     print(tag.name)
10 
11 # 正则表达式 ,compile编写
12 for tag in soup.find_all(re.compile("^b")):   # 以b开头
13     print(tag.name)
14 
15 for tag in soup.find_all(re.compile("e")):     # 含有e
16     print(tag.name)
17 
18 # 列表,["",""]
19 ls=["a","p"]
20 for tag in soup.find_all(ls):
21     print(tag)                    # 显示的是内容+标签
22 
23 # True 全部的标签
24 for tag in soup.find_all(True):
25     print(tag.name) 
26 
27 
28 # find_all()方法
29 # 标签名查找
30 res = soup.find_all("p") 
31 print(res)
32 
33 # 标签属性
34 res = soup.find_all(border="1")
35 print(res)
36 
37 res = soup.find_all(attrs={"border":"1"})    # 多个参数时,常用{"":""}
38 print(res)
39 
40 # CSS选择器
41 res = soup.find_all(class_="me")
42 print(res)
43 
44 # 标签内容
45 res = soup.find_all(text="大数据导论")
46 print(res)
47 
48 # 组合查找
49 res = soup.find_all("td",border="1")   # 指明
50 print(res)
51 
52 res = soup.find_all(text=re.compile("数据"))   # 赋值内容 
53 print(res)
54 
55 res = soup.find_all(href=re.compile("^http:"))  # 查找链接
56 print(res)

 

四、选中元素

 1 import re
 2 from bs4 import BeautifulSoup
 3 
 4 soup = BeautifulSoup(open("index.html",encoding="utf-8"))
 5 
 6 '''
 7 select() 方法
 8 
 9 '''
10 # 单个标签名查找
11 res = soup.select("a")
12 print(res)
13 
14 # 多个标签层层查找
15 res = soup.select("p a")
16 print(res)
17 
18 res = soup.select("body table th")
19 print(res)
20 
21 # 按属性查找
22 res = soup.select("a[href]")
23 print(res)
24 
25 
26 # 增加上属性值
27 res = soup.select("a[href='http://www.buu.edu.cn']")
28 print(res)
29 
30 res = soup.select('a[href*="www"]')  # 含有www
31 print(res)
32 
33 res = soup.select("a[href='http://']")
34 print(res)
35 
36 res = soup.select("a[href$='.cn']")
37 print(res)
38 
39 
40 # CSS选择器
41 # id选择器
42 res = soup.select("#t1")
43 print(res)
44 
45 # 类名选择器
46 res = soup.select(".me")
47 print(res)
48 
49 # 同时用多种CSS选择器查询元素
50 res = soup.select("#t1,.me")
51 print(res)

 

五、获取内容

 1 from bs4 import BeautifulSoup
 2 
 3 soup = BeautifulSoup(open("index.html",encoding="utf-8"))
 4 
 5 print(soup.prettify)   # 页面内容
 6 
 7 print(soup.a)
 8 print(soup.table)
 9 print(soup.table.name)
10 
11 tattrs = soup.table.attrs    # 属性
12 
13 print(tattrs)
14 print(tattrs["border"]) # 取出其中的一个属性
15 
16 print(type(soup.a))    # 查看类型
17 
18 print(soup.a.string)    # 查看超链接的内容
19 
20 print(soup.contents)    # 源代码
21 
22 print(soup.table.tr)

 

六、遍历树

 1 from bs4 import BeautifulSoup
 2 
 3 soup = BeautifulSoup(open("index.html",encoding="utf-8"))
 4 
 5 # 文档树的遍历
 6 print(soup.table.parent) # 源代码
 7 
 8 tags = soup.table.parents  # 所有的父元素
 9 for tag in tags:
10     print(tag.name)
11 
12 # 
13 print(soup.table.children) # 子元素
14 
15 
16 # 
17 print(soup.table.descendants) # 后代

 

posted @ 2022-07-15 19:20  暖阳的雪  阅读(24)  评论(0)    收藏  举报