python3 网络爬虫开发实战2 笔记一(基本库的使用)
最基础的HTTP 库有: urllib 、requests 、httpx 等
urllib ,python内置库,不需要额外安装,可直接使用。urllib 库含以下4个模块:request ,error,parse,robotparser。
# 第一个爬虫测试 : request_test.py
# pip install urllib3, 引入 urllib
import urllib.request as rs
# 学会使用 urlopen, 打开网页
response = rs.urlopen('https://www.ys-gd.com')
print(type(response))
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))
print(response.read().decode('utf-8'))
urllib 库parse 一个工具模块,提供许多URL的处理方法,如折分、解析、合并等
import urllib.request as rs
import urllib.parse
# 传递参数name 的值为samtang, 需要将它转码成bytes类型 . encoding 为指定编码格式
data = bytes(urllib.parse.urlencode({'name':'samtang'}),encoding = 'utf-8')
# 在引入网页中,加入参数 data, 设置 打开时间 1 秒
response = rs.urlopen('https://www.httpbin.org/post',data =data,timeout = 1)
# # response = rs.urlopen('https://www.ys-gd.com',data =data)
print(response.read().decode('utf-8'))
设置请求头,通过修改User-Agent 来伪装浏览器,为爬网页提供服务
import urllib.request as rs
import urllib.parse
url = "https://www.httpbin.org/post"
# 设置请求头
headers = {
'User-Agent':"Mozilla/4.0(compatible; MSIE 5.5 ; Windows NT",
'Host': "www.httpbin.org"
}
# 参数设置
dict = {'name':'samtang'}
data = bytes(urllib.parse.urlencode(dict),encoding = 'utf-8')
# 使用Request 来加载文件参数
req = rs.Request(url = url ,data=data,headers=headers,method = 'POST')
# 打开HTML网页文件
response = rs.urlopen(req)
# 读出HTML文件
print(response.read().decode('utf-8'))
如果网页有验证用户,如何爬虫
import urllib.request as rs import urllib.parse from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener from urllib.error import URLError url = "https://ssr3.scrape.center/" username = 'admin' password = 'admin'
# 提供用户名和密码进行访问 passget = HTTPPasswordMgrWithDefaultRealm() passget.add_password(None,url,username,password) auth_handle = HTTPBasicAuthHandler(passget) opener = build_opener(auth_handle) try: result = opener.open(url) html = result.read().decode('utf-8') print(html) except URLError as e: print(e.reason)
requests 的使用,首先安装:pup install requests
与urllib中的urlopen(),相对应的requests 是get()方法
import requests
rs = requests.get("https://www.baidu.com")
print(type(rs.status_code),rs.status_code)
print(type(rs.headers),rs.headers)
print(type(rs.cookies),rs.cookies)
print(type(rs.url),rs.url)
print("\n网页原文如下:\n")
print(rs.text)
res = requests.get("https://ssr3.scrape.center/",auth =('admin','admin'))
print(res.status_code)
print(res.text)
pyquery 网页解析库,主要针对CSS选择器
import requests from pyquery import PyQuery as pq # 直接爬取,出现403错误 # rs = requests.get("https://www.ys-gd.com") # 增加设置请求头: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } reqs = requests.get("https://www.ys-gd.com", headers=headers) ysgd_html=reqs.text # 首先引入pyquery对象pq,然后把HTML(ysgd_html)作为参数传递给py,完成初始化。 doc = pq(ysgd_html) print(doc('ul')) # 基本CSS选择器 # 选取id="nav_layer84DBE7033E187994D4E81BFEDA90E944",内部class为navigation 节点内部的所有li节点 print(doc("#nav_layer84DBE7033E187994D4E81BFEDA90E944,navigation.li")) # 查找节点 # 选择class为navigation的节点 item = doc(".navigation") print(type(item)) print(item) # 查找li(子节点) lis = item.find("li") print(type(lis)) print(lis) lis = item.children() print(type(lis)) print(lis) # 在子节点中选择class=wp_subtop的节点 lis = item.children(".wp_subtop") print(type(lis)) print(lis) # 父节点(parent()) item=doc(".html5zoo-slides") # print(item) cont = item.parent() print(cont)
BeautifulSoup 是HTML解析库,不需要写复杂的正则表达式,完成网页中的某个无素提取。
BeautifulSoup 是依赖第三方解析器的,推荐用"lxml"
from lxml import etree import requests from bs4 import BeautifulSoup # 获取网页HTML信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } ysgd_text = requests.get("https://www.ys-gd.com", headers=headers).content.decode() # 获取本地文件(test.html)HTML信息 ysgd_html = etree.parse('./test.html',etree.HTMLParser()) ysgd_text = etree.tostring(ysgd_html) soup =BeautifulSoup(ysgd_text,'lxml') # prettify() 要解析的字符串以标准缩进格式输出 print(soup.prettify()) print("soup.head") print(soup.head) print("soup.head.title") print(soup.head.title) print(soup.head.meta) print("soup.title.name") print(soup.title.name)
# 获得属性
print(soup.p.attrs)
print(soup.p.attrs['style'])
# 获得内容
print(soup.p.string)
# 方法选择器 find_all / find
for meta in soup.find_all("meta"):
print(meta)
# CSS选择器
for ul in soup.select('li'):
# print("li.select.a")
print(ul.select("a"))
浙公网安备 33010602011771号