深圳男生快快乐乐

python 学习中

python3 网络爬虫开发实战2 笔记一(基本库的使用)

最基础的HTTP 库有: urllib 、requests 、httpx 等

urllib ,python内置库,不需要额外安装,可直接使用。urllib 库含以下4个模块:request ,error,parse,robotparser。

# 第一个爬虫测试 : request_test.py
# pip install urllib3, 引入 urllib
import urllib.request as rs
# 学会使用 urlopen, 打开网页
response = rs.urlopen('https://www.ys-gd.com')
print(type(response))
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))
print(response.read().decode('utf-8'))

  urllib 库parse 一个工具模块,提供许多URL的处理方法,如折分、解析、合并等

import urllib.request as rs
import urllib.parse

# 传递参数name 的值为samtang, 需要将它转码成bytes类型 . encoding 为指定编码格式
data = bytes(urllib.parse.urlencode({'name':'samtang'}),encoding = 'utf-8')
# 在引入网页中,加入参数 data, 设置 打开时间 1 秒
response = rs.urlopen('https://www.httpbin.org/post',data =data,timeout = 1)

# # response = rs.urlopen('https://www.ys-gd.com',data =data)
print(response.read().decode('utf-8'))

 设置请求头,通过修改User-Agent 来伪装浏览器,为爬网页提供服务

import urllib.request as rs
import urllib.parse

url = "https://www.httpbin.org/post"
# 设置请求头
headers = {
    'User-Agent':"Mozilla/4.0(compatible; MSIE 5.5 ; Windows NT",
    'Host': "www.httpbin.org"
}
# 参数设置
dict = {'name':'samtang'}
data = bytes(urllib.parse.urlencode(dict),encoding = 'utf-8')
# 使用Request 来加载文件参数
req = rs.Request(url = url ,data=data,headers=headers,method = 'POST')
# 打开HTML网页文件
response = rs.urlopen(req)
# 读出HTML文件
print(response.read().decode('utf-8'))

  如果网页有验证用户,如何爬虫

import urllib.request as rs
import urllib.parse


from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener
from urllib.error import URLError

url = "https://ssr3.scrape.center/"
username = 'admin'
password = 'admin'
# 提供用户名和密码进行访问 passget = HTTPPasswordMgrWithDefaultRealm() passget.add_password(None,url,username,password) auth_handle = HTTPBasicAuthHandler(passget) opener = build_opener(auth_handle) try: result = opener.open(url) html = result.read().decode('utf-8') print(html) except URLError as e: print(e.reason)

  requests 的使用,首先安装:pup install requests

与urllib中的urlopen(),相对应的requests 是get()方法

import requests

rs = requests.get("https://www.baidu.com")
print(type(rs.status_code),rs.status_code)
print(type(rs.headers),rs.headers)
print(type(rs.cookies),rs.cookies)
print(type(rs.url),rs.url)
print("\n网页原文如下:\n")
print(rs.text)

res = requests.get("https://ssr3.scrape.center/",auth =('admin','admin'))
print(res.status_code)
print(res.text)

  

pyquery 网页解析库,主要针对CSS选择器
import requests
from pyquery import PyQuery as pq

# 直接爬取,出现403错误
# rs = requests.get("https://www.ys-gd.com")

# 增加设置请求头:

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
reqs = requests.get("https://www.ys-gd.com", headers=headers)
ysgd_html=reqs.text
# 首先引入pyquery对象pq,然后把HTML(ysgd_html)作为参数传递给py,完成初始化。
doc = pq(ysgd_html)
print(doc('ul'))


# 基本CSS选择器
# 选取id="nav_layer84DBE7033E187994D4E81BFEDA90E944",内部class为navigation 节点内部的所有li节点
print(doc("#nav_layer84DBE7033E187994D4E81BFEDA90E944,navigation.li"))

# 查找节点
# 选择class为navigation的节点
item = doc(".navigation")
print(type(item))
print(item)

# 查找li(子节点)
lis = item.find("li")
print(type(lis))
print(lis)
lis = item.children()
print(type(lis))
print(lis)

# 在子节点中选择class=wp_subtop的节点
lis = item.children(".wp_subtop")
print(type(lis))
print(lis)

# 父节点(parent())
item=doc(".html5zoo-slides")
# print(item)
cont = item.parent()
print(cont)

 

BeautifulSoup 是HTML解析库,不需要写复杂的正则表达式,完成网页中的某个无素提取。
BeautifulSoup 是依赖第三方解析器的,推荐用"lxml"

from lxml import etree
import requests
from bs4 import BeautifulSoup

# 获取网页HTML信息
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
ysgd_text = requests.get("https://www.ys-gd.com", headers=headers).content.decode()

# 获取本地文件(test.html)HTML信息
ysgd_html = etree.parse('./test.html',etree.HTMLParser())
ysgd_text = etree.tostring(ysgd_html)


soup =BeautifulSoup(ysgd_text,'lxml')
# prettify() 要解析的字符串以标准缩进格式输出
print(soup.prettify())
print("soup.head")
print(soup.head)
print("soup.head.title")
print(soup.head.title)
print(soup.head.meta)
print("soup.title.name")
print(soup.title.name)

# 获得属性
print(soup.p.attrs)
print(soup.p.attrs['style'])

# 获得内容
print(soup.p.string)

# 方法选择器 find_all / find
for meta in soup.find_all("meta"):
print(meta)

# CSS选择器
for ul in soup.select('li'):
# print("li.select.a")
print(ul.select("a"))

 




posted on 2025-06-04 07:40  深圳男生快快乐乐  阅读(22)  评论(0)    收藏  举报

导航