爬虫 1
1. 使用requests 发送GET、POST请求。
"""使用requests库,发送请求""" import requests from fake_useragent import UserAgent # GET 请求 def requests_get(headers): url = "https://www.baidu.com/s" params = { "wd": "快代理" } r = requests.get(url, headers=headers, params=params) print('---响应值为: ', r.status_code, '---编码为: ', r.encoding) r.encoding = r.apparent_encoding print(r.request.headers) print(r.text) # POST 请求 def requests_post(headers): url = "https://www.kuaidaili.com/login/" login_info = { "username": "...", "passwd": "..." } r = requests.post(url, headers=headers, data=login_info) print('---响应值为: ', r.status_code, '---编码为: ', r.encoding) print(r.text) def main(): headers = { 'User-Agent': UserAgent().chrome } # 发送get请求 requests_get(headers) # 发送post请求 requests_post(headers) if __name__ == '__main__': main()
简单示例: 爬取唯美女生网站中某页面数据
"""爬取 唯美女生 网站图片""" import requests import re import os import time # 1. 请求网页 myheaders = {'User-Agent': 'Mozilla/5.0'} # url = "http://pic.netbian.com" url = 'http://pic.netbian.com/4kmeinv' response = requests.get(url, headers=myheaders) # 2. 处理响应数据, 正则匹配 html = response.text img_urls = re.findall('<img src="(.*?)" alt=".*?">', html) print(img_urls) # 3. 下载图片 if not os.path.exists('彼岸图片'): os.mkdir('彼岸图片') for img_url in img_urls: time.sleep(1) img_name = img_url.split('/')[-1] response = requests.get((url + img_url), headers=myheaders) with open('彼岸图片/' + img_name, 'wb') as f: f.write(response.content)
2. 使用urllib 发送GET、POST请求, 当需要手动传参时,可以使用urllib.parse中的:quote、urlencode来进行“中文”转码
"""使用 urllib 创建爬虫""" from urllib.request import urlopen from urllib.request import Request # 包装爬虫 url = 'https://www.baidu.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36' } # 创建Resquest对象,来包装请求 request = Request(url, headers=headers, method='GET', data=None) # 发送请求 response = urlopen(request) # 打印信息 print('状态码:', response.getcode(), '真实请求地址:', response.geturl(), '状态码:', response.status, '请求头:', response.getheaders()) info = response.read().decode() print(info)
a. GET请求
"""GET 请求,当需要手动传参,且参数为中文时,需要将参数转码 单个参数时可以使用:quote,转码 多个参数时:urlencode, 转码 """ from urllib.request import urlopen, Request from urllib.parse import quote # 单个参数时可以使用:quote,转码 # url = "https://www.baidu.com/s?wd={}".format(quote("科技")) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36" } # request = Request(url, headers=headers) # response = urlopen(request, timeout=3) # info = response.read().decode() # print(info) # 多个参数时:urlencode, 转码 from urllib.parse import urlencode args = { 'wd': "科技", 'ie': 'utf-8' # ... } url = 'https://www.baidu.com/s?{}'.format(urlencode(args)) print(url) request2 = Request(url, headers=headers) response2 = urlopen(request2, timeout=3) info2 = response2.read().decode() print(info2)
b. POST请求, 在request对象创建时,封装参数进去。
"""爬虫, 简单的post请求""" from urllib.request import Request, urlopen from urllib.parse import urlencode from fake_useragent import UserAgent url = "http://www.sxt.cn/index/login/login.html" # 参数(此处为登录的账号和密码) form_data = { 'user': '...', 'password': '...' } f_data = urlencode(form_data) # 转码,防止其中有中文而报错 # 修改请求头 headers = { "User-Agent": UserAgent().chrome } # 创建request对象,发送请求 try: request = Request(url, data=f_data.encode(), headers=headers, ) response = urlopen(request) info = response.read() # print(info.decode()) except Exception as err: print('err', err)

浙公网安备 33010602011771号