day02
内容概要
- requests高级用法
- 代理池搭建
- 爬取某视频网站
- 爬取新闻
requests高级用法
-
ssl认证
https://zhuanlan.zhihu.com/p/561907474 详细
http协议:明文传输
https协议:http + ssl/tsl
HTTP + SSL/TLS,也就是在http上加了一层处理家吗信息的模块,比http安全,可以防止数据在出传输中被窃取、改变,确保数据的完整性。
以后遇到证书提示问题 ssl xxx
-
不验证证书
# import requests # res = requests.get("https://www.12306.cn", verify=False) # print(res.status_code) # warnings.warn( 警告) -
去除警告
# 去除警告 # import requests # from requests.packages import urllib3 # # urllib3.disable_warnings() # 去除警告 # res = requests.get("https://www.12306.cn", verify=False) # print(res.status_code) # 200 -
手动携带证书
# 手动携带证书 import requests res = requests.get("'https://www.12306.cn", cert=('/path/server.crt', # 证书 '/path/key')) # 密钥
-
-
使用代理
如果爬虫使用自身ip地址访问,很有可能被封ip地址,以后就访问不了
我们可以使用代理ip
代理:收费和免费(不稳定)
# res = requests.get("http://bilibili.com", proxies={"协议": "ip:端口"})res = requests.get('https://www.cnblogs.com',proxies={"http": '36.6.145.246:8089'}) print(res.status_code)高匿代理和透明代理
- 高匿,服务端拿不到真实客户端的ip地址
- 透明,服务端拿到真实客户端的ip地址
后端如何拿到真实客户端ip地址
-
http请求头中有个:X-Forwarderd-For:client,proxy1, proxy2, proxy3
x-forword-for
获取HTTP请求端真实的IP
-
超时设置
import requests respone=requests.get('https://www.baidu.com',timeout=0.0001)如果时间过了设定的时间,就断开链接
-
异常处理
import requests from requests.exceptions import RequestException try: res = requests.get("http://www.baidu.com") except RequestException as e: print(e)
代理池搭建
request 发送请求使用代理
代理从哪里来
- 公司花钱买
- 搭建免费的代理池
- https://github.com/jhao104/proxy_pool
- python:爬虫+flask写的
- 架构:

代码
import requests
res_dict = requests.get("http://127.0.0.1:5010/get/").json()
res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}
res1 = requests.get("http://www.baidu.com", proxies=res)
print(res1.status_code)
django后端获取客户端的ip
建立django后端---》index地址---》访问就返回访问者的ip
django代码---》
# import requests
#
# res_dict = requests.get("http://127.0.0.1:5010/get/").json()
# res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}
#
# res1 = requests.get("http://www.baidu.com", proxies=res)
#
# print(res1.status_code)
from redis import Redis
import requests
from requests.packages import urllib3
from requests.exceptions import RequestException
import re
cc = Redis()
urllib3.disable_warnings()
from threading import Thread, Lock
my_local = Lock()
def task():
res_dict = requests.get("http://127.0.0.1:5010/get/").json()
res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}
print(res)
try:
res1 = requests.get("http://124.222.206.158:8080/", proxies=res, verify=False, timeout=3)
if re.findall("{.*?}", res1.text) and "218.82.25.213" not in str(res1.text) and "html" not in str(
res1.text):
# my_local.acquire()
cc.lpush("ip2", str((str(res), str(res1.text))))
# my_local.release()
print(res1.text, "我的")
# res1 = requests.get("https://test.ipw.cn/", proxies={"http": "36.6.145.246:8089"}, verify=False)
except RequestException as e:
# my_local.acquire()
cc.lpush("error", str(res))
# my_local.release()
#
#
# for i in range(100):
# t = Thread(target=task)
# t.start()
# res1 = requests.get("http://124.222.206.158:8080/", proxies={'http': '218.7.171.91:3128'})
# print(res1.text, "我的")
# res1 = requests.get("http://124.222.206.158:8080/", proxies={'http':'123.159.126.27:8085'})
# print(res1.text, "我的")
# res1 = "{'http': '218.7.171.91:3128'}"
# #
# print(re.findall("{.*?}", res1) and "218.82.25.213" not in str(res1) and "html" not in str(
# res1))
django后端
from django.shortcuts import render, HttpResponse, redirect
from django.http import JsonResponse
# Create your views here.
def test(request):
ip = request.META.get("REMOTE_ADDR")
print(ip)
return JsonResponse({"ip": ip})
爬取某视频网站
import random
import requests
from requests.packages import urllib3
import re
urllib3.disable_warnings()
ls = [{'http': '123.159.126.27:8085'}, {'http': '118.31.2.38:8999'}, {'http': '218.7.171.91:3128'}]
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=4&categoryId=1&start=0',
proxies=random.choice(ls), verify=False)
video_list = re.finditer('<a href="(?P<id>.*?)" class="vervideo-lilink actplay">', res.text)
url_list = ["".join(["https://www.pearvideo.com/", i.group("id")]) for i in video_list]
print(url_list)
video_data = requests.get(url_list[0], proxies=random.choice(ls), verify=False)
video2_list = re.findall(
'<video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="(.*?)" style="width: 100%; height: 100%;"></video>',
video_data.text)
print(video2_list)
# # print(video2_list)
# # for i in video2_list:
# # print(i.group("vido"))
爬取新闻
import requests
from bs4 import BeautifulSoup
import random
ls = [{'http': '123.159.126.27:8085'}, {'http': '118.31.2.38:8999'}, {'http': '218.7.171.91:3128'}]
# data = requests.get('https://www.autohome.com.cn/all/1/#liststart', proxies=random.choice(ls))
# print(data.text)
# with open("a.txt", 'wt', encoding="utf8") as f:
# f.write(data.text)
with open('a.txt', 'rt', encoding="utf8") as f:
data = f.read()
soup = BeautifulSoup(data, 'html.parser')
ul_list = soup.find_all(name='ul', class_="article")
for i in ul_list:
h3 = i.find("h3").text
if h3:
img = i.find(name="img").attrs.get("src")
link = "".join(["http:", i.find(name="a").attrs.get("href")])
desc = i.find(name="p").text
print("""
新闻:%s
图片:%s
链接:%s
详情:%s
""" % (h3, img, link, desc))

浙公网安备 33010602011771号