day02

内容概要

requests高级用法
代理池搭建
爬取某视频网站
爬取新闻

requests高级用法

ssl认证

https://zhuanlan.zhihu.com/p/561907474 详细

http协议：明文传输

https协议:http + ssl/tsl

HTTP + SSL/TLS，也就是在http上加了一层处理家吗信息的模块，比http安全，可以防止数据在出传输中被窃取、改变，确保数据的完整性。

以后遇到证书提示问题 ssl xxx

不验证证书

# import requests
# res = requests.get("https://www.12306.cn", verify=False)
# print(res.status_code)  # warnings.warn( 警告)

去除警告

# 去除警告
# import requests
# from requests.packages import urllib3
#
# urllib3.disable_warnings()  # 去除警告
# res = requests.get("https://www.12306.cn", verify=False)
# print(res.status_code)  # 200

手动携带证书

# 手动携带证书
import requests

res = requests.get("'https://www.12306.cn", cert=('/path/server.crt',  # 证书
                                                  '/path/key'))  # 密钥

使用代理

如果爬虫使用自身ip地址访问，很有可能被封ip地址，以后就访问不了

我们可以使用代理ip

代理：收费和免费(不稳定)
```
# res = requests.get("http://bilibili.com", proxies={"协议": "ip:端口"})
```
```
res = requests.get('https://www.cnblogs.com',proxies={"http": '36.6.145.246:8089'})
print(res.status_code)
```
高匿代理和透明代理
- 高匿，服务端拿不到真实客户端的ip地址
- 透明，服务端拿到真实客户端的ip地址
后端如何拿到真实客户端ip地址
- http请求头中有个：X-Forwarderd-For：client，proxy1, proxy2, proxy3
  
  x-forword-for
  
  获取HTTP请求端真实的IP

超时设置

import requests
respone=requests.get('https://www.baidu.com',timeout=0.0001)

如果时间过了设定的时间，就断开链接

异常处理

import requests
from requests.exceptions import RequestException

try:
    res = requests.get("http://www.baidu.com")
except RequestException as e:
    print(e)

代理池搭建

request 发送请求使用代理

代理从哪里来

公司花钱买
搭建免费的代理池
https://github.com/jhao104/proxy_pool
python：爬虫+flask写的
架构：

代码

import requests

res_dict = requests.get("http://127.0.0.1:5010/get/").json()
res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}

res1 = requests.get("http://www.baidu.com", proxies=res)

print(res1.status_code)

django后端获取客户端的ip

建立django后端---》index地址---》访问就返回访问者的ip

django代码---》

# import requests
#
# res_dict = requests.get("http://127.0.0.1:5010/get/").json()
# res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}
#
# res1 = requests.get("http://www.baidu.com", proxies=res)
#
# print(res1.status_code)

from redis import Redis
import requests
from requests.packages import urllib3
from requests.exceptions import RequestException
import re

cc = Redis()
urllib3.disable_warnings()
from threading import Thread, Lock

my_local = Lock()


def task():
    res_dict = requests.get("http://127.0.0.1:5010/get/").json()
    res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}
    print(res)
    try:
        res1 = requests.get("http://124.222.206.158:8080/", proxies=res, verify=False, timeout=3)

        if re.findall("{.*?}", res1.text) and "218.82.25.213" not in str(res1.text) and "html" not in str(
                res1.text):
            # my_local.acquire()
            cc.lpush("ip2", str((str(res), str(res1.text))))
            # my_local.release()
            print(res1.text, "我的")
        # res1 = requests.get("https://test.ipw.cn/", proxies={"http": "36.6.145.246:8089"}, verify=False)
    except RequestException as e:
        # my_local.acquire()
        cc.lpush("error", str(res))
        # my_local.release()


#
#
# for i in range(100):
#     t = Thread(target=task)
#     t.start()
# res1 = requests.get("http://124.222.206.158:8080/", proxies={'http': '218.7.171.91:3128'})
# print(res1.text, "我的")


# res1 = requests.get("http://124.222.206.158:8080/", proxies={'http':'123.159.126.27:8085'})
# print(res1.text, "我的")
# res1 = "{'http': '218.7.171.91:3128'}"
# #
# print(re.findall("{.*?}", res1) and "218.82.25.213" not in str(res1) and "html" not in str(
#                 res1))

django后端

from django.shortcuts import render, HttpResponse, redirect

from django.http import JsonResponse


# Create your views here.

def test(request):
    ip = request.META.get("REMOTE_ADDR")
    print(ip)
    return JsonResponse({"ip": ip})

爬取某视频网站

import random
import requests
from requests.packages import urllib3
import re

urllib3.disable_warnings()

ls = [{'http': '123.159.126.27:8085'}, {'http': '118.31.2.38:8999'}, {'http': '218.7.171.91:3128'}]

res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=4&categoryId=1&start=0',
                   proxies=random.choice(ls), verify=False)
video_list = re.finditer('<a href="(?P<id>.*?)" class="vervideo-lilink actplay">', res.text)
url_list = ["".join(["https://www.pearvideo.com/", i.group("id")]) for i in video_list]
print(url_list)
video_data = requests.get(url_list[0], proxies=random.choice(ls), verify=False)
video2_list = re.findall(
    '<video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="(.*?)" style="width: 100%; height: 100%;"></video>',
    video_data.text)
print(video2_list)
# # print(video2_list)
# # for i in video2_list:
# #     print(i.group("vido"))

爬取新闻

import requests
from bs4 import BeautifulSoup
import random

ls = [{'http': '123.159.126.27:8085'}, {'http': '118.31.2.38:8999'}, {'http': '218.7.171.91:3128'}]

# data = requests.get('https://www.autohome.com.cn/all/1/#liststart', proxies=random.choice(ls))
# print(data.text)
# with open("a.txt", 'wt', encoding="utf8") as f:
#     f.write(data.text)

with open('a.txt', 'rt', encoding="utf8") as f:
    data = f.read()
soup = BeautifulSoup(data, 'html.parser')
ul_list = soup.find_all(name='ul', class_="article")

for i in ul_list:
    h3 = i.find("h3").text
    if h3:
        img = i.find(name="img").attrs.get("src")
        link = "".join(["http:", i.find(name="a").attrs.get("href")])
        desc = i.find(name="p").text
        print("""
            新闻:%s
            图片:%s
            链接:%s
            详情:%s
        """ % (h3, img, link, desc))

posted @ 2023-03-17 22:51 可否阅读(14) 评论(0) 收藏举报

可否