day02

内容概要

  • requests高级用法
  • 代理池搭建
  • 爬取某视频网站
  • 爬取新闻

requests高级用法

  1. ssl认证

    https://zhuanlan.zhihu.com/p/561907474 详细

    http协议:明文传输

    https协议:http + ssl/tsl

    HTTP + SSL/TLS,也就是在http上加了一层处理家吗信息的模块,比http安全,可以防止数据在出传输中被窃取、改变,确保数据的完整性。

    以后遇到证书提示问题 ssl xxx

    1. 不验证证书

      # import requests
      # res = requests.get("https://www.12306.cn", verify=False)
      # print(res.status_code)  # warnings.warn( 警告)
      
      
    2. 去除警告

      # 去除警告
      # import requests
      # from requests.packages import urllib3
      #
      # urllib3.disable_warnings()  # 去除警告
      # res = requests.get("https://www.12306.cn", verify=False)
      # print(res.status_code)  # 200
      
      
    3. 手动携带证书

      # 手动携带证书
      import requests
      
      res = requests.get("'https://www.12306.cn", cert=('/path/server.crt',  # 证书
                                                        '/path/key'))  # 密钥
      
  2. 使用代理

    如果爬虫使用自身ip地址访问,很有可能被封ip地址,以后就访问不了

    我们可以使用代理ip

    代理:收费和免费(不稳定)

    # res = requests.get("http://bilibili.com", proxies={"协议": "ip:端口"})
    
    
    res = requests.get('https://www.cnblogs.com',proxies={"http": '36.6.145.246:8089'})
    print(res.status_code)
    

    高匿代理和透明代理

    • 高匿,服务端拿不到真实客户端的ip地址
    • 透明,服务端拿到真实客户端的ip地址

    后端如何拿到真实客户端ip地址

    • http请求头中有个:X-Forwarderd-For:client,proxy1, proxy2, proxy3

      x-forword-for

      获取HTTP请求端真实的IP

  3. 超时设置

    import requests
    respone=requests.get('https://www.baidu.com',timeout=0.0001)
    

    如果时间过了设定的时间,就断开链接

  4. 异常处理

    import requests
    from requests.exceptions import RequestException
    
    try:
        res = requests.get("http://www.baidu.com")
    except RequestException as e:
        print(e)
    

代理池搭建

request 发送请求使用代理

代理从哪里来

image-20230316163352699

代码

import requests

res_dict = requests.get("http://127.0.0.1:5010/get/").json()
res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}

res1 = requests.get("http://www.baidu.com", proxies=res)

print(res1.status_code)

django后端获取客户端的ip

建立django后端---》index地址---》访问就返回访问者的ip

django代码---》

# import requests
#
# res_dict = requests.get("http://127.0.0.1:5010/get/").json()
# res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}
#
# res1 = requests.get("http://www.baidu.com", proxies=res)
#
# print(res1.status_code)

from redis import Redis
import requests
from requests.packages import urllib3
from requests.exceptions import RequestException
import re

cc = Redis()
urllib3.disable_warnings()
from threading import Thread, Lock

my_local = Lock()


def task():
    res_dict = requests.get("http://127.0.0.1:5010/get/").json()
    res = {"https" if res_dict.get("https") else "http": res_dict.get("proxy")}
    print(res)
    try:
        res1 = requests.get("http://124.222.206.158:8080/", proxies=res, verify=False, timeout=3)

        if re.findall("{.*?}", res1.text) and "218.82.25.213" not in str(res1.text) and "html" not in str(
                res1.text):
            # my_local.acquire()
            cc.lpush("ip2", str((str(res), str(res1.text))))
            # my_local.release()
            print(res1.text, "我的")
        # res1 = requests.get("https://test.ipw.cn/", proxies={"http": "36.6.145.246:8089"}, verify=False)
    except RequestException as e:
        # my_local.acquire()
        cc.lpush("error", str(res))
        # my_local.release()


#
#
# for i in range(100):
#     t = Thread(target=task)
#     t.start()
# res1 = requests.get("http://124.222.206.158:8080/", proxies={'http': '218.7.171.91:3128'})
# print(res1.text, "我的")


# res1 = requests.get("http://124.222.206.158:8080/", proxies={'http':'123.159.126.27:8085'})
# print(res1.text, "我的")
# res1 = "{'http': '218.7.171.91:3128'}"
# #
# print(re.findall("{.*?}", res1) and "218.82.25.213" not in str(res1) and "html" not in str(
#                 res1))

django后端

from django.shortcuts import render, HttpResponse, redirect

from django.http import JsonResponse


# Create your views here.

def test(request):
    ip = request.META.get("REMOTE_ADDR")
    print(ip)
    return JsonResponse({"ip": ip})

爬取某视频网站

import random
import requests
from requests.packages import urllib3
import re

urllib3.disable_warnings()

ls = [{'http': '123.159.126.27:8085'}, {'http': '118.31.2.38:8999'}, {'http': '218.7.171.91:3128'}]

res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=4&categoryId=1&start=0',
                   proxies=random.choice(ls), verify=False)
video_list = re.finditer('<a href="(?P<id>.*?)" class="vervideo-lilink actplay">', res.text)
url_list = ["".join(["https://www.pearvideo.com/", i.group("id")]) for i in video_list]
print(url_list)
video_data = requests.get(url_list[0], proxies=random.choice(ls), verify=False)
video2_list = re.findall(
    '<video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="(.*?)" style="width: 100%; height: 100%;"></video>',
    video_data.text)
print(video2_list)
# # print(video2_list)
# # for i in video2_list:
# #     print(i.group("vido"))

爬取新闻

import requests
from bs4 import BeautifulSoup
import random

ls = [{'http': '123.159.126.27:8085'}, {'http': '118.31.2.38:8999'}, {'http': '218.7.171.91:3128'}]

# data = requests.get('https://www.autohome.com.cn/all/1/#liststart', proxies=random.choice(ls))
# print(data.text)
# with open("a.txt", 'wt', encoding="utf8") as f:
#     f.write(data.text)

with open('a.txt', 'rt', encoding="utf8") as f:
    data = f.read()
soup = BeautifulSoup(data, 'html.parser')
ul_list = soup.find_all(name='ul', class_="article")

for i in ul_list:
    h3 = i.find("h3").text
    if h3:
        img = i.find(name="img").attrs.get("src")
        link = "".join(["http:", i.find(name="a").attrs.get("href")])
        desc = i.find(name="p").text
        print("""
            新闻:%s
            图片:%s
            链接:%s
            详情:%s
        """ % (h3, img, link, desc))
posted @ 2023-03-17 22:51  可否  阅读(14)  评论(0)    收藏  举报