爬虫 第二弹

一:图片懒加载案例

      import requests

      from lxml import etree

      headers = {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
      }
      """
          图片懒加载:当用户刷新页面的时候,页面中的图片只会加载出现局部的而不是所有,只有当满足了指定的条件
              才可以将剩余的图片加载出来.
          
          如何决定图片是否要加载出来?使用伪属性 
      """
      url = 'https://sc.chinaz.com/tupian/hunsha.html'
      response = requests.get(url,headers=headers)

      response.encoding='utf-8'
      page_text = response.text

      tree = etree.HTML(page_text)
      div_list = tree.xpath('//*[@id="container"]/div')
      for div in div_list:
          img_src = 'https:'+div.xpath('./div/a/img/@src2')[0]
          img_name = div.xpath('./div/a/img/@alt')[0]+'.jpg'
          print(img_name,img_src)

二:cookie反爬虫案例
import requests

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
        }

        # 获取session对象
        sess = requests.Session()

        # 捕获cookie
        sess.get('https://xueqiu.com/',headers=headers)

        url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=195129&size=15'

        result=sess.get(url,headers=headers).json()

        print(result)

三:代理操作

      import requests

      from lxml import etree

      import random

      headers = {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
      }

      # 代理池
      # 下面写可以使用的代理IP
      ips=[
          # ......
      ]

      url = 'https://www.sogou.com/web?query=ip'

      # 使用代理机制

      page_text = requests.get(url,headers=headers,proxies=random.choice(ips)).text

      tree = etree.HTML(page_text)

      print(tree.xpath('//*[@id="ipsearchresult"]/strong/text()')[0])

四:验证码自动识别

      import base64
      import json
      import requests

      # 以下是可以进行验证码识别的数据
      # 一、图片文字类型(默认 3 数英混合):
      # 1 : 纯数字
      # 1001:纯数字2
      # 2 : 纯英文
      # 1002:纯英文2
      # 3 : 数英混合
      # 1003:数英混合2
      #  4 : 闪动GIF
      # 7 : 无感学习(独家)
      # 11 : 计算题
      # 1005:  快速计算题
      # 16 : 汉字
      # 32 : 通用文字识别(证件、单据)
      # 66:  问答题
      # 49 :recaptcha图片识别 参考 https://shimo.im/docs/RPGcTpxdVgkkdQdY
      # 二、图片旋转角度类型:
      # 29 :  旋转类型
      #
      # 三、图片坐标点选类型:
      # 19 :  1个坐标
      # 20 :  3个坐标
      # 21 :  3 ~ 5个坐标
      # 22 :  5 ~ 8个坐标
      # 27 :  1 ~ 4个坐标
      # 48 : 轨迹类型
      #
      # 四、缺口识别
      # 18:缺口识别
      # 五、拼图识别
      # 53:拼图识别


      def base64_api(uname, pwd,  img,typeid):
          with open(img,'rb') as f:
              base64_data = base64.b64encode(f.read())
              b64 = base64_data.decode()
          data = {"username": uname, "password": pwd,"typeid":typeid, "image": b64}
          result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
          if result['success']:
              return result["data"]["result"]
          else:
              return result["message"]
          return ""


      if __name__ == "__main__":
          img_path = "./code.jpg"   # 这里放要识别的文件   下面是账号密码
          result = base64_api(uname='bb328410948', pwd='bb328410948', img=img_path,typeid=3)
          print(result)

五:模拟登陆

    # 验证码识别
    import base64
    import json
    import requests
    from lxml import etree

    #获取session
    sess = requests.Session()
    # ua伪装
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
    }





    def base64_api(uname, pwd,  img,typeid):
        with open(img,'rb') as f:
            base64_data = base64.b64encode(f.read())
            b64 = base64_data.decode()
        data = {"username": uname, "password": pwd,"typeid":typeid, "image": b64}
        result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
        if result['success']:
            return result["data"]["result"]
        else:
            return result["message"]
        return ""


    # 动态获取验证码图片
    main_url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'

    page_text =sess.get(main_url,headers=headers).text

    tree = etree.HTML(page_text)

    img_src = 'https://so.gushiwen.cn'+tree.xpath('//*[@id="imgCode"]/@src')[0]

    img_data = sess.get(img_src,headers=headers).content
    with open('../code.jpg', 'wb') as fp:
        fp.write(img_data)


    #识别操作
    if __name__ == "__main__":
        img_path = "./code.jpg"   # 这里放要识别的文件   下面是账号密码
        result = base64_api(uname='bb328410948', pwd='bb328410948', img=img_path,typeid=3)
        print(result)


    #在前台页面动态捕获动态变化的请求参数值

    __VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]

    login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
    data = {
        #动态变化的请求参数
        '__VIEWSTATE': __VIEWSTATE,
        '__VIEWSTATEGENERATOR': 'C93BE1AE',
        'from': 'http://so.gushiwen.cn/user/collect.aspx',
        'email': '15027900535',
        'pwd': 'bobo@15027900535',
        'code': result,
        'denglu': '登录'
    }
    page_text = sess.post(login_url,headers=headers,data=data).text
    with open('../gushiwen.html', 'w', encoding='utf-8') as fp:
        fp.write(page_text)

六:协程基础
import asyncio
import time

      async def get_request(url):
          print('正在请求:',url)
          time.sleep(2)
          print('请求结束!')


      def func(task):#必须携带一个参数,值的就是add_done_callback的调用者
          data = task.result() #result()返回任务对象对应特殊函数内部的返回值
          print(data)
      #协程对象
      c = get_request('www.1.com')

      #基于协程创建一个任务对象
      task = asyncio.ensure_future(c)

      #给任务对象绑定回调函数
      task.add_done_callback(func)
      #创建了一个事件循环对象
      loop = asyncio.get_event_loop()
      #装载任务对象和启动事件循环
      loop.run_until_complete(task)

七:多任务异步协程
import asyncio
import time
start = time.time()

    async def get_request(url):
        print('正在请求:',url)
        await asyncio.sleep(2)
        print('请求结束!')

    urls = [
        'www.1.com',
        'www.2.com',
        'www.3.com'
    ]
    tasks = []
    for url in urls:
        c = get_request(url)
        task = asyncio.ensure_future(c)
        tasks.append(task)
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))

    print('总耗时:',time.time()-start)

八:多任务异步爬虫
import asyncio
import time
import requests
import aiohttp
from lxml import etree
start = time.time()
#requets是不支持异步
# async def get_request(url):
# page_text = requests.get(url).text
# return page_text
async def get_request(url):
#创建一个请求对象
async with aiohttp.ClientSession() as req:
#发起指定请求
#注意:get和post和requests中的区别在于proxies参数。在aiohttp设置代理使用的时proxy='http://ip:port'
async with await req.get(url) as response:
#text()字符串相应数据,read()byte类型的相应数据
page_text = await response.text()
return page_text

  def parse(task):
      page_text = task.result()
      tree = etree.HTML(page_text)
      data = tree.xpath('//a[@id="feng"]/text()')[0]
      print(data)
  urls = [
      'http://127.0.0.1:5000/bobo',
      'http://127.0.0.1:5000/tom',
      'http://127.0.0.1:5000/jay'
  ]
  tasks = []
  for url in urls:
      c = get_request(url)
      task = asyncio.ensure_future(c)
      task.add_done_callback(parse)

      tasks.append(task)
  loop = asyncio.get_event_loop()
  loop.run_until_complete(asyncio.wait(tasks))

  print('总耗时:',time.time()-start)

九:多线程
from multiprocessing.dummy import Pool #线程池
import requests
import time
start = time.time()
def get_request(url):
page_text = requests.get(url).text
print(len(page_text))

    urls = [
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay'
    ]
    pool = Pool(3)
    #可以使用get_requsts对urls列表中的每一个列表元素进行操作
    pool.map(get_request,urls)

    print('总耗时:',time.time()-start)
posted @ 2021-04-21 15:35  meng神  阅读(71)  评论(0)    收藏  举报