化身天使的博客

python学习笔记第11章:web爬虫

2.1 requests库

2.1.1 直接上手

 

#1 下载文件

import requests

 

#0.数据定义

src =r'C:\Users\clockx.cab'

 

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '

                         'AppleWebKit/537.36 (KHTML, like Gecko) '

                         'Chrome/86.0.4240.198 Safari/537.36'}

cookie={"cookie":" "}

#获取文件或网页

result= requests.get(url,headers=headers,cookie=cookie)

result = requests.get('http://a.clock.cab',headers=headers)

 

#以二进制形式写入到文件,用于下载图片文件等

with open(src,'wb') as f:

    f.write(result.content)

 

 

# 2下载必应壁纸

import requests

import re

import time

#0.数据定义

 

src = r'bing.txt'

src_save = r'C:\Users\zx\Pictures\壁纸\\'

 

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '

                         'AppleWebKit/537.36 (KHTML, like Gecko) '

                         'Chrome/86.0.4240.198 Safari/537.36'}

#一.函数

#1获取页面

def web_get():

    url_page = []

    #生成要爬取的页面链接

    for i in range(1, 2):

        url_page.append(f'https://bing.ioliu.cn/?p={i}')

 

    f = open(src,'a',encoding='utf-8')           #以追加的形式打开文件

    #根据链接获取页面,并写入文件

    for url in url_page:

        web = requests.get(url, headers=headers)

        f.write(web.text)       #写入到文件

        time.sleep(1.2)

    f.close()                   #关闭文件

 

#2.用正则表达式提取出下载链接

def url_get():

    pattern_picurl = re.compile(r'pic=(http://h2\.ioliu\.cn/bing/.+?1920x1080\.jpg)', re.S)

    pattern_picname = re.compile(r'bing/(.+?)_')

    with open(src,'r',encoding='utf-8') as f:

        text = f.read()

        url_list = pattern_picurl.findall(text)

        for url in url_list:

            pic_name = pattern_picname.search(url)

            file_get(url,pic_name[1])

            time.sleep(2)        #暂停两秒,目前无反爬虫机制,可以注释

 

#3.下载传入的url

def file_get(url,pic_name):             #图片链接和名称

    result = requests.get(url,headers = headers)

    with open(src_save + pic_name +'.jpg','wb') as f:   #下载到本地,存储目录,文件名,格式拼接

        f.write(result.content)

#二.执行

web_get()

url_get()

 

2.1.2 请求

cookie = {"cookie":'z; ci_session=g4v'}  设置cookie

 

session = requests.session()    #创建会话

session.get(url)              #使用会话

 

POST请求

data = { ‘username’:’userna’,    #定义要上传的数据,要和表单一致

      ‘passwd’:’passwd}       

session.post(url, data = data , headers = headers, cookies=cookie)

session.get(url,params = data)     #data= 或params=可省略

 

示例

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '

                         'AppleWebKit/537.36 (KHTML, like Gecko) '

                         'Chrome/86.0.4240.198 Safari/537.36'}

cookie={"cookie":" "}

payload = {"category": "上报","source": ""}

url= " "

datas=requests.post(url,headers=header,

cookies=cookie,data=payload)

print(datas.content)

 

url编码/解码

import urllib

urllib.parse.quote(s)            url编码

urllib.parse.unquote(s)          url解码

 

2.1.3 响应Response

result.status_code   #获取请求状态码,200

result.cookies      #获取cookies, jar格式

result.raw            #原始内容

result.text           #内容

result.json()         #json格式

result.content        #二进制内容

result.encoding       #查看编码方式

result.encoding = 'gbk'     #修改编码方式

requests.utils.dict_from_cookiejar(result.cookies)['SESSION']  #提取cookie

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

2.2 selenium

2.2.1 直接上手

from selenium import webdriver

from selenium.webdriver.common.keys import Keys  #键盘

import time

 

#解决提示Chrome正在受到自动测试软件的控制

chrome_options = webdriver.ChromeOptions()

chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])

 

driver = webdriver.Chrome(options=chrome_options)

driver.get("https://a.com")    #打开网页

driver.fullscreen_window()                         #全屏

#driver.execute_script("document.body.style.transform='scale(0.90)'")   #缩放90%

#1.点击选择框

e = driver.find_element_by_class_name("el-input__inner")      #通过class定位

e.click()                                                     #点击

time.sleep(2)                  

#2.选择

e = driver.find_element_by_xpath("//*[text()='杭州']")    #通过文字定位元素

e.click()

time.sleep(2)

#3.确定

e = driver.find_element_by_class_name("btn")

e.click()

#e.send_keys(Keys.ENTER)      #在指定位置输入按键

 

 

 

2.2.2 功能

#1 浏览器打开指定链接

import  webbrowser

webbrowser.open(url)

posted @ 2021-08-07 21:06  化身天使  阅读(56)  评论(0)    收藏  举报