python入门day5
• requests的post请求
import requests import re headers={ 'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } response=requests.get(url='https://github.com/login',headers=headers) print(response.text) 把login页返回的cookies信息转换成字典 login_cookies=response.cookies.get_dict() authenticity_token=re.findall(' name="authenticity_token" value="(.*?)"',response.text,re.S)[0] print(authenticity_token) #拼接请求头信息 headers2={ 'Referer':'https://github.com/login', 'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } #拼接请求体信息 form_data={ 'commit':'Sign in', "utf-8":"✓", "authenticity_token":authenticity_token, "login":"852653835", "password":"******", "webauthn-support":"supported" } # 往session发送请求,携带请求头、请求体、login页的cookies信息 response2=requests.post(url='https://github.com/session',data=form_data,headers=headers2,cookies=login_cookies) print(response2.status_code) with open('github.html','w',encoding='utf-8')as f: f.write(response2.text) #响应response import requests headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36', } # response = requests.get('https://www.github.com', headers=headers) print(response.status_code) # 获取响应状态码 print(response.url) # 获取url地址 print(response.text) # 获取文本 print(response.content) # 获取二进制流 print(response.headers) # 获取页面请求头信息 print(response.history) # 上一次跳转的地址 print(response.cookies) # # 获取cookies信息 print(response.cookies.get_dict()) # 获取cookies信息转换成字典 print(response.cookies.items()) # 获取cookies信息转换成字典 print(response.encoding) # 字符编码 print(response.elapsed) # 访问时间
• requests的高级应用
# https=http+ssl import requests #王音频地址发送get请求 url='http://hc.yinyuetai.com/uploads/videos/common/3B7201685F78BF2954FEEB32CB6EBD82.mp4' response=requests.get(url,stream=True) #stream=True把content设置为一个迭代器对象 print(response.content) with open('music.mp4','wb')as f: for content in response.iter_content(): f.write(content) ''' 证书验证(大部分网站都是https) ''' import requests # https = http + ssl response = requests.get('https://www.xiaohuar.com') print(response.status_code) # 改进1:去掉报错,但是会报警告 import requests response = requests.get('https://www.xiaohuar.com', verify=False) # 不验证证书,报警告,返回200 print(response.status_code) # 改进2:去掉报错,并且去掉警报信息 import requests import urllib3 urllib3.disable_warnings() # 关闭警告 response = requests.get('https://www.xiaohuar.com', verify=False) print(response.status_code) # 改进3:加上证书(伪代码) # 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书 # 知乎\百度等都是可带可不带 # 有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站 import requests response = requests.get( 'https://www.xiaohuar.com', cert=('/path/server.crt', '/path/key')) print(response.status_code) '' 上传文件 ''' import requests # 上传文本文件 files1 = {'file': open('user.txt', 'rb')} response = requests.post('http://httpbin.org/post', files=files1) print(response.status_code) # 200 print(response.text) # 200 # 上传图片文件 files2 = {'jpg': open('小狗.jpg', 'rb')} response = requests.post('http://httpbin.org/post', files=files2) print(response.status_code) # 200 print(response.text) # 200 # 上传视频文件 files3 = {'movie': open('love_for_GD.mp4', 'rb')} response = requests.post('http://httpbin.org/post', files=files3) print(response.status_code) # 200 print(response.text) # 200
• selenium模块
''' '''例1''' from selenium import webdriver # 用来驱动浏览器的 from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片 from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR,By.ClassName from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 # import time # 通过谷歌浏览器驱动打谷歌浏览器 # 1、 webdriver.Chrome(r'C:\Users\85265\Downloads\chromedriver.exe') chrome=webdriver.Chrome(r'C:\Users\85265\Downloads\chromedriver.exe') chrome.get('https://www.cnblogs.com/kermitjam') #2、 chrome=webdriver.Chrome() 若try出现异常 try: 驱动一参数对象,驱动二等待时间 wait=WebDriverWait(chrome,10) 访问百度 chrome.get('https://www.baidu.com') 查找input输入窗 input_tag=wait.until(EC.presence_of_element_located( 此处可以写一个参数 (By.ID,'kw')))#没ID找class 搜索一拳超人 input_tag.send_keys('一拳超人') 按键盘回车键 input_tag.send_keys(Keys.ENTER) time.sleep(3) finally: chrome.close() from selenium import webdriver # 用来驱动浏览器的 from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片 from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR,By.ClassName from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time 通过谷歌浏览器驱动打开谷歌浏览器 1、 webdriver.Chrome(r'C:\Users\85265\Downloads\chromedriver.exe') chrome=webdriver.Chrome(r'C:\Users\85265\Downloads\chromedriver.exe') chrome.get('https://www.cnblogs.com/kermitjam') #2、 chrome=webdriver.Chrome() 若try出现异常 try: 显示等待(等待某个元素加载),驱动一参数对象,驱动二等待时间 wait=WebDriverWait(chrome,10) 访问百度 chrome.get('https://www.jd.com') 查找input输入窗 input_tag=wait.until(EC.presence_of_element_located( 此处可以写一个参数 (By.ID,'key')))#没ID找class 搜索唐诗三百首 input_tag.send_keys('唐诗三百首') 根据class属性名查找标签 search_button=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'button'))) search_button.click() time.sleep(8) finally: chrome.close()
• 今日作业
爬取快代理(参考爬取西刺代理代码)
https://www.kuaidaili.com/free/
'''''' ''' 破解极验滑动验证 破解极验滑动验证 博客园登录url: https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F 代码逻辑: 1、输入用户名与密码,并点击登录 2、弹出滑动验证,获取有缺口与完整的图片 3、通过像素点进行比对,获取滑动位移距离 4、模拟人的行为轨迹 5、开始滑动 ''' from selenium import webdriver # 用来驱动浏览器的 from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片 import time from PIL import Image # pip3 install pillow import random # 截图图片函数 def cut_image(driver): # 获取整个页面图片,图片名字为'snap.png' driver.save_screenshot('snap.png') # 获取滑动小画图 image = driver.find_element_by_class_name('geetest_canvas_img') print(image.location) print(image.size) # 获取小图片的左上右下的位置 left = image.location['x'] top = image.location['y'] right = left + image.size['width'] buttom = top + image.size['height'] print(left, top, right, buttom) # 调用open方法打开全屏图片并赋值给image_obj对象 image_obj = Image.open('snap.png') # 通过image_obj对象对小图片进行截取 # box: The crop rectangle, as a (left, upper, right, lower)-tuple. img = image_obj.crop((left, top, right, buttom)) # 打开截取后的小图片 img.show() return img # 获取完整图片 def get_image1(driver): time.sleep(2) # 修改document文档树,把完整图片的display属性修改为block js_code = ''' var x = document.getElementsByClassName("geetest_canvas_fullbg")[0].style.display = "block"; ''' # 执行js代码 driver.execute_script(js_code) # 截取图片 image = cut_image(driver) return image # 获取有缺口图片 def get_image2(driver): time.sleep(2) # 修改document文档树,把完整图片的display属性修改为block js_code = ''' var x = document.getElementsByClassName("geetest_canvas_fullbg")[0].style.display = "none"; ''' # 执行js代码 driver.execute_script(js_code) # 截取图片 image = cut_image(driver) return image def main(): driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') driver.implicitly_wait(10) driver.get('https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F') # 1、输入用户名与密码,并点击登录 user_input = driver.find_element_by_id('LoginName') user_input.send_keys('_tank_') time.sleep(0.2) pwd_input = driver.find_element_by_id('Password') pwd_input.send_keys('k46709394.') time.sleep(2) login_submit = driver.find_element_by_id('submitBtn') login_submit.click() # 2、获取完整的图片 image1 = get_image1(driver) # 3、获取有缺口图片 image2 = get_image2(driver) time.sleep(100) if __name__ == '__main__': main()
• 今日总结
系统的认识了requests与selenium模块,从是什么到为什么使用它们以及何时使用,有了大概的了解,
为深入学习做了一定的基础,收获很多。
浙公网安备 33010602011771号