day01
import requests """ 爬取校花网 1、请求url www.xiaohuar.com/v 2、请求方式 get 3、User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36 """ #爬虫三部 #1.发送请求 def get_page(url): response = requests.get(url) return response #2.解析数据 import re def parse_index(html): detail_urls = re.findall(' <div class="items"><a class="imglink" href="(.*?)"',html,re.S) return detail_urls def parse_detail(html): movie_url = re.findall('<source src="(.*?)">',html,re.S) if movie_url: return movie_url[0] #3.保存数据 import uuid def save_movies(content): with open(f"{uuid.uuid4()}.mp4","wb") as f : f.write(content) print("下载完成.....") #main if __name__ == '__main__': # http: // www.xiaohuar.com / list - 3 - 1.html count = 0 #计数 for i in range(6): #发送请求 url = f"http://www.xiaohuar.com/list-3-{i}.html" #改变网址中特定的字符的值,根据规律进行爬取下一页 response = get_page(url) #响应状态码 # print( response.status_code) #响应文本 # print(response.text) #解析页面 detail_urls = parse_index(response.text) for detail_url in detail_urls: print(detail_url) detail_1 = get_page(detail_url) movie_1 = parse_detail(detail_1.text) if movie_1: movie_res = get_page(movie_1) count += 1 print(f"正在爬取第{count}个视频..") save_movies(movie_res.content) print("第一页爬取完成!")
Post自动登录
"" Post请求自动登录账户 1、请求URL https://github.com/session 2、请求方式 POST 3、请求头 cookie 4、请求体 commit: Sign in utf8: ✓ authenticity_token: I/UYAYv4iJveTzB6FEJV2sfrGrfK79De9OX0tjAIbTfSvwO4H31du/a2GUqO6FoAQ2LAcci12a1XUUsW71qt1w== login: 123 password: webauthn-support: supported """ """ <input type="hidden" name="authenticity_token" value="(.*?)" """ import requests import re login_url = 'https://github.com/login' login_header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } login_res = requests.get(url = login_url ,headers=login_header) authenticity_token = re.findall( '<input type="hidden" name="authenticity_token" value="(.*?)"', login_res.text, re.S )[0] login_cookies = login_res.cookies.get_dict() # print(login_cookies) session_url = "https://github.com/session" session_header={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } form_data = { "commit":"Sign in", "utf8":"✓", "authenticity_token":authenticity_token, "login":"******", "password":"******", "webauthn-support":"supported" } session_res = requests.post( url = session_url, headers = session_header, cookies = login_cookies, data = form_data ) with open('github.html','w',encoding='utf-8') as f: f.write(session_res.text)