day01

import requests
"""
爬取校花网
    1、请求url
        www.xiaohuar.com/v
    2、请求方式
        get
    3、User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
        
"""
#爬虫三部
#1.发送请求
def get_page(url):
   response = requests.get(url)
   return  response
#2.解析数据
import re
def parse_index(html):
    detail_urls = re.findall(' <div class="items"><a class="imglink" href="(.*?)"',html,re.S)
    return detail_urls

def parse_detail(html):
    movie_url = re.findall('<source src="(.*?)">',html,re.S)
    if movie_url:
        return movie_url[0]
#3.保存数据
import uuid
def save_movies(content):
    with open(f"{uuid.uuid4()}.mp4","wb") as f :
        f.write(content)
        print("下载完成.....")

#main
if __name__ == '__main__':
    # http: // www.xiaohuar.com / list - 3 - 1.html
    count = 0   #计数
    for i in range(6):
        #发送请求
        url = f"http://www.xiaohuar.com/list-3-{i}.html"    #改变网址中特定的字符的值,根据规律进行爬取下一页
        response = get_page(url)
        #响应状态码
        # print( response.status_code)
        #响应文本
        # print(response.text)
        #解析页面
        detail_urls = parse_index(response.text)
        for detail_url in detail_urls:
            print(detail_url)
            detail_1 = get_page(detail_url)
            movie_1 = parse_detail(detail_1.text)
            if movie_1:
                movie_res = get_page(movie_1)
                count += 1
                print(f"正在爬取第{count}个视频..")
                save_movies(movie_res.content)
        print("第一页爬取完成!")

 Post自动登录

""
Post请求自动登录账户
    1、请求URL
        https://github.com/session
    2、请求方式
        POST
    3、请求头
        cookie
    4、请求体
        commit: Sign in
        utf8: ✓
        authenticity_token: I/UYAYv4iJveTzB6FEJV2sfrGrfK79De9OX0tjAIbTfSvwO4H31du/a2GUqO6FoAQ2LAcci12a1XUUsW71qt1w==
        login: 123
        password:
        webauthn-support: supported
"""
"""
<input type="hidden" name="authenticity_token" value="(.*?)"
"""
import requests
import re
login_url = 'https://github.com/login'
login_header = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
login_res = requests.get(url = login_url ,headers=login_header)
authenticity_token = re.findall(
    '<input type="hidden" name="authenticity_token" value="(.*?)"',
    login_res.text,
    re.S
)[0]
login_cookies = login_res.cookies.get_dict()
# print(login_cookies)
session_url = "https://github.com/session"
session_header={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
form_data = {
    "commit":"Sign in",
    "utf8":"✓",
    "authenticity_token":authenticity_token,
    "login":"******",
    "password":"******",
    "webauthn-support":"supported"
}
session_res = requests.post(
    url = session_url,
    headers = session_header,
    cookies = login_cookies,
    data = form_data
)
with open('github.html','w',encoding='utf-8') as f:
    f.write(session_res.text)

 

posted @ 2019-07-01 14:41  Coder_HK  阅读(136)  评论(0)    收藏  举报