Day4,放弃又挣扎的几天

一、爬取梨视频

取梨视频:
请求url:
    https://www.pearvideo.com/
    
请求方式:
    GET
    
请求头:
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
'''

# import requests
# import re  # 正则模块
#
# # 1、对梨视频详情页发送请求,获取响应数据
# response = requests.get(url='https://www.pearvideo.com/')
# print(response.status_code)
# print(response.text)
#
# # re.findall('正则匹配规则', '解析文本', "正则模式")
# # re.S: 全局模式 (对整个文本行进匹配)
# # .指的是当前位置
# # *指的是查找所有
# '''
# <a href="video_1543373"
# <a href="video_(.*?)"  # 提取1543373
# '''
#
# # 2、获取主页视频详情页ID
# res = re.findall('<a href="video_(.*?)"', response.text, re.S)
# print(res)
#
#
# for m_id in res:
#     # 拼接详情页url
#     detail_url = 'https://www.pearvideo.com/video_' + m_id
#     print(detail_url)



import requests
import re  # 正则模块
# uuid.uuid4()  可以根据时间戳生成一段世界上唯一的随机字符串
import uuid

# 爬虫三部曲

# 1、发送请求
def get_page(url):
    response = requests.get(url)
    return response

# 2、解析数据
# 解析主页获取视频详情页ID
def parse_index(text):
    res = re.findall('<a href="video_(.*?)"', text, re.S)
    # print(res)

    detail_url_list = []
    for m_id in res:
        # 拼接详情页url
        detail_url = 'https://www.pearvideo.com/video_' + m_id
        # print(detail_url)
        detail_url_list.append(detail_url)

    # print(detail_url_list)

    return detail_url_list

# 解析详情页获取视频url
def parse_detail(text):
    ''''''
    '''
        (.*?): 提取括号的内容
        .*?: 直接匹配
        <video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20190613/cont-1566073-14015522_adpkg-ad_hd.mp4" style="width: 100%; height: 100%;"></video>
        
    正则: <video.*?src="(.*?)"
    
    # 以上是分析过程,不需要写
    
    正则: srcUrl="(.*?)"
    '''
    movie_url = re.findall('srcUrl="(.*?)"', text, re.S)[0]
    return movie_url


# 3、保存数据
def save_movie(movie_url):
    response = requests.get(movie_url)
    # 把视频写到本地
    with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
        f.write(response.content)
        f.flush()

if __name__ == '__main__':  # main + 回车键

    # 1、对主页发送请求
    index_res = get_page(url='https://www.pearvideo.com/')

    # 2、对主页进行解析、获取详情页id
    detail_url_list = parse_index(index_res.text)
    # print(detail_url_list)

    # 3、对每个详情页url发送请求
    for detail_url in detail_url_list:
        detail_res = get_page(url=detail_url)
        print(detail_res.text)

        # 4、解析详情页获取视频url
        movie_url = parse_detail(detail_res.text)
        print(movie_url)

        # 5、保存视频
        save_movie(movie_url)

  

二、高性能爬虫

mport requests
import re  # 正则模块
# uuid.uuid4()  可以根据时间戳生成一段世界上唯一的随机字符串
import uuid
# 导入线程池模块
from concurrent.futures import ThreadPoolExecutor
# 线程池限制50个线程
pool = ThreadPoolExecutor(50)

# 爬虫三部曲

# 1、发送请求
def get_page(url):
    print(f'开始异步任务: {url}')
    response = requests.get(url)
    return response


# 2、解析数据
# 解析主页获取视频详情页ID
def parse_index(res):

    response = res.result()
    # 提取出主页所有ID
    id_list = re.findall('<a href="video_(.*?)"', response.text, re.S)
    # print(res)

    # 循环id列表
    for m_id in id_list:
        # 拼接详情页url
        detail_url = 'https://www.pearvideo.com/video_' + m_id
        # print(detail_url)
        # 把详情页url提交给get_page函数
        pool.submit(get_page, detail_url).add_done_callback(parse_detail)


# 解析详情页获取视频url
def parse_detail(res):
    response = res.result()
    movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]
    # 异步提交把视频url传给get_page函数,把返回的结果传给save_movie
    pool.submit(get_page, movie_url).add_done_callback(save_movie)


# 3、保存数据
def save_movie(res):

    movie_res = res.result()

    # 把视频写到本地
    with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
        f.write(movie_res.content)
        print(f'视频下载结束: {movie_res.url}')
        f.flush()


if __name__ == '__main__':  # main + 回车键

    # 一 往get_page发送异步请求,把结果交给parse_index函数
    url = 'https://www.pearvideo.com/'
    pool.submit(get_page, url).add_done_callback(parse_index)

  

三、requests详细使用

# import requests
#
# response=requests.get(url='https://www.zhihu.com/explore')
# print(response.status_code)
# print(response.text)
#

import requests

headers = {
'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Mobile Safari/537.36'

}

# response=requests.get(url='https://www.zhihu.com/explore',headers=headers)
# print(response.status_code)
# # print(response.text)
#
# with open('zhihu.html','w',encoding='utf-8') as f :
# f.write(response.text)


# import requests
# from urllib.parse import urlencode
# url = 'https://www.baidu.com/s?wd=%E8%94%A1%E5%BE%90%E5%9D%A4'
# url = 'https://www.baidu.com/s?' + urlencode({"wd": "蔡徐坤"})
# url = 'https://www.baidu.com/s?'
# headers = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
# }
# print(url)
# 在get方法中添加params参数
# response = requests.get(url,headers=headers,params={"wd":"蔡徐坤"})
# response = requests.get(url, headers=headers, params={"wd": "安徽工程大学"})
# response = requests.get(url, headers=headers, params={"wd": "安徽工程大学", "pn": "20"})
# print(response.text)
# with open('1.html', 'w', encoding='utf-8') as f:
# f.write(response.text)


#
'''
请求Url:
https://github.com/settings/emails

请求方式method:
GET

请求头:
User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Mobile Safari/537.36
Cookie: _ga=GA1.2.1697930951.1554622929; _octo=GH1.1.1498701842.1560392375; _device_id=22a0ddb58979d9c97ffafeb3113e2567; user_session=oS2RUtplUtxp6CDkuOog1uxdGF0QbrSfYOv6R5Zz4CKLvUJ3; __Host-user_session_same_site=oS2RUtplUtxp6CDkuOog1uxdGF0QbrSfYOv6R5Zz4CKLvUJ3; logged_in=no; has_recent_activity=1; _gat=1; tz=Asia%2FShanghai; _gh_sess=RjNKQzJhRUpoM3lSU2RtUmx2am5GM04rTTFYZmx4dDdTeXdzOWxua0lOeEQ2Ky9TVHlXU2FCd1c0UDVPUVIrNXpQUzBVcndkcW52bEh5aU9JazhmWk1WVXRyWW1NT2crUHc1QmkycHFZWEtrVitOVHFMcDg5cDhNUXlNcjNoQ2djUzhXYklaL00xSHN6RWV5cFY4THdnOVhySzhwOVVlTTFON0VaNENuNGM0K3ZBL3JnWUw4eWc1U1JzWFJGYlJzU3ROdCtaa3JWbmU5T2Foc0hRZ1lRRDdmc09SejlUVmw5emRFQUh5YWVYOElBN0NRaTY5NlUwVXJyMS9VZFF3US9lUHlpTUVWMlN2RXRSSUtDQk8zZEcwWXY1ODZ1THBTSVp0ZFUxZFh4K3JkKzlsdkVDN1hDenZkbU94V0FjSzNwUlcvRkJiSTNQaU15bW1QbnFPUzRESHd3RHp5YmhZNmFrVzUxZjFnbDErVHBmVDAvUnZwTC84STZyY2NsM2x4N3M4VkwzSGlyZ05YVnc2LzNGMHIrVmgxbVBxQk4yYmNQcWJNdlh3dkFBTXNBWGdpVjdaKzBsNVlqaEtRNUlBWlpnNU9iLzk0bWtxSkFsUDRhOUhvV3hCNjNSUnRqMkVleGx2MDU2SFlzb0E9LS1sR0RWRGhhb1IrTDdTYy94ME4ra3N3PT0%3D--1a8f3aec4d9124a6f918f964155d1ff1b75669f7



'''
'''
# GET请求跳过github
1.去github登录,赋值网页的cookies信息
2.直接携带cookies往settings/emails页面发送get请求查看手机号码是否在页面中
'''
import requests

# 请求url
url = 'https://github.com/settings/emails'



# 请求头
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'

}
# github_res = requests.get(url, headers=headers)
# print(github_res.text)

# import requests
# 登录之后获取的cookies
COOKIES= {
'Cookie':'_ga=GA1.2.1697930951.1554622929; _octo=GH1.1.1498701842.1560392375; _device_id=22a0ddb58979d9c97ffafeb3113e2567; has_recent_activity=1; tz=Asia%2FShanghai; user_session=5BmoPtph6x1VDGmIOxwQZeRBgrb36Na9uVBegOtMaL7XkpQ-; __Host-user_session_same_site=5BmoPtph6x1VDGmIOxwQZeRBgrb36Na9uVBegOtMaL7XkpQ-; logged_in=yes; dotcom_user=FengYeZhuiMeng; _gh_sess=NFk0ZE04djhIcUV3Ris3bnNSa0JFUkFoeG1HNmswVzJ1SE5kTEp4dURPcXlidGF4eTRCMjVJZU1zODBURGplZUx4L1VHZnhqRGZDZUR1YmR5ek9YVmtlU1dhcE9yVWRlaEVtbHlramxsWkRmRGRMaGxmN1lNcDJ1R3kwdWk2czk3SFo5WWJrMlFIcGpPNHRpdGUzVGRtelNCSm5RNDl0UlREOFlPUlJraU8yVFJ6L09FdWE4dmcxOVpMMTNJVzAvR2dWOTNmclkzbVkyVkhBVENiR2h6N3hxV1NXcGcrTWZiMzBGU1l1cnhMcGw2ZGpUeFpDNmNGVHRWV05CU0N1SEtTVXVxZTZMSTFJVUhXdklqbzlPazI2R0QxQjZvOXhRdm9hcWcrbTJUSDVWamxHUGVISlNSRVBzcHZzR2tOQkhsSm13VzA2S1hsSGVubWRPNmFQVWFWSGs3NDhEWG5NSDc3eWpGYzFuQVdJPS0tdW9zaldudkh4UjdMeGU4Wkd4R3R5QT09--e4ab892c0c63bd7a811e0a932ad02a576986f4c8'
}
github_res = requests.get(url,headers=headers,cookies=COOKIES)


print('1321053678' in github_res.text)

  

四、爬取豆瓣Top250电影信息

''''''
'''
主页:
    https://movie.douban.com/top250
    GET
    User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
    
re正则:
    # 电影详情页url、图片链接、电影名称、电影评分、评价人数
    <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价
'''
import requests
import re
url = 'https://movie.douban.com/top250'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
# 1、往豆瓣TOP250发送请求获取响应数据
response = requests.get(url, headers=headers)

# print(response.text)

# 2、通过正则解析提取数据
# 电影详情页url、图片链接、电影名称、电影评分、评价人数
movie_content_list = re.findall(
    # 正则规则
    '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',

    # 解析文本
    response.text,

    # 匹配模式
    re.S)

for movie_content in movie_content_list:
    # 解压赋值每一部电影
    detail_url, movie_jpg, name, point, num = movie_content
    data = f'电影名称:{name},   详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num} \n'
    print(data)

    # 3、保存数据,把电影信息写入文件中
    with open('douban.txt', 'a', encoding='utf-8') as f:
        f.write(data)

  

总结:

'''
    坦白说,我喜欢爬虫,因为我感觉爬虫比较有意思,但是现在感觉讲的太快了,反应不过来。因为我们本科Python课程学的较少,只学到函数和后面提了一点。因为我们的Python老师向我们坦白他也没学过这门课程呢,嘻嘻嘻,带着我们一起学。所以慢慢往后听,越来月不懂为什么这样做是对的,只知道这样做是对的。晚上到图书馆自己做的时候总是出错,有的甚至得不到应有的结果。所以降低了我的积极性,但今天午睡过后我想明白了,不能放弃,我打算后面的课程跟着Tank做,能懂一些是一些。我找了Python的资源打算期末考试结束后看。105个G汗颜。
    每天给自己打个气,加油加油Fighting!!!

'''

  

 

posted @ 2019-06-18 23:32  枫叶追梦  阅读(517)  评论(0)    收藏  举报