爬虫基础-requests模块使用样例

1. 基础

import requests

if __name__ == "__main__":
    # 1. 指定url
    url = 'https://www.sougou.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    # 2. 发起请求, get方法会返回一个响应对象
    response = requests.get(url=url, headers=headers)
    # 3. text(字符串) conten(二进制) json() (对象)
    # 对应的保存方法:
    # text: fp.write(page_text)
    # json: json.dump(list_data, fp=fp, ensure_ascii=False)
    # content:  with open(img_path, 'wb') as fp:
    #             fp.write(igm_data)
    page_text = response.text
    print(page_text)
    # 4. 持久化存储
    with open('./sougou.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
        fp.close()
    print('爬取数据结束')

2. 人人网模拟登录-携带cookie

# -*- coding:utf-8 -*-
import requests
# 登录接口:
# http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020731013655
#
# 个人主页地址
# http://www.renren.com/266059096/profile

login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020731013655'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
data = {
    'email': 'wangshui898@sina.com',
    'icode': '',
    'origURL': 'http://www.renren.com/home',
    'domain': 'renren.com',
    'key_id': '1',
    'captcha_type': 'web_login',
    'password': '43deb19ec2e1ea71941ecc309e593afec110876b6181334542f6b125ffdfd675',
    'rkey': '07eeb6b6485891ad0895d30f1a23db38',
    'f': 'http%3A%2F%2Fwww.renren.com%2F266059096'
}
# 创建一个session对象
session = requests.Session()
# 使用session进行post请求
response = session.post(url=login_url, data=data, headers=headers)
print('登录页状态码: ', response.status_code)
# detail_url = 'http://www.renren.com/266059096/profile'
detail_url = 'http://www.renren.com/266059096'
detail_page= session.get(url=detail_url, headers=headers)
detail_page_text = detail_page.text
print('个人主页状态码: ', detail_page.status_code)
fp = open('./renren_detail.html', 'w', encoding='utf-8')
fp.write(detail_page_text)
fp.close()

 3. 肯德基餐厅位置查询

# -*- coding:utf-8 -*-
import requests
import json

if __name__ == '__main__':
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        'cookie': 'route-cell=ksa; ASP.NET_SessionId=2y4hig2mluq5mty4dgai3i4k; Hm_lvt_1039f1218e57655b6677f30913227148=1597132083,1597208991; Hm_lpvt_1039f1218e57655b6677f30913227148=1597208991; SERVERID=891dda8157e24744f56aa53dc4ec1dc1|1597209068|1597208989'
    }
    kw = input('请输入查询地址: ')
    param = {
        'op': 'keyword',
        'cname': '',
        'pid': '',
        'keyword': kw,
        'pageIndex': '1',
        'pageSize': '10'
    }
    response = requests.get(url=url, params=param, headers=headers)
    dic_data = response.json()
    fileName = kw + '.json'
    fp = open(fileName, 'w', encoding='utf-8')
    json.dump(dic_data,fp=fp, ensure_ascii=False)
    fp.close()
    print('爬取完毕')

4. 抓取糗事百科图片-正则方式

# -*- coding:utf-8 -*-
import requests
import re
import os

# 需求: 爬取糗事百科中的糗图板块中所有图片

if __name__ == "__main__":
    # 创建一个目录,保存图片
    if not os.path.exists('./qiutuLibs'):
        os.mkdir('./qiutuLibs')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }

    #设置一个通用url模版
    url = 'https://www.qiushibaike.com/imgrank/page/%d/'
    for pageNum in range(1,13):
        # 对应页码的url
        new_url = format(url%pageNum)

        # 使用通用爬虫对url对应的整张页面进行爬取
        page_text = requests.get(url=new_url, headers=headers).text
        # 使用聚焦爬虫对页面中所有的糗图进行解析/提取, 下面正则的意识是提取src后面的链接-- (.*?) 中的内容
        ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
        # re.S单行匹配, re.M多行匹配 re.I忽略大小写
        img_src_list = re.findall(ex, page_text, re.S)
        #    print(img_src_list)
        for img_src in img_src_list:
            img_link = 'https:' + img_src
            igm_data = requests.get(url=img_link, headers=headers).content
            # 生成图片名称,以/为分割,取最后一列
            img_name = img_link.split('/')[-1]
            # 图片最终存储路径
            img_path = './qiutuLibs/' + img_name
            with open(img_path, 'wb') as fp:
                fp.write(igm_data)
                fp.close()
                print(img_name, '下载成功!!!')

5. 下载梨视频mp4-正则方式

import requests
from lxml import etree
import re
from multiprocessing.dummy import Pool
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }

# 对url发请求,解析视频详情页的url和视频名称
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url=url,headers=headers).text

tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
for li in li_list:
    deatil_url = 'https://www.pearvideo.com/' + li.xpath('.//a[@class="vervideo-lilink actplay"]/@href')[0]
    detail_name = li.xpath('.//a/div[@class="vervideo-title"]/text()')[0] + '.mp4'
    # 对详情页的url发起请求
    print(detail_name, '开始下载')
    detail_page = requests.get(url=deatil_url,headers=headers).text
    ex = 'srcUrl="(.*\.mp4?)",vdoUrl=srcUrl'
    video_url = re.findall(ex, detail_page)[0]
    video_content = requests.get(video_url,headers=headers).content
    fp = open(detail_name,'wb')
    fp.write(video_content)
    fp.close()
    print(detail_name, '下载完成')

 

 

posted @ 2020-08-13 15:03  消磨_时间  阅读(213)  评论(0编辑  收藏  举报