Python爬取酷狗音乐Top500首歌曲并下载到本地

# @Author:林云
# @Time:2022/11/20 18:05
# @File:KuGouYinyue.py
# @Project:PycharmProjects
import json
import os
from time import sleep

import requests
from lxml import etree
# 伪装
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.52'
}
# 共有500首歌曲,每个页面有22首歌曲,所以需要循环500/22->23次
for n in range(23):
    # 摘取并拼凑出每个页面的url地址
   home_url = f'https://www.kugou.com/yy/rank/home/{n + 1}-8888.html?from=homepage'
    # 获取对应页面的文本数据
    home_res = requests.get(url=home_url, headers=headers).text
    # print(home_res)
    e = etree.HTML(home_res)
    # 通过xpath对文本数据进行解析,获取包含有相应页面所有hash值和album_id值的数据
    ms_url = e.xpath('//script[@type="text/javascript"]/text()')
    # print(type(ms_url),len(ms_url))
    # print(ms_url)
    for list in ms_url:
    	# 去除数据中的换行
        list = list.strip()
        # print(list)
        # 提取包含hash值和album_id值的字节集合,并将其中的"-"转换为unicode编码:"\\u002d"
        list = list[list.index('[{'):list.rindex(';')].replace('-', '\\u002d').encode('utf-8')
        # print(list)
        # 将unicode编码的数据转换为中文的json数据
        list = list.decode("unicode_escape")
        # print(list)
        # 将JSON数据并将其转换为字典
        hash = json.loads(list)
        # print(len(hash))
        # print(hash,type(hash))
        for i in range(len(hash)):
            h = hash[i]
            # 获取每个歌曲对应的hash值、album值、FileName值
            m_h = h.get('Hash')
            m_id = h.get('album_id')
            name = h.get('FileName')
            # print(m_h,m_id,name)
            # 将hash值、album值代入对应歌曲播放页面的url地址
            music_url = f'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&hash={m_h}' \
                        f'&mid=e496ef938c1254f6efccb2e7cbccd1fb&album_id={m_id}'
            # print(music_url)
            music_res = requests.get(music_url, headers).text
            music_res = json.loads(music_res)
            # 获取对应歌曲的播放源地址
            ms_url = music_res['data'].get('play_url')
            # print(ms_url)
            ms_res = requests.get(ms_url, headers)
            i = i + 1
            count = (len(hash) * n) + i
            # print(count)
            if count < 10:
                count = '00' + str(count)
            elif 10 <= count < 100:
                count = '0' + str(count)
            else:
                count = count
            if not os.path.exists('./musicTop500'):
                os.makedirs('./musicTop500')
            with open(f'./musicTop500/{count}-{name}.mp3', 'wb') as f:
                f.write(ms_res.content)
                print(f'{name}下载完成!!!')
                # 停顿时间(s 秒)可以增大一些,不然容易被封
                sleep(2)
posted @ 2022-11-21 12:01  YangSaid  阅读(754)  评论(4)    收藏  举报