python 爬取百度网盘分享动态

我之前写的一份爬虫,在百度网盘没有改版之前,有很多资源达人在他们的百度网盘动态分享自己的资源,后来我关注了一批分享影视资源的账号,程序定时去爬取他们的动态,将他们分享出来的百度网盘链接收入自己的数据库,写入数据库之前查询资源是否重复和不良关键词过滤,然后在另一端网页或APP,将数据库的资源展示出来,早期市面上的网盘资源搜索就是运用了这个原理,因为后来百度网盘改版,取消动态分享,程序目前已经无法正常运行,本文做个思路记录。

 

程序主入口,实现爬取百度网盘动态分享的功能都写在这个文件了,还负责调用其他文件函数,运行这个脚本就可以不间断的执行

#   主程序
import requests,re, json, time
import random
from mysql_db import *
import threading
from aidy_pc import *
from yszx import *
from defs import *

header = {
"Cookie": "",
"Host": "pan.baidu.com",
"Referer": "https://pan.baidu.com/pcloud/friendpage?type=follow&uk=2489863899&self=1",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
# 获取账号订阅的id
list_uk = ['2489863899']
def getShareUser():
    start = 0
    for star in range(100):
        try:
            url = 'https://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=2489863899&limit=24&start=%d' % start
            follows_json = (requests.get(url, headers=header)).json()

            if len(follows_json['follow_list']) == 0:  # 如果没有返回数据侧退出
                break
            lists = follows_json['follow_list']  # 全部信息列表
            for i in lists:  # 遍历分离出每个订阅用户的信息
                list_uk.append(i['follow_uk'])  # 添加uk到列表
            start = start + 24
            time.sleep(random.randint(10, 25))
        except:
            continue


    # if list_uk == '':
    #     return False
    # else:
    #     return list_uk

# 程序开始
def gethtml():           #  爬取网盘资源函数
    tu = getShareUser()   # 这里是去获取我订阅的账号id
    if tu == False:          # 如果获取不到订阅列表,则退出
        pass
    else:
        start = 0
        for uk in list_uk:  # 循环订阅id
            for n in range(2):  # 循环翻页
                url = "https://pan.baidu.com/pcloud/feed/getdynamiclist?auth_type=1&filter_types=11000&query_uk=%s&category=0&limit=25&start=%s&bdstoken=29b0093f2c23b7afd5f41c39f57be34e&channel=chunlei&clienttype=0&web=1" % (
                uk, start)
                filelist_json = requests.get(url, headers=header).json()
                if filelist_json['errno'] != 0:
                    break
                list_records = filelist_json['records']  # 本次请求的所有资源列表
                for data_vaule in list_records:  # 遍历资源列表里的所有字典
                    if data_vaule['category'] == 3:           # 不要图片
                        pass
                    if gjc_gl(data_vaule['title'])==False:         # 关键词过滤
                        pass
                    else:
                        #print(data_vaule['title'])
                        print(data_vaule)
                        #mysql_into(data_vaule)          # 开始写入数据库
                                              ##print(data_vaule)  # 文件类型:category(文件夹6,视频1,图片3)  链接:shorturl 标题:title  时间:feed_time
                start = start + 25
                time.sleep(random.randint(10, 25))


if __name__ == '__main__':
    while True:
        try:
            gethtml()                                 #  网盘爬虫函数
            t1 = threading.Thread(target=bdsl)        #  网盘失效连接检测函数
            #t2 = threading.Thread(target=aidy)        #  爱电影网站爬虫函数
            #t3 = threading.Thread(target=main_ys)
            t1.start()
            #t2.start()
            #t3.start()


            time.sleep(10800)  # 每3个小时爬一次,一天爬4次
        except:
            continue
View Code

 

数据写入数据库和百度网盘失效链接检测删除函数,将爬取到的数据传入函数即可写入数据库,还有一个链接失效检测函数,链接失效很正常,这个函数对整个数据库的链接进行检测,如果失效的链接删除。

# 对数据库进行连接与数据入库

import pymysql,time
import requests,re
import random
def pysql():
    try:

        mysql = pymysql.connect('127.0.0.1', 'bdwp', 'xDnwLnjSEXLbGJYa', 'bdwp', charset="utf8")
        #mysql = pymysql.connect('127.0.0.1', 'root', 'root', 'bdwp', charset="utf8")
        return mysql
    except:
        print("数据库连接失败!")
        exit()


def mysql_into(data_vaule):          #  网盘数据添加数据库函数
    mysql = pysql()
    db = mysql.cursor()

    sqlcx = "select title from data_zy WHERE title='%s'"%data_vaule['title']
    db.execute(sqlcx)
    data = db.fetchall()

    if not data:         # 没有的时候执行
        sqlcxid = "select max(id) from data_zy"
        db.execute(sqlcxid)
        dataid = db.fetchall()
        ids = (int(dataid[0][0])) + 1  # 获取最后一个入库id
        time_time = time.strftime("%Y-%m-%d %H:%M", time.localtime())  # 获取入库时间
        timeStamp = data_vaule['feed_time']                 #  转换资源分享时间
        timeStamp = float(timeStamp / 1000)
        timeArray = time.localtime(timeStamp)
        otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

        try:
            sqltj = "insert into data_zy (id,category,shorturl,title,feed_time,rk_time) VALUES ('%d','%d','%s','%s','%s','%s')" % (ids,data_vaule['category'], data_vaule['shorturl'],data_vaule['title'],otherStyleTime,time_time)
            db.execute(sqltj)
            mysql.commit()
        except:
            pass

    else:
        return False   # 数据库里存在文件时

    mysql.close()


# 百度链接失效检测函数
def bdsl():
    header = {
       
        "Host": "pan.baidu.com",
        "Referer": "https://pan.baidu.com/pcloud/friendpage?type=follow&uk=2489863899&self=1",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    mysql = pysql()
    db = mysql.cursor()
    sqlcx = "select id,shorturl from data_zy"
    db.execute(sqlcx)
    data = db.fetchall()
    #查询完成
    for r in data:
        url = "https://pan.baidu.com/s/"+r[1]
        id = r[0]

        html = (requests.get(url, headers=header).text).encode('iso-8859-1').decode('utf-8')

        srt = "此链接分享内容可能因为涉及侵权、色情、反动、低俗等信息,无法访问!"
        if srt in html:
            sqlde = "DELETE FROM data_zy WHERE id = %s" % id
            db.execute(sqlde)

            time.sleep(random.randint(10, 25))

        else:
            pass
View Code

 

这里还有个小函数,如果我们爬取的资源标题包含敏感词则不写入数据库,主要过滤广告

from mysql_db import pysql

def gjc_gl(title):
    mysql = pysql()
    db = mysql.cursor()
    sql = "select * from gjc_gl WHERE id=1"
    db.execute(sql)
    data = db.fetchall()[0][1]
    data = data.split(',')
    for trs in data:
        if trs in title:
            return False
        else:
            pass
    return True




























# import os
# import binascii
# cats = {
#     u'video': u'视频',
#     u'image': u'图片',
#     u'document': u'书籍',
#     u'music': u'音乐',
#     u'package': u'压缩',
#     u'software': u'软件',
# }
#
# def get_label(name):
#     if name in cats:
#         return cats[name]
#     return u'其它'
#
# #   函数用途,根据传入的文件名后缀而判断文件类型
# def get_category(ext):
#     ext = ext + '.'
#     cats = {
#         u'video': '.avi.mp4.rmvb.m2ts.wmv.mkv.flv.qmv.rm.mov.vob.asf.3gp.mpg.mpeg.m4v.f4v.',
#         u'image': '.jpg.bmp.jpeg.png.gif.tiff.',
#         u'document': '.pdf.isz.chm.txt.epub.bc!.doc.docx.xlsx.xls.pptx.ppt.',
#         u'music': '.mp3.wma.ape.wav.dts.mdf.flac.',
#         u'package': '.zip.rar.7z.tar.gz.iso.dmg.pkg.',
#         u'software': '.exe.app.msi.apk.',
#         u'torrent': '.torrent.'
#     }
#     for k, v in cats.items():
#         if ext in v:
#             return get_label(k)     # 调用
#     return '其他'
View Code

 

这里写了一个拓展函数,去爬取其他网站的函数,动态分享获取到的资源或许不够,这里可以多渠道爬取其他网站,从而可以建立一个更加全面的百度网盘资源搜索

import requests,re,time
import random
import pymysql
from mysql_db import pysql


def aidy():
    for i in range(11, 24):  # 1000
        for r in range(1, 6):
            try:
                url = "http://520.58801hn.com/%d/page/%d" % (i, r)

                header = {
                    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Mobile Safari/537.36"}

                html = requests.get(url, headers=header).text
                re_url = re.findall('<div class="entry-meta">.*?<a href="(.*?)" rel="bookmark">', html, re.S)
                times = re.findall('<div class="entry-meta">.*?itemprop="datePublished">(.*?)</time></a>', html, re.S)

                t=0
                for for_url in re_url:
                    html_wp = requests.get(for_url, headers=header).text
                    re_wp = re.findall('<p>.*?href="https://pan.baidu.com/s/(.*?)">百度云盘</a>.*?:(.*?)</p>', html_wp,
                                       re.S)
                    if re_wp:
                        h1 = re.findall('<h1 class="entry-title" itemprop="name headline">(.*?)</h1>', html_wp, re.S)

                        # 开始连接数据库
                        mysql = pysql()
                        db = mysql.cursor()
                        # # 查询是否有重复标题
                        sqlcx = "select title from data_zy WHERE title='%s'" % h1[0]
                        db.execute(sqlcx)
                        data = db.fetchall()

                        # # 没有重复标题时添加数据
                        if not data:  # 没有的时候执行
                            sqlcxid = "select max(id) from data_zy"
                            db.execute(sqlcxid)
                            dataid = db.fetchall()
                            ids = (int(dataid[0][0])) + 1  # 获取最后一个入库id
                            time_time = time.strftime("%Y-%m-%d %H:%M", time.localtime())  # 获取入库时间

                            try:
                                sqltj = "insert into data_zy (id,category,shorturl,title,feed_time,rk_time,wpmm) VALUES ('%d','6','%s','%s','%s','%s','%s')" % (
                                    ids, re_wp[0][0], h1[0], times[t], time_time, re_wp[0][1])


                                db.execute(sqltj)
                                mysql.commit()
                                t = t + 1

                            except:
                                pass


                        else:
                            pass
                    time.sleep(random.randint(2, 10))

                time.sleep(random.randint(2, 10))

            except:
                time.sleep(60)
                continue




if __name__ == '__main__':
    while True:
        try:
            aidy()
            time.sleep(10800)  # 每3个小时爬一次,一天爬4次
        except:
            continue
View Code

 

数据库的设计比较简单,只做了两个表,可以看看写入数据库函数那部分。

posted @ 2020-05-16 17:41  大铭分享  阅读(1309)  评论(0编辑  收藏  举报