【课程章节更新】猫影项目新爬虫源代码
起因
我在慕课有个flask 入门的课程:点击这里查看慕课课程。当时课程讲解的使用学习的视频网站已经不再提供服务了,为了方便大家学习这里重新找了一个视频源。这里郑重声明:该代码仅用于学习演示,请大家妥善使用,不要给源网站造成任何压力。
示例代码
新建一个python文件,文件名称是 movie2.py,代码如下
# -*- coding: utf-8 -*-
from application import app,db
import requests,os,time,hashlib,json,re
from bs4 import BeautifulSoup
from common.libs.DataHelper import getCurrentTime
from urllib.parse import urlparse
from common.models.movie import Movie
import logging
from flask.logging import default_handler
'''
示例命令如下
python manager.py runjob -m movie2 -a list | parse
'''
class JobTask():
def __init__(self):
## 设置Job使用debug模式
app.config['DEBUG'] = True
logging_format = logging.Formatter(
'%(levelname)s %(asctime)s %(filename)s:%(funcName)s L%(lineno)s %(message)s')
default_handler.setFormatter(logging_format)
self.source = "2345movie"
self.url = {
"num" : 3,
"url" : "https://dianying.2345.com/list/-------#d#.html",
"path" : "/tmp/%s/" %( self.source )
}
'''
第一步 首先 获取列表list html 回来,通过解析html 获取详情 的 url等信息,在根据详情url 获取详情html
第二步 解析 详情的html
'''
def run(self,params):
act = params['act']
self.date = getCurrentTime( frm = "%Y%m%d")
if act == "list":
self.getList()
self.parseInfo()
elif act == "parse":
self.parseInfo()
'''
获取列表
'''
def getList(self):
config = self.url
path_root = config['path'] + self.date
path_list = path_root + "/list"
path_info = path_root + "/info"
path_json = path_root + "/json"
path_vid = path_root + "/vid"
self.makeSuredirs( path_root )
self.makeSuredirs( path_list )
self.makeSuredirs( path_info )
self.makeSuredirs( path_json )
self.makeSuredirs( path_vid )
pages = range( 1,config['num'] + 1 )
for idx in pages:
tmp_path = path_list + "/" + str( idx )
tmp_url = config['url'].replace("#d#",str( idx ) )
app.logger.info( "get list : " + tmp_url )
if os.path.exists( tmp_path ):
continue
tmp_content = self.getHttpContent( tmp_url )
self.saveContent( tmp_path,tmp_content )
time.sleep(0.3)
for idx in os.listdir( path_list ):
tmp_content = self.getContent( path_list + "/" + str( idx ) )
items_data = self.parseList( tmp_content )
if not items_data:
continue
for item in items_data:
app.logger.info("----------------")
app.logger.info( item )
tmp_json_path = path_json + "/" + item['hash']
tmp_info_path = path_info + "/" + item['hash']
tmp_vid_path = path_vid + "/" + item['hash']
if not os.path.exists( tmp_json_path ):
self.saveContent( tmp_json_path, json.dumps( item,ensure_ascii=False ) )
if not os.path.exists(tmp_info_path):
tmp_content = self.getHttpContent( item['url'] )
self.saveContent( tmp_info_path,tmp_content )
if not os.path.exists( tmp_vid_path ):
tmp_content = self.getHttpContent( item['vid_url'] )
self.saveContent( tmp_vid_path,tmp_content )
time.sleep( 0.3 )
def parseList(self,content):
data = []
config = self.url
url_info = urlparse( config['url'] )
url_domain = url_info[0] + "://" + url_info[1]
tmp_soup = BeautifulSoup( str(content),"html.parser" )
tmp_list = tmp_soup.select( "div#contentList ul li" )
for tmp_item in tmp_list:
tmp_target = tmp_item.select("div.li-pic a.aPlayBtn")
tmp_name = tmp_target[0]['title']
tmp_href = tmp_target[0]['href']
if "https:" not in tmp_href and "//" in tmp_href:
tmp_href = "https:%s" %( tmp_href )
tmp_vid_url = "" ##这里获取不到下载地址了,那就进去获取
tmp_data = {
"name" : tmp_name,
"url" : tmp_href,
"vid_url" : tmp_vid_url,
"hash" : hashlib.md5( tmp_href.encode("utf-8") ).hexdigest()
}
data.append( tmp_data )
return data
'''
解析详情信息
'''
def parseInfo(self):
config = self.url
path_root = config['path'] + self.date
path_info = path_root + "/info"
path_json = path_root + "/json"
path_vid = path_root + "/vid"
for filename in os.listdir( path_info ):
tmp_json_path = path_json + "/" + filename
tmp_info_path = path_info + "/" + filename
tmp_vid_path = path_vid + "/" + filename
tmp_data = json.loads( self.getContent( tmp_json_path) )
app.logger.info( tmp_info_path )
tmp_content = self.getContent( tmp_info_path )
tmp_soup = BeautifulSoup( tmp_content,"html.parser")
try:
##页面没有日期我们就去当天吧
tmp_pub_date = self.date
tmp_desc = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.newIntro li.extend .pHide" )[0].getText()
tmp_classify = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.txtList li.li_3 div.emTit-l" )[2].getText()
tmp_actor = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.txtList li.liActor div.emTit-l" )[1].getText()
tmp_pic_list = tmp_soup.select("div.posterPlaceholder div.pic img ")
tmp_pics = []
for tmp_pic in tmp_pic_list:
tmp_pics.append( "https:" + tmp_pic['src'] )
#获取下载地址 直接从当前页面获取
#tmp_download_content = self.getContent( tmp_vid_path )
#tmp_vid_soup = BeautifulSoup( tmp_download_content ,"html.parser")
tmp_download_list = tmp_soup.select( "div.txtIntroCon div.series div.series-con div.series-con-i a" )
tmp_magnet_url = ""
if tmp_download_list:
tmp_magnet_url = tmp_download_list[0]['href']
tmp_data['pub_date'] = tmp_pub_date
tmp_data['desc'] = tmp_desc.strip()
tmp_data['classify'] = tmp_classify.strip()
tmp_data['actor'] = tmp_actor.strip()
tmp_data['magnet_url'] = tmp_magnet_url
tmp_data['source'] = self.source
tmp_data['created_time'] = tmp_data['updated_time'] = getCurrentTime()
if tmp_pics:
tmp_data['cover_pic'] = tmp_pics[0]
tmp_data['pics'] = json.dumps( tmp_pics )
tmp_movie_info = Movie.query.filter_by( hash = tmp_data['hash']).first()
if tmp_movie_info:
continue
tmp_model_movie = Movie( **tmp_data )
db.session.add( tmp_model_movie )
db.session.commit()
except Exception as e:
app.logger.info( e )
continue
return True
def getHttpContent(self,url):
try:
headers = {
'Content-Type': 'text/html;charset=utf-8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
'Referer': "https://dianying.2345.com/list/",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
r = requests.get(url, headers=headers)
if r.status_code != 200 :
return None
return r.text
except Exception:
return None
def saveContent(self,path,content):
if content:
with open( path,mode="w+",encoding="utf-8" ) as f:
if type(content) != str:
content = content.decode("utf-8")
f.write(content )
f.flush()
f.close()
def getContent(self,path):
if os.path.exists( path ):
with open( path ,"r") as f:
return f.read()
return ''
def makeSuredirs(self,path):
if not os.path.exists( path ):
os.makedirs( path )原文地址:【课程章节更新】猫影项目新爬虫源代码
标签:flask linux
智能推荐
即学即码 分享 PHP、Linux、Python、Go,大数据,机器学习,人工智能等技术。马上学习码上学会。www.jixuejima.cn 公众号:learn_master
浙公网安备 33010602011771号