爬取某电影网站 新
主类
import re import requests import time from scrapy import Selector from Film_website.models import Film # url = 'https://www.88ys.cc' def search(name, num = 1): r = requests.get('https://www.88ys.cc/vod-search-pg-{}-wd-{}.html'.format(num, name)) sel = Selector(text=r.text) results = sel.xpath('//div[@class="index-area clearfix"]/ul//li[@class="p1 m1"]').extract() result_ = len(results) if result_: pages = sel.xpath('//div[@class="page mb clearfix"]/text()').extract_first() page = int(re.search('(\d*)/(\d*)',pages).group(1)) pagecount = int(re.search('(\d*)/(\d*)',pages).group(2)) if page <= pagecount: for result in results: sel_result = Selector(text=result) href = url + (sel_result.xpath('//a/@href').extract_first()) title = sel_result.xpath('//p[@class="name"]/text()').extract_first() type_ = sel_result.xpath('//span[@class="lzbz"]/p[3]/text()').extract_first() time_ = sel_result.xpath('//span[@class="lzbz"]/p[4]/text()').extract_first() try: address =Film.filter(address=href) print('此信息已录入: {} {} '.format(address[0].id,address[0].film_name)) print('--------------------') except IndexError: Film.create(address=href, film_name=title, type_=type_, time_=time_) print(href) print(title) print(type_) print(time_) print('--------------------') print('-----------当前页数:{}/{}-----------'.format(page, pagecount)) # time.sleep(3) return search(name, num+1) return print('循环结束') def query(name): films = Film.select().where(Film.film_name == name) for film in films: print(film.film_name, ' ', film.address) # return film.film_name, film.address def details(): #数据库中获取电影详情链接 #获取播放源(前两个) #获取集数 #打开第一集获取所有集数播放地址和m3u8地址 #存储数据库或者下载或者(未定) return if __name__=='__main__': #2019/8/7 总页数2939 #2019/8/8 总页数2940 search('0') # query('家有女友')
数据库类
from peewee import * #建立数据库连接 db = MySQLDatabase('spider', host='127.0.0.1', port=3306, user='root', password='123456lmr') # class BaseModel(Model): class Meta: database = db class Film(BaseModel): # id = IntegerField(primary_key=True) address = CharField(default='空') film_name = CharField(max_length=50, default='空') type_ = CharField(max_length=50, default='空') time_ = CharField(max_length=50, default='空') if __name__ == '__main__': # db.create_tables([Good, GoodEvaluate, GoodEvaluateSummary]) # db.create_tables([Goods]) db.create_tables([Film])
重点
网站抓取完后,我通过各种途径了解了一下这种网站的视频来源, 结果发现有个专门的采集网站, 来采集这些视频, 这种网站就是空壳, 只是存储采集网站的视频链接, 如果最开始就知道采集网站的话, 就不用这么麻烦了[捂脸哭]

浙公网安备 33010602011771号