爬取某电影网站 新

主类

import re
import requests
import time
from scrapy import Selector
from Film_website.models import Film
# 
url = 'https://www.88ys.cc'

def search(name, num = 1):
    r = requests.get('https://www.88ys.cc/vod-search-pg-{}-wd-{}.html'.format(num, name))
    sel = Selector(text=r.text)
    results = sel.xpath('//div[@class="index-area clearfix"]/ul//li[@class="p1 m1"]').extract()
    result_ = len(results)
    if result_:
        pages = sel.xpath('//div[@class="page mb clearfix"]/text()').extract_first()
        page = int(re.search('(\d*)/(\d*)',pages).group(1))
        pagecount = int(re.search('(\d*)/(\d*)',pages).group(2))
        if page <= pagecount:
            for result in results:
                sel_result = Selector(text=result)
                href = url + (sel_result.xpath('//a/@href').extract_first())
                title = sel_result.xpath('//p[@class="name"]/text()').extract_first()
                type_ = sel_result.xpath('//span[@class="lzbz"]/p[3]/text()').extract_first()
                time_ = sel_result.xpath('//span[@class="lzbz"]/p[4]/text()').extract_first()
                try:
                    address =Film.filter(address=href)
                    print('此信息已录入: {} {} '.format(address[0].id,address[0].film_name))
                    print('--------------------')
                except IndexError:
                    Film.create(address=href, film_name=title, type_=type_, time_=time_)
                    print(href)
                    print(title)
                    print(type_)
                    print(time_)
                    print('--------------------')
            print('-----------当前页数:{}/{}-----------'.format(page, pagecount))
            # time.sleep(3)
            return search(name, num+1)
    return print('循环结束')


def query(name):
    films = Film.select().where(Film.film_name == name)
    for film in films:
        print(film.film_name, ' ', film.address)
        # return film.film_name, film.address

def details():
    #数据库中获取电影详情链接
    #获取播放源(前两个)
    #获取集数
    #打开第一集获取所有集数播放地址和m3u8地址
    #存储数据库或者下载或者(未定)
    return


if __name__=='__main__':
    #2019/8/7 总页数2939
    #2019/8/8 总页数2940
    search('0')
    # query('家有女友')

数据库类

from peewee import *


#建立数据库连接
db = MySQLDatabase('spider', host='127.0.0.1', port=3306, user='root', password='123456lmr')
#

class BaseModel(Model):
    class Meta:
        database = db

class Film(BaseModel):
    # id = IntegerField(primary_key=True)
    address = CharField(default='')
    film_name = CharField(max_length=50, default='')
    type_ = CharField(max_length=50, default='')
    time_ = CharField(max_length=50, default='')


if __name__ == '__main__':
    # db.create_tables([Good, GoodEvaluate, GoodEvaluateSummary])
    # db.create_tables([Goods])
    db.create_tables([Film])

 

重点

网站抓取完后,我通过各种途径了解了一下这种网站的视频来源, 结果发现有个专门的采集网站, 来采集这些视频, 这种网站就是空壳, 只是存储采集网站的视频链接, 如果最开始就知道采集网站的话, 就不用这么麻烦了[捂脸哭]

 

posted @ 2019-08-19 14:32  小毅i  阅读(862)  评论(0)    收藏  举报