二级静态页面的爬取-----电影天堂

'''二级静态页面的爬取'''
from urllib import request
import re
import time
import random
import pymysql


class DianyingtiantangSpider:
    def __init__(self):
        self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
        self.headers = {'User-Agent': random.choice([
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
        ])}
        self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='dianyingdb',
                                  charset='utf8')
        self.cursor = self.db.cursor()

    # 获取html函数(因为两个页面都需要请求)
    def get_page(self, url):
        req = request.Request(url=url, headers=self.headers)
        res = request.urlopen(req)
        html = res.read().decode('gb2312', 'ignore')
        return html

    # 解析提取数据(把名称和下载链接一次性拿到)
    def parse_page(self, html):
        # 1.先解析一级页面(提取电影名称,和详情链接)
        pattern = re.compile('<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>',
                             re.S)
        # film_list:[('详情链接','电影名称'),()]
        film_list = pattern.findall(html)
        # print(film_list)
        result_list = []
        for film in film_list:
            film_name = film[1].strip()
            film_link = 'https://www.dytt8.net{}'.format(film[0].strip())
            # print(film_link)
            download_link = self.parse_two_page(film_link)

            result_list.append([film_name, download_link])
        self.save_page(result_list)

    def parse_two_page(self, film_link):
        two_html = self.get_page(film_link)
        pattern = re.compile('<td style="WORD-WRAP.*?>.*?>(.*?)</a>', re.S)
        download_link = pattern.findall(two_html)
        # print('你猜',download_link)
        return download_link[0].strip()

        # 2.拿到详情链接后,再去获取详情链接的html,提取下载链接

    # 保存
    def save_page(self, result_list):
        ins = 'insert into film values(%s,%s)'
        self.cursor.executemany(ins, result_list)
        self.db.commit()

    # 主函数
    def main(self):
        ins = 'delete from film'
        self.cursor.execute(ins)
        self.db.commit()
        i = 1
        for i in range(1, 5):
            url = self.url.format(i)
            html = self.get_page(url)
            self.parse_page(html)
            print('第{}页爬取成功'.format(i))
            i += 1
            time.sleep(random.randint(1, 3))
        self.cursor.close()
        self.db.close()


if __name__ == '__main__':
    start = time.time()
    spider = DianyingtiantangSpider()
    spider.main()
    end = time.time()
    print('程序执行时间为:%.2f' % (end - start))

 

posted @ 2019-07-19 17:02  一如年少模样  阅读(599)  评论(0编辑  收藏  举报