python 爬虫系列04-电影天堂连接爬虫

学习的第四个爬虫

  

from lxml import etree
import requests
BASE_D = 'http://www.dytt8.net'
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0"
}
def get_detail_urls(url):
    response = requests.get(url, headers=headers)
    text = response.text
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = map(lambda url: BASE_D+url, detail_urls)
    return detail_urls
def parse_detail_page(url):
    movie = {}
    response = requests.get(url, headers=headers)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    # for x in title:
    #     print(etree.tostring(x,encoding='utf-8').encode('utf-8'))
    movie['title'] = title
    zoomE = html.xpath("//div[@id='Zoom']")[0]
    imgs = zoomE.xpath(".//img/@src")
    cover = imgs[0]
   # screenshot = imgs[1]
    movie['cover'] = cover
    #movie['screenshot'] = screenshot

    def parse_info(info,rule):
        return info.replace(rule,"").strip()
    infos = zoomE.xpath(".//text()")
    for index,info in enumerate(infos):
        # print(info)
        # print(index)
        # print("="*30)
        if info.startswith("◎年  代"):
            info = parse_info(info, "◎年  代")
            movie['year'] = info
        elif info.startswith("◎产  地"):
            info = parse_info(info, "◎产  地")
            movie['country'] = info
        elif info.startswith("◎类  别"):
            info = parse_info(info, "◎类  别")
            movie['category'] = info
        elif info.startswith("◎豆瓣评分"):
            info = parse_info(info, "◎豆瓣评分")
            movie['category'] = info
        elif info.startswith("◎片  长"):
            info = parse_info(info, "◎片  长")
            movie['duration'] = info
        elif info.startswith("◎导  演"):
            info = parse_info(info, "◎导  演")
            movie['director'] = info
        elif info.startswith("◎主  演"):
            info = parse_info(info, "◎主  演")
            actors = [info]
            for x in range(index+1, len(infos)):
                actor = infos[x].strip()
                if actor.startswith(""):
                    break
                actors.append(actor)
            movie['actors'] = actors
        elif info.startswith("◎简  介 "):
            info = parse_info(info, "◎简  介 ")
            for x in range(index+1, len(infos)):
                profile = infos[x].strip()
                movie["profile"] = profile
    download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
    movie['download_url'] = download_url
    return movie
def spider():
    base_url = "http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
    movies = []
    for x in range(1,8):
        # print("="*30)
        # print(x)
        url = base_url.format(x)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)
if __name__ =='__main__':
    spider()

 

posted on 2018-11-06 16:38  kingle-l  阅读(845)  评论(0编辑  收藏  举报

levels of contents