Loading

【爬虫】项目篇-使用re、Xpath爬取豆瓣电影周榜Top10,保存为CSV

使用re爬取+为请求头,保存为csv

import requests
import re
import csv
from fake_useragent import UserAgent
#re文档:
#https://docs.python.org/zh-cn/3.8/library/re.html#re.S


header = {
    'user-agent': UserAgent(path=r'./fake_useragent.json').ie,
    'cookie':'bid=gZhOMjq7Ag0; ll="118200"; __gads=ID=ee81490f4e78ee41-226c825738cf0077:T=1637495539:RT=1637495539:S=ALNI_MYAsbTf9f4zarcndONOU8V3iX3aKg; _vwo_uuid_v2=D5CD017E3233C8F72BD20AB7E8A3DE8C6|e0376aed09832ec0574b534bffe098fc; dbcl2="144119796:t9KAADz+2i4"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.14411; ck=oiAV; _pk_ref.100001.4cf6=["","",1637997339,"https://www.baidu.com/link?url=-BeYMom6zanu8afK9L3VZBlLbFUbdO_SynvSZ9V8_KxMbRniAGx-WAUEh-IFvJ4g&wd=&eqid=e083a3d3000506490000000661a1db18"]; _pk_id.100001.4cf6=98c1f43971dcb9d9.1637495527.4.1637997339.1637511633.; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.2069033705.1637495528.1637679354.1637997340.5; __utmb=30149280.0.10.1637997340; __utmc=30149280; __utmz=30149280.1637997340.5.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.1576995638.1637495528.1637510908.1637997340.4; __utmb=223695111.0.10.1637997340; __utmc=223695111; __utmz=223695111.1637997340.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}

def main(base_url):
    source=request_url(base_url)
    source = re.findall(
        '<div id="billboard" class="s" data-dstat-areaid="75" data-dstat-mode="click,expose">(.+)</div>', source, re.S)
    all_conditions(source)

#请求得到源码
def request_url(url):

    req = requests.get(url, headers=header)
    source = req.content.decode('utf-8')
    return source

#主页
def all_conditions(source):
    datalist=[]
    for item in source:
        item=re.findall("<table>(.+)</table>",item,re.S)
        for i in item:
            #获取主页下的排名、电影名称、子页链接
            rank_list=re.findall('<td class="order">(.+)</td>',i)
            href_list=re.findall('href="(.*?)">',i,re.S)
            name_list=re.findall('[\u4e00-\u9fa5]+',i,re.S)

            #获取子页所有信息,全部添加到列表中
            for href,name,rank in zip(href_list,name_list,rank_list):
                data = []
                data.append(rank)
                data.append(name)
                data.append(href)
                sub_page=get_sub_page(href)
                for i in sub_page:
                   # print(i)
                    data.append(i)

                datalist.append(data)
                print(data)
                #print("datalist\n",datalist)
                #保存爬取的数据
                save(datalist)

#获取子页下所有信息
def get_sub_page(href):
    source=request_url(href)
    score = re.search('<strong class="ll rating_num" property="v:average">(.+)</strong>', source).group(1)
    con_list = re.findall('<div id="info">(.+)<br>', source, re.S)
    for item in con_list:
        item = re.sub('<a href="/celebrity/[\d]+/"', '', item)
        item = re.sub('</span>', '', item)
        #print("++++item+++",item)
        #导演
        actor = re.search('rel="v:directedBy">(.+)</a>', item).group(1)
        # actor = [actor]

        #编剧
        writer = re.search("编剧: <span class='attrs'>(.+)</a><br/>", item).group(1)
        writer = writer.replace("</a> / >", ",")
        writer = re.sub('<a href="/(.*?)">', ',', writer).replace('</a> / ',"").replace(">","")
        # writer = [writer]

        #主演
        star_list = re.search('rel="v:starring">(.+)</a><br/>', item).group(1)
        star_list = re.sub('</a> /  rel="v:starring">', ",", star_list)


        if "href" in star_list:
            star_list = re.sub('</a> / <a href="/(.*?)" rel="v:starring">', ',', star_list)
        #     star_list=[star_list]
        # else:
        #     star_list = [star_list]

        #类型
        type = re.search('<span property="v:genre">(.+)<br/>', item).group(1)
        type = re.findall('[\u4e00-\u9fa5]+', type)
        type = ','.join(type)

        #制片国家/地区
        region = re.search("制片国家/地区: (.+)<br/>", item).group(1)

        #语言
        language = re.search("语言: (.+)<br/>", item).group(1)

        #上映时间
        date = re.findall('"v:initialReleaseDate" content="(.*?)">', item)
        date = ','.join(date)

        #片长
        runtime = re.search("片长: (.+)<br/>", item).group(1)
        runtime = re.findall('[\d]+[\u4e00-\u9fa5]+', runtime)
        runtime = ''.join(runtime)

        #又名
        try:
            other_name = re.search("又名: (.+)<br/>", item).group(1)
        except:
            other_name = ""
        #IMDb
        IMDb = re.search("IMDb: (.+)", item).group(1)

    return score,actor,writer,star_list,type,region,language,date,runtime,other_name,IMDb

#保存为csv文件
def save(data):
    with open("DoubanMovieWeekTop10.csv","w+",encoding="utf-8-sig",newline="") as f:
        a=csv.writer(f)
        a.writerow(["排名","电影名","详情链接","评分","导演","编剧","主演","类型","制片国家/地区","语言",
                    "上映时间","片场","又名","IMDb"])
        a.writerows(data)


if __name__ == '__main__':
    base_url = "https://movie.douban.com/"
    main(base_url)

使用re爬取2+不保存

import requests
import cchardet
import re

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
          'Cookie': 'bid=Wt9rGb6VTcE; douban-fav-remind=1; __gads=ID=b0b9fc62ad8fd36e-2277b1a4d0ca0007:T=1629037033:RT=1629037033:S=ALNI_MZcQI-zVIz4SDF1JEWl3bohLM8JKA; viewed="35571443"; gr_user_id=b4003e18-ed65-42a8-b2aa-c2eee8128f95; ll="118200"; __utmz=30149280.1633773615.6.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1633773615.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=DAAC4D9D6B82F69AC1F055078D065C751|92efe72a313f1fd9c1647ee1c083fa7d; __utmc=30149280; __utmc=223695111; ap_v=0,6.0; __utma=30149280.1433569655.1629037036.1634220097.1634222012.15; __utmb=30149280.0.10.1634222012; __utma=223695111.1215803576.1633773615.1634220097.1634222012.10; __utmb=223695111.0.10.1634222012; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1634222012%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dwyyw1hAEPCDkeCOiS0lMWDx6tRJnw2gELr3aZop7fzDRrduYHXftRKiI4PbeclDL%26wd%3D%26eqid%3Db60e5a81000182b7000000066161682b%22%5D; _pk_id.100001.4cf6=f75b65b3de20f07e.1633773614.10.1634222012.1634220097.; _pk_ses.100001.4cf6=*; dbcl2="146463518:ozVFabF9880"'
          }

def get_movie_list():
    resp = requests.get('https://movie.douban.com', headers=header)
    resp.encoding = cchardet.detect(resp.content)['encoding']
    movie_list_section = re.search(r'<div class="billboard-bd">(.*?)<div id="dale_movie_home_bottom_right"', resp.text, re.S).group(1)
    movie_list = re.findall(r'<tr>.*?href="(.*?)">(.*?)</a>', movie_list_section, re.S)
    return movie_list

def get_movie_info(movie_url_name):
    resp = requests.get(movie_url_name[0], headers=header)
    resp.encoding = cchardet.detect(resp.content)['encoding']
    movie_info_section = re.search(r'<div id="info">(.*?)</div>', resp.text, re.S).group(1)
    director = '/'.join(re.findall(r'href=.*?v:directedBy">(.*?)</a>', movie_info_section, re.S))
    screenwriter_section = re.search(r"编剧.*?'attrs'>(.*?)</span>", movie_info_section, re.S).group(1)
    screenwriter = '/'.join(re.findall(r'href=.*?>(.*?)</a>', screenwriter_section, re.S))
    actor = '/'.join(re.findall(r'href=.*?v:starring">(.*?)</a>', movie_info_section, re.S))
    movie_type = re.search(r'property="v:genre">(.*?)</span>', movie_info_section, re.S).group(1)
    district = re.search(r'制片国家/地区:</span>(.*?)<br/>', movie_info_section, re.S).group(1)
    language = re.search(r'语言:</span>(.*?)<br/>', movie_info_section, re.S).group(1)
    initial_release_date = '/'.join(re.findall(r'v:initialReleaseDate.*?>(.*?)</span>', movie_info_section, re.S))
    runtime = re.search(r'v:runtime.*?>(.*?)</span>', movie_info_section, re.S).group(1)
    movie_detail = {'片名': movie_url_name[1], '导演': director, '编剧': screenwriter, '演员': actor, '类型': movie_type, '制片国家/地区': district,
                    '语言': language, '上映日期': initial_release_date, '片长': runtime}
    return movie_detail

if __name__ == '__main__':
    mv_lst = get_movie_list()
    movie_detail_list = []
    for movie in mv_lst:
        movie_detail = get_movie_info(movie)
        movie_detail_list.append(movie_detail)
    for movie in movie_detail_list:
        for key, value in movie.items():
            print(f'{key}:{value}')
        print()

使用xpath+lxml.html+lxml.etree+不保存

import requests
import cchardet
from lxml import etree
from lxml import html  # 与使用etree模块的效果一样
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
        'Cookie': 'bid=Wt9rGb6VTcE; douban-fav-remind=1; __gads=ID=b0b9fc62ad8fd36e-2277b1a4d0ca0007:T=1629037033:RT=1629037033:S=ALNI_MZcQI-zVIz4SDF1JEWl3bohLM8JKA; viewed="35571443"; gr_user_id=b4003e18-ed65-42a8-b2aa-c2eee8128f95; ll="118200"; __utmz=30149280.1633773615.6.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1633773615.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=DAAC4D9D6B82F69AC1F055078D065C751|92efe72a313f1fd9c1647ee1c083fa7d; __utmc=30149280; __utmc=223695111; ap_v=0,6.0; __utma=30149280.1433569655.1629037036.1634220097.1634222012.15; __utmb=30149280.0.10.1634222012; __utma=223695111.1215803576.1633773615.1634220097.1634222012.10; __utmb=223695111.0.10.1634222012; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1634222012%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dwyyw1hAEPCDkeCOiS0lMWDx6tRJnw2gELr3aZop7fzDRrduYHXftRKiI4PbeclDL%26wd%3D%26eqid%3Db60e5a81000182b7000000066161682b%22%5D; _pk_id.100001.4cf6=f75b65b3de20f07e.1633773614.10.1634222012.1634220097.; _pk_ses.100001.4cf6=*; dbcl2="146463518:ozVFabF9880"'
        }
def get_top_movies_one_week(url):
    resp = requests.get(url, headers=head)
    resp.encoding = cchardet.detect(resp.content)['encoding']
    root_elem = etree.HTML(resp.text)
    print("root_elem",root_elem)
    mv_lst = []
    # tr_elem_lst = root_elem.xpath('//div[@id="billboard"]/div[2]/table/tr')  # 注意:源码中没有tbody标签,而chrome开发者工具的Elements选项卡中可以看到tbody标签。构建xpath时以实际源码为准
    # for tr_elem in tr_elem_lst:
    #     a_elem_lst = tr_elem.xpath('td/a')
    #     mv_url_lst = a_elem_lst[0].xpath('@href')
    #     mv_name_lst = a_elem_lst[0].xpath('text()')
    #     mv_lst.append((mv_name_lst[0], mv_url_lst[0]))
    mv_name_url = root_elem.xpath('//td[@class="title"]/a')
    for mv in mv_name_url:
        mv_lst.append((mv.xpath('text()')[0], mv.xpath('@href')[0]))
        print(mv.xpath('text()')[0])
    return mv_lst

def get_movie_detail(mv_name_url):
    resp = requests.get(mv_name_url[1], headers=head)
    resp.encoding = cchardet.detect(resp.content)['encoding']
    root_elem = html.fromstring(resp.text)  # etree.HTML(resp.text)
    info_div_elem_lst = root_elem.xpath('//div[@id="info"]')
    director = info_div_elem_lst[0].xpath('span[1]/span[@class="attrs"]')[0].xpath('string(.)')
    screenwriter = info_div_elem_lst[0].xpath('span[2]/span[2]')[0].xpath('string(.)')
    actor = info_div_elem_lst[0].xpath('span[@class="actor"]/span[@class="attrs"]')[0].xpath('string(.)')
    mv_type = '/'.join(info_div_elem_lst[0].xpath('span[@property="v:genre"]/text()'))
    district = info_div_elem_lst[0].xpath('span[contains(text(),"地区")]/following-sibling::text()')[0]  # following-sibling::text()表示获取当前节点之后的所有同级节点内的文本
    language = info_div_elem_lst[0].xpath('span[contains(text(),"语言")]/following-sibling::text()')[0]
    initial_release_date = '/'.join(info_div_elem_lst[0].xpath('span[@property="v:initialReleaseDate"]/text()'))
    runtime = info_div_elem_lst[0].xpath('span[@property="v:runtime"]/text()')[0]
    mv_detail_dict = {'片名': mv_name_url[0], '导演': director, '编剧': screenwriter, '演员': actor, '类型': mv_type, '制片国家/地区': district,
                      '语言': language, '上映日期': initial_release_date, '片长': runtime}
    return mv_detail_dict

if __name__ == '__main__':
    movie_list = get_top_movies_one_week('https://movie.douban.com')
#    print(movie_list)
    mv_detail_lst = []
    for movie in movie_list:
        mv_detail_lst.append(get_movie_detail(movie))
#    print(mv_detail_lst)
    for movie in mv_detail_lst:
        for key, value in movie.items():
            print(f'{key}:{value}')
        print()
posted @ 2024-04-05 21:50  踩坑大王  阅读(75)  评论(0)    收藏  举报