python爬虫--爬取豆瓣top250电影名

关于模拟浏览器登录的header,可以在相应网站按F12调取出编辑器,点击netwook,如下:

以便于不会被网站反爬虫拒绝。

import requests
from bs4 import BeautifulSoup
def get_movies():
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
        'Host': 'movie.douban.com'
        }
    movie_list = []
    for i in range(0, 10):
        link = 'https://movie.douban.com/top250?start=' + str(i * 25)
        r = requests.get(link, headers=headers, timeout=10)  
        print(str(i + 1), "页响应状态码:", r.status_code)  
        soup = BeautifulSoup(r.text, "lxml")
        div_list = soup.find_all('div', class_='hd')  
        for each in div_list:
            movie = each.a.span.text.strip()  
            movie_list.append(movie)
    return movie_list 
movies = get_movies() 
print(movies)

 

 

 

 

或者:

import requests
from bs4 import BeautifulSoup

# 设置请求头,模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

# 发送请求获取网页内容
url = 'https://movie.douban.com/top250'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

# 定位电影列表
movie_list = soup.find('ol', class_='grid_view').find_all('li')

# 遍历电影列表,提取信息
for movie in movie_list:
    # 电影名称
    title = movie.find('span', class_='title').text
    
    # 导演和主演
    info = movie.find('div', class_='bd').p.text
    info = info.replace('\n', '').strip()
    director = info.split('\xa0\xa0\xa0')[0]
    actors = info.split('\xa0\xa0\xa0')[1].split('\xa0/\xa0')
    
    # 评分
    rating = movie.find('span', class_='rating_num').text
    
    # 打印结果
    print('电影名称:', title)
    print('导演:', director)
    print('主演:', actors)
    print('评分:', rating)
    print('---')

每一页数据提取:

import requests
from bs4 import BeautifulSoup

# 设置请求头,模拟浏览器访问

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

# 发送请求获取网页内容
for i in range(0, 10):
    url = 'https://movie.douban.com/top250?start=' + str(i * 25)
    response = requests.get(url, headers=headers,timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')
    # 定位电影列表
    movie_list = soup.find('ol', class_='grid_view').find_all('li')
    for movie in movie_list:
        title = movie.find('span', class_='title').text
    
    # 导演和主演
        info = movie.find('div', class_='bd').p.text.replace('\n', '').split('主演: ')
        if len(info) > 1:
            director = info[0].strip().split('导演: ')[-1]
            actor = info[-1].strip()
        else:
            director = '未知'
            actor = '未知'
    
    # 评分
        rating = movie.find('span', class_='rating_num').text
    
    # 打印结果
        print('电影名称:', title)
        print('导演:', director)
        print('主演:', actors)
        print('评分:', rating)
        print('---')

 

posted @ 2017-09-30 11:35  方木--数据分析与挖掘  阅读(941)  评论(0编辑  收藏  举报