python 获取豆瓣排名

import requests
from bs4 import BeautifulSoup
import openpyxl
def gethtmltext(url):
    try:
        header={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3947.100 Safari/537.36'
        }#假如头信息不是游览器会被阻止访问
        r=requests.get(url, headers = header);
        r.raise_for_status();#如果状态码不是200就会报错
        r.encoding=r.apparent_encoding;
        return r.text;#截取前1000个字符[:1000]
    except:
        return "异常"
def select(url):
    arr=[]
    demo=gethtmltext(url)
    soup=BeautifulSoup(demo,"html.parser")
    all_soup=soup.find_all("div",class_="info")
    for each in all_soup:
        mov = each.span.text
        href = each.a['href']
        act = each.p.text
        fraction = each.find("span", class_="rating_num").text
        try:
            pinglun=each.find("span",class_="inq").text
        except:
            pinglun=''
        arr.append([mov, href, act, fraction, pinglun])
    print(arr)
    return arr

def save(arr):
    wb=openpyxl.Workbook()
    ws=wb.active
    ws['A1']="电影名称"
    ws['B1']="链接"
    ws['C1']="演员"
    ws['D1']="评分"
    ws['E1']="评论"
    for each in arr:
        ws.append(each)
    wb.save("豆瓣排名11.xlsx")
if __name__ == "__main__":
    result=[]
    for i in range(0,250,25):
        url = "https://movie.douban.com/top250?start={}&filter=".format(i)
        arr=select(url)
        result.extend(arr)
    save(result)



posted @ 2021-04-15 18:09  小魏同学呀  阅读(50)  评论(0)    收藏  举报