python之爬取豆瓣Top250数据

import re
import ssl
import urllib  # 制定url,获取网页数据
import urllib.request

# xwlt是一个帮助我们写入一个excel表的库
import xlwt
from bs4 import BeautifulSoup

# 忽略https证书
ssl._create_default_https_context = ssl._create_unverified_context

# 标题
findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S)
# 解析数据
# findLink = r'<a href="(.*?)">'
# 链接
findLink = re.compile(r'<a href="(.*?)">')
# 图片
findImg = re.compile(r'<img.*src="(.*?)"', re.S)  # re.S让换行符包含其中
# 其他内容
findOther = re.compile(r'<p class="">(.*)</p>', re.S)  # re.S让换行符包含其中
# 评分
findRate = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# 评价人数
findCom = re.compile(r'<span>(\d*?)人评价</span>')
# 描述
findDes = re.compile(r'<span class="inq">(.*?)</span>')

savePath = "豆瓣电影Top250.xls"
dataList = []


def getData(baseUrl):
    for i in range(0, 10):
        url = baseUrl + str(i * 25)
        html = askUrl(url)
        # print(html)
        resolving(html)
    #         解析数据
    return dataList


# 请求数据
def askUrl(url):
    html = ""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
        }
        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        # print(html)
        return html
    except urllib.error.HTTPError as e:
        if hasattr(e, "code"):  # 如果包含code标签则打印出来
            print(e.code)
        if hasattr(e, "reason"):  # 如果包含reason标签则打印出来
            print(e.reason)
    return html


# 解析数据
def resolving(html):
    bs = BeautifulSoup(html, "html.parser")
    for item in bs.find_all("div", class_="item"):
        data = []
        # print("每一个条目数据", item)
        # print(type(item))
        item = str(item)
        title = re.findall(findTitle, item)
        if len(title) >= 2:
            data.append(title[0])
            data.append(title[1])
        else:
            data.append(title[0])
            data.append(' ')

        data.append(findRule(findLink, item))

        data.append(findRule(findImg, item))
        # other = findRule(findOther, item)
        # data.append(other)
        data.append(findRule(findRate, item))
        data.append(findRule(findCom, item))
        data.append(findRule(findDes, item))

        dataList.append(data)
    return dataList


# 保存数据到excel中
def saveExcel(dataList):
    # 创建一个excel表,编码用utf-8
    workbook = xlwt.Workbook(encoding="utf-8")
    # 创建一个excel文档中的sheet1文件
    worksheet = workbook.add_sheet('豆瓣电影')
    # 在sheet1中的第0行,第一列写入 你好,excel

    col = ("标题", "其他标题", "链接", "图片", "评分", "评价人数", "描述")
    for i in range(0, len(col)):
        worksheet.write(0, i, col[i])
    for i in range(0, len(dataList)):
        item = dataList[i]
        for j in range(0, len(item)):
            worksheet.write(i + 1, j, item[j])
    # 保存excle文档在本地,起名叫做movie.xls
    workbook.save('movie.xls')


def findRule(rule, msg):
    list = re.findall(rule, msg)
    # if (len(list)) >= 1:
    #     return list[0]
    # else:
    #     return " "

    return list


def start():
    getData("https://movie.douban.com/top250?start=")
    saveExcel(dataList)


start()

posted @ 2021-07-05 23:46  我的网名  阅读(57)  评论(0)    收藏  举报