#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/2/7 16:23
# @Author :
from bs4 import BeautifulSoup
import re
import xlwt
import urllib.request, urllib.response, urllib.error
def default():
baseUrl = 'https://movie.douban.com/top250?start='
dataList = getData(baseUrl)
savePath = '豆瓣Top250.xls'
saveDataExcel(savePath, dataList)
def getData(baseUrl):
"""
获取数据
"""
findUrl = re.compile(r'<a href="(.*?)">')
findImg = re.compile(r'<img.*src="(.*?)"')
findTitle = re.compile(r'<span class="title">(.*)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findInq = re.compile(r'<span class="inq">(.*)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 忽略换行符
dataList = []
for i in range(0, 10):
url = baseUrl + str(i * 25)
html = getOneUrl(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', class_="item"):
data = []
item = str(item)
##链接地址
link = re.findall(findUrl, item)[0]
data.append(link)
img = re.findall(findImg, item)[0]
data.append(img)
titles = re.findall(findTitle, item)
if len(titles) == 2 :
cTitle = titles[0]
oTitle = titles[1].replace('/', '')
else:
cTitle = titles[0]
oTitle = ' '
data.append(cTitle.strip())
data.append(oTitle.strip())
rating = re.findall(findRating, item)[0]
data.append(rating)
judge = re.findall(findJudge, item)[0]
data.append(judge)
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace('。', '')
else:
inq = ' '
data.append(inq)
bd = re.findall(findBd, item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?', ' ', bd) # 去掉br
bd = re.sub('/', ' ', bd) # 去掉/
data.append(bd.strip())
dataList.append(data)
return dataList
def getOneUrl(url):
"""获取一个地址信息"""
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36',
}
request = urllib.request.Request(url=url, headers=header)
try:
resp = urllib.request.urlopen(request)
html = resp.read().decode('utf-8')
# print(html)
except urllib.error.HTTPError as e:
if hasattr(e, 'code'):
print('code', e.code)
if hasattr(e, 'reason'):
print('reason', e.reason)
print('error', e)
return html
def saveDataExcel(path, dataList):
"""保存数据到Excel"""
print('save....')
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0) #创建workbook对象
worksheet = workbook.add_sheet('shheet1', cell_overwrite_ok=True) #创建工作表
col = ("链接", "图片", "中文名", "外国名", "评分", "评价数", "概况", "相关信息")
for item in range(0, 8):
worksheet.write(0, item, col[item]) #列名
for item in range(0, 250):
print("第%d条" % (item + 1))
data = dataList[item]
for i in range(0, 8):
worksheet.write(item+1, i, data[i])
workbook.save(path)
print('over')
if __name__ == '__main__':
default()