爬虫爬取豆瓣电影Top250及在网页展示
1、爬取信息的代码
1 # coding=utf-8 2 import pymysql 3 import urllib.request,urllib.error 4 import re 5 from bs4 import BeautifulSoup 6 import xlwt 7 8 # 得到网页的源码 9 def askURL(url): 10 # 模拟浏览器的头部向网站发送请求 11 headers = { 12 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" 13 } 14 req = urllib.request.Request(url=url, headers=headers) 15 html = "" 16 try: 17 response = urllib.request.urlopen(req) 18 html = response.read().decode("utf-8") 19 except urllib.error.URLError as e: 20 if hasattr(e,"code"): 21 print(e.code) 22 if hasattr(e,"reason"): 23 print(e.reason) 24 return html 25 26 # 对源码进行解析和提取 27 def getData(baseurl): 28 all_movie = [] 29 for i in range(10): 30 url = baseurl + str(i*25) 31 html = askURL(url) #每一个网页的源码 32 soup = BeautifulSoup(html,"html.parser") 33 for i in soup.find_all('div',class_="item"): 34 movie = [] 35 movie.append(i.em.get_text()) 36 name = i.select(".title") 37 cname = name[0].get_text() 38 movie.append(cname) 39 if len(name) == 2: 40 fname = name[1].get_text().replace("/","").strip() 41 movie.append(fname) 42 elif len(name) == 1: 43 fname = "无" 44 movie.append(fname) 45 movie_link = i.select("a")[0]["href"] 46 movie.append(movie_link) 47 movie_img_link = i.select("img")[0]["src"] 48 movie.append(movie_img_link) 49 bd_conent = i.select("p")[0].get_text().strip().replace("\n","").replace("...","") 50 bd = re.sub(r"\s{2,}|\xa0"," ",bd_conent) 51 movie.append(bd) 52 rate = i.select(".rating_num")[0].get_text() 53 movie.append(rate) 54 r = re.compile("<span>(\d+)人评价</span>") 55 comment_num = r.findall(str(i))[0] 56 movie.append(comment_num) 57 inq = i.select(".inq") 58 if inq: 59 inq = inq[0].get_text().strip() 60 else: 61 inq = "无" 62 movie.append(inq) 63 all_movie.append(movie) 64 return all_movie 65 66 # 把提取到的信息保存到xls表格中 67 def saveData2xls(data,saveURL): 68 book = xlwt.Workbook(encoding="utf-8",style_compression=0) 69 sheet = book.add_sheet("豆瓣评分top250",cell_overwrite_ok=True) 70 head_col = ("排名","电影中文名","电影外文名","电影链接","图片链接","详细信息","评分","评论数","概要") 71 for i in range(9): 72 sheet.write(0,i,head_col[i]) 73 for i in range(250): 74 movie = data[i] 75 for j in range(9): 76 sheet.write(i+1,j,movie[j]) 77 print("保存完成!") 78 book.save(saveURL) 79 80 # 把提取到的信息保存到MySQL数据库中 81 def saveData2sql(data): 82 conn = pymysql.connect(host="localhost",port=3306,user='root',password="1212",database="spider") 83 cursor = conn.cursor() 84 for movie in data: 85 for item in range(len(movie)): 86 if item ==0 or item == 6: 87 movie[item] = float(movie[item]) 88 sql = "insert into douban(rate,ctitle,ftitle,link,img_link,bd,score,comment_num,descc) value %s"%(str(tuple(movie))) 89 cursor.execute(sql) 90 conn.commit() 91 cursor.close() 92 conn.close() 93 94 if __name__ == '__main__': 95 baseurl = "https://movie.douban.com/top250?start=" 96 saveURL = "豆瓣评分top250.xls" 97 98 all_moive = getData(baseurl) 99 # 保存到xls的方法 100 # saveData2xls(all_moive,saveURL) 101 # 保存到数据库的方法 102 saveData2sql(all_moive)
数据库结构文件:
CREATE TABLE `douban` ( `id` int NOT NULL AUTO_INCREMENT, `rate` int NOT NULL, `img_link` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, `link` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `ctitle` varchar(255) COLLATE utf8_bin NOT NULL, `ftitle` varchar(255) COLLATE utf8_bin DEFAULT NULL, `bd` varchar(255) COLLATE utf8_bin DEFAULT NULL, `score` float(10,2) NOT NULL, `comment_num` varchar(255) COLLATE utf8_bin DEFAULT NULL, `descc` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=251 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
使用Django和MySQL把数据在网页上进行展示(未使用ORM):
html代码(index.html):
1 <!DOCTYPE html> 2 <html lang="zh-CN"> 3 <head> 4 <meta charset="UTF-8"> 5 <title>Title</title> 6 <meta name="viewport" content="width=device-width, initial-scale=1"> 7 <meta name="referrer" content="no-referrer"> 8 <link rel="stylesheet" href="/static/bootstrap/css/bootstrap.min.css"> 9 <link rel="stylesheet" href="/static/fontAwesome/css/font-awesome.min.css"> 10 <style> 11 /*表格中文字上下左右居中*/ 12 .table th, .table td { 13 text-align: center; 14 vertical-align: middle!important; 15 } 16 </style> 17 </head> 18 <body> 19 <div class="" style="margin: 50px 150px 0 150px"> 20 <div class="row"> 21 <table class="table table-bordered table-striped"> 22 <tr> 23 <th>排名</th> 24 <th>封面</th> 25 <th>中文名</th> 26 <th>外国名</th> 27 <th>导演</th> 28 <th>评分</th> 29 <th>评论数</th> 30 <th>概要</th> 31 </tr> 32 {% for moive in list %} 33 <tr> 34 <td>{{ moive.1 }}</td> 35 <td><img src="{{ moive.2 }}" alt="图片" style="width: 60px;height: 80px"></td> 36 <td><a href="{{ moive.3 }}">{{ moive.4 }}</a></td> 37 <td>{{ moive.5 }}</td> 38 <td style="width: 350px">{{ moive.6 }}</td> 39 <td style="width: 50px">{{ moive.7 }}</td> 40 <td style="padding: 0 20px;">{{ moive.8 }}</td> 41 <td>{{ moive.9 }}</td> 42 </tr> 43 {% endfor %} 44 </table> 45 </div> 46 </div> 47 <div class="text-center page" style="margin-bottom: 30px"> 48 {% for page in page_list %} 49 {{ page|safe }} 50 {% endfor %} 51 </div> 52 <script src="/static/jquery-3.5.1.min.js"></script> 53 <script src="/static/bootstrap/js/bootstrap.min.js"></script> 54 </body> 55 </html>
视图函数(views.py):
1 from django.shortcuts import render,HttpResponse 2 import pymysql 3 import myPage 4 # Create your views here. 5 def index(request): 6 now_page = request.GET.get("page") 7 conn = pymysql.connect(host="localhost", port=3306, user='root', password="1212", database="spider") 8 cursor = conn.cursor() 9 sql = "select * from douban" 10 list = [] 11 content = cursor.execute(sql) 12 page_obj = myPage.Page(now_page, 250, "index", 25, 7) 13 for i in cursor.fetchall()[page_obj.start:page_obj.end]: 14 list.append(i) 15 return render(request,"index.html",{'list':list,"page_list": page_obj.get_page()})
把得到的数据分页展示(myPage.py):
1 class Page(object): 2 def __init__(self, now_page, tol_count, base_url, per_num=10, max_page=11): 3 ''' 4 5 :param now_page: 当前页 6 :param tol_count: 数据总条数 7 :param base_url: 基本路径 8 :param per_num: 每页的数据数 9 :param max_page: 最大显示页框数 10 ''' 11 # 得到总页数 12 tol_page, m = divmod(tol_count,per_num) 13 if m: 14 tol_page += 1 15 self.tol_page = tol_page 16 # 得到当前在多少页 17 try: 18 now_page = int(now_page) 19 except: 20 now_page = 1 21 if now_page > self.tol_page: 22 now_page = self.tol_page 23 if now_page < 1: 24 now_page = 1 25 self.now_page = now_page 26 # 点击跳转的基本路径 27 self.base_url = base_url 28 # 每页多少条数据 29 self.per_num = per_num 30 # 最多显示多少个分页框 31 self.max_page = max_page 32 self.half_page = min((max_page // 2),(tol_page // 2)) 33 34 @property 35 def start(self): 36 return (self.now_page - 1) * self.per_num 37 38 @property 39 def end(self): 40 return self.now_page * self.per_num 41 42 def get_page(self): 43 if self.now_page > self.half_page: 44 start_page = self.now_page - self.half_page 45 end_page = self.now_page + self.half_page 46 else: 47 start_page = 1 48 end_page = self.max_page 49 if self.now_page > self.tol_page - self.half_page: 50 start_page = self.tol_page - self.max_page + 1 51 end_page = self.tol_page 52 page_all_list = [] 53 page_all_list.append('<nav aria-label="Page navigation"><ul class="pagination">') 54 # 上一页 55 if self.now_page > 1: 56 ret = '<li class="dote"><a href="/{}/?page={}"><span aria-hidden="true">«</span></a></li>'.format(self.base_url, 57 self.now_page - 1) 58 else: 59 ret = '<li class="dote disabled"><a style="background-color: #dddddd"><span aria-hidden="true">«</span></a></li>' 60 page_all_list.append(ret) 61 # 首页 62 if self.now_page == 1: 63 ret = '<li class="first_page active"><a href="/{}/?page=1">首页</a></li>'.format(self.base_url) 64 elif self.now_page <= self.half_page + 1 or self.tol_page <= self.max_page: 65 ret = '<li class="first_page"><a href="/{}/?page=1">首页</a></li>'.format(self.base_url) 66 else: 67 ret = '<li class="first_page"><a href="/{}/?page=1">首页</a></li>'.format(self.base_url) + \ 68 '<li class="dote"><a>···</a></li>' 69 page_all_list.append(ret) 70 # 主体页 71 for i in range(start_page, end_page + 1): 72 if i <= 1 or i >= self.tol_page: 73 ret = '' 74 elif i == self.now_page: 75 ret = '<li class="active"><a href="/{0}/?page={1}">{1}</a></li>'.format(self.base_url,i) 76 else: 77 ret = '<li><a href="/{0}/?page={1}">{1}</a></li>'.format(self.base_url,i) 78 page_all_list.append(ret) 79 # 尾页 80 if self.tol_page > 1: 81 if self.now_page == self.tol_page: 82 ret = '<li class="tail_page active"><a href="/{}/?page={}">尾页</a></li>'.format(self.base_url,self.tol_page) 83 elif self.now_page <= self.tol_page - self.half_page - 1 and self.tol_page > self.max_page: 84 ret = '<li class="dote"><a onmouseover="this.style.background-color="white"">···</a></li>' + \ 85 '<li class="tail_page"><a href="/{}/?page={}">尾页</a></li>'.format(self.base_url,self.tol_page) 86 else: 87 ret = '<li class="tail_page"><a href="/{}/?page={}">尾页</a></li>'.format(self.base_url,self.tol_page) 88 page_all_list.append(ret) 89 # 下一页 90 if self.now_page < self.tol_page: 91 ret = '<li class="dote"><a href="/{}/?page={}"><span aria-hidden="true">»</span></a></li>'.format(self.base_url, 92 self.now_page + 1) 93 else: 94 ret = '<li class="dote disabled"><a style="background-color: #dddddd"><span aria-hidden="true">»</span></a></li>' 95 page_all_list.append(ret) 96 page_all_list.append('</ul></nav>') 97 # page_all_list.append('<script>$(".page li").not($(".dote")).on("click", function () {$(this).addClass("active");$(this).siblings().removeClass("active");});</script>') 98 return page_all_list
路由(urls.py):
1 from django.contrib import admin 2 from django.urls import path 3 from django.conf.urls import url 4 from spider import views 5 6 urlpatterns = [ 7 path('admin/', admin.site.urls), 8 url(r'^index/', views.index), 9 ]
浙公网安备 33010602011771号