python爬取--豆瓣(Douban) Top 250

大家好，这是我在园子的第一篇文章。以前经常潜水，在这里看到了许多大牛写的文章，计算机/编程行业，开放包容的心态，给我很大的感触及鼓舞。因此，开这个博客：

一是希望能找到些许聊得一起的朋友；

二是把自己学习过程中遇到的各种坑予以记录，其他小伙伴遇到同样问题时可以绕过；

三是在整理/写文章的过程中对自己的思维有一个梳理。

这是一篇关于Python爬虫的小文章，具体是抓取douban top 250，并将其生成excel文件，以便于一定程度的统计分析，较为粗糙，作为小练习，请大家轻拍！

开发环境：ubuntu16.04 python3.5。下面直接上代码：

  1 # -*-coding: utf-8 -*-
  2 ## author: wh98898@163.com
  3 ## Configuration： Ubuntu16.04 LTS python3.5
  4 
  5 #导入所需库 主要采用request及bs4两个库，获取数据。
  6 import requests
  7 from bs4 import BeautifulSoup
  8 from tempfile import TemporaryFile
  9 from xlwt import Workbook
 10 import time
 11 import os
 12 import time
 13 import getpass
 14 ## import traceback
 15 
 16 now_time = time.strftime('%Y-%m-%d %H:%M:%S %A', time.localtime(time.time()))  # 收集时间
 17 now_user = getpass.getuser()  # 收集人
 18 base_url = "https://movie.douban.com/top250" # 目标网址
 19 ## import pymysql
 20 ## 保存至数据库
 21 ## conn = pymysql.connect("localhost", "root", "123456", "test", use_unicode=True, charset="utf8")
 22 ## cursor = conn.cursor()
 23 ##
 24 ## def save_to_database(movie_id, movie_title, movie_image, movie_des, movie_quote):
 25 ##     sql = 'insert into douban(movie_id,movie_title,movie_image,movie_description,movie_quote) VALUES ("%s","%s","%s","%s","%s")' % \
 26 ##           (movie_id, movie_title, movie_image, movie_des, movie_quote)
 27 ##
 28 ##     try:
 29 ##         cursor.execute(sql)
 30 ##         conn.commit()
 31 ##         print("success")
 32 ##     except:
 33 ##         conn.rollback()
 34 ##         traceback.print_exc()
 35 
 36 f = open('250.txt', 'w')
 37 print('开始收集信息')
 38 def get_list(url): #该函数用于爬取数据
 39     global base_url
 40     req = requests.get(url).text
 41     soup = BeautifulSoup(req, "html.parser")
 42     movie_list = soup.find_all("ol", "grid_view")
 43     for item in movie_list:
 44         for detail in item.find_all('li'):
 45             movie_id = str(detail.find('em').text)
 46             image_url = str(detail.img.attrs['src'])
 47             title = str(detail.find('span', 'title').text)
 48             description = str(detail.find('div', 'bd').p.text)
 49             # com = detail.find('div', 'bd').div.span
 50             # com = str(com.find('span' 'rating_num').text)
 51             # people = str(detail.find('div', 'bd').div.span[3].text)
 52             try:
 53                 quote = str(detail.find('p', 'quote').text)
 54             except:
 55                 quote = '暂无引用'
 56             # 测试打印的数据
 57             # print(movie_id, title, image_url, description, quote)
 58             # 进行数据库插入
 59             # print(movie_id, type(int(movie_id)), title, image_url, description, quote)
 60             f.write(movie_id + '\n')
 61             f.write(title + '\n')
 62             f.write(image_url + '\n')
 63             f.write(description + '\n')
 64             f.write(quote + '\n')
 65 
 66 
 67 
 68 
 69         # 下一页地址
 70     next_page = soup.find('span', attrs={'class': 'next'}).find('a')
 71     if next_page:
 72         new_url = base_url + next_page['href']
 73         # 使用递归,重复调用此函数,直到没有新的数据页
 74         get_list(new_url)
 75         return base_url
 76     else:
 77         base_url = None
 78     return None
 79 
 80 
 81 get_list(base_url)
 82 f.close()
 83 print('信息收集完毕')
 84 time.sleep(2)
 85 print('正在保存至excel---->')
 86 time.sleep(5)
 87 
 88 def to_excel(): #该函数用于保存获得的数据为excel
 89     workbook = Workbook()
 90     sheet1 = workbook.add_sheet('豆瓣电影250')
 91     sheet1.write(0, 0, 'sequence')
 92     sheet1.write(0, 1, 'title')
 93     sheet1.write(0, 2, 'link')
 94     sheet1.write(0, 3, 'director')
 95     sheet1.write(0, 4, 'actor')
 96     sheet1.write(0, 5, 'time')
 97     sheet1.write(0, 6, 'country')
 98     sheet1.write(0, 7, 'style')
 99     sheet1.write(0, 8, 'comment')
100 
101     f = open('250.txt', 'r')
102     a = f.readlines()
103     b = []
104     j = 0
105     for i in a:
106         if i != '\n':
107             b.append(i.strip())
108 
109     sequence = b[0::7]
110     title = b[1::7]
111     url = b[2::7]
112     director = b[3::7]
113     time = b[4::7]
114     com = b[6::7]
115     for i in range(250):
116 
117         if i == 21 or i == 38 or i == 68 or i == 69 or i == 242:
118             sheet1.write(i+1, 0, sequence[i])
119             sheet1.write(i+1, 1, title[i])
120             sheet1.write(i+1, 2, url[i])
121             sheet1.write(i+1, 3, director[i].split('主演')[0][4:])
122             sheet1.write(i+1, 4, '暂无')
123             sheet1.write(i+1, 5, time[i].split('/')[0])
124             sheet1.write(i+1, 6, time[i].split('/')[1])
125             sheet1.write(i+1, 7, time[i].split('/')[2])
126             sheet1.write(i+1, 8, com[i])
127         else:
128             sheet1.write(i + 1, 0, sequence[i])
129             sheet1.write(i + 1, 1, title[i])
130             sheet1.write(i + 1, 2, url[i])
131             sheet1.write(i + 1, 3, director[i].split('主演')[0][4:].strip('...').strip('...').strip('/'))
132             sheet1.write(i+1, 4, director[i].split('主演')[1][1:].strip('...').strip('/'))
133             sheet1.write(i + 1, 5, time[i].split('/')[0])
134             sheet1.write(i + 1, 6, time[i].split('/')[1])
135             sheet1.write(i + 1, 7, time[i].split('/')[2])
136             sheet1.write(i + 1, 8, com[i])
137     sheet1.write(i + 2, 5, '收集人：'+now_user)
138     sheet1.write(i + 3, 5, '收集时间：'+now_time)
139     workbook.save('豆瓣.xls')
140     workbook.save(TemporaryFile())
141 
142 # a = input('请输入文件名称--->')
143 to_excel()
144 print('所需资料已保存至当前文件夹excel文件')
145 os.remove('250.txt')

这个小程序的思路是，先利用request和bs4获取所需数据，再将其保存为office。但在实际操作中发现，直接将所得数据保存为excel有一定的难度（可能是能力有限）。因此，将获得的数据先保存为文本文档格式，然后导入excel，最后删除文本文档。由于数据体量较小，该方法对程序运行速度影响较小，但在大量数据时不建议这样做。

最后以园子的这句话结尾：Code changes the world。

发表于 2017-03-22 21:41 tszwh 阅读(91) 评论(0) 收藏举报

刷新页面返回顶部

代码改变世界阅读塑造人生
博客园首页新随笔联系订阅管理

python爬取--豆瓣(Douban) Top 250

大家好，这是我在园子的第一篇文章。以前经常潜水，在这里看到了许多大牛写的文章，计算机/编程行业，开放包容的心态，给我很大的感触及鼓舞。因此，开这个博客：

一是希望能找到些许聊得一起的朋友；

二是把自己学习过程中遇到的各种坑予以记录，其他小伙伴遇到同样问题时可以绕过；

三是在整理/写文章的过程中对自己的思维有一个梳理。

公告