大家好,这是我在园子的第一篇文章。以前经常潜水,在这里看到了许多大牛写的文章,计算机/编程行业,开放包容的心态,给我很大的感触及鼓舞。因此,开这个博客:
一是希望能找到些许聊得一起的朋友;
二是把自己学习过程中遇到的各种坑予以记录,其他小伙伴遇到同样问题时可以绕过;
三是在整理/写文章的过程中对自己的思维有一个梳理。
这是一篇关于Python爬虫的小文章,具体是抓取douban top 250,并将其生成excel文件,以便于一定程度的统计分析,较为粗糙,作为小练习,请大家轻拍!
开发环境:ubuntu16.04 python3.5。下面直接上代码:
1 # -*-coding: utf-8 -*- 2 ## author: wh98898@163.com 3 ## Configuration: Ubuntu16.04 LTS python3.5 4 5 #导入所需库 主要采用request及bs4两个库,获取数据。 6 import requests 7 from bs4 import BeautifulSoup 8 from tempfile import TemporaryFile 9 from xlwt import Workbook 10 import time 11 import os 12 import time 13 import getpass 14 ## import traceback 15 16 now_time = time.strftime('%Y-%m-%d %H:%M:%S %A', time.localtime(time.time())) # 收集时间 17 now_user = getpass.getuser() # 收集人 18 base_url = "https://movie.douban.com/top250" # 目标网址 19 ## import pymysql 20 ## 保存至数据库 21 ## conn = pymysql.connect("localhost", "root", "123456", "test", use_unicode=True, charset="utf8") 22 ## cursor = conn.cursor() 23 ## 24 ## def save_to_database(movie_id, movie_title, movie_image, movie_des, movie_quote): 25 ## sql = 'insert into douban(movie_id,movie_title,movie_image,movie_description,movie_quote) VALUES ("%s","%s","%s","%s","%s")' % \ 26 ## (movie_id, movie_title, movie_image, movie_des, movie_quote) 27 ## 28 ## try: 29 ## cursor.execute(sql) 30 ## conn.commit() 31 ## print("success") 32 ## except: 33 ## conn.rollback() 34 ## traceback.print_exc() 35 36 f = open('250.txt', 'w') 37 print('开始收集信息') 38 def get_list(url): #该函数用于爬取数据 39 global base_url 40 req = requests.get(url).text 41 soup = BeautifulSoup(req, "html.parser") 42 movie_list = soup.find_all("ol", "grid_view") 43 for item in movie_list: 44 for detail in item.find_all('li'): 45 movie_id = str(detail.find('em').text) 46 image_url = str(detail.img.attrs['src']) 47 title = str(detail.find('span', 'title').text) 48 description = str(detail.find('div', 'bd').p.text) 49 # com = detail.find('div', 'bd').div.span 50 # com = str(com.find('span' 'rating_num').text) 51 # people = str(detail.find('div', 'bd').div.span[3].text) 52 try: 53 quote = str(detail.find('p', 'quote').text) 54 except: 55 quote = '暂无引用' 56 # 测试打印的数据 57 # print(movie_id, title, image_url, description, quote) 58 # 进行数据库插入 59 # print(movie_id, type(int(movie_id)), title, image_url, description, quote) 60 f.write(movie_id + '\n') 61 f.write(title + '\n') 62 f.write(image_url + '\n') 63 f.write(description + '\n') 64 f.write(quote + '\n') 65 66 67 68 69 # 下一页地址 70 next_page = soup.find('span', attrs={'class': 'next'}).find('a') 71 if next_page: 72 new_url = base_url + next_page['href'] 73 # 使用递归,重复调用此函数,直到没有新的数据页 74 get_list(new_url) 75 return base_url 76 else: 77 base_url = None 78 return None 79 80 81 get_list(base_url) 82 f.close() 83 print('信息收集完毕') 84 time.sleep(2) 85 print('正在保存至excel---->') 86 time.sleep(5) 87 88 def to_excel(): #该函数用于保存获得的数据为excel 89 workbook = Workbook() 90 sheet1 = workbook.add_sheet('豆瓣电影250') 91 sheet1.write(0, 0, 'sequence') 92 sheet1.write(0, 1, 'title') 93 sheet1.write(0, 2, 'link') 94 sheet1.write(0, 3, 'director') 95 sheet1.write(0, 4, 'actor') 96 sheet1.write(0, 5, 'time') 97 sheet1.write(0, 6, 'country') 98 sheet1.write(0, 7, 'style') 99 sheet1.write(0, 8, 'comment') 100 101 f = open('250.txt', 'r') 102 a = f.readlines() 103 b = [] 104 j = 0 105 for i in a: 106 if i != '\n': 107 b.append(i.strip()) 108 109 sequence = b[0::7] 110 title = b[1::7] 111 url = b[2::7] 112 director = b[3::7] 113 time = b[4::7] 114 com = b[6::7] 115 for i in range(250): 116 117 if i == 21 or i == 38 or i == 68 or i == 69 or i == 242: 118 sheet1.write(i+1, 0, sequence[i]) 119 sheet1.write(i+1, 1, title[i]) 120 sheet1.write(i+1, 2, url[i]) 121 sheet1.write(i+1, 3, director[i].split('主演')[0][4:]) 122 sheet1.write(i+1, 4, '暂无') 123 sheet1.write(i+1, 5, time[i].split('/')[0]) 124 sheet1.write(i+1, 6, time[i].split('/')[1]) 125 sheet1.write(i+1, 7, time[i].split('/')[2]) 126 sheet1.write(i+1, 8, com[i]) 127 else: 128 sheet1.write(i + 1, 0, sequence[i]) 129 sheet1.write(i + 1, 1, title[i]) 130 sheet1.write(i + 1, 2, url[i]) 131 sheet1.write(i + 1, 3, director[i].split('主演')[0][4:].strip('...').strip('...').strip('/')) 132 sheet1.write(i+1, 4, director[i].split('主演')[1][1:].strip('...').strip('/')) 133 sheet1.write(i + 1, 5, time[i].split('/')[0]) 134 sheet1.write(i + 1, 6, time[i].split('/')[1]) 135 sheet1.write(i + 1, 7, time[i].split('/')[2]) 136 sheet1.write(i + 1, 8, com[i]) 137 sheet1.write(i + 2, 5, '收集人:'+now_user) 138 sheet1.write(i + 3, 5, '收集时间:'+now_time) 139 workbook.save('豆瓣.xls') 140 workbook.save(TemporaryFile()) 141 142 # a = input('请输入文件名称--->') 143 to_excel() 144 print('所需资料已保存至当前文件夹excel文件') 145 os.remove('250.txt')
这个小程序的思路是,先利用request和bs4获取所需数据,再将其保存为office。但在实际操作中发现,直接将所得数据保存为excel有一定的难度(可能是能力有限)。因此,将获得的数据先保存为文本文档格式,然后导入excel,最后删除文本文档。由于数据体量较小,该方法对程序运行速度影响较小,但在大量数据时不建议这样做。
最后以园子的这句话结尾:Code changes the world。
------友天下士,读古今书!
浙公网安备 33010602011771号