1 from bs4 import BeautifulSoup
2 import requests
3 import html.parser
4 from openpyxl import Workbook,load_workbook
5 import os
6 class DouBan(object):
7
8 def __init__(self):
9 self.url = 'https://movie.douban.com/'
10 self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
11
12 def openUrl(self, url):
13 response = requests.get(url,headers=self.header)
14 return response
15
16 def getUrl(self):
17 response = self.openUrl(self.url)
18 douban_html = response.text
19 # print(douban_html)
20 soup = BeautifulSoup(douban_html,'html.parser')
21 hrefs = soup.select("li.poster > a")
22 return hrefs
23 # for href in hrefs:
24 # print(href['href']
25 def getMsg(self):
26 hrefs = self.getUrl()
27 for num,href in enumerate(hrefs):
28 msg_list = []
29 print(href['href'])
30 response = self.openUrl(href['href'])
31 html_mover = response.text
32 soup = BeautifulSoup(html_mover,'html.parser')
33 all_info = soup.select('div#content')
34 # print(all_info)
35 title = all_info[0].select('h1')[0].text.replace('\n','')
36 msg_list.append(title)
37 # print(title)
38 info = all_info[0].select('#info')[0].text
39 msg_list.append(info)
40 # print(info)
41 describe = all_info[0].select('div#link-report span')[0].text.replace(' ','')
42 msg_list.append(describe)
43 # print(describe)
44 # return title,info,describe
45 for col in range(3):
46 self.saveMsg(num+1, col+1, msg_list[col])
47
48 def saveMsg(self, row_, column_,msg):
49 # msg = self.getMsg()
50 # a = os.path.exists('//move_msg.xlsx')
51 # if a=False:
52 # os.mkdir('move_msg.xlsx')
53
54 wb = load_workbook('move_msg.xlsx')
55 sheet = wb.active
56 sheet.cell(row=row_, column=column_).value = msg
57 wb.save('move_msg.xlsx')
58
59
60
61
62 if __name__ == "__main__":
63 db = DouBan()
64 db.getMsg()