爬虫小试之一(抓取豆瓣电影)
工具
python3.5
BeautifulSoup
步骤:
1、根据url抓取豆瓣电影html,并解析
2、BeautifulSoup截取节点,写入字典
3、保存字典信息
# -*- coding='utf-8' -*-
import requests
from bs4 import BeautifulSoup
import json
#发送request,返回response
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getMovieInfo(mlist, html):
soup = BeautifulSoup(html, 'html.parser') #解析成html
lists = soup.find_all('li', attrs={'class':'list-item'})
for ls in lists:
if ls.attrs['data-category']== 'nowplaying': #判断正热播的电影
mdict = {}
mdict['电影名'] = ls.attrs['data-title']
mdict['评分'] = ls.attrs['data-score']
mdict['时长'] = ls.attrs['data-duration']
mdict['主演'] = ls.attrs['data-actors']
mlist.append(mdict)
#写入txt文件
def saveMovieInfo(mlist, path):
with open(path, 'w', encoding='utf-8') as f:
f.write(str(mlist))
f.close()
def main():
mlist = []
url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/'
path = 'D://pachong//movie.txt'
html = getHTMLText(url)
print(len(html))
getMovieInfo(mlist, html)
print()
saveMovieInfo(mlist, path)
if __name__ == '__main__':
main()

浙公网安备 33010602011771号