爬虫小试之一(抓取豆瓣电影)

工具

  python3.5

  BeautifulSoup

步骤:

  1、根据url抓取豆瓣电影html,并解析

  2、BeautifulSoup截取节点,写入字典

  3、保存字典信息

 

# -*- coding='utf-8' -*-
import requests
from bs4 import BeautifulSoup
import json

#发送request,返回response
def getHTMLText(url):
	try:
		r = requests.get(url, timeout=30)
		r.raise_for_status()
		r.encoding = r.apparent_encoding
		return r.text
	except:
		return ""


def getMovieInfo(mlist, html):
	soup = BeautifulSoup(html, 'html.parser')         #解析成html
	lists = soup.find_all('li', attrs={'class':'list-item'})   
	for ls in lists:
			if ls.attrs['data-category']== 'nowplaying':  #判断正热播的电影
				mdict = {}
				mdict['电影名'] = ls.attrs['data-title']
				mdict['评分'] = ls.attrs['data-score']
				mdict['时长'] = ls.attrs['data-duration']
				mdict['主演'] = ls.attrs['data-actors']
				mlist.append(mdict)

#写入txt文件
def saveMovieInfo(mlist, path):    
	with open(path, 'w', encoding='utf-8') as f:
		f.write(str(mlist))
		f.close()


def main():
	mlist = []
	url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/'
	path = 'D://pachong//movie.txt'
	html = getHTMLText(url)
	print(len(html))
	getMovieInfo(mlist, html)
	print()
	saveMovieInfo(mlist, path)

if __name__ == '__main__':
	main()

  

posted @ 2017-05-02 15:21  君何在  阅读(1752)  评论(0编辑  收藏  举报