使用 BeautifulSoup 进行解析 html
#coding=utf-8
import urllib2import socketimport httplibfrom bs4 import BeautifulSoupUserAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'def downloadPage(url): try: opener = urllib2.build_opener() headers = { 'User-Agent': UserAgent } req = urllib2.Request(url = url, headers = headers) resp = opener.open(req, timeout = 30) result = resp.read() return result except urllib2.HTTPError, ex: print ex return '' except urllib2.URLError, ex: print ex return '' except socket.error, ex: print ex return '' except httplib.BadStatusLine, ex: print ex return ''if __name__ == '__main__':content = downloadPage("这填douban的地址")#print contentsoap = BeautifulSoup(content, 'lxml')lst = soap.select('ol.grid_view li')for item in lst: # 电影详情页链接 print item.select('div.item > div.pic a')[0].attrs['href'] # 图片链接 print item.select('div.item > div.pic a img')[0].attrs['src'] # 标题 print item.select('div.item > div.info > div.hd > a > span.title')[0].get_text() # 评分 print item.select('div.item > div.info > div.bd > div.star > span.rating_num')[0].get_text() print '-------------------------------------------------------------------------'
浙公网安备 33010602011771号