Python抓取电影来了(dianying.fm)
/*
title:Python抓取电影来了(dianying.fm)
blog:http://yxmhero1989.blog.163.com/blog/static/11215795620132441055127/
author:insun
*/
#!/usr/bin/env python #-*- coding=utf-8 -*- import urllib2,urllib,re import pymongo import sys,os import json import chardet reload(sys) sys.setdefaultencoding('utf-8') db = pymongo.Connection().test if(os.path.exists('Allsmall')==False): os.mkdir('Allsmall') if(os.path.exists('Allbig')==False): os.mkdir('Allbig') #http://dianying.fm/genre/%E7%88%B1%E6%83%85 调整后的分类是http://movie.readself.com/category/filter_play-genre_%E5%96%9C%E5%89%A7 #下拉加载和点击加载是JS附加了一段json 要json {"result": -1, "error": "unknown error"} #一个json加载28个 29*28 = 812 > 500 #2013/2/27 貌似做了IP屏蔽 于是加上代理 页面调整了 于是改了2个正则 for i in range(1,49): #http://movie.readself.com/ print "-----------第"+str(i)+"个链接开始抓取--------------" url = 'http://movie.readself.com/reflect/category/eyJzb3J0IjogIiIsICJmaWx0ZXIiOiAiIiwgImNhdGFsbCI6ICIiLCAia2V5IjogIiIsICJ5ZWFyIjogIiIsICJnZW5yZSI6ICIiLCAicmVnaW9uIjogIiIsICJzaG93IjogIiIsICJjbGFzcyI6ICIifQ==/'+str(i) try: proxies = {'http':'http://219.239.26.23:80'}#或者proxies = {'':'211.167.112.14:80'} opener = urllib.FancyURLopener(proxies) f = opener.open(url) html = f.read() #print chardet.detect(html)#{'confidence': 1.0, 'encoding': 'ascii'} #html = html.replace('\\n','') #这个导致了28个变成了5个 说明后来的正则已经不适合下面的了 html = json.loads(html,encoding='utf-8') html = str(html) reg = re.compile(r'<div class="x-movie-detail">.+?<a class="z-movie-playmask" .+? href="(.*?)">.+?<img alt="(.*?)".+?src="(.*?)">.+?</a>',re.S) #reg = re.compile(r'<div class=\\\"x-movie-entry\\\">.+?<a href=\\\"(.*?)\\\".+?title=\\\"(.*?)\\\">.+?<img.+?src=\\\"(.*?)\\\">.+?</a>.+?<div class=\\\"x-movie-caption\\\">.+?</div>',re.S) groups = re.findall(reg,html) print "*****----本链接抓取了"+str(len(groups))+"个影片信息****----------" for detail in groups: shortTitle = detail[1] #三傻大闹宝莱坞 \u4e09\u50bb\u5927\u95f9\u5b9d\u83b1\u575e shortTitle = shortTitle.decode('unicode_escape').encode('utf-8') print shortTitle + ' 下载中' #\u8ba9\u5b50\u5f39\u98de \u后边的四位十六进制数是汉字的UNICODE编码 #\xe8\xae\xa9\xe5\xad\x90\xe5\xbc\xb9\xe9\xa3\x9e gb2312 编码 #b'\xDE\xD5\xB4\xF8' 二进制 smallImage = detail[2] detailUrl = 'http://movie.readself.com'+detail[0] text = opener.open(detailUrl).read() IMDB_stars_need = '.+?<span class="badge" style="color: orange; font-weight: bold;">(.*?)</span>' alias_need = '.+?<td class="span2"><span class="x-m-label">别名</span></td>.+?<td>(.*?)</td>' IMDB_link_need = '.+?<a rel="nofollow" href="(.*?)" target="_blank">IMDB链接</a>' if text.count('别名') == 0: alias_need = '(.*?)' if text.count('IMDB:') == 0: IMDB_stars_need = '(.*?)' if text.count('IMDB链接') == 0 : IMDB_link_need = '(.*?)' detailReg = re.compile('<div class="x-m-poster">.+?<img .+? src="(.*?)">.+?</div>'+ '.+?<a rel="nofollow" href="(.*?)" target="_blank">豆瓣链接</a>'+ IMDB_link_need+ '.+?<div class="x-m-title">(.*?)<span class="muted">(.*?)</span>.+?</div>'+ '.+?<td class="span2"><span class="x-m-label">导演</span></td>.+?<td>(.*?)</td>'+ '.+?<td class="span2"><span class="x-m-label">主演</span></td>.+?<td>(.*?)</td>'+ '.+?<td class="span2"><span class="x-m-label">类型</span></td>.+?<td>(.*?)</td>'+ '.+?<td class="span2"><span class="x-m-label">地区</span></td>.+?<td>(.*?)</td>'+ '.+?<td class="span2"><span class="x-m-label">上映时间</span></td>.+?<td>(.*?)</td>'+ '.+?<td class="span2"><span class="x-m-label">片长</span></td>.+?<td>(.*?)</td>'+ alias_need + '.+?<td class="span2"><span class="x-m-label">评分</span></td><td>'+ '.+?<span class="badge" style="color: green; font-weight: bold;">(.*?)</span>'+ IMDB_stars_need + '.+?<div class="x-m-summary">(.*?)</div>',re.S) contents = re.findall(detailReg,text) magnetReg = re.compile('.+?<tr class="resources".+?>.+?<td style="word-break: break-all;">(.*?).+?<span class="muted">(.*?)</span>.+?</td>'+ '.+?<a .+? rel="nofollow" href="(.*?)" .+? title="(.*?)">.+?</a>'+ '.+?<a rel="nofollow".+?href="(.*?)" target="_blank" .+?>.+?云点播.+?</a>' '.+?<a class="btn" style="padding:0px;" tclass=" ".+?>(.*?)</a>',re.S) magnetAll = re.findall(magnetReg,text) for content in contents: bigImage = content[0] values = dict( shortTitle = shortTitle, smallImage = smallImage, bigImage = bigImage, douban_link = content[1], imdb_link = content[2], title = content[3], year = content[4], director = content[5], actors = content[6], genre = content[7], area = content[8], film_date = content[9], duration = content[10], alias = content[11], douban_rate = content[12], imdb_rate = content[13], summary = content[14], magnetAll = magnetAll, ) db.all.save(values) #small:http://img2.static.dianying.fm/poster/m/5071cc6d90d7a90a416a596d #big:http://img1.static.dianying.fm/poster/l/5071cc6d90d7a90a416a596d smallSave = smallImage[smallImage.rindex('/')+1:]+".jpg"#after #before = smallImage[:smallImage.rindex('/')+1] #before bigSave = bigImage[bigImage.rindex('/')+1:]+".jpg" urllib.urlretrieve(smallImage,'Allsmall/'+smallSave) urllib.urlretrieve(bigImage,'Allbig/'+bigSave) print shortTitle+'Save Success!' except: ''' print 'Bad Gateway or Bad NetWork or Site Crash!' sys.exit(-1) ''' raise
刚学python不久,转了学习一下
浙公网安备 33010602011771号