【实例】爬虫:下载图片

#coding:utf8 

import urllib2
import re 
import os
import urllib

#record all url
fileurl = open("down_url.txt","w")
fileurl.write("************start**************")

#origin page youxunnet
temp = "http://pic.yxdown.com/list/0_0_1.html"
content = urllib2.urlopen(temp).read()
open("down_1.html","w").write(content)

#<div class="cbmiddle"></div>中<a target="_blank" href="/html/5533.html" >  
count = 1
res_div = r'<div class="cbmiddle">(.*?)</div>'
m_div = re.findall(res_div, content, re.S|re.M)
print len(m_div)
for line in m_div:
    if os.path.exists(str(count)) == False:
        os.mkdir(str(count))
    #获取title
    #<b class="imgname">台湾Showgirl晒福利健身照 网友:营养又要不够了(图)</b>
    title_pat = r'<b class="imgname">(.*?)</b>'
    title = re.findall(title_pat,line)
    if len(title)>0:
        title = re.findall(title_pat,line)[0]
    else:
        continue
    unicode(title,'utf-8')
    fileurl.write(title+'\n')
    #获取url
    #<a target="_blank" href="/html/7018.html" class="proimg">
    url_a_pat = r'<a target="_blank" href="(.*?

)" class="proimg">' url_a = re.findall(url_a_pat,line) if len(title)>0: url_a = re.findall(url_a_pat,line)[0] else: continue if url_a[0]!='/': continue fileurl.write(url_a+'\n') #获取url中的页面 print url_a html_url = 'http://pic.yxdown.com'+str(url_a) print html_url html_content = urllib2.urlopen(html_url).read() script_pat = r'<script>(.*?)</script>' script_con = re.findall(script_pat, html_content, re.S|re.M) for script in script_con: ori_pat = r'"original":"(.*?

)"' ori_con = re.findall(ori_pat, script) for ori in ori_con: fileurl.write(ori+'\n') filename = os.path.basename(ori) print ori #official recommendation method to download picture urllib.urlretrieve(ori,str(count)+"/"+filename) count = count+1 print "over"

posted @ 2017-07-25 09:58  brucemengbm  阅读(144)  评论(0)    收藏  举报