【实例】爬虫：下载图片

#coding:utf8 

import urllib2
import re 
import os
import urllib

#record all url
fileurl = open("down_url.txt","w")
fileurl.write("************start**************")

#origin page youxunnet
temp = "http://pic.yxdown.com/list/0_0_1.html"
content = urllib2.urlopen(temp).read()
open("down_1.html","w").write(content)

#<div class="cbmiddle"></div>中<a target="_blank" href="/html/5533.html" >  
count = 1
res_div = r'<div class="cbmiddle">(.*?)</div>'
m_div = re.findall(res_div, content, re.S|re.M)
print len(m_div)
for line in m_div:
    if os.path.exists(str(count)) == False:
        os.mkdir(str(count))
    #获取title
    #<b class="imgname">台湾Showgirl晒福利健身照 网友：营养又要不够了(图)</b>
    title_pat = r'<b class="imgname">(.*?)</b>'
    title = re.findall(title_pat,line)
    if len(title)>0:
        title = re.findall(title_pat,line)[0]
    else:
        continue
    unicode(title,'utf-8')
    fileurl.write(title+'\n')
    #获取url
    #<a target="_blank" href="/html/7018.html" class="proimg">
    url_a_pat = r'<a target="_blank" href="(.*?
)" class="proimg">'
    url_a = re.findall(url_a_pat,line)
    if len(title)>0:
        url_a = re.findall(url_a_pat,line)[0]
    else:
        continue
    if url_a[0]!='/':
        continue
    fileurl.write(url_a+'\n')
    #获取url中的页面
    print url_a
    html_url = 'http://pic.yxdown.com'+str(url_a)
    print html_url
    html_content = urllib2.urlopen(html_url).read()
    script_pat = r'<script>(.*?)</script>'
    script_con = re.findall(script_pat, html_content, re.S|re.M)
    for script in script_con:
        ori_pat = r'"original":"(.*?)"'
        ori_con = re.findall(ori_pat, script)
        for ori in ori_con:
            fileurl.write(ori+'\n')
            filename = os.path.basename(ori)

            print ori
            #official recommendation method to download picture
            urllib.urlretrieve(ori,str(count)+"/"+filename)
    count = count+1
print "over"
posted @ 2017-07-25 09:58 brucemengbm 阅读(144) 评论(0) 收藏举报
刷新页面返回顶部