python 爬虫下载图片

import os#导入操作系统模块
from urllib.request import urlretrieve#下载url对应的文件
from urllib.request import urlopen    #打开url，得到网页源代码
from bs4 import BeautifulSoup         #bs库，对源代码进行各种操作

downloadDirectory = "downloaded"     #下载至名为“download”的文件夹
baseUrl = "http://pythonscraping.com"#########################################

#将任意链接转换成absolute URL——清理和标准化
def getAbsoluteURL(baseUrl, source):
    if source.startswith("http://www."):
        url = "http://"+source[11:]
    elif source.startswith("http://"):
        url = source
    elif source.startswith("www."):
        url = source[4:]
        url = "http://"+url
    else:
        url = baseUrl+"/"+source
    if baseUrl not in url:
        return None
    return url

#新建一个文件夹，存放下载文件
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace("www.", "")
    path = path.replace(baseUrl, "")
    path = downloadDirectory+path
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    return path

html = urlopen("http://www.pythonscraping.com")##############################
bsObj = BeautifulSoup(html)
downloadList = bsObj.findAll(src=True)#获取src对应的链接list
#print(downloadList)

#将链接list中每一个链接转换成absoluteURL
for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download["src"])
    if fileUrl is not None:
        print(fileUrl)

urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))

posted @ 2017-01-18 11:06 寒江小筑阅读(929) 评论(0) 收藏举报

刷新页面返回顶部

寒江小筑

python 爬虫 下载图片

公告

python 爬虫下载图片