使用Python抓取网页图片
今天写了一个Python小程序,用来抓取网页图片.

1
 import win32com.client,time,win32inet,win32file,os
import win32com.client,time,win32inet,win32file,os2
 class ImgDownloader:
class ImgDownloader:3
 def __init__(self,url,dir):
    def __init__(self,url,dir):4
 self.__dir=dir
        self.__dir=dir5
 self.__ie=win32com.client.Dispatch('InternetExplorer.Application')
        self.__ie=win32com.client.Dispatch('InternetExplorer.Application')6
 self.__ie.Navigate(url)
        self.__ie.Navigate(url)7
 self.__wait__()
        self.__wait__()8

9
 def __wait__(self):
    def __wait__(self):10
 while self.__ie.Busy:
        while self.__ie.Busy:11
 time.sleep(0.1)
            time.sleep(0.1)12

13
 def start(self):
    def start(self):14
 self.__wait__()
        self.__wait__()15
 imgs=self.__ie.Document.getElementsByTagName('img')
        imgs=self.__ie.Document.getElementsByTagName('img')16
 
      17
 for i in range(imgs.length):
        for i in range(imgs.length):18
 try:
            try:19
 cachInfo=win32inet.GetUrlCacheEntryInfo(imgs[i].src)
                cachInfo=win32inet.GetUrlCacheEntryInfo(imgs[i].src)20
 if cachInfo:
                if cachInfo:21
 path=cachInfo['LocalFileName']
                    path=cachInfo['LocalFileName']22
 pathinfo=path.split('\\')
                    pathinfo=path.split('\\')23
 pathinfo.reverse()
                    pathinfo.reverse()24
 filename=('[%d]' % i) + pathinfo[0]
                    filename=('[%d]' % i) + pathinfo[0]25
 
 26
 win32file.CopyFile(path,os.path.join(self.__dir,filename),True)
                    win32file.CopyFile(path,os.path.join(self.__dir,filename),True)27
 except:
            except:28
 pass
                pass29
 def close(self):
    def close(self):30
 self.__ie.Quit()
        self.__ie.Quit()31

32
 if __name__=='__main__':
if __name__=='__main__':33
 d=ImgDownloader('http://image.baidu.com/i?ct=201326592&cl=2&lm=-1&tn=baiduimage&pv=&word=boy&z=0','c:\\temp\\')
    d=ImgDownloader('http://image.baidu.com/i?ct=201326592&cl=2&lm=-1&tn=baiduimage&pv=&word=boy&z=0','c:\\temp\\')34
 d.start()
    d.start()35
 d.close()
    d.close()
原理:在Python使用com 接口运行IE浏览器,然后打开网页,获取网页所有图片的URL,最后利用win32api函数GetUrlCacheEntryInfo找出图片相应的本地缓存文件,复制到指定目录。
 
                    
                
 
                
            
         浙公网安备 33010602011771号
浙公网安备 33010602011771号