python 图片爬虫

20年跟着千峰的课写的网络安全。4年前的代码了。

关键就是访问链接得到回复,解析得到图片链接,再次调用访问链接函数,得到的东西写入文件即可。

1 美女图片(WEB 链接目前不可用)

import urllib.request
import re

class GetHtml:
        def __init__(self, URL, HEADER):
                self.url = URL
                self.header = HEADER

        def get_index(self):
                self.request = urllib.request.Request(self.url)
                self.request.add_header("User-Agent",self.header)
                self.response = urllib.request.urlopen(self.request)
                return self.response.read()
        
        def get_list(self):
                self.stringList = []
                self.imglist = re.findall(b"https://pit1.maozhew.com/forum/.{32}.(?:jpg|jpeg|png)",self.get_index())
                #print(self.imglist)
                for i in self.imglist:
                        self.stringList.append(str(i,encoding = "utf8"))
                
                return self.stringList
                
        def get_image(self):
                num = 0
                for self.url in self.get_list():
                      num += 1
                      with open(str(num)+".jpg","wb") as f:
                              f.write(self.get_index())
                pass

html = GetHtml("https://www.topit.pro/","Mozilla/5.0 (Windows NT 10.0; Win64; \
x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36")

html.get_image()

2 国家地理

import urllib.request
import re

class GetHtml:
        def __init__(self, URL, HEADER):
                self.url = URL
                self.header = HEADER

        def get_index(self):
                self.request = urllib.request.Request(self.url)
                self.request.add_header("User-Agent",self.header)
                self.response = urllib.request.urlopen(self.request)
                return self.response.read()
        
        def get_list(self):
                self.stringList = []
                self.imglist = re.findall(b"http://5b0988e595225.cdn.sohucs.com/images/.{41}.(?:jpg|jpeg|png)",self.get_index())
                #print(self.imglist)
                for i in self.imglist:
                        self.stringList.append(str(i,encoding = "utf8"))
                
                return self.stringList
                
        def get_image(self):
                num = 0
                for self.url in self.get_list():
                      num += 1
                      with open(str(num)+".jpg","wb") as f:
                              f.write(self.get_index())
                pass

html = GetHtml("https://www.sohu.com/a/255373346_721493","Mozilla/5.0 (Windows NT 10.0; Win64; \
x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36")

html.get_image()

posted @ 2024-03-08 22:00  Dba_sys  阅读(21)  评论(0)    收藏  举报