scapy 实现抓取不同IP网站的IP,目前实现4个,有实力自己添加更多
# -*- coding: utf-8 -*- ''' 这是一个抓取IP大全,可以自动抓取好几个网站IP。 scrapy crawl daili -s JOBDIR=crawls/somespider-1 这样启动 然后,你就能在任何时候安全地停止爬虫(按Ctrl-C或者发送一个信号)。恢复这个爬虫也是同样的命令: '''
2020/12/8 跟新 一个网站实效,添加一个网站。
import scrapy from scrapy.selector import Selector from scrapy.http import Request #from IpProxy.items import IpproxyItem import time import re class MyspiderSpider(scrapy.Spider): name = "daili" #url = 'https://ip.jiangxianli.com' #allowed_domains = ["https://ip.jiangxianli.com/?page=1"] start_urls = [ 'https://ip.jiangxianli.com/?page=1', 'http://www.iphai.com/free/ng', 'http://www.ip3366.net/free/?stype=1', 'http://www.ip3366.net/free/?stype=2&page=2', 'https://proxy.seofangfa.com/', ] def parse(self, response): urls = ['https://ip.jiangxianli.com/?page=%s' % i for i in range(1, 9)] #抓取9页 for url in urls: yield scrapy.Request(url, callback=self.jiangxianli) urls1 = ('http://www.iphai.com/free/ng') #网址只能是元组 yield scrapy.Request(urls1, callback=self.iphai) #抓取两个网站 urls3 =['http://www.ip3366.net/free/?stype=%s'% i for i in range(1, 8)] #抓取7页 抓取高密 for url1 in urls3: yield scrapy.Request(url1, callback=self.ip3366) urls4 = ['http://www.ip3366.net/free/?stype=2&page=%s' % i for i in range(1, 8)] # 抓取7页 抓去普通 for url2 in urls4: yield scrapy.Request(url2, callback=self.ip3366pro) urls5 = ('https://proxy.seofangfa.com/') # 网址只能是元组 yield scrapy.Request(urls5, callback=self.seo) # 抓取两个网站 def seo(self,response): IP = response.xpath('//*[@class="table"]/tbody[1]/tr/td[1]//text()').extract() port = response.xpath('//*[@class="table"]/tbody[1]/tr/td[2]//text()').extract() for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] # 演示合成网站有兴趣自己直接 在这里检测IP可用性。 print('seo') yield { 'IP': IP[i], 'port': port[i], } def jiangxianli(self, response): IP = response.xpath('//*[@class="layui-table"]/tbody[1]/tr/td[1]//text()').extract() # IP = response.css(".layui-table td:nth-of-type(1)::text").extract() port = response.xpath('//*[@class="layui-table"]/tbody[1]/tr/td[2]//text()').extract() #port = response.css(".layui-table td:nth-of-type(2)::text").extract() for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] # 演示合成网站有兴趣自己直接 在这里检测IP可用性。 print('jiangxianli') yield { 'IP': IP[i], 'port': port[i], } def iphai(self, response): ip = response.xpath('//*[@class="table table-bordered table-striped table-hover"]//td[1]//text()').extract() PORT = response.xpath('//*[@class="table table-bordered table-striped table-hover"]//td[2]//text()').extract() IP=[] for g in ip: g = re.findall(r'[^ \n]', g)#去除 特殊符号 g = ''.join(g) IP.append(g) port = [] for d in PORT: d = re.findall(r'[^ \n]', d) #去除 特殊符号 d = ''.join(d) #转元组 port.append(d) for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] print('牛逼') yield { 'IP': IP[i], 'port': port[i], } def ip3366(self, response): IP = response.xpath('//*[@id="list"]/table/tbody/tr/td[1]//text()').extract() port = response.xpath('//*[@id="list"]/table/tbody/tr/td[2]//text()').extract() for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] print('妮马') yield { 'IP': IP[i], 'port': port[i], } def ip3366pro(self, response): IP = response.xpath('//*[@id="list"]/table/tbody/tr/td[1]//text()').extract() port = response.xpath('//*[@id="list"]/table/tbody/tr/td[2]//text()').extract() for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] print('操妮马') yield { 'IP': IP[i], 'port': port[i], } ''' proxies = { "http": PROXY } try: test = requests.get(pagetest, headers=header, proxies=proxy, timeout=5) if test.status_code == 200: # 能正常请求目标网页 yield { 'IP': IP[i], 'port': port[i], } #response = requests.get(pagetest, timeout=1, proxies=proxies) # print(response.status_code) #if response.status_code == 200: # 判断返回的状态代码来判断IP是否可用 except : print("connect failed!") ''' # yield { 'IP': IP,'port': port,} #status = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[3]/text()').extract() #types = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[4]/text()').extract() #support = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[5]/text()').extract() #address = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[6]/text()').extract() #speed= Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[7]/text()').extract() #testtime= Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[8]/text()').extract() ''' for i in range(len(IP)): item['IP'] = IP[i] item['port'] = port[i] #item['status'] = status[i] #item['types'] = types[i] #item['support'] = support[i] #item['address'] = address[i] #item['speed'] = speed[i] #item['testtime'] = testtime[i] #item['grab_time'] = time.strftime('%Y-%m-%d') yield item '''
------------------------------------------------------------------------------------------------------
在item文件里写入
import scrapy class IpproxyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() IP = scrapy.Field() port = scrapy.Field()
————————————————————————————————————————————————
# -*- coding: utf-8 -*- #在pipeline 替换为以下代码 class IpproxyPipeline(object): def process_item(self, item, spider): print ('---------write------------------') fileName = 'IP.txt' f = open(fileName,"a+") content=item['IP'] + ':' +item['port'] +'\n' f.write(content) f.close() return item
————————————————————————————————————————————————————————————————、
#在settings 添加以下代码 USER_AGENT = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" ] # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'IpProxy.pipelines.IpproxyPipeline': 1, }
——————————————————————————————————————————————————————
好了可以完美抓去多条网站免费IP。。。。
如果人生还有重来,那就不叫人生。

浙公网安备 33010602011771号