scapy 实现抓取不同IP网站的IP,目前实现4个,有实力自己添加更多

# -*- coding: utf-8 -*-
'''
这是一个抓取IP大全,可以自动抓取好几个网站IP。
scrapy crawl daili -s JOBDIR=crawls/somespider-1    这样启动 然后,你就能在任何时候安全地停止爬虫(按Ctrl-C或者发送一个信号)。恢复这个爬虫也是同样的命令:
'''
2020/12/8 跟新 一个网站实效,添加一个网站。
import scrapy from scrapy.selector import Selector from scrapy.http import Request #from IpProxy.items import IpproxyItem import time import re class MyspiderSpider(scrapy.Spider): name = "daili" #url = 'https://ip.jiangxianli.com' #allowed_domains = ["https://ip.jiangxianli.com/?page=1"] start_urls = [ 'https://ip.jiangxianli.com/?page=1', 'http://www.iphai.com/free/ng', 'http://www.ip3366.net/free/?stype=1', 'http://www.ip3366.net/free/?stype=2&page=2', 'https://proxy.seofangfa.com/', ] def parse(self, response): urls = ['https://ip.jiangxianli.com/?page=%s' % i for i in range(1, 9)] #抓取9页 for url in urls: yield scrapy.Request(url, callback=self.jiangxianli) urls1 = ('http://www.iphai.com/free/ng') #网址只能是元组 yield scrapy.Request(urls1, callback=self.iphai) #抓取两个网站 urls3 =['http://www.ip3366.net/free/?stype=%s'% i for i in range(1, 8)] #抓取7页 抓取高密 for url1 in urls3: yield scrapy.Request(url1, callback=self.ip3366) urls4 = ['http://www.ip3366.net/free/?stype=2&page=%s' % i for i in range(1, 8)] # 抓取7页 抓去普通 for url2 in urls4: yield scrapy.Request(url2, callback=self.ip3366pro) urls5 = ('https://proxy.seofangfa.com/') # 网址只能是元组 yield scrapy.Request(urls5, callback=self.seo) # 抓取两个网站 def seo(self,response): IP = response.xpath('//*[@class="table"]/tbody[1]/tr/td[1]//text()').extract() port = response.xpath('//*[@class="table"]/tbody[1]/tr/td[2]//text()').extract() for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] # 演示合成网站有兴趣自己直接 在这里检测IP可用性。 print('seo') yield { 'IP': IP[i], 'port': port[i], } def jiangxianli(self, response): IP = response.xpath('//*[@class="layui-table"]/tbody[1]/tr/td[1]//text()').extract() # IP = response.css(".layui-table td:nth-of-type(1)::text").extract() port = response.xpath('//*[@class="layui-table"]/tbody[1]/tr/td[2]//text()').extract() #port = response.css(".layui-table td:nth-of-type(2)::text").extract() for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] # 演示合成网站有兴趣自己直接 在这里检测IP可用性。 print('jiangxianli') yield { 'IP': IP[i], 'port': port[i], } def iphai(self, response): ip = response.xpath('//*[@class="table table-bordered table-striped table-hover"]//td[1]//text()').extract() PORT = response.xpath('//*[@class="table table-bordered table-striped table-hover"]//td[2]//text()').extract() IP=[] for g in ip: g = re.findall(r'[^ \n]', g)#去除 特殊符号 g = ''.join(g) IP.append(g) port = [] for d in PORT: d = re.findall(r'[^ \n]', d) #去除 特殊符号 d = ''.join(d) #转元组 port.append(d) for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] print('牛逼') yield { 'IP': IP[i], 'port': port[i], } def ip3366(self, response): IP = response.xpath('//*[@id="list"]/table/tbody/tr/td[1]//text()').extract() port = response.xpath('//*[@id="list"]/table/tbody/tr/td[2]//text()').extract() for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] print('妮马') yield { 'IP': IP[i], 'port': port[i], } def ip3366pro(self, response): IP = response.xpath('//*[@id="list"]/table/tbody/tr/td[1]//text()').extract() port = response.xpath('//*[@id="list"]/table/tbody/tr/td[2]//text()').extract() for i in range(len(IP)): PROXY = "http://" + IP[i] + ":" + port[i] print('操妮马') yield { 'IP': IP[i], 'port': port[i], } ''' proxies = { "http": PROXY } try: test = requests.get(pagetest, headers=header, proxies=proxy, timeout=5) if test.status_code == 200: # 能正常请求目标网页 yield { 'IP': IP[i], 'port': port[i], } #response = requests.get(pagetest, timeout=1, proxies=proxies) # print(response.status_code) #if response.status_code == 200: # 判断返回的状态代码来判断IP是否可用 except : print("connect failed!") ''' # yield { 'IP': IP,'port': port,} #status = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[3]/text()').extract() #types = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[4]/text()').extract() #support = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[5]/text()').extract() #address = Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[6]/text()').extract() #speed= Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[7]/text()').extract() #testtime= Selector(response).xpath('//*[@id="index_free_list"]/table/tbody/tr/td[8]/text()').extract() ''' for i in range(len(IP)): item['IP'] = IP[i] item['port'] = port[i] #item['status'] = status[i] #item['types'] = types[i] #item['support'] = support[i] #item['address'] = address[i] #item['speed'] = speed[i] #item['testtime'] = testtime[i] #item['grab_time'] = time.strftime('%Y-%m-%d') yield item '''

 

 

 

------------------------------------------------------------------------------------------------------

在item文件里写入

import scrapy

class IpproxyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    IP = scrapy.Field()
    port = scrapy.Field()

 

————————————————————————————————————————————————

# -*- coding: utf-8 -*-
#在pipeline 替换为以下代码

class IpproxyPipeline(object):

    def process_item(self, item, spider):
        print ('---------write------------------')
        fileName = 'IP.txt'
        f = open(fileName,"a+")
        content=item['IP'] + ':' +item['port'] +'\n'
        f.write(content)
        f.close()
        return item

————————————————————————————————————————————————————————————————、

#在settings 添加以下代码
USER_AGENT = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
    ]
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'IpProxy.pipelines.IpproxyPipeline': 1, 
}

——————————————————————————————————————————————————————

好了可以完美抓去多条网站免费IP。。。。

posted @ 2020-09-13 22:19  凹凸曼大人  阅读(446)  评论(0)    收藏  举报