测试小站: 处理网 回收帮 培训网 富贵论坛 老富贵论坛

python获取可用代理并保存至Excel文件

  import requests

  from bs4 import BeautifulSoup

  import pandas as pd

  import threading

  import time

  from time import sleep

  import urllib3

  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

  #代理是否成功测试网站

  test_http='httpbin/get'

  test_https='httpbin/get'

  header={

  'Accept':'*/*',

  'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

  'Accept-Language':'zh-CN',

  'Accept-Encoding':'gzip, deflate',

  'Connection': 'Keep-Alive',

  'Cache-Control': 'no-cache',

  'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.01)'

  }

  def pandas_to_xlsx(filename, info): # 储存到xlsx

  pd_look=pd.DataFrame(info)

  pd_look.to_excel(filename, sheet_name='快代理')

  def TestOneProxy(ip, port,n):

  proxy=ip + ':' + port

  proxies={

  'http': '' + proxy,

  'https': '' + proxy,

  }

  try:

  response=requests.get('httpbin/get', proxies=proxies , timeout=3)

  if response.status_code==200 :

  print(n,'--验证代理通过 ip', ip, ' 端口:', port)

  return True

  else:

  print(n,'--验证代理失败 ip', ip, ' 端口:', port)

  return False

  except BaseException as e:

  print(n,'--Error', e.args)

  return False

  def getHttpsProxy(url):

  for i in range(1,20):

  sleep(1)

  curUrl=url + str(i) + '/'

  try:

  print('正在获取代理信息,网页', curUrl)

  webcontent=requests.get(curUrl,verify=False)

  if webcontent.status_code!=200 :

  print('获取错误网页,错误码:',webcontent.status_code)

  continue

  soup=BeautifulSoup(webcontent.text, 'lxml')

  list=soup.select('#list')

  if len(list)==0:

  print('获取错误网页,网页内容:',webcontent.text)

  continue

  a=list[0].select('tbody')[0]

  b=a.select('tr')

  for item in b:

  td=item.select('td')

  info={}

  info['ip']=td[0].text

  info['port']=td[1].text

  info['匿名度']=td[2].text

  info['类型']=td[3].text

  info['位置']=td[4].text

  info['响应速度']=td[5].text

  info['最后验证时间']=td[6].text

  allProxies.append(info)

  except requests.exceptions.ConnectionError as e:

  print('--Error', e.args)

  pandas_to_xlsx('所有代理.xlsx',allProxies)

  return allProxies

  #线程函数

  num=0

  def threadFun(n):

  global num

  while True:

  #领取任务

  lock.acquire()

  if num >=len(allProxies):

  lock.release()#这个地方忘了写这一行代码,调试了一整天,泪奔

  break

  curTestProxy=allProxies[num]

  num=num + 1

  lock.release()

  #线程干活

  if TestOneProxy(curTestProxy['ip'],curTestProxy['port'],n):

  canUseProxies.append(curTestProxy)

  print(n,'--运行结束')

  def GetCanUseProxies():

  # 单线程获取所有可用代理

  url='kuaidaili/free/inha/'

  getHttpsProxy(url)

  # 多线程测试是否可用

  res=[]

  for i in range(50): # 创建线程50个线程

  t=threading.Thread(target=threadFun, args=("thread-%s" % i,))

  t.start()

  res.append(t)

  for r in res: # 循环线程实例列表,等待所有的线程执行完毕

  r.join() # 线程执行完毕后,才会往后执行,相当于C语言中的wait()

  if len(canUseProxies) > 0:

  pandas_to_xlsx('所有可用代理.xlsx', canUseProxies)

  return canUseProxies

  allProxies=[]

  canUseProxies=[]

  lock=threading.Lock()

  if __name__=='__main__':

  GetCanUseProxies()

posted @ 2021-12-23 12:03  linjingyg  阅读(54)  评论(0)    收藏  举报