测试小站: 处理网 回收帮 培训网 富贵论坛 老富贵论坛

Resquest、Bs4、多线程爬取全站图片

  #!/usr/bin/env python

  # coding=utf-8

  # author:Charles

  # datetime:2021/03/23/0004 11:26

  # software: meizitu

  import requests, os, shutil

  from bs4 import BeautifulSoup

  from multiprocessing import Pool

  # 封装get方法

  def geta(url, params=None, header=None):

  session=requests.session()

  ret={}

  ret['success']=False

  try:

  if params:

  session.params=params

  if header:

  session.headers=header

  msg=session.get(url)

  if msg:

  ret['success']=True

  ret['content']=msg.content

  except Exception, e:

  print e.message

  finally:

  if session:

  session.close()

  return ret

  # 主页面

  def meizitu(kind, page):

  # 进程池中线程数

  pool=Pool(10)

  for p in xrange(1, int(page) + 1):

  pg='/page/%s/' % p

  url='mzitu%s%s' % (kind, pg)

  header={

  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

  }

  ret=geta(url=url, header=header)

  if ret['success']==False:

  return False

  soup=BeautifulSoup(ret['content'], 'lxml')

  listsoup=soup.find_all('ul', {"id": "pins"})

  for i in listsoup:

  if i is not None:

  soup1=BeautifulSoup(str(i), 'lxml')

  listsoup1=soup1.find_all('span')

  soup2=BeautifulSoup(str(listsoup1), 'lxml')

  listsoup2=soup2.find_all('a')

  for g in listsoup2:

  href=g['href'] # 获取链接

  title=g.text.decode('unicode_escape') # 标题

  # print href

  # 同步爬取

  # detail(href, title)

  # 进线程异步爬取(非阻塞)

  pool.apply_async(detail, args=(href, title))

  print '*********************啦啦啦,已爬取%s屏幕啦*********************' % p

  print '需要爬取的全站图片写入完成!!'

  # 进程池关闭

  pool.close()

  # 等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。

  pool.join()

  # 详细页面

  def detail(url, titles):

  num=int(max_page(url))

  title=titles.strip().replace('?', '').replace(':', '').replace(',', '').replace('@', '')

  path='D:/meizitu/'

  print u'文件存放地址: ' + path + title

  if os.path.exists(path + title):

  raw_input('文件夹已经存在,按任意键删除此文件夹!!!')

  shutil.rmtree(path + title)

  raw_input('文件夹已经删除,按任意键执行爬取!!!')

  os.makedirs(path + title)

  os.chdir(path + title)

  for i in xrange(1, num):

  urls=url + '/' + str(i)

  ret=geta(url=urls)

  if ret['success']==False:

  return False

  soup=BeautifulSoup(ret['content'], 'lxml')

  listsoup=soup.find('div', {'class': 'main-image'})

  soup1=BeautifulSoup(str(listsoup), 'lxml')

  listsoup1=soup1.find('img')

  detail_href=listsoup1['src'] # 详细链接

  # print detail_href

  header={

  'Referer': 'mzitu/',

  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

  }

  ret=geta(url=detail_href, header=header)

  if ret['success']==True:

  tupian=ret['content']

  with open('%s-%s.jpg' % (url[21:], i), 'wb')as f:

  f.write(tupian)

  print ('已爬取完成编号:%s----第%s张' % (url[21:], i))

  print '编号为:%s===================>已经爬取完成!!!' % url[21:]

  # 详细页面最大张数

  def max_page(url):

  ret=geta(url=url)

  if ret['success']==False:

  return False

  soup=BeautifulSoup(ret['content'], 'lxml')

  listsoup=soup.find('div', {'class': 'pagenavi'})

  soup1=BeautifulSoup(str(listsoup), 'lxml')

  listsoup1=soup1.find_all('span')

  list=[]

  for i in listsoup1:

  list.append(i.text)

  maxpage=list[-2]

  return maxpage

  if __name__=='__main__':

  if os.name=='nt':

  print(u'你正在使用win平台')

  else:

  print(u'你正在使用linux平台')

  category={'1': '', '2': '/xinggan/', '3': '/japan/', '4': '/taiwan/', '5': '/mm/'}

  num=raw_input('请选择您要爬取的妹子图种类: 1.Index 2.Sex 3.Japan 4.TaiWan 5.Pure

  ')

  if num=='1' or num=='2' or num=='3' or num=='4' or num=='5':

  page=raw_input('输入爬取几屏幕:')

  if page.isdigit():

  meizitu(category[num], page)

  else:

  raw_input('输入错误!按任意键退出!!!')

  else:

  raw_input('输入错误!按任意键退出!!!')

posted @ 2021-12-20 15:43  linjingyg  阅读(48)  评论(0)    收藏  举报