同步pypi源

bandsnatch 不好用,不符合自己心意,所以自己就简单弄了个同步的

# -*- coding: utf-8 -*-
import re
import os
import shutil
import urllib
import json
import copy
import wget
import subprocess as sp
from bs4 import BeautifulSoup as bs
import multiprocessing as mp
import requests
import concurrent.futures

simpleurl = 'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/'
mxnet_pattern = re.compile('mxnet.+[\d.]+(b[\d]+|.post)')
tensorflow_pattern = re.compile('tensorflow.+[\d.]+rc\d')
torch_pattern = re.compile('torch.+[\d.]post\d')
cupy_pattern = re.compile('cupy_cuda.+[\d.]+b\d')


#首先获取所有的packages的名称以及对应的packages所有版本
def get_package_name():
  simple_index = requests.get(simpleurl)
  simple_index = bs(simple_index.text,'lxml')
  packages_list = [i['href'] for i in simple_index.findAll('a')]
  print('一共 {} 个项目'.format(len(packages_list)))
  return packages_list



#每个包链接尝试下载6次,不行就返回空
def get_requests(url):
  for i in range(6):
   try:
     download_urls = requests.get(simpleurl+url,timeout = 60)
   except Exception as e:
     print(url,str(e))
   else:
     return download_urls
  return ''


#过滤当前访问的包链接,看看还有多少包没访问
def filter_packages(packages):
  if os.path.exists('packages_json.txt'):
    packages_dict = json.loads(open('packages_json.txt','r').read())
    packages_dict = eval(packages_dict)
  _packages = list(filter(lambda k: k not in packages_dict, packages))
  return _packages

#多进程访问包链接
def get_download_url1(packages):
  packages_dict = {}
  with concurrent.futures.ProcessPoolExecutor() as executor:
    for package,download_urls in zip(packages,executor.map(get_requests,packages)):
      packages_dict[package] = [];
      if not download_urls: continue
      download_urls = bs(download_urls.text,'lxml')
      for j in download_urls.findAll('a'):
        packages_dict[package].append(j['href'].replace('../../','../'))
      print('项目:{} 有{}个版本'.format(package, 
                                      len(packages_dict[package])))
    json.dump(packages_dict,open('packages_json.txt','w'))  
  return packages_dict
      

 #----------------------------

def clean_urls(packages_dict):
  for k,v in packages_dict.items():
    for ind,v1 in enumerate(v):
      if 'packages' not in v1:
         print("非法链接: ",k,':',v1)
         packages_dict[k][ind] = ''
  return packages_dict


#基于当前本地的pypi源,进行过滤哪些不需要下, 依据文件名,过滤win,macos
def filter_local(packages_dict):
  icount = 0
  done_packages = {k.replace('packages/','').strip():1 for k in open('done_packages.txt')}
  done_packages_cp = copy.deepcopy(done_packages)
  filters_packages = []
  for k,v in packages_dict.items():
    for ind,v1 in enumerate(v):
      if not v1: continue
      prePackage = v1.split('../packages/')[1].split('#')[0] 
      package_name = prePackage.split('/')[-1].lower()
      if prePackage in done_packages or \
         'macosx' in package_name or \
         'win_' in package_name or \
         package_name.endswith('.msi') or \
         package_name.endswith('.exe') or \
         'win-amd64' in package_name or \
         'win32.' in package_name or \
         'win32-' in package_name or \
         'win32_' in package_name or \
         mxnet_pattern.search(package_name) or \
         tensorflow_pattern.search(package_name) or \
         torch_pattern.search(package_name) or \
         cupy_pattern.search(package_name) or \
#         'nightly-' in package_name or \
#         '.dev' in package_name or \
         'windows' in package_name:

        packages_dict[k][ind] = ''
        #如果当前包在done,则表示这个包还有效,从而将未访问到的包进行删除
        if prePackage in done_packages: done_packages_cp.pop(prePackage)
        icount += 1

  print('经过本地库,过滤{}个包'.format(icount))
  print('其中filters_packages.txt 表示当前未访问到的包,可能对应作者已经删除该版本索引')
  with open('filters_packages.txt','w',encoding='utf-8') as fw:
    [fw.write(i+'\n') for i in done_packages_cp.keys()]
  json.dump(packages_dict,open('packages_after_done.txt','w'))  
      
  return packages_dict


#基于黑名单过滤
def filter_blacklist(packages_dict):
  blacklist = [k.strip() for k in open('blacklist.txt')] 
  iproject = 0
  for k,v in copy.deepcopy(packages_dict).items():

    if not any(v):
      packages_dict.pop(k)
      continue

    for black in blacklist:
      if black.lower() in k.lower():
        iproject += 1
        try:
          packages_dict.pop(k)
        except:
          pass

  print('经过黑名单,过滤{}个项目'.format(iproject))
  json.dump(packages_dict,open('packages_remain.txt','w'))  
  return packages_dict


def filter_current_download(packages_dict):

  packages_list = []
  for v in packages_dict.values():
    packages_list.extend([v1 for v1 in v if v1])

  if os.path.exists('current_download_url.txt'):
     done_di = {k.strip():1 for k in open('current_download_url.txt')}  
     packages_list = list(filter(lambda k: k.split('../packages/')[1].split('#')[0] not in done_di, packages_list))
     print('经过当前下载的部分{}个,还有{}个包要下载'.format(len(done_di),len(packages_list)))

  with open('un_download_packages_url.txt','w') as fw:
    for package in packages_list:
      fw.write(simpleurl+package+'\n')

  return packages_list
    

#多进程下载包,创建对应的位置,目录   
def wget1(url):
  #out = sp.check_all('wget -r -np {}'.format(simpleurl+url), shell=True)
  try:
    filename = wget.download(simpleurl+url)
  except :
    return '' 
  pathname = url.split('../')[1].split('#')[0]
  dirname = os.path.dirname(pathname)
  if not os.path.exists(dirname):
    try:
        os.makedirs(dirname)
    except:
        pass
  shutil.move(filename,pathname)
  return url.split('../packages/')[1].split('#')[0]
  

def wget2(url):
  totalurl = simpleurl+url
  out = sp.Popen('wget -c -r -nv -np -T 60 -k -L -p {}'.format(totalurl),shell=True,stdout=sp.PIPE,stderr=sp.PIPE)
  stdout,stderr = out.communicate()
  returncode = out.returncode
  filepath = totalurl.split('//')[1].split('#')[0]
  filepath = filepath.replace('/simple/..','')

  # 如果包不存在,则将其存放到packages_not_found中,且返回包名称,防止第二次下载
  if '404: Not Found' in stderr.decode('utf-8'):
      with open('packages_not_found.txt','a') as fa:
        fa.write(filepath.split('packages/')[1]+'\n')
      return filepath.split('packages/')[1]

  if returncode != 0:
    if os.path.exists(filepath): os.remove(filepath)
    return ''

  return filepath.split('packages/')[1]


def download_packages(packages_list):
#
  with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    for package,package_name in zip(packages_list,executor.map(wget1,packages_list)):
#
      if not package_name:
         print(package,":","下载不了")
         continue
      print('包{}下载完毕'.format(package_name))
      with open('current_download_url.txt','a') as fa:
        fa.write(package_name+'\n')
      

if __name__ == '__main__':
  if os.path.exists('packages_json.txt'):
    packages_dict = json.load(open('packages_json.txt'))
  else:
    packages = get_package_name()
    packages_dict = get_download_url1(packages)

  nsubpackages = 0
  for k,v in packages_dict.items():
    nsubpackages += len(v)
  print('获取完所有{}包以及对应版本的链接,开始进行过滤, 一共有{}个包'.format(len(packages_dict),
                                                               nsubpackages))
  #========================================================
  clean_urls(packages_dict)

  if os.path.exists('done_packages.txt'):
    packages_dict = filter_local(packages_dict)
  if os.path.exists('blacklist.txt'):
    packages_dict = filter_blacklist(packages_dict)

  packages_list = filter_current_download(packages_dict)
 
  print('还有{}个包需要更新,过滤掉{}个不下载的包'.format(len(packages_list),
                                            nsubpackages-len(packages_list)))
  print('-'*50)
  download_packages(packages_list)
  #========================================================
  #done_packages.txt文件是通过find 命令在packages下生成的,形如
  '''  
0d/0d/fac29d2f0a57e3321804a84239389255e9ac7d649502c359de888b71ffb1/mastercard_mc_on-1.0.1.tar.gz
0d/0d/8c53e8b9249fecbfb370f3e3c22ef514f9bfbedf0e8917a1e21aed16bafa/tulgey-0.1.6.tar.gz
0d/0d/8d553e72a079ca545859cccda9b9df05f6ea7b49376e1beefcc870647b27/keepkey-4.0.0.tar.gz
0d/0d/2b8dec79ef1a19cdc6ecfa2878bb8eb7c78d8ee82f37abbe2e3df0b8249d/bio2bel_chebi-0.2.0.tar.gz
0d/0d/5801c7bebf6dfb2e1d81bda054de27d7e26776cbe74ed9db7f7b9b6fcd88/coinbase-1.0.0-py2.py3-none-any.whl
0d/91/8d860c75c3e70e6bbec7b898b5f753bf5da404be9296e245034360759645/tree-format-0.1.2.tar.gz
0d/91/c62a6b11ac6839fd39da55c1b04ce89ed460644d34b8cff14a5550890a68/crawlib-0.0.4.zip
0d/91/ca443d226b42bbfab0e2d1e1bd1e5f69cff948fee6dac2e42d7350e28c47/FlexGet-2.12.11-py2.py3-none-any.whl
0d/91/7e7b0debbfc755876f496ce99d330f6056e1f679964cbba13c029569f64c/agora-graphql-0.1.5.tar.gz
0d/91/cea4732387324199505eaca476c4837bd6463bc4efa207862c96352b3390/kervi-0.6.3.zip
  '''

   #黑名单,就是直接输入项目文件,比如
  '''
mxnet
tensorflow
  '''

然后接下来就是运行如下的createsimple.py,思路是在不下载的时候都可以完全创建simple文件夹,然后去检测对应的packages里面是否存在对应的文件,如果不存在,则当前文件夹及文件不创建,很简单。

import os
import json

def _createsimple(k,v,filepath, packages_path):

    prestring = ['''<!DOCTYPE html>
<html>
  <head>
    <title>Links for {}</title>
  </head>
  <body>
    <h1>Links for {}</h1>\n'''.format(k.strip('./'),k.strip('./')) ]
    flag = False
    for v_ in v:
        packages_basename = v_.split('#')[0].replace('../packages/','')
        packages_filename = os.path.join(packages_path,packages_basename)
        if not os.path.exists(packages_filename): continue
        flag = True
        prestring.append( '    <a href="../{}">{}</a><br/>\n'.format(v_,v_.split('#')[0].split('/')[-1]))
    prestring.append('''  </body>
</html>\n''')
    if flag:

      os.makedirs(os.path.dirname(filepath))
      with open(filepath, 'w')as fw:
        fw.write(''.join(prestring))
    return

def createsimple(packages_json, packages_path, simple):
  ans = json.load(open(packages_json))
  for k,v in ans.items():
    dirname = os.path.join(simple,k)
    if not all(v): continue

    filepath = os.path.join(dirname,'index.html')
    try:
      _createsimple(k,v,filepath, packages_path)
    except Exception as e:
      print(str(e))


if __name__ == '__main__':
  packages_json = './packages_json.txt'
  packages_path =  './packages'
  simple = './simple'
  createsimple(packages_json, packages_path, simple)

posted @ 2020-06-08 20:40  仙守  阅读(594)  评论(0编辑  收藏  举报