自建yum镜像

#!/usr/bin/python
#-*- coding: utf-8 -*-
import requests
import sys,logging,traceback
from bs4 import BeautifulSoup as bsp


#计算顶层目录
num=0
num_2=0
data_url=[]
tmp=[]
def dg(url,url_head,url_tail,num,centos_ver):
    global tmp,num_3
    r = requests.get("%s%s/%s"%(url,url_head,url_tail))
    html=r.content
    soup=bsp(html,'html.parser')
    #显示所有内容
    #print(soup.prettify())
    try:
        #用soup的方法直接将title格式化有问题,这里手动格式化
        dg_url_head=str(soup.title).replace('<title>','').replace('</title>','').split()[-1]
        for i in soup.find_all('a') :
            #用于获取centos版本
            try:
                #判断失败则代表不是数字类型就继续
                if not i.get('href').endswith('../') and num == 0 and float(i.get('href').split('/')[0].split()[0]) < centos_ver:
                        #显示当前跳过的版本
                        #print(int(i.get('href').split('/')[0]))
                        continue
                
            except:
                #判断包含.的并不完全正确,yum源里有的文件名有.这里做冗余
                #print("%s%s"%(url,dg_url_head))
                pass
            try:
                if not i.get('href').endswith('../') and num == 0:
                    if  float(i.get('href').split('/')[0].split()[0]) >= centos_ver:
                        #显示当前在哪个版本
                        #print("%s%s/%s"%(url,url_head,i.get('href').split('/')[0]))
                        error_log="%s%s/%s"%(url,url_head,i.get('href').split('/')[0])
                        logging.info(error_log)
            except:
                error_log="%s%s/%s"%(url,url_head,i.get('href').split('/')[0])
                logging.info(error_log)
            if not i.get('href').endswith('../') and i.get('href').endswith('/'):
                #显示完整目录路径
                #print("%s%s%s"%(url,url_head,i.get('href')))
                #记录递归层数,每递归一次加1
                num+=1
                #每递归一层就记录一层目录
                dg(url,dg_url_head,i.get('href'),num,centos_ver)
                    
                num_3=num_2
                #递归结束一层则代表目录退出一层,所以num要建议
                num-=1

            elif not i.get('href').endswith('../'):
                #pass
                #显示完整目录和下载路径
                #print('/'.join(tmp))
                #print("%s%s%s"%(url,lj,i.get('href')))
                data_url.append("%s%s%s"%(url,dg_url_head,i.get('href')))
           
    except:
        #print("%s%s/%s"%(url,url_head,url_tail))
        #print(soup.prettify())
        traceback.print_exc()
        sys.exit(0)    
        
#print(url_head)
def start(file,url,url_head,url_tail,centos_ver):
    url=url
    url_head=url_head
    url_tail=url_tail
    centos_ver=centos_ver
    data=dg(url,url_head,url_tail,num,centos_ver)
    output = open(file, 'w')
    output.write('\n'.join(data_url))
    output.close( )
    return 'ok'

 

上面保存为dg.py

 

  1 #!/usr/bin/python
  2 #-*- coding: utf-8 -*-
  3 import urllib,sys,json,shutil
  4 import os,requests,re,time
  5 import dg,logging,traceback
  6 from multiprocessing import Process,Pool
  7 
  8 date_ymd=time.strftime("%Y-%m-%d", time.localtime())
  9 def date_time():
 10     return time.strftime("%Y-%m-%dT%H-%M-%S", time.localtime())
 11 #下载文件存储路径
 12 file_path='/data/wwwroot/yum/centos'
 13 #信息存储路径
 14 file_dir='.'
 15 file_dir_log="./log"
 16 if not os.path.exists(file_dir):
 17         os.makedirs(file_dir)
 18 if not os.path.exists(file_dir_log):
 19         os.makedirs(file_dir_log)
 20 if not os.path.exists(file_path):
 21         os.makedirs(file_path)
 22 download_log_name="%s/download_log_%s.log"%(file_dir_log,date_ymd)
 23 #存储下载行数
 24 download_Record_name="%s/download_Record.lock"%file_dir
 25 #下载列表
 26 network_list="%s/all_list.txt"%file_dir
 27 #进程数
 28 process_num=6
 29 #dg下载的地址
 30 dg_url='https://mirrors.aliyun.com'
 31 #dg_url目录
 32 dg_url_head='/centos'
 33 #文件
 34 dg_url_tail=''
 35 #指定开始的版本
 36 dg_centos_ver=7
 37 
 38 #存储日志
 39 
 40 logging.basicConfig(level=logging.DEBUG,
 41                 format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)-8s %(message)s',
 42                 datefmt='[%Y-%m-%d %H:%M:%S]',
 43                 filename="%s_debug"%(download_log_name),
 44                 filemode='a')
 45 
 46 
 47 #################################################################################################
 48 #定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象#
 49 console = logging.StreamHandler()
 50 console.setLevel(logging.INFO)
 51 formatter = logging.Formatter('[%(asctime)s] %(filename)s[line:%(lineno)d] %(levelname)-8s %(message)s')
 52 console.setFormatter(formatter)
 53 logging.getLogger('').addHandler(console)
 54 #输入一份到日志文件里
 55 file_handler = logging.FileHandler(download_log_name)
 56 file_handler.setLevel(logging.INFO)
 57 file_handler.setFormatter(formatter)
 58 logging.getLogger('').addHandler(file_handler)
 59 #################################################################################################
 60 
 61 def date_def():
 62     date=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 63     return date 
 64 
 65 def Schedule(a,b,c):
 66     '''''
 67     a:已经下载的数据块
 68     b:数据块的大小
 69     c:远程文件的大小
 70    '''
 71     per = 100.0 * a * b / c
 72     if per > 100 :
 73         per = 100
 74     #print ('%.2f%%' %(per))
 75     logging.debug('%.2f%%' %(per))
 76     
 77 def file_add_del(filename,data):
 78     output = open(filename, 'wb')
 79     output.write(data)
 80     output.close( )
 81     
 82 def file_log(filename,data):
 83     output = open(filename, 'a')
 84     output.write(data)
 85     output.close( )
 86 
 87 #获取需要下载的列表
 88 #print('\n'.join(add_list_dir_Size()))
 89 
 90 def url_down(url_n,num):
 91     #记录错误次数,重试3次
 92         num=int(num)
 93         error_num=0
 94         if url_n != '':
 95             url=url_n.replace('\n','').replace('\r','')
 96             r=requests.get(url)
 97             #通过headers信息获取文件大小
 98             size=r.headers['Content-Length']
 99 
100             dir=url.split('/')
101             file=dir[-1]
102             del dir[0:4]
103             del dir[-1]
104             dir='/'.join(dir)
105             logging.debug(url)
106             while True:
107             
108                 
109                 #文件存在则重新下载
110                 if os.path.exists('%s/%s/%s'%(file_path,dir,file)):
111                     os.remove('%s/%s/%s'%(file_path,dir,file))
112                 #判断文件夹是否存在
113                 if not os.path.exists('%s/%s'%(file_path,dir)):
114                     os.makedirs('%s/%s'%(file_path,dir))
115                 url_date=date_time()
116                 #下载进度
117                 #urllib.urlretrieve(url,'%s/%s/%s_%s'%(file_path,dir,file,url_date),Schedule)
118                 #urllib.urlretrieve(url,'%s/%s/%s_%s'%(file_path,dir,file,url_date))
119                 os.popen("wget --limit-rate=200k %s/%s/%s_%s %s"%(file_path,dir,file,url_date,url))
120                 shutil.move('%s/%s/%s_%s'%(file_path,dir,file,url_date),'%s/%s/%s'%(file_path,dir,file))
121                 #文件下载后存在则判断大小是否一致
122                 if os.path.exists('%s/%s/%s'%(file_path,dir,file)):
123                     path_size=os.path.getsize('%s/%s/%s'%(file_path,dir,file))
124                     if float(path_size) == float(size):
125                         #print({"status":"ok","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date})
126                         error_log=json.dumps({num:{"status":"ok","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date_def()}})
127                         #将正常的日志就输入到debug
128                         logging.info(error_log)
129                         #return error_log
130                         #print("%s\t%s\n"%(date,error_log))
131                         break
132                     else:
133                         if error_num >2:
134                             #print({"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date})
135                             error_log=json.dumps({num:{"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date_def()}})
136                             #将正常的日志就输入到debug
137                             logging.info(error_log)
138                             #print("%s\t%s\n"%(date,error_log))
139                             break
140                             return error_log
141                         error_num+=1
142                 #下载后文件不存在则重试
143                 else:
144                     if error_num >2:
145                             #print({"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date})
146                             error_log=json.dumps({num:{"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date_def()}})
147                             #将错误的日志输入到普通日志里
148                             logging.error(error_log)                      #print("%s\t%s\n"%(date,error_log))
149                             #return error_log
150                             #data_log[num]={"status":"error","url":url,"down_size":path_size,"list_szie":size.replace('\n','').replace('\r',''),"num":error_num,"time":date}
151                             break
152                     error_num+=1
153 #获取本地文件
154 def dg_Local_files_and_network_files(path):
155     file_list=[]
156     for root, dirs, files in os.walk(path, topdown=False):
157         for name in files:
158             file_list.append(os.path.join(root, name).replace(path,"https://mirrors.aliyun.com/centos").replace("\\","/"))
159     
160     return file_list                
161 #校验
162 def delete():
163 
164     while True:
165         data=dg_Local_files_and_network_files(file_path)
166         network=open(network_list).read().split('\n')
167         new_network=[]
168         new_data=[]
169         for i in network:
170             if i != '':
171                 new_network.append(i)
172         for i in data:
173             if i != '':
174                 new_data.append(i)
175         delete_data=list(set(new_data)-set(new_network))
176         add=list(set(new_network) - set(new_data))
177         if not os.listdir(file_path):
178             logging.info("删除空目录%s"%'/'.join(file_path))
179         if len(add) == 0 and len(delete_data) ==0:
180             logging.info("校验成功本地与网络无差别")
181             break
182         elif len(add) != 0:
183             for i in add:
184                 if i!='':
185                     logging.info("下载差异文件%s"%i)
186                     url_down(i,"0")
187         elif len(delete_data) != 0:
188             for i in delete_data:
189                 if i!='':
190                     i=i.replace("https://mirrors.aliyun.com/centos/","%s/"%file_path)
191                     logging.info("删除差异文件%s"%i)
192                     os.remove(i)
193 
194                     
195 #测试          
196 #print("开始下载"%date) 
197 #url_data()
198 if __name__ == '__main__':
199     while True:
200         try:
201             
202             num=1
203             exit=0
204             
205             if  os.path.exists(download_Record_name):
206                 logging.info("检测到上次未下载完,重新上次的下载")
207                 dg_Local_files_and_network_files(file_path)
208                 logging.info("开始下载")
209     ####################下面为下载方法############################################## ##########################################################################
210                 mainStart = time.time()
211                 num=0
212                 #data_log=dict()
213                 p = Pool(process_num)
214                 nework_list=open(network_list).read().split('\n')
215                 load_list=dg_Local_files_and_network_files(file_path)
216                 for url_n in list(set(nework_list)-set(load_list)):
217                     num+=1
218                     #下载
219                     p.apply_async(url_down,args=(url_n,str(num),))
220                 logging.info('等待所有子进程完成…')
221                 p.close()
222                 p.join()
223                 mainEnd = time.time()
224                 logging.info('所有进程运行   %s 秒.'%(mainEnd-mainStart))
225                 #下载完成日志分割
226                 file_log(download_log_name,"#"*100)
227                 logging.info("下载完成")
228                 logging.info("开始校验")
229                 delete()
230                 #下载完成清空进度
231                 os.remove(download_Record_name)
232     ########################################################################## ##########################################################################
233             else:
234                 #这里为第一次运行的
235                 if not os.path.exists(download_Record_name):
236                         logging.info("dg.py运行")
237                         dg_po=dg.start(network_list,dg_url,dg_url_head,dg_url_tail,dg_centos_ver)
238                         if 'ok' not in dg_po:
239                             logging.error("dg运行故障")
240                         else:
241                             file_add_del(download_Record_name,'')
242                 else:
243                     logging.info("dg.py检测已经执行过了")
244                     #如果network_list_old文件存在,就代表不是第一次下载,则进行筛选下载,判断有无更新
245                     nework_list=open(network_list).read().split('\n')
246                     load_list=dg_Local_files_and_network_files(file_path)
247                     if len(list(set(nework_list) - set(load_list))) == 0:
248                             logging.info("不用更新")
249                             os.remove(download_Record_name)
250                             exit=1
251                             sys.exit(0)
252     
253                         
254                 #开始下载
255                 if num == 1:
256                         logging.info("开始下载")
257                         file_add_del(download_Record_name,"0")
258     #######################下面为下载方法############################################# ##########################################################################
259                         mainStart = time.time()
260                         num=0
261                         #data_log=dict()
262                         p = Pool(process_num)
263                         nework_list=open(network_list).read().split('\n')
264                         load_list=dg_Local_files_and_network_files(file_path)
265                         for url_n in list(set(nework_list)-set(load_list)):
266                             num+=1
267                             #下载
268                             p.apply_async(url_down,args=(url_n,str(num),))
269                         logging.info('等待所有子进程完成…')
270                         p.close()
271                         p.join()
272                         mainEnd = time.time()
273                         logging.info('所有进程运行   %0.2f 秒.'%(mainEnd-mainStart))
274                         logging.info("下载完成")
275                         logging.info("开始校验")
276                         delete()
277                         #下载完成清空进度
278                         os.remove(download_Record_name)
279                         #下载完成日志分割
280                         file_log(download_log_name,"#"*100)
281     ########################################################################## ##########################################################################
282             #运行结束删除锁
283             os.remove(download_Record_name)
284             logging.info("结束")
285             break
286         except:
287             if exit==0:
288                 #for i in traceback.format_exc().split('\n'):
289                 #    logging.error(i)
290                 if not os.path.exists(download_Record_name) and os.path.exists(network_list):
291                             logging.info("由于dg.py执行故障要将刚生成的以下文件去除")
292                             os.remove(network_list)
293                             logging.info(network_list)
294                 logging.error('\n%s'%traceback.format_exc())

 

这个保存为dg_download.py

 

执行pg_download.py就可以开始爬取了,可以修改里面的爬取版本

 

dg.py爬取镜像,pg_download.py判断是否需要更新

 

剩下的就是等爬取好后,搭建web服务发布出去

posted @ 2018-03-21 15:35  IT菜鸟园  阅读(239)  评论(0编辑  收藏