1 # # -*- coding:utf-8 -*-
2 # @Time : 2021/7/22 22:04
3 # @Author : 周博
4 # @File : test_1.py
5 # @博客园: https://www.cnblogs.com/smartisn/
6 import requests
7 from lxml import etree
8 import sys
9 from urllib import request
10 import zipfile
11 import os
12 import time
13 import Download_mysql_zip.mysql.SQL as MYSQL
14 from selenium import webdriver
15 from selenium.webdriver.common.by import By
16 from selenium.webdriver.support import expected_conditions as EC
17 from selenium.webdriver.support.wait import WebDriverWait
18 from selenium.webdriver.chrome.options import Options
19 def Get_whole_file(file):
20 Lists_val=[]
21 for root, dirs, files in os.walk(file):
22 # root 表示当前正在访问的文件夹路径
23 # dirs 表示该文件夹下的子目录名list
24 # files 表示该文件夹下的文件list
25 # 遍历文件
26 for f in files:
27 Lists_val.append(os.path.join(root, f))
28 # # 遍历所有的文件夹
29 # for d in dirs:
30 # print(os.path.join(root, d))
31 return Lists_val
32 def un_zip(zip_filename,des_dir):
33 '''
34 解压压缩包至des_dir指定文件夹
35 :param zip_filename:输入的压缩包名字,例如a.zip
36 :param des_dir: 解压到的位置:例如为 ./文件存储/
37 :return:
38 '''
39 with zipfile.ZipFile(zip_filename, 'r') as zzz:
40 # 捕捉错误并且 返回存在错误的 压缩包名称
41 try:
42 zzz.extractall(des_dir)
43 print(zip_filename,"解压成功")
44 except zipfile.BadZipFile:
45 print("Error: 压缩文件不完整:",zip_filename)
46
47 def DownLoad_mysql_(start,end):
48 # 51-60
49 URLS = MYSQL.select_url_html(start,end)
50 for url_ in URLS:
51 print("*******************")
52 url=url_[0]
53 print(url)
54 file_name = url.split("/")[-1]
55 try:
56 strhtml = requests.get(url, timeout=7) # Get方式获取网页数据
57 tree = etree.HTML(strhtml.text)
58 # //*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a
59 print(tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a//@href'))
60 href_down = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[2]/a//@href')[0]
61 print("55555555555555555555555555555555555555555")
62 print(href_down)
63 href_down="https://github.com"+href_down
64 print(href_down)
65 print("./data/" + file_name + '.zip')
66 request.urlretrieve(href_down, "./data/" + file_name + '.zip')
67 print("下载成功")
68 except Exception as e:
69 print(e)
70 continue
71 if __name__=="__main__":
72 # E:\pycharm\WorkPlace\.net_analyzer\DownLoad_GitHub\data\
73 options = Options()
74 # options.headless = True # 禁止打开
75 driver = webdriver.Chrome('D:\Program Apps\Google\Chrome\driver\chromedriver.exe',options=options)
76 '''获取所有的列表'''
77 for page in range(0,1):
78 url = 'https://github.com/search?l=C%23&o=desc&p='+str(page)+'&q=C%23&s=stars&type=Repositories'
79 print("*******************")
80 print(url)
81 strhtml = requests.get(url, timeout=7)
82 tree = etree.HTML(strhtml.text)
83 hreff = tree.xpath('//*[@id="js-pjax-container"]/div/div[3]/div/ul//div[@class="f4 text-normal"]//a//@href')
84 for hh in hreff:
85 try:
86 file_name=hh.replace("/","_")
87 hh="https://github.com"+hh
88 driver.get(hh)
89 time.sleep(2)
90 wait = WebDriverWait(driver, 20)
91 button1 = wait.until(EC.element_to_be_clickable((By.XPATH,
92 '//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/summary')))
93 button1.click()
94 # //*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a
95 button2 = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a')))
96 button2.click()
97 print(hh,"——————下载成功")
98 except Exception as e:
99 print(e)
100 continue