QS世界大学排名

file_rar

"""
    Project University Ranking

        In order to research for exchange information,studentsalways need to search for the university ranking from multiplesources.
    I propose to make a web crawler to crawl top 200 universities from each sources.

    Website:
    QS ranking: https://www.topuniversities.com/

    Data manipulation
      1.Integrate all the data into one file
      2.Data cleansing:
        Organize all information from different website intounified format:
            University Name
            Ranking
            Location(city&country)
            Source

    The crawled data for the website will be cleaned and submitted in csv format.


    程序调试环境：
         Google Chrome：  版本 96.0.4664.45（正式版本） （64 位）
           python解释器：  使用 anaconda3
    chromedriver_win32：  96.0.4664.45/2021-11-16T09:35:54.118Z  测试时位于 anaconda3 根目录


"""

from selenium import webdriver
from lxml import etree
from bs4 import BeautifulSoup
import re
import time
import csv
import fake_useragent


# 请求方法
def get(driver, url):
    FileName = './QS排名.csv'
    with open(FileName, "a", newline='', encoding='utf-8') as f:
        print(FileName, "   文件已创建...")
        fieldnames = ["Rank", "University", "Overall Score"]
        f_csv = csv.DictWriter(f, fieldnames=fieldnames)
        f_csv.writeheader()

        # 开始请求
        driver.get(url)
        driver.implicitly_wait(10)  # 隐式等待时间 单位 秒/s  等待网页加载完成

        time.sleep(2)  # 网络原因 短暂停顿等待网页加载完成  网速好可删除此行

        # 滑动浏览器滚动条向下滑动1000px
        driver.execute_script("window.scrollTo(0,1000)")  # 执行JavaScript代码

        time.sleep(1)  # 网络原因 短暂停顿等待网页加载完成  网速好可删除此行

        try:  # 点击同意获取cookie
            driver.find_element_by_xpath('//*[@id="popup-buttons"]/button').click()
            print("已点击我同意获取cookie")
        except:
            print("没有点击...........................................")
            pass

        # 计数标识
        a = 0

        # 循环爬取3页数据
        for i in range(3):  # 若开启递归  需把range(3) 改为0  或 去除此处循环
            # 下拉滚动条
            driver.implicitly_wait(30)  # 隐式等待时间 单位 秒/s  等待网页加载完成
            driver.execute_script("window.scrollTo(1000,2000)")
            time.sleep(2)  # 网络原因 短暂停顿等待网页加载完成  网速好可删除此行

            # 点击下拉列表
            driver.find_element_by_xpath('//*[@id="block-tu-d8-content"]/div/article/div/div[3]'
                                         '/div/div/div/div[3]/div[4]/div[1]/div[2]').click()

            time.sleep(1)  # 网络原因 短暂停顿等待网页加载完成  网速好可删除此行

            # 选择每页100条
            driver.find_element_by_xpath(
                '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div/div/div[3]/div[4]/div[1]/div[2]/div[2]/div[4]').click()

            driver.implicitly_wait(20)  # 隐式等待时间 单位 秒/s  等待网页加载完成
            time.sleep(1)  # 网络原因 短暂停顿等待网页加载完成  网速好可删除此行
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight-2000)")
            time.sleep(2)  # 网络原因 短暂停顿等待网页加载完成  网速好可删除此行

            # pagesource  开始解析页面内容
            text = driver.page_source
            html = etree.HTML(text)

            content = html.xpath('//*[@id="ranking-data-load"]')[0]


            # 每个页面100条  循环 依次获取数据
            for j in range(1, 101):
                Rank = content.xpath('.//div[@class="row ind"][' + str(j) + ']/div/div/div/div[1]/div[1]/text()')[0]
                University = content.xpath('.//div[@class="row ind"][' + str(j) + ']/div/div/div/div[2]/div/div[1]/a/text()')[0]
                Overall_Score = content.xpath('.//div[@class="row ind"][' + str(j) + ']/div/div/div/div[3]/span[1]/text()')[0]
                print("Rank：", Rank)
                print("University：", University)
                print("Overall Score：", Overall_Score)
                print('**********'*10)
                a += 1
                f_csv.writerow(
                    {
                        'Rank': Rank,
                        'University': University,
                        'Overall Score': Overall_Score
                    }
                )

            # js = 'document.getElementById("password").click()'  # js点击元素
            # driver.execute_scrtip(js)  # 执行js语句

            # 点击下一页
            driver.find_element_by_css_selector(
                '#alt-style-pagination > li:nth-child(8) > a').click()
            time.sleep(1)  # 网络原因 短暂停顿等待网页加载完成  网速好可删除此行

        print('已获取{}条数据！！！'.format(a))
        print("程序关闭！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！")
        driver.close()
    # 递归   不想用循环翻页可以打开此处递归
    # get(driver)




# 程序入口
if __name__ == '__main__':

    # 实例化 user-agent 对象    得到随机user-agent
    ua = fake_useragent.UserAgent()

    url = "https://www.topuniversities.com/university-rankings/world-university-rankings/2022"
    # 不用vpn将如下跳转中文网  网页排版略有不同
    # url = "https://www.qschina.cn/university-rankings/world-university-rankings/2022"

    # 使用随机ua
    headers = {"user-agent": ua.random}

    # 去除浏览器被控  字样
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)

    # 浏览器扩展程序
    # cookie获取程序
    # extension_path1 = './2.1.0.0_0.crx'

    # xpath程序
    extension_path2 = './2.0.2_0.crx'

    # 添加扩展程序到浏览器
    # options.add_extension(extension_path1)
    options.add_extension(extension_path2)

    # 添加随机ua到浏览器
    options.add_argument('user-agent=' + ua.random)
    driver = webdriver.Chrome(options=options)  # driver = webdriver.Chrome(executable_path=r'D:PATHchromedriver.exe')?

    # 浏览器窗口最大化
    driver.maximize_window()

    # 开始执行函数
    get(driver, url)
posted @ 2023-02-23 22:44 Victor's 阅读(27) 评论(0) 收藏举报
刷新页面返回顶部
杨晓东

www.demo443.com

QS世界大学排名

QS世界大学排名

公告