【网友委托的爬虫代码】KanAcademyTranscriptsSprider.py(网站有反爬虫,做不了)

# python3.8.x
# 保存的txt有少量html标签包裹,请自行清除
import requests
from bs4 import BeautifulSoup
enDomain = "https://www.khanacademy.org/"
ruDomain = "https://ru.khanacademy.org/"
enTranscripts = []
ruTranscripts = []

def getAllUnits(url):
    unitTags = []
    respone = requests.get(url)
    soup = BeautifulSoup(respone.content, "html.parser")
    # unitTags = soup.find_all("a",class_="_m3n46ga")
    #units.append(unitTag["href"])
    root = soup.find("div", class_="_1r0pi8am")
    print(respone.content)
    print(root)
    all_a = root.find_all_next("a")
    for a in all_a:
        if a["href"] != "/math/early-math":
            unitTag.append(a)
    return unitTags
    pass

def requestEn():
    print("爬取中,请稍后")
    enFile = open("Early math review-en.txt","w")
    #url = enUrlFirstName + "math/early-math/"
    units = getAllUnits(enDomain + "math/early-math")
    for unit in units:
        respone = requests.get(enDomain + unit["href"])
        soup = BeautifulSoup(respone.content, "html.parser")
        link_titles = soup.find_all("div", class_="_stw1dyg")
        for link_title in link_titles:
            lesson_link = link_title.find_next("a")
            lesson_page_full_url = enDomain + lesson_link["href"]
            respone = requests.get(lesson_page_full_url)
            videoTabTranscript = soup.find("div", id="videoTabTranscript-panel")
            enFile.write(link_title.get_text())
            enFile.write(videoTabTranscript.get_text())        
    enFile.close()
    pass

def requestRu():
    ruFile = open("Early math review-ru.txt", "w")
    ruFile.close()

requestEn()
#requestRu()
# def mergeEN_Transcripts():
#     pass

# def mergeRU_Transcripts():
#     pass

 

posted on 2026-02-01 19:22  小沙盒工作室  阅读(0)  评论(0)    收藏  举报