# python3.8.x
# 保存的txt有少量html标签包裹,请自行清除
import requests
from bs4 import BeautifulSoup
enDomain = "https://www.khanacademy.org/"
ruDomain = "https://ru.khanacademy.org/"
enTranscripts = []
ruTranscripts = []
def getAllUnits(url):
unitTags = []
respone = requests.get(url)
soup = BeautifulSoup(respone.content, "html.parser")
# unitTags = soup.find_all("a",class_="_m3n46ga")
#units.append(unitTag["href"])
root = soup.find("div", class_="_1r0pi8am")
print(respone.content)
print(root)
all_a = root.find_all_next("a")
for a in all_a:
if a["href"] != "/math/early-math":
unitTag.append(a)
return unitTags
pass
def requestEn():
print("爬取中,请稍后")
enFile = open("Early math review-en.txt","w")
#url = enUrlFirstName + "math/early-math/"
units = getAllUnits(enDomain + "math/early-math")
for unit in units:
respone = requests.get(enDomain + unit["href"])
soup = BeautifulSoup(respone.content, "html.parser")
link_titles = soup.find_all("div", class_="_stw1dyg")
for link_title in link_titles:
lesson_link = link_title.find_next("a")
lesson_page_full_url = enDomain + lesson_link["href"]
respone = requests.get(lesson_page_full_url)
videoTabTranscript = soup.find("div", id="videoTabTranscript-panel")
enFile.write(link_title.get_text())
enFile.write(videoTabTranscript.get_text())
enFile.close()
pass
def requestRu():
ruFile = open("Early math review-ru.txt", "w")
ruFile.close()
requestEn()
#requestRu()
# def mergeEN_Transcripts():
# pass
# def mergeRU_Transcripts():
# pass