爬取研招网专业信息

自用,懒得写注释,可以直接跑,结果在df变量里


import requests as r
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import pandas as pd
origin_data = "ssdm=&dwmc=&mldm=zyxw&mlmc=&yjxkdm=0854&zymc=&xxfs=&pageno=1&xxlb=ZHX"
data = {x[0]:x[1] if len(x) == 2 else '' for x in [t.split('=') for t in origin_data.split('&')]}
base_url = 'https://yz.chsi.com.cn/zsml/queryAction.do'
ua = UserAgent()
res = r.post(base_url,data=data,headers={'User-Agent':ua.random})
html = res.content.decode('utf8')
count = 0
df = pd.DataFrame()
ks_item = ['政治','外语','业务课一','业务课二']
soup = BeautifulSoup(html,'lxml')
table = soup.find('table',class_='ch-table')
links = [f"https://yz.chsi.com.cn{a['href']}" for a in table.find_all('a')]
universities = [a.text for a in table.find_all('a')]
for university,url in zip(universities,links):
    html2 = r.get(url).content.decode()
    ts = BeautifulSoup(html2,'lxml')
    list_table = ts.find('table',class_='ch-table more-content')
    detail_links = [f"https://yz.chsi.com.cn{a['href']}" for a in list_table.find_all('a') if a.text == '查看']
    df.loc[count,'大学名称']=university
    for dl in detail_links:
        print(university)
        detail_page = r.get(dl).content.decode()
        dp_soup = BeautifulSoup(detail_page,'lxml')
        dp_table = dp_soup.find('table',class_='zsml-condition')
        dp_data = [x.text for x in dp_table.find_all('td')]
        for key,value in zip(dp_data[::2],dp_data[1::2]):
            df.loc[count,key] = value
        ks_table = dp_soup.find('tbody',class_='zsml-res-items')
        ks_data = [x.contents[0].strip() for x in ks_table.find_all('td')]
        for key,value in zip(ks_item,ks_data):
            df.loc[count,key] = value
        count += 1
posted @ 2021-03-31 16:41  aminor  阅读(379)  评论(0)    收藏  举报
/**/ /**/