'''
爬取中国每个省份的大学名称和官网地址
'''

import requests
from lxml import etree


class School(object):
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
self.url = 'http://www.cnxiaoyuan.com/'

# 省份学校
def province_school_url(self):
province = list()
response = requests.get(url=self.url, headers=self.headers)
html = etree.HTML(response.content.decode('utf-8'))

# 省份学校url
li_list = html.xpath("//div[@id='homecate']/ul/li")[0:-3]
for li in li_list:
province_school_url = li.xpath("./a/@href")
for province_school in province_school_url:
province_school = 'http://www.cnxiaoyuan.com/' + province_school
province.append(province_school)
return province

# 获取每个省份的学校的url
def school_url(self, province):
school_list = list()
for school in province:
response = requests.get(url=school, headers=self.headers)
html = etree.HTML(response.content.decode('utf-8'))

# 每个省份的学校title和url
li_list = html.xpath("//ul[@class='sitelist']/li")
for li in li_list:
school_title = li.xpath("./div/h3/a/text()")
school_url = li.xpath("./div/address/a/text()")
school_list.append(school_url)
school_list.append(school_title)
print(school_title, school_url)


if __name__ == '__main__':
s = School()
province = s.province_school_url()
i = 0
while i < 21:
s.school_url(province)
i += 1
posted on 2019-08-28 19:07  Yihan_07  阅读(343)  评论(0编辑  收藏  举报