慕课网爬虫

'''

本demo是爬慕课网,实战课下前端,后端,移动开发,云计算大数据,数据库,部分页面下,所有课程页面信息。

  代码有需要改进,请指出,谢谢。

'''

# author:Administrator 2 # date:2021/04/30 3 4 import requests #第三方下载器 5 import re #正则表达式 6 import json #格式化数据用 7 from requests.exceptions import RequestException #做异常处理 8 from multiprocessing import Pool #使用多进程 9 10 11 12 def geturl(url): 13 try: 14 response = requests.get(url) 15 if response.status_code == 200: 16 return response.content.decode("utf-8") 17 return None 18 except RequestException: 19 return None 20 21 homeurl='https://coding.imooc.com' 22 #取慕课主页课程url 放入list 23 stuname_dict_url = {} 24 def parse_one_classUrl(html,stuname): 25 pattern = re.compile('.*?<a target="_blank" href="(.*?)">',re.S) 26 items = re.findall(pattern,html) 27 #url 拼接 28 items = [homeurl + i for i in items] 29 stuname_dict_url[stuname] = items 30 return stuname_dict_url 31 32 33 # 正则匹配数据 34 def parse_one_page(html,url,stuname): 35 pattern = re.compile( 36 '.*?<div class="title-box">.*?<h1>(.*?)</h1>' 37 '.*?<span>难度</span>.*?<span class="nodistance">(.*?)</span>' 38 '.*?<span>时长</span>.*?<span class="nodistance">(.*?)</span>' 39 '.*?<span>学习人数</span>.*?<span class="nodistance">(.*?)</span>' 40 '.*?<span>综合评分</span>.*?<span class="nodistance">(.*?)</span>' 41 ,re.S) 42 items = re.findall(pattern,html) 43 #定义个list 为了格式化 44 tup_items = items[0] + (url,stuname,) 45 list = [] 46 list.append(tup_items) 47 for item in list: 48 # 格式化每一条数据为字典类型的数据 49 yield { 50 'title': item[0], 51 'difficulty': item[1], 52 'duration': item[2], 53 'stu_number': item[3], 54 'comprehensive_evaluation': item[4], 55 'url':item[5], 56 'stuname':item[6] 57 } 58 59 #获取课程urlList 60 def getClassurl(dict): 61 for class_type in dict: 62 for stuname in dict[class_type]: 63 url = geturl(dict[class_type][stuname]) 64 #获取课程url 是一个字典类型 {name:[url]} 65 dic = parse_one_classUrl(url,stuname) 66 return dic 67 68 #写入文本 69 def write_to_file(name,content): 70 with open('..\\text\%s.txt' %name,'a',encoding='utf-8') as f: 71 f.write(json.dumps(content,ensure_ascii=False)+'\n') 72 f.close() 73 74 75 dict_qd = {'前端':{'vus.js':'https://coding.imooc.com/?c=vuejs','HTML/CSS':'https://coding.imooc.com/?c=html','JavaScript':'https://coding.imooc.com/?c=javascript','Node.js':'https://coding.imooc.com/?c=nodejs'}} 76 dict_hd = {'后端':{'java':'https://coding.imooc.com/?c=java','SpringBoot':'https://coding.imooc.com/?c=springboot','SpringCloud':'https://coding.imooc.com/?c=springcloud'}} 77 dict_ydkf = {'移动开发':{'android':'https://coding.imooc.com/?c=android','ios':'https://coding.imooc.com/?c=ios','Reactnative':'https://coding.imooc.com/?c=reactnative'}} 78 dict_yun = {'云计算大数据':{'hadoop':'https://coding.imooc.com/?c=hadoop','大数据':'https://coding.imooc.com/?c=bigdata','Spark':'https://coding.imooc.com/?c=spark','Docker':'https://coding.imooc.com/?c=docker'}} 79 dict_db = {'数据库':{'mysql':'https://coding.imooc.com/?c=mysql','redis':'https://coding.imooc.com/?c=redis','mongodb':'https://coding.imooc.com/?c=mongodb'}} 80 81 def main(): 82 pool = Pool(processes=5) 83 #慕课课程url 84 url_dict = pool.apply_async(getClassurl,(dict_db,)).get() 85 for stuname in url_dict: 86 for url in url_dict[stuname]: 87 print(stuname,url) 88 classhtml = pool.apply_async(geturl,(url,)).get() 89 for item in parse_one_page(classhtml,url,stuname): 90 write_to_file("dict_db",item) 91 92 pool.close() 93 pool.join() 94 95 if __name__ == '__main__': 96 main()

 

最终爬到的数据格式:



{"title": "Spring Boot + Vue3 前后端分离 ", "difficulty": "初级", "duration": "18小时", "stu_number": "546", "comprehensive_evaluation": "9.95", "url": "https://coding.imooc.com/class/474.html", "stuname": "mysql"}
{"title": "阿里新零售数据库设计与实战 (升级版)", "difficulty": "初级", "duration": "22小时", "stu_number": "1688", "comprehensive_evaluation": "9.99", "url": "https://coding.imooc.com/class/353.html", "stuname": "mysql"}
{"title": "程序猿必知必会-MySQL 8.0详解与实战", "difficulty": "入门", "duration": "11小时30分钟", "stu_number": "1213", "comprehensive_evaluation": "9.96", "url": "https://coding.imooc.com/class/332.html", "stuname": "mysql"}
{"title": "MySQL面试指南", "difficulty": "中级", "duration": "12小时", "stu_number": "534", "comprehensive_evaluation": "10.00", "url": "https://coding.imooc.com/class/296.html", "stuname": "mysql"}
{"title": "MySQL数据库集群-PXC方案", "difficulty": "中级", "duration": "13小时", "stu_number": "455", "comprehensive_evaluation": "10.00", "url": "https://coding.imooc.com/class/274.html", "stuname": "mysql"}
{"title": "MyCAT+MySQL", "difficulty": "中级", "duration": " 9小时", "stu_number": "753", "comprehensive_evaluation": "10.00", "url": "https://coding.imooc.com/class/208.html", "stuname": "mysql"}
{"title": "Python操作三大主流数据库", "difficulty": "初级", "duration": "10小时", "stu_number": "2018", "comprehensive_evaluation": "9.91", "url": "https://coding.imooc.com/class/114.html", "stuname": "mysql"}
{"title": "高性能可扩展", "difficulty": "中级", "duration": " 8小时10分钟", "stu_number": "1075", "comprehensive_evaluation": "9.88", "url": "https://coding.imooc.com/class/79.html", "stuname": "mysql"}
{"title": "扛得住的MySQL数据库架构", "difficulty": "中级", "duration": "14小时40分钟", "stu_number": "3689", "comprehensive_evaluation": "9.96", "url": "https://coding.imooc.com/class/49.html", "stuname": "mysql"}
{"title": "Spring Boot + Vue3 前后端分离 ", "difficulty": "初级", "duration": "18小时", "stu_number": "546", "comprehensive_evaluation": "9.95", "url": "https://coding.imooc.com/class/474.html", "stuname": "redis"}
{"title": "高级Redis应用进阶课 一站式Redis解决方案", "difficulty": "高级", "duration": "21小时", "stu_number": "295", "comprehensive_evaluation": "10.00", "url": "https://coding.imooc.com/class/467.html", "stuname": "redis"}
{"title": "Spring Cloud分布式微服务实战", "difficulty": "中级", "duration": "35小时", "stu_number": "450", "comprehensive_evaluation": "10.00", "url": "https://coding.imooc.com/class/456.html", "stuname": "redis"}
{"title": "性能优化+架构迭代升级", "difficulty": "中级", "duration": "14小时", "stu_number": "374", "comprehensive_evaluation": "9.95", "url": "https://coding.imooc.com/class/403.html", "stuname": "redis"}
{"title": "Spring Cloud微服务框架 ", "difficulty": "中级", "duration": "29小时52分钟", "stu_number": "841", "comprehensive_evaluation": "9.99", "url": "https://coding.imooc.com/class/380.html", "stuname": "redis"}
{"title": "阿里新零售数据库设计与实战 (升级版)", "difficulty": "初级", "duration": "22小时", "stu_number": "1688", "comprehensive_evaluation": "9.99", "url": "https://coding.imooc.com/class/353.html", "stuname": "redis"}
{"title": "聚焦Java性能优化 打造亿级流量秒杀系统", "difficulty": "高级", "duration": "18小时", "stu_number": "1626", "comprehensive_evaluation": "10.00", "url": "https://coding.imooc.com/class/338.html", "stuname": "redis"}
{"title": "一站式学习Redis ", "difficulty": "中级", "duration": "16小时", "stu_number": "2014", "comprehensive_evaluation": "9.96", "url": "https://coding.imooc.com/class/151.html", "stuname": "redis"}
{"title": "Spring Cloud分布式微服务实战", "difficulty": "中级", "duration": "35小时", "stu_number": "450", "comprehensive_evaluation": "10.00", "url": "https://coding.imooc.com/class/456.html", "stuname": "mongodb"}
{"title": " 全面掌握MongoDB4.0 完成从小白到达人的蜕变", "difficulty": "入门", "duration": "13小时", "stu_number": "875", "comprehensive_evaluation": "9.97", "url": "https://coding.imooc.com/class/324.html", "stuname": "mongodb"}
{"title": "Go语言开发分布式任务调度 ", "difficulty": "中级", "duration": "13小时", "stu_number": "978", "comprehensive_evaluation": "9.98", "url": "https://coding.imooc.com/class/281.html", "stuname": "mongodb"}
{"title": "Python操作三大主流数据库", "difficulty": "初级", "duration": "10小时", "stu_number": "2018", "comprehensive_evaluation": "9.91", "url": "https://coding.imooc.com/class/114.html", "stuname": "mongodb"}

 

posted @ 2021-05-12 00:05  张岂逢  阅读(261)  评论(0编辑  收藏  举报