爬取麦子学院所有课程信息
# encoding=utf-8 import re,sys import requests #兼容python2编码 ################## # reload(sys) # sys.setdefaultencoding('utf-8') ################## class spider(object): def __init__(self): print('开始爬取内容...') def getsourse(self,url): #getsource用来获取网页源代码 html=requests.get(url) return html.text def changepage(self,url,total_page): #获取总页数,用来生产不同页数的链接 now_page=int(re.search(r'pageNum=(\d+)',url,re.S).group(1)) page_group=[] for i in range(now_page,total_page+1): link =re.sub('pageNum=(\d+)','pageNum=%s'%i,url,re.S) page_group.append(link) return page_group def geteveryclass(self,source): #geteveryclass用来抓取每个课程块的信息 everyclass=re.findall(r'(<li id=.*?</li>)',source,re.S) # print(everyclass) return everyclass def getinfo(self,eachclass): #getinfo用来从每个课程块中提取出我们需要的信息 info=set() info={} info['title']=re.search(r'class="lessonimg" title="(.*?)" alt=',eachclass,re.S).group(1) info['content']=re.search(r'<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>',eachclass,re.S).group(1) info['classtime']=re.search(r'class="time-icon"></i><em>(.*?)</em>',eachclass,re.S).group(1) info['classlever']=re.search(r'<em class="learn-number">(.*?)</em>',eachclass,re.S).group(1) return info def saveinfo(self,classinfo): #saveinfo用来保存结果到info.txt文件中 with open('info.txt','a',encoding='utf-8') as f: for each in classinfo: f.writelines('title:'+each['title']+'\n') f.writelines('content:' + each['content']+'\n') f.writelines('classtime:' + each['classtime']+'\n') f.writelines('classlever:' + each['classlever']+'\n') f.close() if __name__ == '__main__': classinfo=[] url='http://www.jikexueyuan.com/course/?pageNum=1' jikespider=spider() all_links=jikespider.changepage(url,40) for link in all_links: print('正在处理页面'+link) html =jikespider.getsourse(link) everyclass=jikespider.geteveryclass(html) #everyclass=str(everyclass) for each in everyclass: info=jikespider.getinfo(each) classinfo.append(info) jikespider.saveinfo(classinfo)

浙公网安备 33010602011771号