喵吉欧尼酱

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

爬取麦子学院所有课程信息

# encoding=utf-8
import re,sys
import requests
#兼容python2编码
##################
# reload(sys)
# sys.setdefaultencoding('utf-8')
##################
class spider(object):
    def __init__(self):
        print('开始爬取内容...')

    def getsourse(self,url): #getsource用来获取网页源代码
        html=requests.get(url)
        return html.text

    def changepage(self,url,total_page):  #获取总页数,用来生产不同页数的链接
        now_page=int(re.search(r'pageNum=(\d+)',url,re.S).group(1))
        page_group=[]
        for i in range(now_page,total_page+1):
            link =re.sub('pageNum=(\d+)','pageNum=%s'%i,url,re.S)
            page_group.append(link)
        return page_group
    def geteveryclass(self,source):  #geteveryclass用来抓取每个课程块的信息
        everyclass=re.findall(r'(<li id=.*?</li>)',source,re.S)
        # print(everyclass)
        return everyclass
    def getinfo(self,eachclass): #getinfo用来从每个课程块中提取出我们需要的信息
        info=set()
        info={}
        info['title']=re.search(r'class="lessonimg" title="(.*?)" alt=',eachclass,re.S).group(1)
        info['content']=re.search(r'<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>',eachclass,re.S).group(1)
        info['classtime']=re.search(r'class="time-icon"></i><em>(.*?)</em>',eachclass,re.S).group(1)
        info['classlever']=re.search(r'<em class="learn-number">(.*?)</em>',eachclass,re.S).group(1)

        return info

    def saveinfo(self,classinfo):  #saveinfo用来保存结果到info.txt文件中
        with open('info.txt','a',encoding='utf-8') as f:
            for each in classinfo:
                f.writelines('title:'+each['title']+'\n')
                f.writelines('content:' + each['content']+'\n')
                f.writelines('classtime:' + each['classtime']+'\n')
                f.writelines('classlever:' + each['classlever']+'\n')
            f.close()

if __name__ == '__main__':
    classinfo=[]
    url='http://www.jikexueyuan.com/course/?pageNum=1'
    jikespider=spider()
    all_links=jikespider.changepage(url,40)
    for link in all_links:
        print('正在处理页面'+link)
        html =jikespider.getsourse(link)
        everyclass=jikespider.geteveryclass(html)
        #everyclass=str(everyclass)
        for each in everyclass:
            info=jikespider.getinfo(each)
            classinfo.append(info)
    jikespider.saveinfo(classinfo)

 

posted on 2017-11-06 15:08  喵吉欧尼酱  阅读(148)  评论(0)    收藏  举报