# -*-encoding:utf8-*-
import re
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class crawler:
ITEM_PATTERN = '<div class="lessonimg-box">(.*?)</li>'
TITLE_PATTERN = 'class="lesson-info-h2">.*?>(.*?)</a>'
COURSE_TIME_SPAN_PATTERN = 'class="time-icon"></i><em>(.*?)</em>'
COURSE_DIFFICULTY_PATTERN = 'class="xinhao-icon."></i><em>(.*?)</em>'
COURSE_INTRODUCE_PATTERN = '<h2 class="lesson-info-h2"><a.*?<p.*?>(.*?)</p>'
LEARNING_NUM = '<em class="learn-number">(.*?)</em>'
IMG_URL_PATTERN = '<img src="(.*?)" class="lessonimg".*?>'
TOTAL_PAGE_NUM_PATTERN = '<li class="thpoint pagetotal" style="margin-top:3px;">(.*?)</li>'
def __init__(self,url='http://www.jikexueyuan.com/course/?pageNum=1'):
self.url = url
self.headers = 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36'
def getcontent(self,currentpagenum,url):
print '正在抓取第%s页数据...%s'%(currentpagenum,url)
html = requests.get(url,self.headers).text
html.encode('utf-8')
items = re.findall(crawler.ITEM_PATTERN,html,re.S)
course = {}
courses = []
for item in items:
title = re.findall(crawler.TITLE_PATTERN,item,re.S)
timespan = re.findall(crawler.COURSE_TIME_SPAN_PATTERN,item,re.S)
difficulty = re.findall(crawler.COURSE_DIFFICULTY_PATTERN, item, re.S)
introduce = re.findall(crawler.COURSE_INTRODUCE_PATTERN,item,re.S)
learningnum = re.findall(crawler.LEARNING_NUM,item,re.S)
course['title'] = title and title.pop(0).strip() or ''
course['timespan'] = timespan and timespan.pop(0) or ''
course['difficulty'] = difficulty and difficulty.pop(0).strip() or ''
course['introduce'] = introduce and introduce.pop(0).strip() or ''
course['learningnum'] = learningnum and learningnum.pop(0)or ''
courses.append(course)
course = {}
#print item
#print html
return courses
def setcurpagelink(self,url):
self.url = url
def geteachpagelink(self):
#self.pager = requests.get('http://s1.jikexueyuan.com/common/widget/pager/jquery.pager_5df04e5.js',self.headers)
#self.totalpagenum = re.findall(crawler.TOTAL_PAGE_NUM_PATTERN,self.pager,re.S).pop(0)[2:3]
#print self.pager.text
self.currentpage = int(re.search('pageNum=(\d+)',self.url,re.S).group(1))
links =[]
for i in range(1,90+1):
link = re.sub('pageNum=\d+','pageNum=%s'%i, self.url, re.S)
links.append(link)
#print self.totalpagenum
return links
def getcourseinfo(self,course):
return '课程名称:%s \n难度:%s 课时:%s 学习人数:%s \n简介:%s\n\n'%(course['title'], course['difficulty'], course['timespan'], course['learningnum'], course['introduce'])
def save(self,content):
file = open('course_data.txt','a')
file.write(content)
file.close()
p = crawler()
links = p.geteachpagelink()
for i,eachpagelink in enumerate(links):
p.setcurpagelink(eachpagelink)
courses = p.getcontent(str(i+1),eachpagelink)
#循环遍历 这样打印出来的不好看
#for eachcourse in courses:
# for item in eachcourse:
# print item +": " + eachcourse[item]
coursesinfo = ''
coursesinfo += '\n-----第%s页---------------------------------------------------------------' \
'-------------------------------\n'%str(i+1)
for course in courses:
courseinfo = p.getcourseinfo(course)
coursesinfo += courseinfo
p.save(coursesinfo)