1 # -*- coding: utf-8 -*-
2 # scrapy爬取极客学院全部课程
3 import scrapy
4 from pyquery import PyQuery as pq
5 from jike.items import JikeItem
6
7 class JikespiderSpider(scrapy.Spider):
8 name = "jikespider"
9 allowed_domains = ["www.jikexueyuan.com"]
10 base_url = 'http://www.jikexueyuan.com/course/?pageNum='
11
12 def start_requests(self):
13 for page_num in range(1,96):
14 url = self.base_url + str(page_num)
15 yield scrapy.Request(url, callback=self.parse_index)
16
17 def parse_index(self, response):
18 doc = pq(response.text)
19 lis = doc('.lesson-list .cf li').items()
20 # pyquery心得, 以为pyquery有点问题而导致无法遍历数据结构,
21 # 研究发现是'http:' + item('.lessonimg-box a').attr('href')
22 # 的问题, href是相对路径没有得到一个有效的请求链接
23 for item in lis:
24 detail_url = 'http:' + item('.lessonimg-box a').attr('href')
25 yield scrapy.Request(url=detail_url,callback=self.parse_detail)
26
27 def parse_detail(self, response):
28 item = JikeItem()
29 doc = pq(response.text)
30 item['title'] = doc('.lesson-teacher .bc-box h2').text()
31 item['time'] = doc('.lesson-teacher .bc-box .timebox').text()
32 item['content'] = doc('.lesson-teacher .infor-content').text()
33
34 yield item