1 class xiaoshuoSpider(scrapy.Spider):
2 name = "freenovel"
3 headers={
4 'Upgrade - Insecure - Requests': '1',
5 }
6 def start_requests(self):
7 #完本、免费小说
8 start_url=["url起始网址"]
9 for url in start_url:
10 yield scrapy.Request(url=url, headers=self.headers,callback=self.first_parse)
11
12 def first_parse(self, response):
13 sel=Selector(response)
14 category=sel.css('div[class="select-list"] div ul[type="category"] li a::text').extract()
15 category_url=sel.css('div[class="select-list"] div ul[type="category"] li a::attr(href)').extract()
16 items=[]
17 for i in range(1,len(category_url)):
18 item=XiaoshuoItem()
19 item['category']=category[i]
20 item['category_url']="https:"+category_url[i]
21 items.append(item)
22 for item in items:
23 yield scrapy.Request(url=item['category_url'],meta={"category":item['category']},callback=self.second_parse,headers=self.headers)
24
25 def second_parse(self,response):
26 sel=Selector(response)
27 novel_url=sel.css('div[class="book-mid-info"] h4 a::attr(href)').extract()
28 item=XiaoshuoItem()
29 item['category']=response.meta['category']
30 yield scrapy.Request(url="https:" + novel_url[1] + "#Catalog",callback=self.article_parse,
31 headers=self.headers)
32 for i in range(len(novel_url)):
33 novel_url[i]="https:" + novel_url[i] + "#Catalog"
34 yield scrapy.Request(url=novel_url[i], meta={"category":item['category']},callback=self.article_parse, headers=self.headers)
35
36 def article_parse(self, response):
37 sel=Selector(response)
38 article_name=sel.xpath('//h1/em/text()').extract_first()
39 article_url=sel.css(
40 'div[id="j-catalogWrap"] div[class="volume-wrap"] div[class="volume"] ul li a::attr(href)').extract_first()
41 article_url="https:" + article_url
42 item=XiaoshuoItem()
43 item['article_name']=article_name
44 item['category']=response.meta['category']
45 yield scrapy.Request(url=article_url, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse,
46 headers=self.headers)
47
48 def detail_parse(self, response):
49 sel=Selector(response)
50 content=""
51 item=XiaoshuoItem()
52 content_list=sel.css(
53 'div[id="j_chapterBox"] div[class="text-wrap"] div[class="main-text-wrap"] div[class="read-content j_readContent"] p::text').extract()
54 content_name=sel.css('h3[class="j_chapterName"]::text').extract_first()
55 next_page=sel.css('a[id="j_chapterNext"]::attr(href)').extract_first()
56 for content_one in content_list:
57 content+=content_one
58 item['content']=content
59 item['content_name']=content_name
60 item['article_name']=response.meta['article_name']
61 item['category']=response.meta['category']
62 yield item
63 if next_page is not None:
64 next_page="https:" + next_page
65 yield scrapy.Request(url=next_page, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse,
66 headers=self.headers)