scrapy 例:新浪网分类资讯爬虫

items.py

 1 import scrapy
 2 
 3 class SinaspiderItem(scrapy.Item):
 4     # define the fields for your item here like:
 5     #大类的标题和url
 6     parentUrls = scrapy.Field()
 7     parentTitle=scrapy.Field()
 8 
 9     #小类的标题和url
10     subUrls=scrapy.Field()
11     subTitle=scrapy.Field()
12 
13     #小类目录存储路径
14     subFilename=scrapy.Field()
15 
16     #小类下的子链接
17     sonUrls=scrapy.Field()
18 
19     #文章标题和内容
20     title=scrapy.Field()
21     content=scrapy.Field()

 

spiders/sina.py

  1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from ..items import SinaspiderItem
  4 import os
  5 
  6 class SinaSpider(scrapy.Spider):
  7     name = 'sina'
  8     # allowed_domains = ['news.sina.com.cn']   #不能这样写啊姐,如果只想爬取新闻大类,可以这样写就不会爬取别的大类了
  9     allowed_domains = ['sina.com.cn']
 10     start_urls = ['http://news.sina.com.cn/guide/']
 11 
 12     def parse(self, response):
 13         items=[]
 14 
 15         #所有大类的urls和title
 16         parentUrls=response.xpath("//h3[@class='tit02']/a/@href").extract()
 17         parentTitle=response.xpath("//h3[@class='tit02']/a/text()").extract()
 18 
 19         #所有小类的urls和title
 20         subUrls = response.xpath("//ul[@class='list01']/li/a/@href").extract()
 21         subTitle = response.xpath("//ul[@class='list01']/li/a/text()").extract()
 22 
 23         #爬取所有大类
 24         for i in range(0,len(parentTitle)):
 25             #指定大类目录的路径和目录名
 26             parentFilename='./data/'+parentTitle[i]
 27 
 28             # 如果目录不存在,则创建目录
 29             if not os.path.exists(parentFilename):
 30                 os.makedirs(parentFilename)
 31 
 32             #爬取所有小类
 33             for j in range(0,len(subUrls)):
 34                 item=SinaspiderItem()
 35 
 36                 # 保存大类的title和urls
 37                 item['parentUrls']=parentUrls[i]
 38                 item['parentTitle']=parentTitle[i]
 39 
 40                 # 检查小类的url是否以同类别大类url开头,如果是返回True
 41                 if_belong=subUrls[j].startswith(parentUrls[i])
 42 
 43                 # 如果属于本大类,将存储目录放在本大类目录下
 44                 if if_belong:
 45                     subFilename=parentFilename+'/'+subTitle[j]
 46 
 47                     # 如果目录不存在,则创建目录
 48                     if not os.path.exists(subFilename):
 49                         os.makedirs(subFilename)
 50 
 51                     # 存储 小类url、title和filename字段数据
 52                     item['subUrls']=subUrls[j]
 53                     item['subTitle']=subTitle[j]
 54                     item['subFilename']=subFilename
 55 
 56                     items.append(item)
 57 
 58         # 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
 59         for item in items:
 60             # print(item['parentUrls']) #将所有的parentUrls都拿到了
 61             yield scrapy.Request(url=item['subUrls'],meta={'meta_1':item},callback=self.second_parse)
 62 
 63     def second_parse(self,response):
 64         # 提取每次Response的meta数据
 65         meta_1=response.meta['meta_1']
 66 
 67         # 取出小类里所有子链接
 68         sonUrls=response.xpath("//a/@href").extract()
 69         # print(sonUrls)
 70 
 71         items=[]
 72         for i in range(0,len(sonUrls)):
 73             # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True
 74             if_belong=sonUrls[i].startswith(meta_1['parentUrls']) and sonUrls[i].endswith('.shtml')
 75 
 76             # 如果属于本大类,获取字段值放在同一个item下便于传输
 77             if if_belong:
 78                 item=SinaspiderItem()
 79                 item['parentUrls'] = meta_1['parentUrls']
 80                 item['parentTitle'] = meta_1['parentTitle']
 81                 item['subUrls'] = meta_1['subUrls']
 82                 item['subTitle'] = meta_1['subTitle']
 83                 item['subFilename'] = meta_1['subFilename']
 84                 item['sonUrls'] = sonUrls[i]
 85 
 86                 items.append(item)
 87         # print(items)
 88         for item in items:
 89             # 发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse
 90             # print(item['sonUrls'])
 91             yield scrapy.Request(url=item['sonUrls'],meta={'meta_2':item},callback=self.detail_parse)
 92 
 93     # 数据解析方法,获取文章标题和内容
 94     def detail_parse(self,response):
 95         item=response.meta['meta_2']
 96         # print(item)
 97         title=response.xpath("//h1[@class='main-title']/text() | //div[@class='blkContainerSblk']/h1/text()").extract()  #同时满足
 98         conten_list=response.xpath("//div[@class='article']/p/text()").extract()
 99 
100         # 将p标签里的文本内容合并到一起
101         contents=""
102         for content in conten_list:
103             contents+=content
104 
105         item['title']=title
106         item['content']=contents.strip()
107         # print(item)
108 
109         yield item

 

pipelines.py

 1 # 保存日志信息的文件名
 2 LOG_FILE='sina.log'
 3 # 保存日志等级,低于|等于此等级的信息都被保存
 4 LOG_LEVEL='DEBUG'
 5 
 6 DEFAULT_REQUEST_HEADERS = {
 7   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
 8   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 9   'Accept-Language': 'en',
10 }
11 
12 ITEM_PIPELINES = {
13    'sinaSpider.pipelines.SinaspiderPipeline': 300,
14 }

 

settings.py

1 class SinaspiderPipeline(object):
2     def process_item(self, item, spider):
3         sonUrls=item['sonUrls']
4         filename=sonUrls[7:-6].replace('/',"_")+'.txt'
5         with open(item['subFilename']+'/'+filename,'w') as f:
6             f.write(item['content'])
7         return item

 

posted on 2020-03-21 09:46  cherry_ning  阅读(164)  评论(0)    收藏  举报

导航