scrapy 例:新浪网分类资讯爬虫
items.py
1 import scrapy 2 3 class SinaspiderItem(scrapy.Item): 4 # define the fields for your item here like: 5 #大类的标题和url 6 parentUrls = scrapy.Field() 7 parentTitle=scrapy.Field() 8 9 #小类的标题和url 10 subUrls=scrapy.Field() 11 subTitle=scrapy.Field() 12 13 #小类目录存储路径 14 subFilename=scrapy.Field() 15 16 #小类下的子链接 17 sonUrls=scrapy.Field() 18 19 #文章标题和内容 20 title=scrapy.Field() 21 content=scrapy.Field()
spiders/sina.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from ..items import SinaspiderItem 4 import os 5 6 class SinaSpider(scrapy.Spider): 7 name = 'sina' 8 # allowed_domains = ['news.sina.com.cn'] #不能这样写啊姐,如果只想爬取新闻大类,可以这样写就不会爬取别的大类了 9 allowed_domains = ['sina.com.cn'] 10 start_urls = ['http://news.sina.com.cn/guide/'] 11 12 def parse(self, response): 13 items=[] 14 15 #所有大类的urls和title 16 parentUrls=response.xpath("//h3[@class='tit02']/a/@href").extract() 17 parentTitle=response.xpath("//h3[@class='tit02']/a/text()").extract() 18 19 #所有小类的urls和title 20 subUrls = response.xpath("//ul[@class='list01']/li/a/@href").extract() 21 subTitle = response.xpath("//ul[@class='list01']/li/a/text()").extract() 22 23 #爬取所有大类 24 for i in range(0,len(parentTitle)): 25 #指定大类目录的路径和目录名 26 parentFilename='./data/'+parentTitle[i] 27 28 # 如果目录不存在,则创建目录 29 if not os.path.exists(parentFilename): 30 os.makedirs(parentFilename) 31 32 #爬取所有小类 33 for j in range(0,len(subUrls)): 34 item=SinaspiderItem() 35 36 # 保存大类的title和urls 37 item['parentUrls']=parentUrls[i] 38 item['parentTitle']=parentTitle[i] 39 40 # 检查小类的url是否以同类别大类url开头,如果是返回True 41 if_belong=subUrls[j].startswith(parentUrls[i]) 42 43 # 如果属于本大类,将存储目录放在本大类目录下 44 if if_belong: 45 subFilename=parentFilename+'/'+subTitle[j] 46 47 # 如果目录不存在,则创建目录 48 if not os.path.exists(subFilename): 49 os.makedirs(subFilename) 50 51 # 存储 小类url、title和filename字段数据 52 item['subUrls']=subUrls[j] 53 item['subTitle']=subTitle[j] 54 item['subFilename']=subFilename 55 56 items.append(item) 57 58 # 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理 59 for item in items: 60 # print(item['parentUrls']) #将所有的parentUrls都拿到了 61 yield scrapy.Request(url=item['subUrls'],meta={'meta_1':item},callback=self.second_parse) 62 63 def second_parse(self,response): 64 # 提取每次Response的meta数据 65 meta_1=response.meta['meta_1'] 66 67 # 取出小类里所有子链接 68 sonUrls=response.xpath("//a/@href").extract() 69 # print(sonUrls) 70 71 items=[] 72 for i in range(0,len(sonUrls)): 73 # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True 74 if_belong=sonUrls[i].startswith(meta_1['parentUrls']) and sonUrls[i].endswith('.shtml') 75 76 # 如果属于本大类,获取字段值放在同一个item下便于传输 77 if if_belong: 78 item=SinaspiderItem() 79 item['parentUrls'] = meta_1['parentUrls'] 80 item['parentTitle'] = meta_1['parentTitle'] 81 item['subUrls'] = meta_1['subUrls'] 82 item['subTitle'] = meta_1['subTitle'] 83 item['subFilename'] = meta_1['subFilename'] 84 item['sonUrls'] = sonUrls[i] 85 86 items.append(item) 87 # print(items) 88 for item in items: 89 # 发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 90 # print(item['sonUrls']) 91 yield scrapy.Request(url=item['sonUrls'],meta={'meta_2':item},callback=self.detail_parse) 92 93 # 数据解析方法,获取文章标题和内容 94 def detail_parse(self,response): 95 item=response.meta['meta_2'] 96 # print(item) 97 title=response.xpath("//h1[@class='main-title']/text() | //div[@class='blkContainerSblk']/h1/text()").extract() #同时满足 98 conten_list=response.xpath("//div[@class='article']/p/text()").extract() 99 100 # 将p标签里的文本内容合并到一起 101 contents="" 102 for content in conten_list: 103 contents+=content 104 105 item['title']=title 106 item['content']=contents.strip() 107 # print(item) 108 109 yield item
pipelines.py
1 # 保存日志信息的文件名 2 LOG_FILE='sina.log' 3 # 保存日志等级,低于|等于此等级的信息都被保存 4 LOG_LEVEL='DEBUG' 5 6 DEFAULT_REQUEST_HEADERS = { 7 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", 8 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 9 'Accept-Language': 'en', 10 } 11 12 ITEM_PIPELINES = { 13 'sinaSpider.pipelines.SinaspiderPipeline': 300, 14 }
settings.py
1 class SinaspiderPipeline(object): 2 def process_item(self, item, spider): 3 sonUrls=item['sonUrls'] 4 filename=sonUrls[7:-6].replace('/',"_")+'.txt' 5 with open(item['subFilename']+'/'+filename,'w') as f: 6 f.write(item['content']) 7 return item
posted on 2020-03-21 09:46 cherry_ning 阅读(164) 评论(0) 收藏 举报
浙公网安备 33010602011771号