是否允许爬取

 1 # 开始写爬虫前,先确定目标网页是否允许爬取相关页面
 2 from urllib.robotparser import RobotFileParser
 3 
 4 UrlRobots = 'https://book.douban.com/robots.txt'
 5 
 6 def GetRobotsTxt(url) :
 7     rp = RobotFileParser()
 8     rp.set_url(url)
 9     rp.read()
10     print(rp.can_fetch('*', 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'))
11     print(rp.can_fetch('*', 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4'))
12     print(rp.can_fetch('*', 'https://book.douban.com/'))
13 
14 GetRobotsTxt(UrlRobots)

 

posted @ 2020-11-29 15:56  不迁徙候鸟  阅读(162)  评论(0)    收藏  举报