Wooyun最新确认漏洞爬虫V0.02
Wooyun最新确认漏洞爬虫v0.02
1 #coding:utf-8 2 import requests 3 import re 4 from bs4 import BeautifulSoup 5 wooyun_confirm='http://www.wooyun.org/bugs/new_confirm' 6 wooyun_url='http://www.wooyun.org' 7 wooyun_url_result=re.compile('<a href="(/bugs/wooyun-\d*-\d*)">(.*?)</a>') 8 wooyun_data_result=re.compile(u'<h3>缺陷编号:.*?">(.*)</a>[\w\W]*漏洞标题:(.*?)</h3>[\s\S]*?<h3 class=\'wybug_corp\'>相关厂商: <a href="http://www.wooyun.org/corps/(.*?)">[\w\W]*?<h3 class=\'wybug_author\'>漏洞作者: <a href="http://www.wooyun.org/whitehats/(.*?)">[\w\W]*?<h3 class=\'wybug_date\'>提交时间:(.*?)</h3>[\w\W]*?<h3 class=\'wybug_type\'>漏洞类型: (.*?)</h3>[\w\W]*?<p class="detail">漏洞Rank:(.*?) </p>') 9 url_page='http://www.wooyun.org/bugs/new_confirm/page/' 10 page_id=74#yeshu 11 req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 12 'Accept':'text/html;q=0.9,*/*;q=0.8', 13 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 14 'Accept-Encoding':'gzip', 15 'Connection':'close', 16 'Referer':None 17 } 18 19 20 def get_html(url): 21 return requests.get(url,headers=req_header).text 22 23 24 25 def spider_data(url): 26 data_url=get_html(url) 27 wooyun_data_list=wooyun_data_result.findall(data_url) 28 for wooyun_bug_data in wooyun_data_list: 29 print u"缺陷编号:%s\n漏洞标题:%s\n相关厂商:%s\n漏洞作者:%s\n提交时间:%s\n漏洞类型:%s\n漏洞Rank:%s\n\n\n"%(wooyun_bug_data[0],wooyun_bug_data[1].strip(),wooyun_bug_data[2].strip(),wooyun_bug_data[3].strip(),wooyun_bug_data[4].strip(),wooyun_bug_data[5].strip(),wooyun_bug_data[6].strip()) 30 31 def get_data(html): 32 wooyun_vul_list=wooyun_url_result.findall(html) 33 for wooyun_vul_url in wooyun_vul_list: 34 wooyun_url_page=wooyun_url+wooyun_vul_url[0]#每页每个漏洞的链接 35 print "下一个次元之门" 36 spider_data(wooyun_url_page) 37 38 39 def spider_all(url): 40 for i in range(1,page_id): 41 html=get_html(url+str(i)) 42 get_data(html) 43 print i 44 45 46 if __name__ == '__main__': 47 print '\033[0m' 48 spider_all(url_page)
耗时:挺久
原因:正则.*?是匹配除\n以外的任意字符,因此在网页换行时会出现匹配不到的错误
网页上某些描述为”然后输出之后为’,并且’需要用\转义
Wooyun最新确认漏洞爬虫v0.01
1 #coding:utf-8 2 import requests 3 import re 4 from bs4 import BeautifulSoup 5 wooyun_confirm='http://www.wooyun.org/bugs/new_confirm' 6 wooyun_url='http://www.wooyun.org' 7 wooyun_url_result=re.compile('<a href="(/bugs/wooyun-\d*-\d*)">(.*?)</a>') 8 #wooyun_data_result=re.compile('<h3>.*?>(WooYun.*?)</a>.*?<h3 class="wybug_title">(.*?)</h3>.*?<h3 class="wybug_corp">.*?>(.*?)</a>.*?<h3 class="wybug_author">.*?>(.*?)</a>.*?<h3 class="wybug_type">(.*?)</h3>.*?<h3 class="wybug_level">(.*?)</h3>') 9 wooyun_data_result=re.compile('.*?class="wybug_title">(.*?)</h3>.*?class="wybug_corp"') 10 url_page='http://www.wooyun.org/bugs/new_confirm/page/' 11 page_id=74#yeshu 12 out_file=open('wooyun_data.txt','w') 13 #wooyun_name_result=re.compile('<a href=.*>(.*)</a>') 14 req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 15 'Accept':'text/html;q=0.9,*/*;q=0.8', 16 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 17 'Accept-Encoding':'gzip', 18 'Connection':'close', 19 'Referer':None 20 } 21 22 23 def get_html(url): 24 return requests.get(url,headers=req_header).text 25 26 27 28 def spider_data(url): 29 data_url=get_html(url) 30 #wooyun_data_list=wooyun_data_result.findall(data_url) 31 soup=BeautifulSoup(data_url) 32 #wooyun_vul_id=soup.findall() 33 wooyun_vul_title=soup.find('h3','wybug_title') 34 #wooyun_vul_corp=soup.find_all() 35 #wooyun_vul_author= 36 wooyun_vul_type=soup.find('h3','wybug_type') 37 #wooyun_vul_level=soup.find('h3','wybug_level') 38 wooyun_vul_detail=soup.find_all('p','detail') 39 #for wooyun_bug_data in wooyun_data_list: 40 #print "%s\n"%wooyun_bug_data 41 #print "1" 42 print "%s\n"%wooyun_vul_title.string 43 print "%s\n" %wooyun_vul_type.string 44 for i in wooyun_vul_detail: 45 print "%s\n"%i.string 46 47 def get_data(html): 48 wooyun_vul_list=wooyun_url_result.findall(html) 49 #wooyun_name_list=wooyun_name_result.findall(html) 50 for wooyun_vul_url in wooyun_vul_list: 51 wooyun_url_page=wooyun_url+wooyun_vul_url[0]#每页每个漏洞的链接 52 print "下一个次元之门" 53 spider_data(wooyun_url_page) 54 print "%s\n%s\n"%(wooyun_url_page,wooyun_vul_url[1]) 55 #out_file.write(wooyun_url_page,wooyun_vul_url[1]) 56 #print '\033[1;34;40m' 57 #print vul_str 58 59 #print '\033[0m' 60 #out_file.write("%s\t%s"%(wooyun_url_page,wooyun_vul_url[1])) 61 #print wooyun_vul_url 62 63 64 def spider_all(url): 65 for i in range(1,page_id): 66 html=get_html(url+str(i)) 67 get_data(html) 68 #spider_data(html) 69 print i 70 71 72 if __name__ == '__main__': 73 print '\033[0m' 74 #print '\033[1;34;40m' 75 spider_all(url_page) 76 #out_file.close()
耗时:挺久
代码是个半成品,很多问题,很多空格,很多很多。起初用正则,匹配的时候有点问题,怎么都匹配不到目标字符。后来采用BeautifulSoup,还是在细节匹配上有些问题,rank排序,完全没到那一步
----vincebye---