Wooyun最新确认漏洞爬虫V0.02

Wooyun最新确认漏洞爬虫v0.02

 1 #coding:utf-8
 2 import requests
 3 import re
 4 from bs4 import BeautifulSoup
 5 wooyun_confirm='http://www.wooyun.org/bugs/new_confirm'
 6 wooyun_url='http://www.wooyun.org'
 7 wooyun_url_result=re.compile('<a href="(/bugs/wooyun-\d*-\d*)">(.*?)</a>')
 8 wooyun_data_result=re.compile(u'<h3>缺陷编号:.*?">(.*)</a>[\w\W]*漏洞标题:(.*?)</h3>[\s\S]*?<h3 class=\'wybug_corp\'>相关厂商:      <a href="http://www.wooyun.org/corps/(.*?)">[\w\W]*?<h3 class=\'wybug_author\'>漏洞作者:        <a href="http://www.wooyun.org/whitehats/(.*?)">[\w\W]*?<h3 class=\'wybug_date\'>提交时间:(.*?)</h3>[\w\W]*?<h3 class=\'wybug_type\'>漏洞类型:  (.*?)</h3>[\w\W]*?<p class="detail">漏洞Rank:(.*?) </p>')
 9 url_page='http://www.wooyun.org/bugs/new_confirm/page/'
10 page_id=74#yeshu
11 req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
12   'Accept':'text/html;q=0.9,*/*;q=0.8',
13   'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
14   'Accept-Encoding':'gzip',
15  'Connection':'close',
16 'Referer':None 
17                           }
18 
19 
20 def get_html(url):
21     return requests.get(url,headers=req_header).text
22 
23 
24 
25 def spider_data(url):
26     data_url=get_html(url)
27     wooyun_data_list=wooyun_data_result.findall(data_url)
28     for wooyun_bug_data in wooyun_data_list:
29         print u"缺陷编号:%s\n漏洞标题:%s\n相关厂商:%s\n漏洞作者:%s\n提交时间:%s\n漏洞类型:%s\n漏洞Rank:%s\n\n\n"%(wooyun_bug_data[0],wooyun_bug_data[1].strip(),wooyun_bug_data[2].strip(),wooyun_bug_data[3].strip(),wooyun_bug_data[4].strip(),wooyun_bug_data[5].strip(),wooyun_bug_data[6].strip())
30 
31 def get_data(html):
32     wooyun_vul_list=wooyun_url_result.findall(html)
33     for wooyun_vul_url in wooyun_vul_list:
34         wooyun_url_page=wooyun_url+wooyun_vul_url[0]#每页每个漏洞的链接
35         print "下一个次元之门"
36         spider_data(wooyun_url_page)
37 
38 
39 def spider_all(url):
40     for i in range(1,page_id):
41         html=get_html(url+str(i))
42         get_data(html)
43         print i
44 
45 
46 if __name__ == '__main__':
47     print '\033[0m'
48     spider_all(url_page)

 

这里写图片描述
耗时:挺久
原因:正则.*?是匹配除\n以外的任意字符,因此在网页换行时会出现匹配不到的错误
网页上某些描述为”然后输出之后为’,并且’需要用\转义

Wooyun最新确认漏洞爬虫v0.01

 1 #coding:utf-8
 2 import requests
 3 import re
 4 from bs4 import BeautifulSoup
 5 wooyun_confirm='http://www.wooyun.org/bugs/new_confirm'
 6 wooyun_url='http://www.wooyun.org'
 7 wooyun_url_result=re.compile('<a href="(/bugs/wooyun-\d*-\d*)">(.*?)</a>')
 8 #wooyun_data_result=re.compile('<h3>.*?>(WooYun.*?)</a>.*?<h3 class="wybug_title">(.*?)</h3>.*?<h3 class="wybug_corp">.*?>(.*?)</a>.*?<h3 class="wybug_author">.*?>(.*?)</a>.*?<h3 class="wybug_type">(.*?)</h3>.*?<h3 class="wybug_level">(.*?)</h3>')
 9 wooyun_data_result=re.compile('.*?class="wybug_title">(.*?)</h3>.*?class="wybug_corp"')
10 url_page='http://www.wooyun.org/bugs/new_confirm/page/'
11 page_id=74#yeshu
12 out_file=open('wooyun_data.txt','w')
13 #wooyun_name_result=re.compile('<a href=.*>(.*)</a>')
14 req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
15   'Accept':'text/html;q=0.9,*/*;q=0.8',
16   'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
17   'Accept-Encoding':'gzip',
18  'Connection':'close',
19 'Referer':None 
20                           }
21 
22 
23 def get_html(url):
24     return requests.get(url,headers=req_header).text
25 
26 
27 
28 def spider_data(url):
29     data_url=get_html(url)
30     #wooyun_data_list=wooyun_data_result.findall(data_url)
31     soup=BeautifulSoup(data_url)
32     #wooyun_vul_id=soup.findall()
33     wooyun_vul_title=soup.find('h3','wybug_title')
34     #wooyun_vul_corp=soup.find_all()
35     #wooyun_vul_author=
36     wooyun_vul_type=soup.find('h3','wybug_type')
37     #wooyun_vul_level=soup.find('h3','wybug_level')
38     wooyun_vul_detail=soup.find_all('p','detail')
39     #for wooyun_bug_data in wooyun_data_list:
40         #print "%s\n"%wooyun_bug_data
41         #print "1"
42     print "%s\n"%wooyun_vul_title.string
43     print "%s\n" %wooyun_vul_type.string
44     for i in wooyun_vul_detail:
45         print "%s\n"%i.string
46 
47 def get_data(html):
48     wooyun_vul_list=wooyun_url_result.findall(html)
49     #wooyun_name_list=wooyun_name_result.findall(html)
50     for wooyun_vul_url in wooyun_vul_list:
51         wooyun_url_page=wooyun_url+wooyun_vul_url[0]#每页每个漏洞的链接
52         print "下一个次元之门"
53         spider_data(wooyun_url_page)
54         print  "%s\n%s\n"%(wooyun_url_page,wooyun_vul_url[1])
55         #out_file.write(wooyun_url_page,wooyun_vul_url[1])
56         #print '\033[1;34;40m'
57         #print vul_str
58 
59         #print '\033[0m'
60         #out_file.write("%s\t%s"%(wooyun_url_page,wooyun_vul_url[1]))
61         #print wooyun_vul_url
62 
63 
64 def spider_all(url):
65     for i in range(1,page_id):
66         html=get_html(url+str(i))
67         get_data(html)
68         #spider_data(html)
69         print i
70 
71 
72 if __name__ == '__main__':
73     print '\033[0m'
74     #print '\033[1;34;40m'
75     spider_all(url_page)
76     #out_file.close()

 

这里写图片描述
耗时:挺久
代码是个半成品,很多问题,很多空格,很多很多。起初用正则,匹配的时候有点问题,怎么都匹配不到目标字符。后来采用BeautifulSoup,还是在细节匹配上有些问题,rank排序,完全没到那一步

 

posted @ 2016-05-22 15:48  v1ce0ye  阅读(343)  评论(0)    收藏  举报