利用BeautifulSoup爬博客园的网址(待完善)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime
pages = set()
random.seed(datetime.datetime.now())
def getLinks(pageUrl):
global pages
html = urlopen(pageUrl)
bsObj = BeautifulSoup(html)
for link in bsObj.find_all("a", href=re.compile("(http|https)://www\.")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
#我们遇到了新页面
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getLinks(newPage)
getLinks("http://www.cnblogs.com/thcnww/p/8650511.html")
posted on 2018-03-28 15:14 Overtimer-加班哥 阅读(125) 评论(0) 收藏 举报
浙公网安备 33010602011771号