利用BeautifulSoup爬博客园的网址(待完善)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime

pages = set()
random.seed(datetime.datetime.now())
def getLinks(pageUrl):
global pages
html = urlopen(pageUrl)
bsObj = BeautifulSoup(html)
for link in bsObj.find_all("a", href=re.compile("(http|https)://www\.")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
#我们遇到了新页面
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getLinks(newPage)

getLinks("http://www.cnblogs.com/thcnww/p/8650511.html")

posted on 2018-03-28 15:14  Overtimer-加班哥  阅读(125)  评论(0)    收藏  举报

导航