1、随机读取维基文章
from urllib.request import urlopen from bs4 import BeautifulSoup import re import datetime import random random.seed(datetime.datetime.now()) def getLinks(articleUrl): html = urlopen("http://en.wikipedia.org"+articleUrl) bsObj = BeautifulSoup(html,"lxml") # 返回所包含/wiki/开头的url的tag return bsObj.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$")) links = getLinks("/wiki/Kevin_Bacon") while len(links)>0: # 随机读取维基文章的url newArticle = links[random.randint(0,len(links)-1)].attrs['href'] print(newArticle) links = getLinks(newArticle)
from urllib.request import urlopen from bs4 import BeautifulSoup import re # pages保存唯一的url pages = set() def getLinks(pageUrl): global pages html = urlopen("http://en.wikipedia.org"+pageUrl) bsObj = BeautifulSoup(html,"lxml") for link in bsObj.findAll("a",href=re.compile("^(/wiki/)")): if "href" in link.attrs: newPage = link.attrs['href'] print(newPage) pages.add(newPage) # 此处应该判断:如果pages不为空且newPage不在pages时才获取url getLinks(newPage) getLinks("")
from urllib.request import urlopen from bs4 import BeautifulSoup import re pages = set() def getLinks(pageUrl): global pages html = urlopen("http://en.wikipedia.org"+pageUrl) bsObj = BeautifulSoup(html,"lxml")
# 为避免属性丢失的情况,做了异常处理 try: print(bsObj.h1.get_text()) print(bsObj.find(id="mw-content-text").findAll("p")[0]) print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href']) except AttributeError: print("页面缺少一些属性!不过不用担心!") for link in bsObj.findAll("a",href=re.compile("^(/wiki/)")): if 'href' in link.attrs: if link.attrs['href'] not in pages: newPage = link.attrs['href'] print("----------------\n"+newPage) pages.add(newPage) getLinks(newPage) getLinks("")