第三章 开始采集
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj = BeautifulSoup(html)
for link in bsObj.find("div", {"id":"bodyContent"}).findAll("a",
href=re.compile("^(/wiki/)((?!:).)*$")):
if 'href' in link.attrs:
print(link.attrs['href'])
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re
random.seed(datetime.datetime.now())
def getlinks(articleUrl):
html = urlopen("http://en.wikipedia.org/"+articleUrl)
bsObj = BeautifulSoup(html, features='lxml')
return bsObj.find("div", {"id": "bodyContent"}).findAll("a",
href=re.compile("^(/wiki/)((?!:).)*$"))
links = getlinks("/wiki/Kevin_Bacon")
while len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
print(newArticle)
links = getlinks(newArticle)
网页去重
from urllib import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(pageUrl):
global pages
html = urlopen("http://en.wikipedia.org"+pageUrl)
bsObj = BeautifulSoup(html, features='lxml')
for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
# we have a new pageUrl
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getlinks(newPage)
getLinks("")
from urllib.request import urlopen from bs4 import BeautifulSoup import re import datetime import random pages = set() random.seed(datetime.datetime.now()) # 获取页面所有内链的列表 def getInternalLinks(bsObj, includeUrl): internalLinks = [] # 找出所有以“/”开头的链接 for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")): if link.attrs['href'] is not None: if link.attrs['href'] not in internalLinks: internalLinks.append(link.attrs['href']) return internalLinks # 获取页面所有外链的列表 def getExternalLinks(bsObj, excludeUrl): externalLinks = [] # 找出所有以“http”或“www”开头且不包含当前URL的链接 for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")): if link.attrs['href'] is not None: if link.attrs['href'] not in externalLinks: externalLinks.append(link.attrs['href']) return externalLinks def splitAddress(address): addressParts = address.replace("http://", "").split("/") return addressParts def getRandomExternalLink(startingPage): html = urlopen(startingPage) bsObj = BeautifulSoup(html, features='lxml') externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0]) if len(externalLinks) == 0: internalLinks = getInternalLinks(startingPage) return getNextExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)]) else: return externalLinks[random.randint(0, len(externalLinks)-1)] def followExternalOnly(startimgSite): externalLink = getRandomExternalLink("http://oreilly.com") print("随机外链是:"+externalLink) followExternalOnly(externalLink) followExternalOnly("http://oreilly.com")
output
随机外链是:https://itunes.apple.com/us/app/safari-to-go/id881697395 随机外链是:https://www.youtube.com/user/OreillyMedia 随机外链是:https://itunes.apple.com/us/app/safari-to-go/id881697395 随机外链是:https://www.youtube.com/user/OreillyMedia 随机外链是:https://www.youtube.com/user/OreillyMedia 随机外链是:https://www.linkedin.com/company/oreilly-media 随机外链是:https://www.youtube.com/user/OreillyMedia 随机外链是:https://www.facebook.com/OReilly/ 随机外链是:https://www.youtube.com/user/OreillyMedia 随机外链是:https://itunes.apple.com/us/app/safari-to-go/id881697395 随机外链是:https://www.linkedin.com/company/oreilly-media 随机外链是:https://twitter.com/oreillymedia
2019-10-12
15:50:43
posted on 2019-10-12 10:51 petit_herisson 阅读(141) 评论(0) 收藏 举报
浙公网安备 33010602011771号