petit_herisson

导航

第三章 开始采集

 

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj = BeautifulSoup(html)
for link in bsObj.find("div", {"id":"bodyContent"}).findAll("a",
                                    href=re.compile("^(/wiki/)((?!:).)*$")):
    if 'href' in link.attrs:
        print(link.attrs['href'])

 

from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getlinks(articleUrl):
    html = urlopen("http://en.wikipedia.org/"+articleUrl)
    bsObj = BeautifulSoup(html, features='lxml')
    return bsObj.find("div", {"id": "bodyContent"}).findAll("a", 
                    href=re.compile("^(/wiki/)((?!:).)*$"))
links = getlinks("/wiki/Kevin_Bacon")
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
    print(newArticle)
    links = getlinks(newArticle)

网页去重

from urllib import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = BeautifulSoup(html, features='lxml')
    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # we have a new pageUrl
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getlinks(newPage)
                
getLinks("")

 

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

# 获取页面所有内链的列表
def getInternalLinks(bsObj, includeUrl):
    internalLinks = []
    # 找出所有以“/”开头的链接
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                internalLinks.append(link.attrs['href'])
    return internalLinks
    
# 获取页面所有外链的列表
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    # 找出所有以“http”或“www”开头且不包含当前URL的链接
    for link in bsObj.findAll("a",
                        href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks
    
def splitAddress(address):
    addressParts = address.replace("http://", "").split("/")
    return addressParts
    
def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html, features='lxml')
    externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
    if len(externalLinks) == 0:
        internalLinks = getInternalLinks(startingPage)
        return getNextExternalLink(internalLinks[random.randint(0,
                                len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startimgSite):
    externalLink = getRandomExternalLink("http://oreilly.com")
    print("随机外链是:"+externalLink)
    followExternalOnly(externalLink)
    
followExternalOnly("http://oreilly.com")

output

随机外链是:https://itunes.apple.com/us/app/safari-to-go/id881697395
随机外链是:https://www.youtube.com/user/OreillyMedia
随机外链是:https://itunes.apple.com/us/app/safari-to-go/id881697395
随机外链是:https://www.youtube.com/user/OreillyMedia
随机外链是:https://www.youtube.com/user/OreillyMedia
随机外链是:https://www.linkedin.com/company/oreilly-media
随机外链是:https://www.youtube.com/user/OreillyMedia
随机外链是:https://www.facebook.com/OReilly/
随机外链是:https://www.youtube.com/user/OreillyMedia
随机外链是:https://itunes.apple.com/us/app/safari-to-go/id881697395
随机外链是:https://www.linkedin.com/company/oreilly-media
随机外链是:https://twitter.com/oreillymedia

 

2019-10-12

15:50:43

posted on 2019-10-12 10:51  petit_herisson  阅读(141)  评论(0)    收藏  举报