1 #beautifulSoup
2 from urllib.request import urlopen
3 from bs4 import BeautifulSoup
4 html = urlopen("http://www.pythonscraping.com/pages/page1.html")
5 bsObj = BeautifulSoup(html,'html.parser')
6 print(bsObj.h1)
7
8 #处理异常
9 html = urlopen("http://www.pythonscraping.com/pages/page1.html")
10
11 #可能会发生两种异常
12 #1.网页在服务器上不存在
13 #2.服务器不存在
14
15 #可以用下方式处理处理这种异常
16
17 try:
18 html = urlopen("http://www.pythonscraping.com/pages/page1.html")
19 if html is None:
20 print("URL is not found")
21 else:
22 #程序继续
23 except HTTPError as e:
24 print(e)
25 #返回空值,中断程序,或者执行另一个方案
26 else:
27 #程序继续。注意:如果你已经在上面异常捕捉那一段代码里返回或中断
28 #那么就不需要使用else语句了,这段代码也不会执行
29
30
32 from urllib.request import urlopen
33 from urllib.error import HTTPError
34 from bs4 import BeautifulSoup
35 def getTitle(url):
36 try:
37 html = urlopen(url)
38 except HTTPError as e:
39 return None
40 try:
41 bsObj = BeautifulSoup(html,'html.parser')
42 title = bsObj.body.h1
43 except AttributeError as e:
44 return None
45 return title
46 title = getTitle("http://www.pythonscraping.com/pages/page1.html")
47 if title == None:
48 print("Title could not be found")
49 else:
50 print(title)