1 爬虫通用框架
2 import requests
3
4 def get_html_text(url):
5 try:
6 r =requests.get(url,timeout=20)
7
8 r.raise_for_status()
9
10 r.encoding = r.apparent_encoding
11
12 return r.text
13
14 except:
15 return "产生异常"
16
17 if __name__ == '__main__':
18 url ="http://www.baidu.com"
19 print(get_html_text(url))
20
21
22 实例
23 import requests
24 from bs4 import BeautifulSoup
25
26 def getHTMLText(url):
27 try:
28 r = requests.get(url,timeout=20) #设置超时
29 r.raise_for_status() # 判断请求是否成功
30 r.encoding = r.apparent_encoding # 设置编码
31 return r.text # 返回获取内容
32 except: #异常处理
33 return "产生异常"
34
35 if __name__ == '__main__':
36 url = "https://book.douban.com/subject/1084336/comments/" # 需要请求的网址
37 # print(getHTMLText(url)) #调用函数
38 requests = getHTMLText(url) # 获取文本内容
39 soup = BeautifulSoup(requests,"html.parser") # 文本解析
40 reasult = soup.find_all("div",class_="comment") # 文本处理
41 for i in reasult :
42 print(i.p.text) # 循坏打印文本