爬取百度百科

 1 import urllib.request
 2 from bs4 import BeautifulSoup
 3 import re
 4 
 5 def main():
 6     response= urllib.request.urlopen('http://baike.baidu.com/view/284853.htm').read()
 7     soup = BeautifulSoup(response,'html.parser')#使用python默认的解析器
 8     for each in soup.find_all(href = re.compile('view')):
 9         print(each.text,'->',''.join(['http://baike.baidu.com/',each['href']]))#join函数明显比+提高
10 if __name__=='__main__':
11     main()

 

posted @ 2017-04-12 22:26  道高一尺  阅读(757)  评论(0编辑  收藏  举报