python3 调用 beautifulSoup 进行简单的网页处理
from bs4 import BeautifulSoupfile = open('index.html','r',encoding='utf-16-le') #此处有坑!!!soup = BeautifulSoup(file,'lxml')print (soup) # 打印读出的内容print ('\n ------------- \n')print (soup.get_text()) # 取所有标签中的文字print ('\n ------------- \n')print (soup.prettify()) # 格式化输出
# 以标签的形式输出print (soup.title)print ('\n ------------- \n')print (soup.body)print ('\n ------------- \n')print (soup.body.div)
import reprint (soup.find_all('br')) # 仅仅用来搜索标签 print ('\n ------------- \n')print (soup.find_all(re.compile('^b')))#可以使用正则表达式 以b开头的标签print ('\n ------------- \n')print (soup.find_all(id='wiz_custom_css'))print ('\n ------------- \n')for strr in soup.strings: # 取所有下一级标签中的字符串 .stripped_strings可以去空白 print (strr)print ('\n ------------- \n')
# 去除body中的标签,将结果保存于文件 待改进# kill all script and style elementsfor script in soup(["script", "style"]): script.extract() # rip current taptitle_text = soup.title.get_text()str_text = ''for strr in soup.body.strings: # 取所有下一级标签中的字符串 .stripped_strings可以去空白 str_text = str_text + strr + '\n'print (str_text)if title_text == '': md_file = open('index.md','w') md_file.write(str_text)else: md_file = open(title_text+'.md','w') md_file.write(str_text)
# 网上搜到的方式,<br/>标签没有转为换行,后面有另一种方式#print soup# kill all script and style elementsfor script in soup(["script", "style"]): script.extract() # rip current tap# get texttext = soup.get_text()#print text + '____________'# break into lines and remove leading and trailing space on each# splitlines 按\r \r\n \n三种标签分解为行 # strip()移除首尾字符,参数默认为空格lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line eachchunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank linestext = '\n'.join(chunk for chunk in chunks if chunk) # 这个循环…………#wfile = open('aa.md','w')#wfile.write(text)print(text)