使用python3 解析html对称标签
2017-11-21 14:17 太烦人 阅读(1324) 评论(0) 收藏 举报写了一个类,主要用于解析html文本的对称的标签结构。
通过输入tag名称,解析对应HTML文本,查找对应tag的层级数,并可以通过层级数得出对应的tag内容。写的比较粗糙,后续如果用到在慢慢改进。
代码如下:
#!/usr/bin/python3
#encoding = UTF-8
import re
####################################
#通过解析HTML文本,获取指定tag的层数
###################################
class htmltaganalysis(object):
def __init__(self,html,tag):
self.html = html
self.tag = tag
#正则匹配,还需要调优
def tagdec(self,html,tag):
pa = re.compile(tag,re.I|re.S|re.M)
return re.finditer(pa,html)
#返回数组[{'content':'xx','layer',x}....]
def GetTagContent(self):
divfinditers = self.tagdec(self.html,'<'+ self.tag)
divfinditere = self.tagdec(self.html,'</'+ self.tag + '>')
startlist = []
endlist = []
arr = []
for n in divfinditere:
endlist.append(n.end())
for m in divfinditers:
startlist.append(m.start())
for j in range(len(endlist)):
for i in range(len(startlist)-1):
if startlist[i] < endlist[j] and startlist[i + 1] > endlist[j] :
arr.append([startlist[i] ,endlist[j]])
startlist.remove(startlist[i])
continue
for k in range(len(startlist)):
#print(startlist[k],endlist[len(endlist)-k-1])
arr.append([startlist[k],endlist[len(endlist)-k-1]])
#按第一列进行排序
arr = sorted(arr, key=lambda x:x[0])
arrcontent = []
for i in range(len(arr)):
#print(arr[i],self.Getlayer(arr,i,1))
dic = dict()
dic['content'] = self.html[arr[i][0]:arr[i][1]]
dic['layer'] = self.Getlayer(arr,i,1)
arrcontent.append(dic)
return arrcontent
#计算数组在二维数组中的层级
def Getlayer(self,arr,i,layer):
#print(arr[i])
zz = False
if i > 0 and i <len(arr):
for j in range(i - 1 ,-1,-1):
if arr[i][0] > arr[j][0] and arr[i][1] < arr[j][1]:
zz = True
layer = layer + 1
#print(arr[j])
#break
return self.Getlayer(arr,j,layer)
if zz:
return layer
else:
return layer
#获取对应层级的标签文本
def GetContentForLayer(self,layer = 1):
arr = []
for dic in self.GetTagContent():
if dic['layer'] == layer:
arr.append(dic['content'])
return arr
#获取最高层级
def GetTopLayer(self):
tplayer = 0
for dic in self.GetTagContent():
if tplayer < dic['layer']:
tplayer = dic['layer']
return tplayer
使用示例:
html = '<div id="cnblogs_post_body"><div class="x-wiki-content x-content"></div></div>' htmltaganalysis = htmltaganalysis(html,'div') print(htmltaganalysis.GetTopLayer()) print(htmltaganalysis.GetContentForLayer(1)) print(htmltaganalysis.GetContentForLayer(2))
结果:
2 ['<div id="cnblogs_post_body"><div class="x-wiki-content x-content"></div></div>'] ['<div class="x-wiki-content x-content"></div>']
备注:欢迎任何形式的转载,但请务必注明出处。
限于本人水平,如果文章和代码有表述不当之处,还请不吝赐教。
浙公网安备 33010602011771号