代码改变世界

使用python3 解析html对称标签

2017-11-21 14:17  太烦人  阅读(1288)  评论(0编辑  收藏  举报

写了一个类,主要用于解析html文本的对称的标签结构。

通过输入tag名称,解析对应HTML文本,查找对应tag的层级数,并可以通过层级数得出对应的tag内容。写的比较粗糙,后续如果用到在慢慢改进。

代码如下:

#!/usr/bin/python3
#encoding = UTF-8
import re

####################################
#通过解析HTML文本,获取指定tag的层数
###################################
class htmltaganalysis(object):
    def __init__(self,html,tag):
        self.html = html
        self.tag = tag
        
    #正则匹配,还需要调优
    def tagdec(self,html,tag):
        pa = re.compile(tag,re.I|re.S|re.M)
        return re.finditer(pa,html)

    #返回数组[{'content':'xx','layer',x}....]
    def GetTagContent(self):
        divfinditers = self.tagdec(self.html,'<'+ self.tag)
        divfinditere = self.tagdec(self.html,'</'+ self.tag + '>')
        startlist = []
        endlist = []
        arr = []
        for n in divfinditere:
            endlist.append(n.end())
        for m in divfinditers:
            startlist.append(m.start())
        for j in range(len(endlist)):
            for i in range(len(startlist)-1):
                if startlist[i] < endlist[j] and startlist[i + 1] > endlist[j] :
                    arr.append([startlist[i] ,endlist[j]])
                    startlist.remove(startlist[i])
                    continue

        for k in range(len(startlist)):
        #print(startlist[k],endlist[len(endlist)-k-1])
            arr.append([startlist[k],endlist[len(endlist)-k-1]])
        #按第一列进行排序
        arr = sorted(arr, key=lambda x:x[0])
        arrcontent = []
        for i in range(len(arr)):
            #print(arr[i],self.Getlayer(arr,i,1))
            dic = dict()
            dic['content'] = self.html[arr[i][0]:arr[i][1]]
            dic['layer'] = self.Getlayer(arr,i,1)
            arrcontent.append(dic)
        return arrcontent

    #计算数组在二维数组中的层级
    def Getlayer(self,arr,i,layer):
        #print(arr[i])
        zz = False
        if i > 0 and i <len(arr):
            for j in range(i - 1 ,-1,-1):
                if arr[i][0] > arr[j][0] and arr[i][1] < arr[j][1]:
                    zz = True
                    layer = layer + 1
                    #print(arr[j])
                    #break
                    return self.Getlayer(arr,j,layer)
        if zz:
            return layer
        else:
            return layer


    #获取对应层级的标签文本
    def GetContentForLayer(self,layer = 1):
        arr = []
        for dic in self.GetTagContent():
            if dic['layer'] == layer:
                arr.append(dic['content'])
        return arr


    #获取最高层级
    def GetTopLayer(self):
        tplayer = 0
        for dic in self.GetTagContent():
            if tplayer < dic['layer']:
                tplayer = dic['layer']
        return tplayer

 

使用示例:

html = '<div id="cnblogs_post_body"><div class="x-wiki-content x-content"></div></div>'
htmltaganalysis = htmltaganalysis(html,'div')
print(htmltaganalysis.GetTopLayer())
print(htmltaganalysis.GetContentForLayer(1))
print(htmltaganalysis.GetContentForLayer(2))

结果:

2
['<div id="cnblogs_post_body"><div class="x-wiki-content x-content"></div></div>']
['<div class="x-wiki-content x-content"></div>']

 备注:欢迎任何形式的转载,但请务必注明出处。
限于本人水平,如果文章和代码有表述不当之处,还请不吝赐教。