python 操作XML和自己实现的xml解析器

1.python的ElementTree的用法

   这部分的内容主要来自对python 3.3.2的documentation的翻译

   sample.xml:

   

<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>

  从文件导入数据或者从字符串导入:

import xml.etree.ElementTree as ET
tree = ET.parse('country_data.xml')
root = tree.getroot()
#-----------------------
root=ET.fromstring(country_data_as_string)

  访问tag或者attrib

root.tag='data'
root.attrib={}

  每一个node节点都可以看成是其子节点的迭代器,可以当成n重list来访问

for child in root:
  ...
#-----------------
root[0][1].text='2008'

 

    查找感兴趣的节点:iter,findall,find

      iter查找所有的子孙节点

      findall找到子节点中所有满足tag=""

      find返回findall找到的第一个

 

2.自己实现的xml解析器

   为了避免自己过于依赖库而失去了自我思考和实现的能力,自己写了一个小型的xml解析器,将xml以树状结构读到内存中,并简单实现了最常用的get和iter功能,就当练习正则和加深对xml的理解了

#coding:utf-8
'''
Created on 2013年8月22日
@author: zsc347
'''

import re
import copy

class TinyXmlPraser:
    def __init__(self):
        self.version=""
        self.nlist=[]
        self.root=None
        self.node=None
        
    def from_file(self,xmlpath):
        filestr=self.filetostr(xmlpath)
        self.nlist=self.strtolist(filestr)
        self._anaxml(self.nlist)
        return self.root
    
    def from_str(self,filestr):
        self.nlist=self.strtolist(filestr)
        self._anaxml(self.nlist)
        return self.root       
        
    def filetostr(self,xmlpath):
        fin=open(xmlpath,'r',encoding='utf-8')
        return fin.read()
        
    def strtolist(self,filestr):
        pattse=r"<[^>]*>"
        pattcontent=r"(?<=>).*\S.*?(?=<)"
        patt=pattse+r'|'+pattcontent
        nlist=re.findall(patt,filestr)
        return nlist

    def _anaxml(self,nlist):
        comment=re.compile(r"<!--[^-]*-->")
        ver=re.compile(r"<?.*\?>")
        startend=re.compile(r"<[^/]*/>")
        start=re.compile(r"<[^>]*>")
        end=re.compile(r"</[^>]*>")
        
        def get_tag_attr(li):
            tag=''
            attr={}
            tmp=re.match(r"<\s*([^\s>]+)[\s>]",li)
            if tmp is not None:
                tag=tmp.group(1)
            for key,value in re.findall(r"[ <]([^= ]*)\s*=\s*[\'\"]([^\'\"]*)[\"\']",li):
                attr[key]=value
            return tag,attr
        
        for li in nlist:
            if re.match(comment,li) is not None:
                continue
            
            if re.match(startend,li) is not None:
                tag=''
                attr={}
                tag,attr=get_tag_attr(li)
                tnode=XmlNode(tag,attr,self.node)
                if not self.node:
                    self.root=tnode
                else:
                    self.node.add_child(tnode)
                continue
            
            if re.match(ver,li) is not None:
                self.version=li
                continue
            
            if re.match(end,li) is not None:
                if self.node is not None:
                    self.node=self.node.father
                    
            elif re.match(start,li) is not None:
                tag=''
                attr={}
                tag,attr=get_tag_attr(li)
                tnode=XmlNode(tag,attr,self.node)
                if not self.node:
                    self.root=tnode
                else:
                    self.node.add_child(tnode)
                self.node=tnode
                
            else:
                self.node.text=li
    
    def show_tree(self):
        print("The tree is here:")
        self.root.shownode()

class XmlNode:
    def __init__(self,tag,attr,father):
        self.tag=tag
        self.attr=attr
        self.father=father
        self.text=""
        self.children=[]
        pass

    def add_child(self,node):
        self.children.append(node)
    
    def iter(self,tag):
        result=[]
        tmplist=copy.copy(self.children)
        for node in tmplist:
            if node.tag==tag:
                result.append(node)
            tmplist+=node.children
        return result
    
    def get(self,key,value=None):
        try:
            value=self.attr[key]
        except:
            pass
        return value
    
    def shownode(self,prefix='',no=0):
        print(prefix+self.tag+'  ' \
              +(str(no) if no!=0 else '')\
              +((self.text+'  ') if self.text else '')\
              +(str(self.attr) if self.attr else '')) 
        num=len(self.children)
        i=1     
        for child in self.children:
            if(num==1):
                child.shownode(prefix+'\t')
            else:
                child.shownode(prefix+'\t',i)
            i+=1

        
        
    

if __name__ == '__main__':
    sample="sample.xml"
    def test():
        root=TinyXmlPraser().from_file(sample)
        root.shownode()
        countrys=root.iter('country')
        for co in countrys:
            print(co.get('name'))
            
    test()

  

posted @ 2013-08-22 18:13  zsc347  阅读(600)  评论(0编辑  收藏  举报