python 操作XML和自己实现的xml解析器
1.python的ElementTree的用法
这部分的内容主要来自对python 3.3.2的documentation的翻译
sample.xml:
<?xml version="1.0"?> <data> <country name="Liechtenstein"> <rank>1</rank> <year>2008</year> <gdppc>141100</gdppc> <neighbor name="Austria" direction="E"/> <neighbor name="Switzerland" direction="W"/> </country> <country name="Singapore"> <rank>4</rank> <year>2011</year> <gdppc>59900</gdppc> <neighbor name="Malaysia" direction="N"/> </country> <country name="Panama"> <rank>68</rank> <year>2011</year> <gdppc>13600</gdppc> <neighbor name="Costa Rica" direction="W"/> <neighbor name="Colombia" direction="E"/> </country> </data>
从文件导入数据或者从字符串导入:
import xml.etree.ElementTree as ET tree = ET.parse('country_data.xml') root = tree.getroot()
#-----------------------
root=ET.fromstring(country_data_as_string)
访问tag或者attrib
root.tag='data'
root.attrib={}
每一个node节点都可以看成是其子节点的迭代器,可以当成n重list来访问
for child in root:
...
#-----------------
root[0][1].text='2008'
查找感兴趣的节点:iter,findall,find
iter查找所有的子孙节点
findall找到子节点中所有满足tag=""
find返回findall找到的第一个
2.自己实现的xml解析器
为了避免自己过于依赖库而失去了自我思考和实现的能力,自己写了一个小型的xml解析器,将xml以树状结构读到内存中,并简单实现了最常用的get和iter功能,就当练习正则和加深对xml的理解了
#coding:utf-8 ''' Created on 2013年8月22日 @author: zsc347 ''' import re import copy class TinyXmlPraser: def __init__(self): self.version="" self.nlist=[] self.root=None self.node=None def from_file(self,xmlpath): filestr=self.filetostr(xmlpath) self.nlist=self.strtolist(filestr) self._anaxml(self.nlist) return self.root def from_str(self,filestr): self.nlist=self.strtolist(filestr) self._anaxml(self.nlist) return self.root def filetostr(self,xmlpath): fin=open(xmlpath,'r',encoding='utf-8') return fin.read() def strtolist(self,filestr): pattse=r"<[^>]*>" pattcontent=r"(?<=>).*\S.*?(?=<)" patt=pattse+r'|'+pattcontent nlist=re.findall(patt,filestr) return nlist def _anaxml(self,nlist): comment=re.compile(r"<!--[^-]*-->") ver=re.compile(r"<?.*\?>") startend=re.compile(r"<[^/]*/>") start=re.compile(r"<[^>]*>") end=re.compile(r"</[^>]*>") def get_tag_attr(li): tag='' attr={} tmp=re.match(r"<\s*([^\s>]+)[\s>]",li) if tmp is not None: tag=tmp.group(1) for key,value in re.findall(r"[ <]([^= ]*)\s*=\s*[\'\"]([^\'\"]*)[\"\']",li): attr[key]=value return tag,attr for li in nlist: if re.match(comment,li) is not None: continue if re.match(startend,li) is not None: tag='' attr={} tag,attr=get_tag_attr(li) tnode=XmlNode(tag,attr,self.node) if not self.node: self.root=tnode else: self.node.add_child(tnode) continue if re.match(ver,li) is not None: self.version=li continue if re.match(end,li) is not None: if self.node is not None: self.node=self.node.father elif re.match(start,li) is not None: tag='' attr={} tag,attr=get_tag_attr(li) tnode=XmlNode(tag,attr,self.node) if not self.node: self.root=tnode else: self.node.add_child(tnode) self.node=tnode else: self.node.text=li def show_tree(self): print("The tree is here:") self.root.shownode() class XmlNode: def __init__(self,tag,attr,father): self.tag=tag self.attr=attr self.father=father self.text="" self.children=[] pass def add_child(self,node): self.children.append(node) def iter(self,tag): result=[] tmplist=copy.copy(self.children) for node in tmplist: if node.tag==tag: result.append(node) tmplist+=node.children return result def get(self,key,value=None): try: value=self.attr[key] except: pass return value def shownode(self,prefix='',no=0): print(prefix+self.tag+' ' \ +(str(no) if no!=0 else '')\ +((self.text+' ') if self.text else '')\ +(str(self.attr) if self.attr else '')) num=len(self.children) i=1 for child in self.children: if(num==1): child.shownode(prefix+'\t') else: child.shownode(prefix+'\t',i) i+=1 if __name__ == '__main__': sample="sample.xml" def test(): root=TinyXmlPraser().from_file(sample) root.shownode() countrys=root.iter('country') for co in countrys: print(co.get('name')) test()