python 操作XML和自己实现的xml解析器
1.python的ElementTree的用法
这部分的内容主要来自对python 3.3.2的documentation的翻译
sample.xml:
<?xml version="1.0"?>
<data>
<country name="Liechtenstein">
<rank>1</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank>4</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Panama">
<rank>68</rank>
<year>2011</year>
<gdppc>13600</gdppc>
<neighbor name="Costa Rica" direction="W"/>
<neighbor name="Colombia" direction="E"/>
</country>
</data>
从文件导入数据或者从字符串导入:
import xml.etree.ElementTree as ET
tree = ET.parse('country_data.xml')
root = tree.getroot()
#-----------------------
root=ET.fromstring(country_data_as_string)
访问tag或者attrib
root.tag='data'
root.attrib={}
每一个node节点都可以看成是其子节点的迭代器,可以当成n重list来访问
for child in root:
...
#-----------------
root[0][1].text='2008'
查找感兴趣的节点:iter,findall,find
iter查找所有的子孙节点
findall找到子节点中所有满足tag=""
find返回findall找到的第一个
2.自己实现的xml解析器
为了避免自己过于依赖库而失去了自我思考和实现的能力,自己写了一个小型的xml解析器,将xml以树状结构读到内存中,并简单实现了最常用的get和iter功能,就当练习正则和加深对xml的理解了
#coding:utf-8
'''
Created on 2013年8月22日
@author: zsc347
'''
import re
import copy
class TinyXmlPraser:
def __init__(self):
self.version=""
self.nlist=[]
self.root=None
self.node=None
def from_file(self,xmlpath):
filestr=self.filetostr(xmlpath)
self.nlist=self.strtolist(filestr)
self._anaxml(self.nlist)
return self.root
def from_str(self,filestr):
self.nlist=self.strtolist(filestr)
self._anaxml(self.nlist)
return self.root
def filetostr(self,xmlpath):
fin=open(xmlpath,'r',encoding='utf-8')
return fin.read()
def strtolist(self,filestr):
pattse=r"<[^>]*>"
pattcontent=r"(?<=>).*\S.*?(?=<)"
patt=pattse+r'|'+pattcontent
nlist=re.findall(patt,filestr)
return nlist
def _anaxml(self,nlist):
comment=re.compile(r"<!--[^-]*-->")
ver=re.compile(r"<?.*\?>")
startend=re.compile(r"<[^/]*/>")
start=re.compile(r"<[^>]*>")
end=re.compile(r"</[^>]*>")
def get_tag_attr(li):
tag=''
attr={}
tmp=re.match(r"<\s*([^\s>]+)[\s>]",li)
if tmp is not None:
tag=tmp.group(1)
for key,value in re.findall(r"[ <]([^= ]*)\s*=\s*[\'\"]([^\'\"]*)[\"\']",li):
attr[key]=value
return tag,attr
for li in nlist:
if re.match(comment,li) is not None:
continue
if re.match(startend,li) is not None:
tag=''
attr={}
tag,attr=get_tag_attr(li)
tnode=XmlNode(tag,attr,self.node)
if not self.node:
self.root=tnode
else:
self.node.add_child(tnode)
continue
if re.match(ver,li) is not None:
self.version=li
continue
if re.match(end,li) is not None:
if self.node is not None:
self.node=self.node.father
elif re.match(start,li) is not None:
tag=''
attr={}
tag,attr=get_tag_attr(li)
tnode=XmlNode(tag,attr,self.node)
if not self.node:
self.root=tnode
else:
self.node.add_child(tnode)
self.node=tnode
else:
self.node.text=li
def show_tree(self):
print("The tree is here:")
self.root.shownode()
class XmlNode:
def __init__(self,tag,attr,father):
self.tag=tag
self.attr=attr
self.father=father
self.text=""
self.children=[]
pass
def add_child(self,node):
self.children.append(node)
def iter(self,tag):
result=[]
tmplist=copy.copy(self.children)
for node in tmplist:
if node.tag==tag:
result.append(node)
tmplist+=node.children
return result
def get(self,key,value=None):
try:
value=self.attr[key]
except:
pass
return value
def shownode(self,prefix='',no=0):
print(prefix+self.tag+' ' \
+(str(no) if no!=0 else '')\
+((self.text+' ') if self.text else '')\
+(str(self.attr) if self.attr else ''))
num=len(self.children)
i=1
for child in self.children:
if(num==1):
child.shownode(prefix+'\t')
else:
child.shownode(prefix+'\t',i)
i+=1
if __name__ == '__main__':
sample="sample.xml"
def test():
root=TinyXmlPraser().from_file(sample)
root.shownode()
countrys=root.iter('country')
for co in countrys:
print(co.get('name'))
test()

浙公网安备 33010602011771号