代码的持续改进
#!/usr/bin/env python # -*- coding: utf-8 -*- #工厂模式 def createDomTree(htmlStream,type='soup'): if type == "soup": return tnDomTreeWithSoup(htmlStream) if type == "lxml": return tnDomTreeWithlXml(htmlStream) return None #外部依赖的接口,它屏蔽了我具体使用的第三方库 class tnDomTree: #public def __init__(self,htmlStream): self.htmlStream = htmlStream def getLinkList(self): return self._getElementByTagName('a')#提取子类共有行为到基类 def getImageList(self): return self._getElementByTagName('img')#提取子类共有行为到基类 def elementToString(self,element): pass def getAttrValueOfElement(self,element,attName): pass #private def _getElementByTagName(self,tagName):#子类实现该方法 pass #使用BeautifulSoup的类 from BeautifulSoup import BeautifulSoup class tnDomTreeWithSoup(tnDomTree): def __init__(self,htmlStream): tnDomTree.__init__(self,htmlStream) self._tree = BeautifulSoup(self.htmlStream) def _getElementByTagName(self,tagName): return self._tree.findAll(tagName) def elementToString(self,element): return str(element) def getAttrValueOfElement(self,element,attName): if hasattr(element,attName): return str(element[attName]) else: return "" #使用lxml from lxml.html import tostring from lxml.html.soupparser import fromstring class tnDomTreeWithlXml(tnDomTree): def __init__(self,htmlStream): tnDomTree.__init__(self,htmlStream) self._tree = fromstring(self.htmlStream) def _getElementByTagName(self,tagName): list = [] for i in self._tree.iter(): if i.tag == tagName: list.append(i) return list def elementToString(self,element): return tostring(element) def getAttrValueOfElement(self,element,attName): if attName in set(element.keys()): return str(element.attrib[attName]) else: return "" if __name__ == "__main__": s1 = ''' <p>BEIAI</p> <p><img src="/attachment/28" alt=""></p> ''' print s1 domtree = createDomTree(s1,'soup') list = domtree.getImageList() for i in list: print domtree.getAttrValueOfElement(i,'src') print domtree.elementToString(i)
为什么能抽出一个公用的接口出来?本质上是因为这些东西有相似性,而具体到代码的设计,就应该设计成和其他类相似的API,否则类的设计就是不对的
浙公网安备 33010602011771号