python模块之HTMLParser(原理很大程度上就是对类构造的熟练运用)

# -*- coding: utf-8 -*-
#python 27
#xiaodeng
#python模块之HTMLParser(原理很大程度上就是对类构造的熟练运用)


import HTMLParser
#tag是的html标签,attrs是 (属性,值)元组(tuple)的列表(list)。
#HTMLParser自动将tag和attrs都转为小写


'''
>>> help(HTMLParser)
Help on module HTMLParser:
CLASSES
    exceptions.Exception(exceptions.BaseException)
        HTMLParseError
    markupbase.ParserBase
        HTMLParser
    
    class HTMLParser(markupbase.ParserBase)
     |  Find tags and other markup and call handler functions.
     |  
     |  Usage:
     |      p = HTMLParser()#初始化
     |      p.feed(data)#feed()方法可以多次调用,也就是不一定一次把整个HTML字符串都塞进去,可以一部分一部分塞进去
                        #提供一些文本给解析器。在由完整元素组成的限度内进行处理,不完整的数据被缓冲直到更多的数据提供或者close()被调用
     |      ...
     |      p.close()
     |  
     |  Methods defined here:
     |  
     |  __init__(self)
     |      Initialize and reset this instance.
     |  
     |  check_for_whole_start_tag(self, i)
     |      # Internal -- check to see if we have a complete starttag; return end
     |      # or -1 if incomplete.
     |  
     |  clear_cdata_mode(self)
     |  
     |  close(self)
     |      Handle any buffered data.
     |  
     |  error(self, message)
     |  
     |  feed(self, data)            #向分析器提供数据。
     |      Feed data to the parser.
     |      
     |      Call this as often as you want, with as little or as much text
     |      as you want (may include '\n').
     |  
     |  get_starttag_text(self)
     |      Return full source of start tag: '<...>'.
     |  
     |  goahead(self, end)
     |      # Internal -- handle data as far as reasonable.  May leave state
     |      # and data to be processed by a subsequent call.  If 'end' is
     |      # true, force handling all data as if followed by EOF marker.
     |  
     |  handle_charref(self, name)              #处理特殊字符串,就是以&#开头的,一般是内码表示的字符
     |      # Overridable -- handle character reference
     |  
     |  handle_comment(self, data)              #处理注释,处理<!--comment-->内的内容
     |      # Overridable -- handle comment
     |  
     |  handle_data(self, data)                 #处理数据,就是<xx>data</xx>中间的那些数据
     |      # Overridable -- handle data
     |  
     |  handle_decl(self, decl)                 #处理<!开头的,比如<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
     |                                          #文档类型声明,
             # Overridable -- handle declaration
     |  
     |  handle_endtag(self, tag)                #处理结束标签,</xx>
     |      # Overridable -- handle end tag
     |  
     |  handle_entityref(self, name)            #处理一些特殊字符,以&开头的
     |      # Overridable -- handle entity reference
     |  
     |  handle_pi(self, data)                   #处理形如<?instruction>的东西
     |      # Overridable -- handle processing instruction
     |  
     |  handle_startendtag(self, tag, attrs)    #处理开始标签和结束标签
     |      # Overridable -- finish processing of start+end tag: <tag.../>
     |  
     |  handle_starttag(self, tag, attrs)       # 处理开始标签,比如<xx>
     |      # Overridable -- handle start tag
     |  
     |  parse_bogus_comment(self, i, report=1)
     |      # Internal -- parse bogus comment, return length or -1 if not terminated
     |      # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
     |  
     |  parse_endtag(self, i)                   
     |      # Internal -- parse endtag, return end or -1 if incomplete
     |  
     |  parse_html_declaration(self, i)
     |      # Internal -- parse html declarations, return length or -1 if not terminated
     |      # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
     |      # See also parse_declaration in _markupbase
     |  
     |  parse_pi(self, i)
     |      # Internal -- parse processing instr, return end or -1 if not terminated
     |  
     |  parse_starttag(self, i)
     |      # Internal -- handle starttag, return end or -1 if not terminated
     |  
     |  reset(self)
     |      Reset this instance.  Loses all unprocessed data.
     |  
     |  set_cdata_mode(self, elem)
     |  
     |  unescape(self, s)
     |  
     |  unknown_decl(self, data)
     |  
     |  ----------------------------------------------------------------------
     |  Data and other attributes defined here:
     |  
     |  CDATA_CONTENT_ELEMENTS = ('script', 'style')
     |  
     |  entitydefs = None
     |  
     |  ----------------------------------------------------------------------
     |  Methods inherited from markupbase.ParserBase:
     |  
     |  getpos(self)
     |      Return current line number and offset.
     |  
     |  parse_comment(self, i, report=1)
     |      # Internal -- parse comment, return length or -1 if not terminated
     |  
     |  parse_declaration(self, i)
     |      # Internal -- parse declaration (for use by subclasses).
     |  
     |  parse_marked_section(self, i, report=1)
     |      # Internal -- parse a marked section
     |      # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
     |  
     |  updatepos(self, i, j)
     |      # Internal -- update line number and offset.  This should be
     |      # called for each piece of data exactly once, in order -- in other
     |      # words the concatenation of all the input strings to this
     |      # function should be exactly the entire input.

>>> 
'''

 

posted @ 2015-11-21 15:47  Xiao|Deng  阅读(384)  评论(0编辑  收藏  举报