htmlParser 测试
拿到 https://www.python.org/events/python-events/中 ,近期会议的信息(时间,地点,title等)
所有信息打印到console ,需要的信息单独打印到filelog
顺便练习了下logging :所有信息打印到console ,仅打印需要的信息到文件
# -*- coding:utf-8 -*- from configparser import ConfigParser from html.parser import HTMLParser from html.entities import name2codepoint from urllib import request import logging ''' 拿到 https://www.python.org/events/python-events/中 ,近期会议的信息(时间,地点,title等)
所有信息打印到console ,需要的信息单独打印到filelog ''' class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): #file log only record the <h3>。。。</h3>info if attrs and "ul" in tag and "list-recent-events menu" in attrs[0]: myLogs.set_filelog_level("INFO") # print("h3start"+"*"*97) # print("h3start"+"*"*87,mylogger.level,mylogger.handlers[0].level) mylogger.warning(r'<'+tag+(r" "+str(attrs) if attrs !=r"" else r"")+r'>' ) def handle_endtag(self, tag): # file log only record the <h3>。。。</h3>info if "ul" in tag and myLogs.get_filelog_level()<=20: myLogs.set_filelog_level(level="FATAL") # print("h3end"+"*"*90+str(mylogger.level),mylogger.handlers[0].level) mylogger.warning(r'</'+tag+r'>') def handle_startendtag(self, tag, attrs): mylogger.warning(r'<'+tag+r'/>' ) def handle_data(self, data): mylogger.warning(data) def handle_comment(self, data): mylogger.warning(r'<!--'+str(data)+r'-->') def handle_entityref(self, name): mylogger.warning(r'&'+name+r';') def handle_charref(self, name): mylogger.warning(r'&#'+name+r';') class myLogSetting(object): def __init__(self,file_config_path="",file_log_path="",file_log_level="INFO",console_log_level="INFO",simple_format=True): #日志是否要打印时间,level,调用者等信息 if not simple_format : self.formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') else: self.formatter = logging.Formatter('%(message)s') # file_log_level and console_log_level will be usefull file_config_path are empty string self.level_dict={ 'CRITICAL' : logging.CRITICAL, 'FATAL' : logging.FATAL, 'ERROR' : logging.ERROR, 'WARNING' : logging.WARNING, 'WARN' : logging.WARN, 'INFO' : logging.INFO, 'DEBUG' : logging.DEBUG, 'NOTSET' : logging.NOTSET } ''' ;配置文件样式 如下 ;日志等级: ;FATAL:致命错误 ;CRITICAL:特别糟糕的事情,如内存耗尽、磁盘空间为空,一般很少使用 ;ERROR:发生错误时,如IO操作失败或者连接问题 ;WARNING:发生很重要的事件,但是并不是错误时,如用户登录密码错误 ;INFO:处理请求或者状态变化等日常事务 ;DEBUG:调试过程中使用DEBUG等级,如算法中每个循环的中间状态 ;CRITICAL = 50 ;FATAL = CRITICAL ;ERROR = 40 ;WARNING = 30 ;WARN = WARNING ;INFO = 20 ;DEBUG = 10 ;NOTSET = 0 [default] fileloglevel=INFO consoleloglevel=WARN ''' # 如果存在配置文件,level 从文件中读取 if file_config_path != "": self._myConf = ConfigParser() self._myConf.read(file_config_path, encoding='utf-8') self.fileloglevel = self._myConf.get("default", "fileloglevel") self.consoleloglevel = self._myConf.get("default", "consoleloglevel") # 如果不存在配置文件,level从文件的入参读取 else: self.fileloglevel = file_log_level self.consoleloglevel = console_log_level #logger1默认配置 self.logger1 = logging.getLogger(__name__) self.logger1.setLevel(level=logging.ERROR) # 如果要记录日志到文件中,(入参有包含文件日志路径) if file_log_path != "": self.filehandler = logging.FileHandler(file_log_path, mode="w", encoding='UTF-8') self.filehandler.setLevel(self.level_dict[self.fileloglevel]) self.filehandler.setFormatter(self.formatter) self.logger1.addHandler(self.filehandler) # 控制台日志配置 self.console_handler = logging.StreamHandler() self.console_handler.setLevel(self.level_dict[self.consoleloglevel]) self.console_handler.setFormatter(self.formatter) self.logger1.addHandler(self.console_handler) @property def logger(self): return self.logger1 def set_filelog_level(self,level="INFO"): self.logger1.handlers[0].setLevel(self.level_dict[level]) def get_filelog_level(self): return self.logger1.handlers[0].level def set_consolelog_level(self,level="INFO"): self.logger1.handlers[-1].setLevel(self.level_dict[level]) def set_logger_level(self,level="INFO"): self.logger1.setLevel(self.level_dict[level]) if __name__ == '__main__': myLogs=myLogSetting(file_log_path="test.log",file_log_level="FATAL",console_log_level="INFO") mylogger=myLogs.logger mylogger.setLevel(logging.INFO) URL="https://www.python.org/events/python-events/" with request.urlopen(URL, timeout=4) as f: data = f.read() # test_data=r''' # <li> # <h3 class="event-title"><a href="/events/python-events/1043/">PyCon Belarus 2021</a></h3> # <p> # # # <time datetime="2021-03-13T00:00:00+00:00">13 March<span class="say-no-more"> 2021</span></time> # # # # # <span class="event-location">Online</span> # # </p> # </li> # ''' parser = MyHTMLParser() # parser.feed(test_data) parser.feed(data.decode('utf-8'))
浙公网安备 33010602011771号