tidbv3.0日志查询系统

整体链路

filebeat ----> kakfa----> logstash(ruby) -----> elk

filebeat

采集主机上服务日志和慢日志推送到kafka

filebeat.inputs:
- type: log
  paths:
    - "/databases/deploy/log/tidb_slow_query.log"
    - "/databases/deploy/log/tidb_slow_query.log"
  fields:
    service: tidb_slow
    ip: yourip
    cluster: clustername (用于区分多集群)
  exclude_lines: ['COMMIT', 'commitTxn', 'mysql.', 'commit;']
  multiline.pattern: "# Time:"
  multiline.negate: true
  multiline.match: after
  tail_files: true
  fields_under_root: true
  scan_frequency: 1s

- type: log
  paths:
    - "/databases/deploy/log/pd.log"
    - "/databases/deploy/log/tidb.log"
    - "/databases/deploy/log/tikv.log"
  fields:
    service: tidb_serverlog
    ip: yourip
    cluster: com
  exclude_lines: ['pushgateway','connection closed','new connection']
  tail_files: true
  fields_under_root: true
  scan_frequency: 1s

- type: log
  paths:
    - "/data/log/mongodb/*.log"
    - "/data/mongodb*/log/*.log"
  fields:
    service: mongodb_slow
    ip: yourip
  include_lines: ['ms']
  exclude_lines: ['Successfully', 'killcursors', 'oplog.rs', 'admin.', 'config.']
  multiline.pattern: "^20"
  multiline.negate: true
  multiline.match: after
  tail_files: true
  fields_under_root: true
  scan_frequency: 1s

processors:
  - drop_fields:
      fields: ["beat", "offset", "input", "prospector"]

output.kafka:
  hosts: ["kafka1.zzq.com:9092", "kafka2.zzq.com:9092", "kafka3.zzq.com:9092"]
  topic: "beat-%{[service]}"
  partition.round_robin:
    reachable_only: false
  required_acks: 1
  compression: none
  max_message_bytes: 10000000

logstash 主要是进行grok 对日志进行格式化

filter{
    if [service] == "tidb_slow_new" {
    ruby {
      code => "
        begin
	        a=event.get('message')

	        generate_time = a.scan(/# Time:\s(.*?)\n/)
	        if generate_time.length > 0
	            event.set('generate_time',generate_time)
	        end

	        user = a.scan(/# User:\s(.*?)@(.*?)\n/)
	        if user.length > 0
	            event.set('user',user[0][0])
	            event.set('client_ip',user[0][1])
	        end

	        query_time = a.scan(/Query_time:\s([1-9]\d*\.\d*|0\.\d*[1-9]\d*)\n/)
	        if query_time.length > 0
	            event.set('query_time',query_time[0][0].to_f)
	        end
	        process_time = a.scan(/Process_time:\s([1-9]\d*\.\d*|0\.\d*[1-9]\d*)/)
	        if process_time.length > 0
	            event.set('process_time',process_time[0][0].to_f)
	        end
  
	        compile_time = a.scan(/Compile_time:\s([1-9]\d*\.\d*|0\.\d*[1-9]\d*)/)
	        if compile_time.length > 0
	            event.set('compile_time',compile_time[0][0].to_f)
	        end

	        request_count = a.scan(/Request_count:\s(\d+?)/)
	        if request_count.length > 0
	            event.set('request_count',request_count[0][0].to_i)
	        end

	        total_keys = a.scan(/Total_keys:\s(\d+)/)
	        if total_keys.length > 0
	            event.set('total_keys',total_keys[0][0].to_i)
	        end

	        process_keys = a.scan(/Process_keys:\s(\d+)/)
	        if process_keys.length > 0
	            event.set('process_keys',process_keys[0][0].to_i)
	        end

	        db = a.scan(/# DB: (.*?)\n/)
	        if db.length > 0
	            event.set('DB',db[0][0])
	        end

	        is_internal = a.scan(/# Is_internal: (.*?)\n/)
	        if is_internal.length > 0
	            event.set('is_internal',is_internal[0][0])
	        end

	        index_name = a.scan(/# Index_names: \[([a-z_0-9]*?):([a-z_0-9]*?)\]\n/)
	        if index_name.length > 0
	            event.set('index_name',index_name[0][1])
	            event.set('table_name',index_name[0][0])
	        end

	        table = a.scan(/# Stats:\s([a-z_0-9]*?):(.*?)\n/)
	        if table.length > 0
	            event.set('table',table[0][0])
	            if table.include?(',')
		 	    table_list = table.split(/[,:]/)
	                    event.set('sql_stat',table_list[1])
		            event.set('table_1',table_list[1])            
		            event.set('sql_stat_1',table_list[2])            
                    else
	                    event.set('sql_stat',table[0][1])
	            end
                else
                   table_spc = a.scan(/(INSERT INTO|from|UPDATE|FROM|update|insert into|analyze table|ALTER table|alter table|replace into)\s{1,3}(\S*?)[\s;]{1,3}/)
                   if table_spc.length > 0
                      str1=table_spc[0][1]
                      table_spc_list = str1.gsub(/\`/,'').split(/\./)
                      event.set('table',table_spc_list[0])
                      if !event.get('DB')
                          event.set('DB',table_spc_list[0])
                      end
                   end
	        end

	        digest = a.scan(/# Digest: (.*?)\n/)
	        if digest.length > 0
	            event.set('digest',digest[0][0])
	        end

	        num_cop_tasks = a.scan(/# Num_cop_tasks: (\d+?)\n/)
	        if num_cop_tasks.length > 0
	            event.set('num_cop_tasks',num_cop_tasks[0][0].to_i)
	        end
	        cop_proc_avg = a.scan(/# Cop_proc_avg: ([1-9]\d*\.\d*|0\.\d*[1-9]\d*) /)
	        if cop_proc_avg.length > 0
	            event.set('cop_proc_avg',cop_proc_avg[0][0].to_f)
	        end

	        cop_proc_p90 = a.scan(/Cop_proc_p90: ([1-9]\d*\.\d*|0\.\d*[1-9]\d*) /)
	        if cop_proc_p90.length > 0
	            event.set('cop_proc_p90',cop_proc_p90[0][0].to_f)
	        end

	        cop_proc_max = a.scan(/Cop_proc_max: ([1-9]\d*\.\d*|0\.\d*[1-9]\d*) /)
	        if cop_proc_max.length > 0
	            event.set('cop_proc_max',cop_proc_max[0][0].to_f)
	        end

	        cop_proc_addr = a.scan(/Cop_proc_addr: (.*?):(.*?)\n/)
	        if cop_proc_addr.length > 0
	            event.set('cop_proc_addr',cop_proc_addr[0][0])
	            event.set('cop_proc_port',cop_proc_addr[0][1])
	        end

	        cop_wait_avg = a.scan(/# Cop_wait_avg: ([1-9]\d*\.\d*|0\.\d*[1-9]\d*|\d*) /)
	        if cop_wait_avg.length > 0
	            event.set('cop_wait_avg',cop_wait_avg[0][0].to_f)
	        end

	        cop_wait_p90 = a.scan(/Cop_wait_p90: ([1-9]\d*\.\d*|0\.\d*[1-9]\d*|\d*) /)
	        if cop_wait_p90.length > 0
	            event.set('cop_wait_p90',cop_wait_p90[0][0].to_f)
	        end

	        cop_wait_max = a.scan(/Cop_wait_max: ([1-9]\d*\.\d*|0\.\d*[1-9]\d*|\d*) /)
	        if cop_wait_max.length > 0
	            event.set('cop_wait_max',cop_wait_max[0][0].to_f)
	        end
	        cop_wait_addr = a.scan(/Cop_wait_addr: (.*?):(.*?)\n/)
	        if cop_wait_addr.length > 0
	            event.set('cop_wait_addr',cop_wait_addr[0][0])
	            event.set('cop_wait_port',cop_wait_addr[0][1])
	        end

	        mem_max = a.scan(/# Mem_max: (\d+?)\n/)
	        if mem_max.length > 0
	            event.set('mem_max',mem_max[0][0].to_i)
	        end

	        succ = a.scan(/# Succ: (\w+?)\n/)
	        if succ.length > 0
	            event.set('succ',succ[0][0])
	        end

	        txn_retry = a.scan(/ Txn_retry: (\d+?)\n/)
	        if txn_retry.length > 0
	            event.set('txn_retry',txn_retry[0][0].to_i)
	        end

	        sql = a.scan(/\n([^#]{1}.*)/)
	        if sql.length > 0
	            event.set('sql',sql[0][0])
	        end
        rescue
	        puts $!
	        a1 = event.get('message')
	        event.set('message_origin',a1)
        ensure
	        event.remove('message')
        end

    "
	    } 
    }
}

ELK 展示界面

image
根据集群选择查看指定集群日志,故障时期按照process_keys 和 query_time 强排序 快速定位慢sql,根据慢sql digest 进行同类sql 汇聚操作

FAQs

  1. tidb 3.0 没有集成慢日志系统,需要自行补齐.高版本有成型的慢日志体系
  2. tidb 3.0 中server 组件分布式部署,每个节点的slow_query log 都要收集否则会有遗漏。高版本中已经存在 cluster slow query 的概念更便捷
  3. 慢日志字段顺序非定式,使用logstash 常规grok 常常会出现日志无法解析的情况。需要使用ruby 脚本进行字符串正则过滤
posted @ 2023-01-10 20:57  萱乐庆foreverlove  阅读(99)  评论(0编辑  收藏  举报