定时统计CDN日志访问量并存入MySQL
# 下载CDN日志脚本参考地址
https://www.cnblogs.com/sswind/p/18206195
# 建表语句
CREATE DATABASE IF NOT EXISTS log_stats; USE log_stats; CREATE TABLE IF NOT EXISTS domain_pv_uv ( id INT AUTO_INCREMENT PRIMARY KEY, domain VARCHAR(100) NOT NULL COMMENT '域名', stat_date DATE NOT NULL COMMENT '统计日期', pv INT NOT NULL DEFAULT 0 COMMENT 'PV访问量', uv INT NOT NULL DEFAULT 0 COMMENT 'UV独立IP', create_time DATETIME DEFAULT CURRENT_TIMESTAMP, UNIQUE KEY uk_domain_date (domain, stat_date) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '域名PVUV统计表';
# MySQL授权
create user 'root'@'127.0.0.1' identified by '123'; grant all privileges on *.* to 'root'@'127.0.0.1'; flush privileges;
# Python脚本
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import re import datetime import pymysql # ====================== 【只需修改这里】配置区 ====================== LOG_BASE_PATH = "/home/cdn_logs/" # 日志根目录(一级子目录=域名) # MySQL 配置 DB_CONFIG = { "host": "127.0.0.1", "port": 3306, "user": "root", "password": "123", "db": "log_stats", "charset": "utf8mb4" } # ================================================================= # 自动计算前天日期 YESTERDAY = datetime.date.today() - datetime.timedelta(days=2) print(YESTERDAY) STAT_DATE = YESTERDAY.strftime("%Y-%m-%d") #LOG_FILE_NAME = f"domain_{STAT_DATE}.log" # ====================== 静态资源过滤规则 ====================== # 过滤的静态资源后缀(可自行增删) STATIC_EXTENSIONS = ( '.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.map', '.txt' ) # ================================================================= def parse_log_file(log_path): """解析日志,返回 PV 行数、UV 去重IP数量(已过滤静态资源)""" ip_set = set() pv_count = 0 if not os.path.exists(log_path): return 0, 0 try: with open(log_path, "r", encoding="utf-8", errors="ignore") as f: for line in f: line = line.strip() if not line: continue # -------------------------- # 第一步:提取 IP,根据日志格式自行修改正则 # -------------------------- # ip_match = re.match(r"(\d+\.\d+\.\d+\.\d+)", line)
ip_match = re.match(r'^\[\d{2}/[A-Za-z]{3}/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4}\]\s*(\d+\.\d+\.\d+\.\d+)', line) if not ip_match: continue ip = ip_match.group(1) # -------------------------- # 第二步:过滤静态资源 # -------------------------- lower_line = line.lower() is_static = False for ext in STATIC_EXTENSIONS: if lower_line.find(ext) > -1: is_static = True break if is_static: continue # 静态资源直接跳过 # -------------------------- # 第三步:有效请求,统计PV/UV # -------------------------- ip_set.add(ip) pv_count += 1 except Exception as e: print(f"读取失败 {log_path}: {str(e)}") return pv_count, len(ip_set) def save_to_mysql(domain, stat_date, pv, uv): """写入MySQL,存在则更新,不存在则插入""" try: conn = pymysql.connect(**DB_CONFIG) cursor = conn.cursor() sql = """ INSERT INTO domain_pv_uv (domain, stat_date, pv, uv) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE pv=%s, uv=%s, create_time=NOW() """ cursor.execute(sql, (domain, stat_date, pv, uv, pv, uv)) conn.commit() print(f"✅ 写入成功:{domain} | {stat_date} | PV={pv} | UV={uv}") cursor.close() conn.close() except Exception as e: print(f"❌ 写入失败 {domain}: {str(e)}") def main(): print(f"===== 开始统计:{STAT_DATE} 的 PV/UV(已过滤静态资源) =====") #print(f"📄 目标日志文件:{LOG_FILE_NAME}\n") # 遍历所有域名目录 for domain in os.listdir(LOG_BASE_PATH): domain_dir = os.path.join(LOG_BASE_PATH, domain) # 只处理目录 if not os.path.isdir(domain_dir): continue # 拼接前天日志完整路径 LOG_FILE_NAME = f"{domain}_{STAT_DATE}.log" print(f"📄 目标日志文件:{domain_dir}/{LOG_FILE_NAME}\n") log_file_path = os.path.join(domain_dir, LOG_FILE_NAME) # print(log_file_path) # 统计 PV UV pv, uv = parse_log_file(log_file_path) if pv == 0 and uv == 0: print(f"ℹ️ {domain} 无前天有效访问日志") continue # 写入数据库 save_to_mysql(domain, STAT_DATE, pv, uv) print("\n===== ✅ 统计完成 =====") if __name__ == "__main__": main()

浙公网安备 33010602011771号