定时统计CDN日志访问量并存入MySQL

# 下载CDN日志脚本参考地址

https://www.cnblogs.com/sswind/p/18206195

# 建表语句

CREATE DATABASE IF NOT EXISTS log_stats;
USE log_stats;

CREATE TABLE IF NOT EXISTS domain_pv_uv (
    id INT AUTO_INCREMENT PRIMARY KEY,
    domain VARCHAR(100) NOT NULL COMMENT '域名',
    stat_date DATE NOT NULL COMMENT '统计日期',
    pv INT NOT NULL DEFAULT 0 COMMENT 'PV访问量',
    uv INT NOT NULL DEFAULT 0 COMMENT 'UV独立IP',
    create_time DATETIME DEFAULT CURRENT_TIMESTAMP,
    UNIQUE KEY uk_domain_date (domain, stat_date)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '域名PVUV统计表';

# MySQL授权

create user 'root'@'127.0.0.1' identified by '123';
grant all privileges on *.* to 'root'@'127.0.0.1';
flush privileges;

# Python脚本

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import datetime
import pymysql

# ====================== 【只需修改这里】配置区 ======================
LOG_BASE_PATH = "/home/cdn_logs/"  # 日志根目录(一级子目录=域名)

# MySQL 配置
DB_CONFIG = {
    "host": "127.0.0.1",
    "port": 3306,
    "user": "root",
    "password": "123",
    "db": "log_stats",
    "charset": "utf8mb4"
}
# =================================================================

# 自动计算前天日期
YESTERDAY = datetime.date.today() - datetime.timedelta(days=2)
print(YESTERDAY)
STAT_DATE = YESTERDAY.strftime("%Y-%m-%d")
#LOG_FILE_NAME = f"domain_{STAT_DATE}.log"

# ====================== 静态资源过滤规则 ======================
# 过滤的静态资源后缀(可自行增删)
STATIC_EXTENSIONS = (
    '.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.ico',
    '.svg', '.woff', '.woff2', '.ttf', '.eot', '.map', '.txt'
)
# =================================================================

def parse_log_file(log_path):
    """解析日志,返回 PV 行数、UV 去重IP数量(已过滤静态资源)"""
    ip_set = set()
    pv_count = 0

    if not os.path.exists(log_path):
        return 0, 0

    try:
        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                # --------------------------
                # 第一步:提取 IP,根据日志格式自行修改正则
                # --------------------------
                # ip_match = re.match(r"(\d+\.\d+\.\d+\.\d+)", line)
ip_match = re.match(r'^\[\d{2}/[A-Za-z]{3}/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4}\]\s*(\d+\.\d+\.\d+\.\d+)', line)
if not ip_match: continue ip = ip_match.group(1) # -------------------------- # 第二步:过滤静态资源 # -------------------------- lower_line = line.lower() is_static = False for ext in STATIC_EXTENSIONS: if lower_line.find(ext) > -1: is_static = True break if is_static: continue # 静态资源直接跳过 # -------------------------- # 第三步:有效请求,统计PV/UV # -------------------------- ip_set.add(ip) pv_count += 1 except Exception as e: print(f"读取失败 {log_path}: {str(e)}") return pv_count, len(ip_set) def save_to_mysql(domain, stat_date, pv, uv): """写入MySQL,存在则更新,不存在则插入""" try: conn = pymysql.connect(**DB_CONFIG) cursor = conn.cursor() sql = """ INSERT INTO domain_pv_uv (domain, stat_date, pv, uv) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE pv=%s, uv=%s, create_time=NOW() """ cursor.execute(sql, (domain, stat_date, pv, uv, pv, uv)) conn.commit() print(f"✅ 写入成功:{domain} | {stat_date} | PV={pv} | UV={uv}") cursor.close() conn.close() except Exception as e: print(f"❌ 写入失败 {domain}: {str(e)}") def main(): print(f"===== 开始统计:{STAT_DATE} 的 PV/UV(已过滤静态资源) =====") #print(f"📄 目标日志文件:{LOG_FILE_NAME}\n") # 遍历所有域名目录 for domain in os.listdir(LOG_BASE_PATH): domain_dir = os.path.join(LOG_BASE_PATH, domain) # 只处理目录 if not os.path.isdir(domain_dir): continue # 拼接前天日志完整路径 LOG_FILE_NAME = f"{domain}_{STAT_DATE}.log" print(f"📄 目标日志文件:{domain_dir}/{LOG_FILE_NAME}\n") log_file_path = os.path.join(domain_dir, LOG_FILE_NAME) # print(log_file_path) # 统计 PV UV pv, uv = parse_log_file(log_file_path) if pv == 0 and uv == 0: print(f"ℹ️ {domain} 无前天有效访问日志") continue # 写入数据库 save_to_mysql(domain, STAT_DATE, pv, uv) print("\n===== ✅ 统计完成 =====") if __name__ == "__main__": main()

 

posted @ 2026-04-08 17:17  一窗明月半帘风  阅读(3)  评论(0)    收藏  举报