cloudfront 日志收集到阿里云的sls

aws的cloudfront 可以把访问日志采集到s3，aws也有日志查看的功能，但是用起来不不好用，而且每次查看日志的时候还要登陆aws的账号去查看。考虑到业务日志也在阿里云的sls上，所以就把日志放到sls上做统一展示和查看

大致的架构

cloudfront -> s3 -> ecs(和sls在同一个地域) -> sls

依赖信息

/root/anaconda3/bin/python3 --version
Python 3.12.11
/root/anaconda3/bin/pip3 install boto3
/root/anaconda3/bin/pip3 install  -U aliyun-log-python-sdk

ll /data/apps/cloudfront_s3_to_sls/ | awk '{print $NF}'
config.py # 配置文件，记录s3和sls的相关信息
data #保存从s3上同步下来的文件
parse_and_push.py  #同步日志内容到sls
run.py  #主程序
sync_s3.py #从s3上同步日志文件
utils.py   #日记模块

utils.py

from datetime import datetime

def log(msg):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

config.py

import os

# 根目录：所有日志存放到这里，再按日期分目录
LOCAL_SYNC_DIR = "/data/apps/cloudfront_s3_to_sls/data"

# 阿里云日志服务配置
SLS_ENDPOINT = "us-hongkong-1.log.aliyuncs.com"
SLS_PROJECT = "project-log"
SLS_LOGSTORE = "cloudfront-log"
ACCESS_KEY_ID = "xxxxx"
ACCESS_KEY_SECRET = "xxxx"

# s3配置
AWS_ACCESS_KEY_ID = "xxx"
AWS_SECRET_ACCESS_KEY = "xx+xx+xx"
AWS_REGION = "us-hongkong-1"
AWS_S3_BUCKET = "cloudfront-s3"
AWS_S3_PREFIX = ""  # 为空表示根目录

# 控制同步s3上最近的文件时间范围（单位：小时）
SYNC_LOOKBACK_HOURS = 1

# 只处理最近多少小时内的文件到sls（单位：小时）
PROCESS_LOOKBACK_HOURS = 1

sync_s3.py

import os
import boto3
import botocore.exceptions
from datetime import datetime, timedelta, timezone
from config import (
    LOCAL_SYNC_DIR,
    AWS_S3_BUCKET,
    AWS_S3_PREFIX,
    AWS_ACCESS_KEY_ID,
    AWS_SECRET_ACCESS_KEY,
    AWS_REGION,
    SYNC_LOOKBACK_HOURS,
)
from utils import log

# 初始化 S3 客户端
s3 = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION,
)

now = datetime.now(timezone.utc)
cutoff_time = now - timedelta(hours=SYNC_LOOKBACK_HOURS)

def sync_from_s3():
    log(f"Syncing files modified after: {cutoff_time.isoformat()}")

    try:
        paginator = s3.get_paginator("list_objects_v2")
        page_iterator = paginator.paginate(Bucket=AWS_S3_BUCKET, Prefix=AWS_S3_PREFIX)

        for page in page_iterator:
            for obj in page.get("Contents", []):
                key = obj["Key"]
                last_modified = obj["LastModified"]

                if last_modified < cutoff_time:
                    continue
                if not key.endswith(".gz"):
                    continue

                # 获取当天日期字符串
                date_str = last_modified.strftime("%Y-%m-%d")
                local_dir = os.path.join(LOCAL_SYNC_DIR, date_str)
                os.makedirs(local_dir, exist_ok=True)

                filename = os.path.basename(key)
                local_path = os.path.join(local_dir, filename)

                if os.path.exists(local_path):
                    continue  # 已存在，跳过

                log(f"Downloading {key} → {local_path}")
                try:
                    s3.download_file(AWS_S3_BUCKET, key, local_path)
                    log(f"Downloaded: {filename}")
                except Exception as e:
                    log(f"Failed to download {key}: {e}")

    except botocore.exceptions.NoCredentialsError:
        log("ERROR: No AWS credentials found. Please check config.py.")
    except Exception as e:
        log(f"Unexpected error while syncing S3: {e}")

parse_and_push.py

import os
import gzip
import time
from datetime import datetime, timedelta
from aliyun.log import LogClient, LogItem, PutLogsRequest
from config import (
    LOCAL_SYNC_DIR,
    SLS_ENDPOINT,
    SLS_PROJECT,
    SLS_LOGSTORE,
    ACCESS_KEY_ID,
    ACCESS_KEY_SECRET,
    PROCESS_LOOKBACK_HOURS,
)
from utils import log

client = LogClient(SLS_ENDPOINT, ACCESS_KEY_ID, ACCESS_KEY_SECRET)

def parse_line_to_log_item(line, fields, source_file):
    if not line.strip() or line.startswith("#"):
        return None
    parts = line.strip().split("\t")
    if len(parts) != len(fields):
        return None

    # 字段名和对应值映射，字段名去括号
    field_map = {
        k.replace("(", "").replace(")", ""): v
        for k, v in zip(fields, parts)
    }

    contents = []
    # 先放 date 和 time
    for key in ("date", "time"):
        if key in field_map:
            contents.append((key, field_map[key]))

    # 再放其他字段，排除 date 和 time
    for k in fields:
        k_clean = k.replace("(", "").replace(")", "")
        if k_clean not in ("date", "time"):
            contents.append((k_clean, field_map[k_clean]))

    # 添加来源文件名字段
    contents.append(("filename", source_file))

    timestamp = int(time.time())
    return LogItem(timestamp, 0, contents)

def send_logs_to_sls(log_items, topic="cloudfront", source="cloudfront_s3"):
    if not log_items:
        return
    try:
        req = PutLogsRequest(
            SLS_PROJECT,
            SLS_LOGSTORE,
            topic,
            source,
            log_items
        )
        res = client.put_logs(req)
        res.log_print()
    except Exception as e:
        log(f"SLS 上传失败: {e}")

def parse_and_push():
    marker_dir = os.path.join(LOCAL_SYNC_DIR, ".processed_files")
    os.makedirs(marker_dir, exist_ok=True)
    marker_file = os.path.join(marker_dir, f"{time.strftime('%Y-%m-%d')}.txt")

    processed_files = set()
    if os.path.exists(marker_file):
        with open(marker_file, "r") as f:
            processed_files = set(line.strip() for line in f)

    now = datetime.now()
    cutoff = now - timedelta(hours=PROCESS_LOOKBACK_HOURS)

    for root, _, files in os.walk(LOCAL_SYNC_DIR):
        for filename in files:
            filepath = os.path.join(root, filename)

            if '.processed_files' in filepath or filepath in processed_files:
                continue

            # 文件修改时间判断
            mtime = datetime.fromtimestamp(os.path.getmtime(filepath))
            if mtime < cutoff:
                continue

            log(f"Processing {filepath}")
            try:
                opener = gzip.open if filepath.endswith(".gz") else open
                with opener(filepath, "rt", encoding="utf-8", errors="ignore") as f:
                    fields = []
                    for line in f:
                        if line.startswith("#Fields:"):
                            fields = line.replace("#Fields:", "").strip().split()
                            break
                    if not fields:
                        log(f"No #Fields found in {filepath}, skipping.")
                        continue
                    f.seek(0)

                    batch = []
                    # 相对于根目录的文件名作为字段值
                    relative_path = os.path.relpath(filepath, LOCAL_SYNC_DIR)

                    for line in f:
                        item = parse_line_to_log_item(line, fields, relative_path)
                        if item:
                            batch.append(item)
                        if len(batch) >= 100:
                            send_logs_to_sls(batch)
                            batch = []
                    if batch:
                        send_logs_to_sls(batch)

                with open(marker_file, "a") as f:
                    f.write(filepath + "\n")
                processed_files.add(filepath)

            except Exception as e:
                log(f"Failed to process {filepath}: {e}")

run.py

import time
from sync_s3 import sync_from_s3
from parse_and_push import parse_and_push
from utils import log

def main():
    log("=== Start syncing from S3 ===")
    sync_from_s3()

    log("=== Start parsing and pushing to SLS ===")
    parse_and_push()

    time.sleep(10)

if __name__ == "__main__":
    main()

运行方式

crontab -l
*/5 * * * * cd /data/apps/cloudfront_s3_to_sls;/root/anaconda3/bin/python3 run.py >> run.log 2>&1

posted @ 2025-07-15 10:53 Hello_worlds 阅读(20) 评论(0) 收藏举报

刷新页面返回顶部

大军军军之技术落地

责人之心责己、恕己之心恕人

cloudfront 日志收集到阿里云的sls

公告