cloudfront 日志收集到阿里云的sls
aws的cloudfront 可以把访问日志采集到s3,aws也有日志查看的功能,但是用起来不不好用,而且每次查看日志的时候还要登陆aws的账号去查看。 考虑到业务日志也在阿里云的sls上,所以就把日志放到sls上做统一展示和查看
- 大致的架构
cloudfront -> s3 -> ecs(和sls在同一个地域) -> sls
- 依赖信息
/root/anaconda3/bin/python3 --version
Python 3.12.11
/root/anaconda3/bin/pip3 install boto3
/root/anaconda3/bin/pip3 install -U aliyun-log-python-sdk
- 项目目录结构
ll /data/apps/cloudfront_s3_to_sls/ | awk '{print $NF}'
config.py # 配置文件,记录s3和sls的相关信息
data #保存从s3上同步下来的文件
parse_and_push.py #同步日志内容到sls
run.py #主程序
sync_s3.py #从s3上同步日志文件
utils.py #日记模块
- utils.py
from datetime import datetime
def log(msg):
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")
- config.py
import os
# 根目录:所有日志存放到这里,再按日期分目录
LOCAL_SYNC_DIR = "/data/apps/cloudfront_s3_to_sls/data"
# 阿里云日志服务配置
SLS_ENDPOINT = "us-hongkong-1.log.aliyuncs.com"
SLS_PROJECT = "project-log"
SLS_LOGSTORE = "cloudfront-log"
ACCESS_KEY_ID = "xxxxx"
ACCESS_KEY_SECRET = "xxxx"
# s3配置
AWS_ACCESS_KEY_ID = "xxx"
AWS_SECRET_ACCESS_KEY = "xx+xx+xx"
AWS_REGION = "us-hongkong-1"
AWS_S3_BUCKET = "cloudfront-s3"
AWS_S3_PREFIX = "" # 为空表示根目录
# 控制同步s3上最近的文件时间范围(单位:小时)
SYNC_LOOKBACK_HOURS = 1
# 只处理最近多少小时内的文件到sls(单位:小时)
PROCESS_LOOKBACK_HOURS = 1
- sync_s3.py
import os
import boto3
import botocore.exceptions
from datetime import datetime, timedelta, timezone
from config import (
LOCAL_SYNC_DIR,
AWS_S3_BUCKET,
AWS_S3_PREFIX,
AWS_ACCESS_KEY_ID,
AWS_SECRET_ACCESS_KEY,
AWS_REGION,
SYNC_LOOKBACK_HOURS,
)
from utils import log
# 初始化 S3 客户端
s3 = boto3.client(
"s3",
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
region_name=AWS_REGION,
)
now = datetime.now(timezone.utc)
cutoff_time = now - timedelta(hours=SYNC_LOOKBACK_HOURS)
def sync_from_s3():
log(f"Syncing files modified after: {cutoff_time.isoformat()}")
try:
paginator = s3.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=AWS_S3_BUCKET, Prefix=AWS_S3_PREFIX)
for page in page_iterator:
for obj in page.get("Contents", []):
key = obj["Key"]
last_modified = obj["LastModified"]
if last_modified < cutoff_time:
continue
if not key.endswith(".gz"):
continue
# 获取当天日期字符串
date_str = last_modified.strftime("%Y-%m-%d")
local_dir = os.path.join(LOCAL_SYNC_DIR, date_str)
os.makedirs(local_dir, exist_ok=True)
filename = os.path.basename(key)
local_path = os.path.join(local_dir, filename)
if os.path.exists(local_path):
continue # 已存在,跳过
log(f"Downloading {key} → {local_path}")
try:
s3.download_file(AWS_S3_BUCKET, key, local_path)
log(f"Downloaded: {filename}")
except Exception as e:
log(f"Failed to download {key}: {e}")
except botocore.exceptions.NoCredentialsError:
log("ERROR: No AWS credentials found. Please check config.py.")
except Exception as e:
log(f"Unexpected error while syncing S3: {e}")
- parse_and_push.py
import os
import gzip
import time
from datetime import datetime, timedelta
from aliyun.log import LogClient, LogItem, PutLogsRequest
from config import (
LOCAL_SYNC_DIR,
SLS_ENDPOINT,
SLS_PROJECT,
SLS_LOGSTORE,
ACCESS_KEY_ID,
ACCESS_KEY_SECRET,
PROCESS_LOOKBACK_HOURS,
)
from utils import log
client = LogClient(SLS_ENDPOINT, ACCESS_KEY_ID, ACCESS_KEY_SECRET)
def parse_line_to_log_item(line, fields, source_file):
if not line.strip() or line.startswith("#"):
return None
parts = line.strip().split("\t")
if len(parts) != len(fields):
return None
# 字段名和对应值映射,字段名去括号
field_map = {
k.replace("(", "").replace(")", ""): v
for k, v in zip(fields, parts)
}
contents = []
# 先放 date 和 time
for key in ("date", "time"):
if key in field_map:
contents.append((key, field_map[key]))
# 再放其他字段,排除 date 和 time
for k in fields:
k_clean = k.replace("(", "").replace(")", "")
if k_clean not in ("date", "time"):
contents.append((k_clean, field_map[k_clean]))
# 添加来源文件名字段
contents.append(("filename", source_file))
timestamp = int(time.time())
return LogItem(timestamp, 0, contents)
def send_logs_to_sls(log_items, topic="cloudfront", source="cloudfront_s3"):
if not log_items:
return
try:
req = PutLogsRequest(
SLS_PROJECT,
SLS_LOGSTORE,
topic,
source,
log_items
)
res = client.put_logs(req)
res.log_print()
except Exception as e:
log(f"SLS 上传失败: {e}")
def parse_and_push():
marker_dir = os.path.join(LOCAL_SYNC_DIR, ".processed_files")
os.makedirs(marker_dir, exist_ok=True)
marker_file = os.path.join(marker_dir, f"{time.strftime('%Y-%m-%d')}.txt")
processed_files = set()
if os.path.exists(marker_file):
with open(marker_file, "r") as f:
processed_files = set(line.strip() for line in f)
now = datetime.now()
cutoff = now - timedelta(hours=PROCESS_LOOKBACK_HOURS)
for root, _, files in os.walk(LOCAL_SYNC_DIR):
for filename in files:
filepath = os.path.join(root, filename)
if '.processed_files' in filepath or filepath in processed_files:
continue
# 文件修改时间判断
mtime = datetime.fromtimestamp(os.path.getmtime(filepath))
if mtime < cutoff:
continue
log(f"Processing {filepath}")
try:
opener = gzip.open if filepath.endswith(".gz") else open
with opener(filepath, "rt", encoding="utf-8", errors="ignore") as f:
fields = []
for line in f:
if line.startswith("#Fields:"):
fields = line.replace("#Fields:", "").strip().split()
break
if not fields:
log(f"No #Fields found in {filepath}, skipping.")
continue
f.seek(0)
batch = []
# 相对于根目录的文件名作为字段值
relative_path = os.path.relpath(filepath, LOCAL_SYNC_DIR)
for line in f:
item = parse_line_to_log_item(line, fields, relative_path)
if item:
batch.append(item)
if len(batch) >= 100:
send_logs_to_sls(batch)
batch = []
if batch:
send_logs_to_sls(batch)
with open(marker_file, "a") as f:
f.write(filepath + "\n")
processed_files.add(filepath)
except Exception as e:
log(f"Failed to process {filepath}: {e}")
- run.py
import time
from sync_s3 import sync_from_s3
from parse_and_push import parse_and_push
from utils import log
def main():
log("=== Start syncing from S3 ===")
sync_from_s3()
log("=== Start parsing and pushing to SLS ===")
parse_and_push()
time.sleep(10)
if __name__ == "__main__":
main()
- 运行方式
crontab -l
*/5 * * * * cd /data/apps/cloudfront_s3_to_sls;/root/anaconda3/bin/python3 run.py >> run.log 2>&1