数据定时上传到obs并验证数据正确性

一.数据准备

1.爬虫定时下载数据并校验重新下载

2.数据处理

3.本地数据管理

 

二.数据上传

1.依赖华为obs python库

2.ak sk 定制化上传策略--通过配置表,利用多进程并发方式,同时上传 多个桶的数据

 

# -*- encoding: utf-8 -*-
import hashlib
import time
import os
import yaml
import argparse
import concurrent.futures
from obs import ObsClient
from datetime import datetime, timedelta,date

# 配置认证信息
#访问密钥
AK = ''

#安全秘钥
SK = ''

# 华南深圳地区的端点
endpoint = ''

#桶的名称
#bucket_name = ''

# 华南深圳地区对应的位置约束
location = ''

# 指定要上传的文件夹路径
#folder_to_upload = '/data/nasalog/'

#配置好桶名以及对应的本地路径(key:value模式,冒号分割,根据需求增加数组配置)
members = ['original:/data/nasalog', 'processed:/data/nasalog', 'fishinglog:/data/nasalog']

# 文件是否是在多少天之前修改的
before_days = 5

# 线程池数量
max_workers = 10
#本地路径
dataPaths = []

"""
此函数用于加载 YAML 配置文件
:param file_path: YAML 文件的路径
:return: 包含配置信息的字典
"""
def load_yaml_config(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
# 使用 yaml.safe_load 安全地加载 YAML 文件内容
return yaml.safe_load(file)
except FileNotFoundError:
print(f"错误:未找到配置文件 {file_path}。")
except yaml.YAMLError as e:
print(f"错误:解析 YAML 文件时出错 - {e}。")
return None


# 计算本地文件的md5
def calculate_md5(file_path):
# 创建一个 MD5 哈希对象
hash_md5 = hashlib.md5()
try:
# 以二进制模式打开文件
with open(file_path, "rb") as f:
# 逐块读取文件内容,每次读取 4096 字节
for chunk in iter(lambda: f.read(4096), b""):
# 更新哈希对象的状态
hash_md5.update(chunk)
# 获取最终的 MD5 哈希值
return hash_md5.hexdigest()
except FileNotFoundError:
print(f"错误:文件 {file_path} 未找到。")
except Exception as e:
print(f"发生未知错误:{e}")
return None

# 判断文件是否在多少天之内修改的
def is_file_modified_more_than_n_days_ago(file_path, before_days):
try:
# 获取文件的修改时间(以秒为单位)
modification_time = os.path.getmtime(file_path)

# 获取当前时间(以秒为单位)
current_time = time.time()

# 计算n天的秒数
n_days_in_seconds = before_days * 24 * 60 * 60

# 判断文件修改时间是否大于n天前
return (current_time - modification_time) > n_days_in_seconds
except FileNotFoundError:
print(f"错误:文件 {file_path} 未找到。")
return False
except Exception as e:
print(f"错误:发生未知错误 {e}")
return False


# 删除文件
def delete_file(filename):
if os.path.exists(filename):
os.remove(filename)
print(f"文件文件上传完成, {filename} 删除成功!")
else:
print(f"文件 {filename} 不存在。")

# 获取obs文件md5
def get_filemd5_from_obs(client, bucket_name, object_key):
try:
# 获取对象元数据
resp = client.getObjectMetadata(bucket_name, object_key)
if resp.status < 300:
etag = resp.body.etag
# 去掉 ETag 可能的引号
md5_value = etag.strip('"')
# print(f"文件的 MD5 值(ETag)为: {md5_value}")
return md5_value
else:
print(f"obs文件{object_key}的MD5值不存在,需要上传文件")
#print(f"获取obs文件的MD5值失败,错误码: {resp.errorCode},错误信息: {resp.errorMessage}")
except Exception as e:
print(f"取obs文件md5值,发生错误: {e}")
return None

#创建桶方法
def createBucket(bucket_name, client):
try:
# 调用 headBucket 方法来检查桶是否存在
resp = client.headBucket(bucket_name)
if resp.status < 300:
print(f"桶 {bucket_name} 已经存在,开始处理文件")
else:
# 创建桶
resp = client.createBucket(bucketName=bucket_name, location=location)
if resp.status < 300:
print(f"桶 {bucket_name} 创建成功")
else:
print(f"桶 {bucket_name} 创建失败,错误信息: {resp.errorMessage}")
except Exception as e:
print(f"创建桶时出现异常: {e}")


#上传文件方法
def upload_files(bucket_name, folder_path):
# 创建 OBS 客户端
client = ObsClient(access_key_id=AK, secret_access_key=SK, server=endpoint)
try:
#createBucket(bucket_name, client) #创建桶
# 遍历指定文件夹中的所有文件
if not os.path.exists(folder_path):
raise FileNotFoundError(f"路径不存在: {folder_path}")
for root, dirs, files in os.walk(folder_path):
print(f"当前目录 {root}, 下文件 {files}")
for file in files:
file_path = os.path.join(root, file)
# 构造对象键,这里使用相对路径
#object_key = os.path.relpath(file_path, dataPath)
object_key = file_path
for dataPath in dataPaths:
object_key = object_key.replace(dataPath,'')
local_md5 = calculate_md5(file_path)
remote_md5 = get_filemd5_from_obs(client, bucket_name, object_key)
if ((local_md5 is not None and remote_md5 is not None) and (local_md5 == remote_md5)):
print(f"桶 {bucket_name}, Obs上文件已经存在 {file_path} ,无需继续上传")
continue
try:
# 上传文件
resp = client.putFile(bucketName=bucket_name, objectKey=object_key, file_path=file_path)
if resp.status < 300:
print(f"桶 {bucket_name}, 对应的文件 {file_path} 上传成功,objectKey:{object_key}")
if is_file_modified_more_than_n_days_ago(file_path, before_days): #删除超过多少天的数据
delete_file(file_path)
else:
print(f"桶 {bucket_name}, 文件 {file_path} 上传失败,错误信息: {resp.errorMessage}")
continue
except Exception as e:
print(f"桶 {bucket_name}, 上传文件 {file_path} 时出现异常: {e}")
continue
except FileNotFoundError as e:
print(f"错误:指定路径不存在 - {e}")
except PermissionError as e:
print(f"错误:权限不足,无法访问路径 - {e}")
except Exception as e:
print(f"上传文件发生全局错误: {e}")
finally:
# 关闭客户端连接
client.close()

# 定义一个简单的函数,作为线程池要执行的任务
def task_obj(string):
result = string.split(':')
today = date.today()
yesterday = today - timedelta(days=1)
bucket_name = result[0]
folder_path = result[1]+str(yesterday).replace('-','_')
print(folder_path)
upload_files(bucket_name, folder_path)


def get_config():
# 创建命令行参数解析器
parser = argparse.ArgumentParser(description='使用 YAML 配置文件运行程序')
# 添加配置文件路径的命令行参数
parser.add_argument('config_file', type=str, help='YAML 配置文件的路径')
# 解析命令行参数
args = parser.parse_args()
# 调用 load_yaml_config 函数加载配置文件
config = load_yaml_config(args.config_file)

if config:
# 应用程序配置信息
global AK, SK, endpoint, location, members, before_days, max_workers,dataPaths
sys_config = config.get('obsinfo', {})
AK = sys_config.get('AK')
SK = sys_config.get('SK')
endpoint = sys_config.get('endpoint')
location = sys_config.get('location')
dataPaths = sys_config.get('dataPaths')
members = sys_config.get('members')
print(members)
before_days = sys_config.get('before_days')
max_workers = sys_config.get('max_workers')

if __name__ == "__main__":
get_config()
# 创建一个包含 x 个线程的线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
executor.map(task_obj, members)

三.数据校验

1.本地创建md5文件并上传

2.使用obs元对象的md5信息验证

 

四.定时任务

1.准备要执行的脚本upload.py

2.权限 chmod +x upload.py

3.在/etc/systemd/system/ 下创建upload.service

例如:

[Unit]
Description=Python upload script service
After=network.target

[Service]
Type=oneshot
ExecStart=/usr/local/bin/python3.12 /workspaces/work/fisherycube-data-process/upload_obs/tyobs_upload_data.py /workspaces/work/fisherycube-data-process/upload_obs/tyobs_config.yaml
#WorkingDirectory=/workspaces/work # 重点检查这一行,确保路径存在且权限正确
#Restart=on-failure

[Install]
WantedBy=multi-user.target

4.在/etc/systemd/system/ 下创建upload.timer

例如:

[Unit]
Description=Run upload.py at 08:00 daily

[Timer]
# 简化语法,每天 8 点触发
OnCalendar=08:00
# 按需决定是否保留 Persistent,这里演示保留
Persistent=true
# 关键修正:Unit 首字母大写
Unit=upload.service

[Install]
WantedBy=timers.target

5.重载systemd守护进程并启动服务和定时器

# 重载systemd守护进程,让系统识别新创建的服务和定时器

sudo systemctl daemon-reload

# 启动服务

sudo systemctl start upload.service

# 启动定时器

sudo systemctl start upload.timer

# 查看定时器状态,确认是否正常运行

sudo systemctl status upload.timer

# 查看服务状态,确认服务是否执行成功

sudo systemctl status upload.service

# 设置服务和定时器开机自启

sudo systemctl enable upload.service

sudo systemctl enable upload.timer

 

posted @ 2025-07-07 14:53  江湖一支竹  阅读(19)  评论(0)    收藏  举报