## 分布转换!!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import os
def DistributionTransform(datasets_path= "../../mydatasets_3b"):
for filename in os.listdir(datasets_path):
input_path=os.path.join(datasets_path,filename)
print(f"{input_path} processing")
pdf=pd.read_parquet(input_path)
# 定义原始分布和目标分布的参数
mu1, std1 = -5.739378720116549, 4.796716135366644 # 蛋白质打分
mu2, std2 = -6.680654048919678, 3.8557145595550537 # DNA打分
# --- 执行映射 ---
# 使用公式将 score 映射到目标分布
pdf['mapped_score'] = mu2 + (std2 / std1) * (pdf['score'] - mu1)
pdf.to_parquet(input_path,index=False)
# --- 绘图验证 ---
plt.figure(figsize=(12, 6))
# 1. 绘制原始得分分布 (Blue)
sns.kdeplot(pdf['score'], label='Original Score (Input)', color='blue', fill=True, alpha=0.1)
# 2. 绘制映射后的得分分布 (Green)
sns.kdeplot(pdf['mapped_score'], label='Mapped Score (Result)', color='green', lw=3, fill=True, alpha=0.2)
# 3. 绘制目标分布的理论曲线 (Red Dashed) 用于对比
x = np.linspace(pdf['mapped_score'].min(), pdf['mapped_score'].max(), 200)
plt.plot(x, norm.pdf(x, mu2, std2), 'r--', label='Target Theoretical PDF', alpha=0.8)
plt.title('Sample Distribution Mapping: Original vs Mapped', fontsize=14)
plt.xlabel('Score Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 输出映射后的统计信息
print(f"映射后样本均值: {pdf['mapped_score'].mean():.4f} (目标: {mu2:.4f})")
print(f"映射后样本标准差: {pdf['mapped_score'].std():.4f} (目标: {std2:.4f})")
pdf.to_parquet(input_path,index=False)
print(f"{input_path} added mapped_score.")
DistributionTransform("../../mydatasets_3b")
![image]()