Z分布转换

## 分布转换!!

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

import os

def DistributionTransform(datasets_path= "../../mydatasets_3b"):

    for filename in os.listdir(datasets_path):
        input_path=os.path.join(datasets_path,filename)
        print(f"{input_path} processing")
        pdf=pd.read_parquet(input_path)

        # 定义原始分布和目标分布的参数
        mu1, std1 = -5.739378720116549, 4.796716135366644 # 蛋白质打分
        mu2, std2 = -6.680654048919678, 3.8557145595550537 # DNA打分

        # --- 执行映射 ---
        # 使用公式将 score 映射到目标分布
        pdf['mapped_score'] = mu2 + (std2 / std1) * (pdf['score'] - mu1)
        pdf.to_parquet(input_path,index=False)
        # --- 绘图验证 ---
        plt.figure(figsize=(12, 6))

        # 1. 绘制原始得分分布 (Blue)
        sns.kdeplot(pdf['score'], label='Original Score (Input)', color='blue', fill=True, alpha=0.1)

        # 2. 绘制映射后的得分分布 (Green)
        sns.kdeplot(pdf['mapped_score'], label='Mapped Score (Result)', color='green', lw=3, fill=True, alpha=0.2)

        # 3. 绘制目标分布的理论曲线 (Red Dashed) 用于对比
        x = np.linspace(pdf['mapped_score'].min(), pdf['mapped_score'].max(), 200)
        plt.plot(x, norm.pdf(x, mu2, std2), 'r--', label='Target Theoretical PDF', alpha=0.8)

        plt.title('Sample Distribution Mapping: Original vs Mapped', fontsize=14)
        plt.xlabel('Score Value')
        plt.ylabel('Density')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

        # 输出映射后的统计信息
        print(f"映射后样本均值: {pdf['mapped_score'].mean():.4f} (目标: {mu2:.4f})")
        print(f"映射后样本标准差: {pdf['mapped_score'].std():.4f} (目标: {std2:.4f})")
        pdf.to_parquet(input_path,index=False)
        print(f"{input_path} added mapped_score.")
        
DistributionTransform("../../mydatasets_3b")

image

posted @ 2025-12-28 20:29  ylifs  阅读(5)  评论(0)    收藏  举报