数据分布差异鉴定指南
一、基本统计特征分布
二、内容主题与领域分布
三、语言风格与结构分布
四、实体与词汇分布
五、任务 / 功能类型分布
六、相关性分析(补充维度)
总结
代码实现:
import json
import csv
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from scipy.stats import describe
# 下载NLTK资源(新增:添加punkt_tab的下载)
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # 关键:解决缺失punkt_tab的错误
nltk.download('stopwords')
nltk.download('wordnet')
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='sans-serif'
plt.rcParams['font.sans-serif'] = ['SimSun']
plt.rcParams['axes.unicode_minus'] = False
def convert_helpsteer_to_reward_format(input_path, output_path):
"""将HelpSteer2的原始preference.jsonl转换为prompt\tchosen\trejected格式的TSV文件"""
with open(input_path, "r", encoding="utf-8") as fin, \
open(output_path, "w", encoding="utf-8", newline="") as fout:
tsv_writer = csv.writer(fout, delimiter='\t')
for line_num, line in enumerate(fin, 1):
try:
data = json.loads(line)
prompt = data["prompt"]
if data["preference_strength"] > 0:
chosen = data["response_2"]
rejected = data["response_1"]
else:
chosen = data["response_1"]
rejected = data["response_2"]
tsv_writer.writerow([prompt, chosen, rejected])
except Exception as e:
print(f"处理第{line_num}行失败: {e},已跳过")
print(f"转换完成,输出文件: {output_path}")
def load_dataset(file_path, dataset_name):
"""加载数据集(手动解析避免EOF inside string错误)"""
data = []
error_lines = []
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
reader = csv.reader(
f,
delimiter='\t',
quoting=csv.QUOTE_NONE,
escapechar=None
)
for line_num, row in enumerate(reader, 1):
try:
if len(row) != 3:
raise ValueError(f"列数错误(预期3列,实际{len(row)}列)")
prompt = re.sub(r'[\r\n]+', ' ', row[0].strip())
chosen = re.sub(r'[\r\n]+', ' ', row[1].strip())
rejected = re.sub(r'[\r\n]+', ' ', row[2].strip())
if not (prompt and chosen and rejected):
raise ValueError("存在空值字段")
data.append({
'prompt': prompt,
'chosen': chosen,
'rejected': rejected
})
except Exception as e:
error_lines.append(f"行号 {line_num}:{str(e)}")
continue
df = pd.DataFrame(data)
print(f"加载{dataset_name}完成:")
print(f" 有效样本量:{len(df)}")
print(f" 总行数:{line_num},跳过错误行:{len(error_lines)}")
if error_lines:
print(f" 前5个错误示例:{error_lines[:5]}")
return df
def preprocess_text(text):
"""文本预处理:去特殊字符、分词、去停用词"""
text = str(text).lower()
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
return ' '.join(tokens)
def analyze_basic_stats(df, dataset_name, save_dir="plots"):
"""分析基础统计特征:长度分布、缺失值等"""
os.makedirs(save_dir, exist_ok=True)
# 计算文本长度(词数)
df['prompt_len'] = df['prompt'].apply(lambda x: len(word_tokenize(str(x))))
df['chosen_len'] = df['chosen'].apply(lambda x: len(word_tokenize(str(x))))
df['rejected_len'] = df['rejected'].apply(lambda x: len(word_tokenize(str(x))))
# 打印统计描述
print(f"\n--- {dataset_name} 文本长度统计 ---")
print("prompt长度:", describe(df['prompt_len']))
print("chosen长度:", describe(df['chosen_len']))
print("rejected长度:", describe(df['rejected_len']))
# 绘制长度分布直方图
plt.figure(figsize=(15, 5))
for i, col in enumerate(['prompt_len', 'chosen_len', 'rejected_len']):
plt.subplot(1, 3, i+1)
sns.histplot(df[col], kde=True, bins=50)
plt.title(f"{col} distribution ({dataset_name})")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(f"{save_dir}/{dataset_name}_length_dist.png")
plt.close()
return df
# ----------------------
# 3. 偏好标签差异分析(保持不变)
# ----------------------
def analyze_preference_diff(df, dataset_name, save_dir="plots"):
"""分析chosen与rejected的差异:长度差、语义相似度"""
# 长度差(chosen - rejected)
df['len_diff'] = df['chosen_len'] - df['rejected_len']
# 语义相似度(用预训练模型计算)
model = SentenceTransformer('all-MiniLM-L6-v2')
chosen_emb = model.encode(df['chosen'].tolist(), show_progress_bar=True)
rejected_emb = model.encode(df['rejected'].tolist(), show_progress_bar=True)
df['similarity'] = [cosine_similarity([c], [r])[0][0] for c, r in zip(chosen_emb, rejected_emb)]
# 可视化
plt.figure(figsize=(12, 5))
# 长度差分布
plt.subplot(1, 2, 1)
sns.histplot(df['len_diff'], kde=True, bins=50)
plt.axvline(x=0, color='r', linestyle='--', label='0 (equal length)')
plt.title(f"Chosen-Rejected Length Diff ({dataset_name})")
plt.xlabel("Length Difference (words)")
plt.legend()
# 相似度分布
plt.subplot(1, 2, 2)
sns.histplot(df['similarity'], kde=True, bins=50, color='green')
plt.title(f"Chosen-Rejected Similarity ({dataset_name})")
plt.xlabel("Cosine Similarity")
plt.tight_layout()
plt.savefig(f"{save_dir}/{dataset_name}_preference_diff.png")
plt.close()
# 打印统计
print(f"\n--- {dataset_name} 偏好差异统计 ---")
print("长度差(chosen - rejected):", describe(df['len_diff']))
print("语义相似度:", describe(df['similarity']))
return df
# ----------------------
# 4. 主题分布分析(TF-IDF关键词)(保持不变)
# ----------------------
def analyze_topic_dist(df, dataset_name, top_n=20, save_dir="plots"):
"""用TF-IDF分析主题分布(高频关键词)"""
# 预处理文本
df['prompt_clean'] = df['prompt'].apply(preprocess_text)
# TF-IDF提取关键词
tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['prompt_clean'])
keywords = tfidf.get_feature_names_out()
tfidf_scores = np.sum(tfidf_matrix.toarray(), axis=0)
top_keywords = pd.Series(tfidf_scores, index=keywords).nlargest(top_n)
# 可视化高频关键词
plt.figure(figsize=(12, 6))
sns.barplot(x=top_keywords.values, y=top_keywords.index)
plt.title(f"Top {top_n} Keywords in Prompt ({dataset_name})")
plt.xlabel("TF-IDF Score Sum")
plt.tight_layout()
plt.savefig(f"{save_dir}/{dataset_name}_top_keywords.png")
plt.close()
print(f"\n--- {dataset_name} 高频关键词 ---")
print(top_keywords.index.tolist())
return top_keywords
# ----------------------
# 5. 跨数据集差异对比(保持不变)
# ----------------------
def compare_datasets(df1, df2, name1, name2, save_dir="plots"):
"""对比两个数据集的核心差异"""
# 1. 长度分布对比
plt.figure(figsize=(15, 5))
for i, col in enumerate(['prompt_len', 'chosen_len', 'rejected_len']):
plt.subplot(1, 3, i+1)
sns.kdeplot(df1[col], label=name1, fill=True)
sns.kdeplot(df2[col], label=name2, fill=True)
plt.title(f"{col} Distribution")
plt.xlabel("Word Count")
plt.legend()
plt.tight_layout()
plt.savefig(f"{save_dir}/length_compare.png")
plt.close()
# 2. 偏好差异对比(长度差、相似度)
plt.figure(figsize=(12, 5))
# 长度差对比
plt.subplot(1, 2, 1)
sns.kdeplot(df1['len_diff'], label=name1, fill=True)
sns.kdeplot(df2['len_diff'], label=name2, fill=True)
plt.axvline(x=0, color='r', linestyle='--')
plt.title("Chosen-Rejected Length Diff Comparison")
plt.xlabel("Length Difference")
plt.legend()
# 相似度对比
plt.subplot(1, 2, 2)
sns.kdeplot(df1['similarity'], label=name1, fill=True)
sns.kdeplot(df2['similarity'], label=name2, fill=True)
plt.title("Chosen-Rejected Similarity Comparison")
plt.xlabel("Cosine Similarity")
plt.legend()
plt.tight_layout()
plt.savefig(f"{save_dir}/preference_compare.png")
plt.close()
print("\n跨数据集对比图表已保存")
def main():
# 1. 加载数据(替换为实际路径)
stackexchange_path = "/data/team/lmq/prefenceModel/stack_exchange/data/reward/merged_reward_test.csv"
helpsteer2_path = "/data/team/lmq/prefenceModel/HelpSteer2/preference/helpsteer_reward_data_test.csv"
df_stack = load_dataset(stackexchange_path, "StackExchange")
df_help = load_dataset(helpsteer2_path, "HelpSteer2")
# 2. 基础统计分析
df_stack = analyze_basic_stats(df_stack, "StackExchange")
df_help = analyze_basic_stats(df_help, "HelpSteer2")
# 3. 偏好标签差异分析
df_stack = analyze_preference_diff(df_stack, "StackExchange")
df_help = analyze_preference_diff(df_help, "HelpSteer2")
# 4. 主题分布分析
keywords_stack = analyze_topic_dist(df_stack, "StackExchange")
keywords_help = analyze_topic_dist(df_help, "HelpSteer2")
# 5. 跨数据集对比
compare_datasets(df_stack, df_help, "StackExchange", "HelpSteer2")
# 6. 计算主题相关性(关键词重叠度)
common_keywords = set(keywords_stack.index) & set(keywords_help.index)
print(f"\n--- 主题相关性分析 ---")
print(f"共同高频关键词数量:{len(common_keywords)}")
print(f"共同关键词:{list(common_keywords)[:10]}...")
print(f"主题重叠度:{len(common_keywords)/len(set(keywords_stack.index)|set(keywords_help.index)):.2f}")
if __name__ == "__main__":
main()
代码功能说明
结果解读方向

本文来自博客园,作者:limingqi,转载请注明原文链接:https://www.cnblogs.com/limingqi/p/19009033

浙公网安备 33010602011771号