数据分布差异鉴定指南
一、基本统计特征分布
二、内容主题与领域分布
三、语言风格与结构分布
四、实体与词汇分布
五、任务 / 功能类型分布
六、相关性分析(补充维度)
总结
代码实现:
import json import csv import os import re import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer from scipy.stats import describe # 下载NLTK资源(新增:添加punkt_tab的下载) import nltk nltk.download('punkt') nltk.download('punkt_tab') # 关键:解决缺失punkt_tab的错误 nltk.download('stopwords') nltk.download('wordnet') import matplotlib.pyplot as plt plt.rcParams['font.family'] ='sans-serif' plt.rcParams['font.sans-serif'] = ['SimSun'] plt.rcParams['axes.unicode_minus'] = False def convert_helpsteer_to_reward_format(input_path, output_path): """将HelpSteer2的原始preference.jsonl转换为prompt\tchosen\trejected格式的TSV文件""" with open(input_path, "r", encoding="utf-8") as fin, \ open(output_path, "w", encoding="utf-8", newline="") as fout: tsv_writer = csv.writer(fout, delimiter='\t') for line_num, line in enumerate(fin, 1): try: data = json.loads(line) prompt = data["prompt"] if data["preference_strength"] > 0: chosen = data["response_2"] rejected = data["response_1"] else: chosen = data["response_1"] rejected = data["response_2"] tsv_writer.writerow([prompt, chosen, rejected]) except Exception as e: print(f"处理第{line_num}行失败: {e},已跳过") print(f"转换完成,输出文件: {output_path}") def load_dataset(file_path, dataset_name): """加载数据集(手动解析避免EOF inside string错误)""" data = [] error_lines = [] with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: reader = csv.reader( f, delimiter='\t', quoting=csv.QUOTE_NONE, escapechar=None ) for line_num, row in enumerate(reader, 1): try: if len(row) != 3: raise ValueError(f"列数错误(预期3列,实际{len(row)}列)") prompt = re.sub(r'[\r\n]+', ' ', row[0].strip()) chosen = re.sub(r'[\r\n]+', ' ', row[1].strip()) rejected = re.sub(r'[\r\n]+', ' ', row[2].strip()) if not (prompt and chosen and rejected): raise ValueError("存在空值字段") data.append({ 'prompt': prompt, 'chosen': chosen, 'rejected': rejected }) except Exception as e: error_lines.append(f"行号 {line_num}:{str(e)}") continue df = pd.DataFrame(data) print(f"加载{dataset_name}完成:") print(f" 有效样本量:{len(df)}") print(f" 总行数:{line_num},跳过错误行:{len(error_lines)}") if error_lines: print(f" 前5个错误示例:{error_lines[:5]}") return df def preprocess_text(text): """文本预处理:去特殊字符、分词、去停用词""" text = str(text).lower() text = re.sub(r'[^a-zA-Z0-9\s]', '', text) tokens = word_tokenize(text) stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] return ' '.join(tokens) def analyze_basic_stats(df, dataset_name, save_dir="plots"): """分析基础统计特征:长度分布、缺失值等""" os.makedirs(save_dir, exist_ok=True) # 计算文本长度(词数) df['prompt_len'] = df['prompt'].apply(lambda x: len(word_tokenize(str(x)))) df['chosen_len'] = df['chosen'].apply(lambda x: len(word_tokenize(str(x)))) df['rejected_len'] = df['rejected'].apply(lambda x: len(word_tokenize(str(x)))) # 打印统计描述 print(f"\n--- {dataset_name} 文本长度统计 ---") print("prompt长度:", describe(df['prompt_len'])) print("chosen长度:", describe(df['chosen_len'])) print("rejected长度:", describe(df['rejected_len'])) # 绘制长度分布直方图 plt.figure(figsize=(15, 5)) for i, col in enumerate(['prompt_len', 'chosen_len', 'rejected_len']): plt.subplot(1, 3, i+1) sns.histplot(df[col], kde=True, bins=50) plt.title(f"{col} distribution ({dataset_name})") plt.xlabel("Word Count") plt.ylabel("Frequency") plt.tight_layout() plt.savefig(f"{save_dir}/{dataset_name}_length_dist.png") plt.close() return df # ---------------------- # 3. 偏好标签差异分析(保持不变) # ---------------------- def analyze_preference_diff(df, dataset_name, save_dir="plots"): """分析chosen与rejected的差异:长度差、语义相似度""" # 长度差(chosen - rejected) df['len_diff'] = df['chosen_len'] - df['rejected_len'] # 语义相似度(用预训练模型计算) model = SentenceTransformer('all-MiniLM-L6-v2') chosen_emb = model.encode(df['chosen'].tolist(), show_progress_bar=True) rejected_emb = model.encode(df['rejected'].tolist(), show_progress_bar=True) df['similarity'] = [cosine_similarity([c], [r])[0][0] for c, r in zip(chosen_emb, rejected_emb)] # 可视化 plt.figure(figsize=(12, 5)) # 长度差分布 plt.subplot(1, 2, 1) sns.histplot(df['len_diff'], kde=True, bins=50) plt.axvline(x=0, color='r', linestyle='--', label='0 (equal length)') plt.title(f"Chosen-Rejected Length Diff ({dataset_name})") plt.xlabel("Length Difference (words)") plt.legend() # 相似度分布 plt.subplot(1, 2, 2) sns.histplot(df['similarity'], kde=True, bins=50, color='green') plt.title(f"Chosen-Rejected Similarity ({dataset_name})") plt.xlabel("Cosine Similarity") plt.tight_layout() plt.savefig(f"{save_dir}/{dataset_name}_preference_diff.png") plt.close() # 打印统计 print(f"\n--- {dataset_name} 偏好差异统计 ---") print("长度差(chosen - rejected):", describe(df['len_diff'])) print("语义相似度:", describe(df['similarity'])) return df # ---------------------- # 4. 主题分布分析(TF-IDF关键词)(保持不变) # ---------------------- def analyze_topic_dist(df, dataset_name, top_n=20, save_dir="plots"): """用TF-IDF分析主题分布(高频关键词)""" # 预处理文本 df['prompt_clean'] = df['prompt'].apply(preprocess_text) # TF-IDF提取关键词 tfidf = TfidfVectorizer(max_features=1000) tfidf_matrix = tfidf.fit_transform(df['prompt_clean']) keywords = tfidf.get_feature_names_out() tfidf_scores = np.sum(tfidf_matrix.toarray(), axis=0) top_keywords = pd.Series(tfidf_scores, index=keywords).nlargest(top_n) # 可视化高频关键词 plt.figure(figsize=(12, 6)) sns.barplot(x=top_keywords.values, y=top_keywords.index) plt.title(f"Top {top_n} Keywords in Prompt ({dataset_name})") plt.xlabel("TF-IDF Score Sum") plt.tight_layout() plt.savefig(f"{save_dir}/{dataset_name}_top_keywords.png") plt.close() print(f"\n--- {dataset_name} 高频关键词 ---") print(top_keywords.index.tolist()) return top_keywords # ---------------------- # 5. 跨数据集差异对比(保持不变) # ---------------------- def compare_datasets(df1, df2, name1, name2, save_dir="plots"): """对比两个数据集的核心差异""" # 1. 长度分布对比 plt.figure(figsize=(15, 5)) for i, col in enumerate(['prompt_len', 'chosen_len', 'rejected_len']): plt.subplot(1, 3, i+1) sns.kdeplot(df1[col], label=name1, fill=True) sns.kdeplot(df2[col], label=name2, fill=True) plt.title(f"{col} Distribution") plt.xlabel("Word Count") plt.legend() plt.tight_layout() plt.savefig(f"{save_dir}/length_compare.png") plt.close() # 2. 偏好差异对比(长度差、相似度) plt.figure(figsize=(12, 5)) # 长度差对比 plt.subplot(1, 2, 1) sns.kdeplot(df1['len_diff'], label=name1, fill=True) sns.kdeplot(df2['len_diff'], label=name2, fill=True) plt.axvline(x=0, color='r', linestyle='--') plt.title("Chosen-Rejected Length Diff Comparison") plt.xlabel("Length Difference") plt.legend() # 相似度对比 plt.subplot(1, 2, 2) sns.kdeplot(df1['similarity'], label=name1, fill=True) sns.kdeplot(df2['similarity'], label=name2, fill=True) plt.title("Chosen-Rejected Similarity Comparison") plt.xlabel("Cosine Similarity") plt.legend() plt.tight_layout() plt.savefig(f"{save_dir}/preference_compare.png") plt.close() print("\n跨数据集对比图表已保存") def main(): # 1. 加载数据(替换为实际路径) stackexchange_path = "/data/team/lmq/prefenceModel/stack_exchange/data/reward/merged_reward_test.csv" helpsteer2_path = "/data/team/lmq/prefenceModel/HelpSteer2/preference/helpsteer_reward_data_test.csv" df_stack = load_dataset(stackexchange_path, "StackExchange") df_help = load_dataset(helpsteer2_path, "HelpSteer2") # 2. 基础统计分析 df_stack = analyze_basic_stats(df_stack, "StackExchange") df_help = analyze_basic_stats(df_help, "HelpSteer2") # 3. 偏好标签差异分析 df_stack = analyze_preference_diff(df_stack, "StackExchange") df_help = analyze_preference_diff(df_help, "HelpSteer2") # 4. 主题分布分析 keywords_stack = analyze_topic_dist(df_stack, "StackExchange") keywords_help = analyze_topic_dist(df_help, "HelpSteer2") # 5. 跨数据集对比 compare_datasets(df_stack, df_help, "StackExchange", "HelpSteer2") # 6. 计算主题相关性(关键词重叠度) common_keywords = set(keywords_stack.index) & set(keywords_help.index) print(f"\n--- 主题相关性分析 ---") print(f"共同高频关键词数量:{len(common_keywords)}") print(f"共同关键词:{list(common_keywords)[:10]}...") print(f"主题重叠度:{len(common_keywords)/len(set(keywords_stack.index)|set(keywords_help.index)):.2f}") if __name__ == "__main__": main()
代码功能说明
结果解读方向
本文来自博客园,作者:limingqi,转载请注明原文链接:https://www.cnblogs.com/limingqi/p/19009033