kraken2mpa
#!/bin/bash
# 直接从Kraken2报告生成合并的MPA格式文件
# 此脚本跳过中间步骤,直接创建包含所有分类层级和所有样本的MPA格式文件
# 设置路径变量
BASE_DIR="/data/khanh/hurmicro"
KRAKEN_DIR="${BASE_DIR}/5.annotation/kraken2"
MPA_DIR="${BASE_DIR}/5.annotation/kraken2/mpa"
LOG_DIR="${MPA_DIR}/logs"
# 创建输出目录
mkdir -p "$MPA_DIR"
mkdir -p "$LOG_DIR"
# 日志文件
LOG_FILE="${LOG_DIR}/direct_mpa_generation_$(date +%Y%m%d_%H%M%S).log"
echo "===== 开始直接生成合并MPA格式文件 =====" | tee -a "$LOG_FILE"
echo "开始时间: $(date)" | tee -a "$LOG_FILE"
# 创建Python脚本
PYTHON_SCRIPT="${MPA_DIR}/generate_combined_mpa.py"
echo "创建Python处理脚本" | tee -a "$LOG_FILE"
cat > "$PYTHON_SCRIPT" << 'EOF'
#!/usr/bin/env python3
"""
直接从Kraken2报告生成合并的MPA格式文件
"""
import os
import sys
import glob
import argparse
from collections import defaultdict
def parse_kraken_report(report_file):
"""解析单个Kraken2报告文件,提取分类信息"""
sample_name = os.path.basename(report_file).replace('_kraken2.report', '')
taxonomy_dict = {}
lineage_dict = {}
# 分类级别代码映射
rank_codes = {
'D': 'k__', # 域/超界 (Domain)
'P': 'p__', # 门 (Phylum)
'C': 'c__', # 纲 (Class)
'O': 'o__', # 目 (Order)
'F': 'f__', # 科 (Family)
'G': 'g__', # 属 (Genus)
'S': 's__', # 种 (Species)
}
try:
with open(report_file, 'r') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) < 6:
continue
percentage = float(parts[0])
reads = int(parts[1])
rank_code = parts[3]
taxid = parts[4]
name = parts[5].lstrip()
# 跳过未分类和根
if taxid == '0' or taxid == '1':
continue
# 仅处理主要分类级别
if rank_code in rank_codes:
taxonomy_dict[taxid] = {
'name': name,
'rank': rank_code,
'reads': reads,
'percentage': percentage
}
# 第二次遍历,构建完整的分类谱系
with open(report_file, 'r') as f:
current_lineage = []
for line in f:
parts = line.strip().split('\t')
if len(parts) < 6:
continue
rank_code = parts[3]
taxid = parts[4]
name = parts[5].lstrip()
indent = len(parts[5]) - len(name)
# 跳过未分类和根
if taxid == '0' or taxid == '1':
continue
# 根据缩进调整当前谱系
while len(current_lineage) > 0 and current_lineage[-1]['indent'] >= indent:
current_lineage.pop()
# 添加当前分类
current_entry = {
'taxid': taxid,
'name': name,
'rank': rank_code,
'indent': indent
}
current_lineage.append(current_entry)
# 只为主要分类级别构建谱系字符串
if rank_code in rank_codes and taxid in taxonomy_dict:
lineage = []
for entry in current_lineage:
if entry['rank'] in rank_codes:
lineage.append(f"{rank_codes[entry['rank']]}{entry['name']}")
if lineage:
lineage_str = '|'.join(lineage)
lineage_dict[lineage_str] = taxonomy_dict[taxid]['reads']
return sample_name, lineage_dict
except Exception as e:
print(f"处理文件 {report_file} 时出错: {e}")
return sample_name, {}
def generate_combined_mpa(kraken_dir, output_file):
"""从多个Kraken2报告生成合并的MPA格式文件"""
# 获取所有Kraken2报告文件
report_files = glob.glob(os.path.join(kraken_dir, '*_kraken2.report'))
if not report_files:
print(f"错误: 未找到任何Kraken2报告文件")
return False
print(f"找到 {len(report_files)} 个Kraken2报告文件")
# 存储所有样本的数据
all_data = defaultdict(dict)
samples = []
# 处理每个报告文件
for report_file in sorted(report_files):
try:
sample_name, lineage_dict = parse_kraken_report(report_file)
if sample_name and lineage_dict:
samples.append(sample_name)
# 添加该样本的数据
for lineage, count in lineage_dict.items():
all_data[lineage][sample_name] = count
print(f"成功处理样本: {sample_name}")
else:
print(f"警告: 样本 {os.path.basename(report_file)} 未返回有效数据")
except Exception as e:
print(f"处理文件 {report_file} 时出错: {e}")
# 检查是否有任何样本数据
if not samples:
print("错误: 未找到任何有效样本数据")
return False
print(f"共处理了 {len(samples)} 个样本,找到 {len(all_data)} 个分类单元")
# 写入合并文件
try:
with open(output_file, 'w') as out:
# 写入标题行
out.write('#Classification\t' + '\t'.join(samples) + '\n')
# 写入数据行
for lineage in sorted(all_data.keys()):
row = [lineage]
for sample in samples:
row.append(str(all_data[lineage].get(sample, 0)))
out.write('\t'.join(row) + '\n')
print(f"成功创建合并文件: {output_file}")
return True
except Exception as e:
print(f"创建合并文件时出错: {e}")
return False
def main():
parser = argparse.ArgumentParser(description='直接从Kraken2报告生成合并的MPA格式文件')
parser.add_argument('-i', '--input', required=True, help='包含Kraken2报告的目录')
parser.add_argument('-o', '--output', required=True, help='输出合并MPA文件的路径')
args = parser.parse_args()
if generate_combined_mpa(args.input, args.output):
print("成功生成合并的MPA格式文件")
return 0
else:
print("生成合并的MPA格式文件失败")
return 1
if __name__ == '__main__':
sys.exit(main())
EOF
# 添加执行权限
chmod +x "$PYTHON_SCRIPT"
# 备份当前的合并文件
if [ -f "${MPA_DIR}/combined_mpa.txt" ]; then
echo "备份当前的合并文件" | tee -a "$LOG_FILE"
mv "${MPA_DIR}/combined_mpa.txt" "${MPA_DIR}/combined_mpa.txt.bak.$(date +%Y%m%d_%H%M%S)"
fi
# 运行Python脚本直接生成合并的MPA文件
echo "开始直接生成合并MPA文件..." | tee -a "$LOG_FILE"
python "$PYTHON_SCRIPT" -i "$KRAKEN_DIR" -o "${MPA_DIR}/combined_mpa.txt" 2>&1 | tee -a "$LOG_FILE"
# 检查结果
if [ -f "${MPA_DIR}/combined_mpa.txt" ]; then
echo "合并MPA文件创建成功" | tee -a "$LOG_FILE"
# 获取文件大小
FILE_SIZE=$(du -h "${MPA_DIR}/combined_mpa.txt" | cut -f1)
echo "合并文件大小: $FILE_SIZE" | tee -a "$LOG_FILE"
# 获取样本数和分类单元数
HEADER=$(head -1 "${MPA_DIR}/combined_mpa.txt")
SAMPLE_COUNT=$(echo "$HEADER" | awk -F'\t' '{print NF-1}')
TAXONOMY_COUNT=$(wc -l < "${MPA_DIR}/combined_mpa.txt")
echo "合并文件包含 $SAMPLE_COUNT 个样本和 $((TAXONOMY_COUNT-1)) 个分类单元" | tee -a "$LOG_FILE"
else
echo "错误: 合并MPA文件创建失败" | tee -a "$LOG_FILE"
exit 1
fi
echo "完成时间: $(date)" | tee -a "$LOG_FILE"
本文来自博客园,作者:Hahntoh,转载请注明原文链接:https://www.cnblogs.com/hahn/p/18803008
浙公网安备 33010602011771号