kraken2mpa

#!/bin/bash

# 直接从Kraken2报告生成合并的MPA格式文件
# 此脚本跳过中间步骤,直接创建包含所有分类层级和所有样本的MPA格式文件

# 设置路径变量
BASE_DIR="/data/khanh/hurmicro"
KRAKEN_DIR="${BASE_DIR}/5.annotation/kraken2"
MPA_DIR="${BASE_DIR}/5.annotation/kraken2/mpa"
LOG_DIR="${MPA_DIR}/logs"

# 创建输出目录
mkdir -p "$MPA_DIR"
mkdir -p "$LOG_DIR"

# 日志文件
LOG_FILE="${LOG_DIR}/direct_mpa_generation_$(date +%Y%m%d_%H%M%S).log"

echo "===== 开始直接生成合并MPA格式文件 =====" | tee -a "$LOG_FILE"
echo "开始时间: $(date)" | tee -a "$LOG_FILE"

# 创建Python脚本
PYTHON_SCRIPT="${MPA_DIR}/generate_combined_mpa.py"

echo "创建Python处理脚本" | tee -a "$LOG_FILE"
cat > "$PYTHON_SCRIPT" << 'EOF'
#!/usr/bin/env python3
"""
直接从Kraken2报告生成合并的MPA格式文件
"""

import os
import sys
import glob
import argparse
from collections import defaultdict

def parse_kraken_report(report_file):
    """解析单个Kraken2报告文件,提取分类信息"""
    sample_name = os.path.basename(report_file).replace('_kraken2.report', '')
    taxonomy_dict = {}
    lineage_dict = {}
    
    # 分类级别代码映射
    rank_codes = {
        'D': 'k__',  # 域/超界 (Domain)
        'P': 'p__',  # 门 (Phylum)
        'C': 'c__',  # 纲 (Class)
        'O': 'o__',  # 目 (Order)
        'F': 'f__',  # 科 (Family)
        'G': 'g__',  # 属 (Genus)
        'S': 's__',  # 种 (Species)
    }
    
    try:
        with open(report_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) < 6:
                    continue
                    
                percentage = float(parts[0])
                reads = int(parts[1])
                rank_code = parts[3]
                taxid = parts[4]
                name = parts[5].lstrip()
                
                # 跳过未分类和根
                if taxid == '0' or taxid == '1':
                    continue
                    
                # 仅处理主要分类级别
                if rank_code in rank_codes:
                    taxonomy_dict[taxid] = {
                        'name': name,
                        'rank': rank_code,
                        'reads': reads,
                        'percentage': percentage
                    }
        
        # 第二次遍历,构建完整的分类谱系
        with open(report_file, 'r') as f:
            current_lineage = []
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) < 6:
                    continue
                    
                rank_code = parts[3]
                taxid = parts[4]
                name = parts[5].lstrip()
                indent = len(parts[5]) - len(name)
                
                # 跳过未分类和根
                if taxid == '0' or taxid == '1':
                    continue
                
                # 根据缩进调整当前谱系
                while len(current_lineage) > 0 and current_lineage[-1]['indent'] >= indent:
                    current_lineage.pop()
                
                # 添加当前分类
                current_entry = {
                    'taxid': taxid,
                    'name': name,
                    'rank': rank_code,
                    'indent': indent
                }
                current_lineage.append(current_entry)
                
                # 只为主要分类级别构建谱系字符串
                if rank_code in rank_codes and taxid in taxonomy_dict:
                    lineage = []
                    for entry in current_lineage:
                        if entry['rank'] in rank_codes:
                            lineage.append(f"{rank_codes[entry['rank']]}{entry['name']}")
                    
                    if lineage:
                        lineage_str = '|'.join(lineage)
                        lineage_dict[lineage_str] = taxonomy_dict[taxid]['reads']
        
        return sample_name, lineage_dict
    
    except Exception as e:
        print(f"处理文件 {report_file} 时出错: {e}")
        return sample_name, {}

def generate_combined_mpa(kraken_dir, output_file):
    """从多个Kraken2报告生成合并的MPA格式文件"""
    # 获取所有Kraken2报告文件
    report_files = glob.glob(os.path.join(kraken_dir, '*_kraken2.report'))
    
    if not report_files:
        print(f"错误: 未找到任何Kraken2报告文件")
        return False
    
    print(f"找到 {len(report_files)} 个Kraken2报告文件")
    
    # 存储所有样本的数据
    all_data = defaultdict(dict)
    samples = []
    
    # 处理每个报告文件
    for report_file in sorted(report_files):
        try:
            sample_name, lineage_dict = parse_kraken_report(report_file)
            if sample_name and lineage_dict:
                samples.append(sample_name)
                
                # 添加该样本的数据
                for lineage, count in lineage_dict.items():
                    all_data[lineage][sample_name] = count
                
                print(f"成功处理样本: {sample_name}")
            else:
                print(f"警告: 样本 {os.path.basename(report_file)} 未返回有效数据")
        except Exception as e:
            print(f"处理文件 {report_file} 时出错: {e}")
    
    # 检查是否有任何样本数据
    if not samples:
        print("错误: 未找到任何有效样本数据")
        return False
    
    print(f"共处理了 {len(samples)} 个样本,找到 {len(all_data)} 个分类单元")
    
    # 写入合并文件
    try:
        with open(output_file, 'w') as out:
            # 写入标题行
            out.write('#Classification\t' + '\t'.join(samples) + '\n')
            
            # 写入数据行
            for lineage in sorted(all_data.keys()):
                row = [lineage]
                for sample in samples:
                    row.append(str(all_data[lineage].get(sample, 0)))
                out.write('\t'.join(row) + '\n')
        
        print(f"成功创建合并文件: {output_file}")
        return True
    except Exception as e:
        print(f"创建合并文件时出错: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description='直接从Kraken2报告生成合并的MPA格式文件')
    parser.add_argument('-i', '--input', required=True, help='包含Kraken2报告的目录')
    parser.add_argument('-o', '--output', required=True, help='输出合并MPA文件的路径')
    args = parser.parse_args()
    
    if generate_combined_mpa(args.input, args.output):
        print("成功生成合并的MPA格式文件")
        return 0
    else:
        print("生成合并的MPA格式文件失败")
        return 1

if __name__ == '__main__':
    sys.exit(main())
EOF

# 添加执行权限
chmod +x "$PYTHON_SCRIPT"

# 备份当前的合并文件
if [ -f "${MPA_DIR}/combined_mpa.txt" ]; then
    echo "备份当前的合并文件" | tee -a "$LOG_FILE"
    mv "${MPA_DIR}/combined_mpa.txt" "${MPA_DIR}/combined_mpa.txt.bak.$(date +%Y%m%d_%H%M%S)"
fi

# 运行Python脚本直接生成合并的MPA文件
echo "开始直接生成合并MPA文件..." | tee -a "$LOG_FILE"
python "$PYTHON_SCRIPT" -i "$KRAKEN_DIR" -o "${MPA_DIR}/combined_mpa.txt" 2>&1 | tee -a "$LOG_FILE"

# 检查结果
if [ -f "${MPA_DIR}/combined_mpa.txt" ]; then
    echo "合并MPA文件创建成功" | tee -a "$LOG_FILE"
    
    # 获取文件大小
    FILE_SIZE=$(du -h "${MPA_DIR}/combined_mpa.txt" | cut -f1)
    echo "合并文件大小: $FILE_SIZE" | tee -a "$LOG_FILE"
    
    # 获取样本数和分类单元数
    HEADER=$(head -1 "${MPA_DIR}/combined_mpa.txt")
    SAMPLE_COUNT=$(echo "$HEADER" | awk -F'\t' '{print NF-1}')
    TAXONOMY_COUNT=$(wc -l < "${MPA_DIR}/combined_mpa.txt")
    
    echo "合并文件包含 $SAMPLE_COUNT 个样本和 $((TAXONOMY_COUNT-1)) 个分类单元" | tee -a "$LOG_FILE"
else
    echo "错误: 合并MPA文件创建失败" | tee -a "$LOG_FILE"
    exit 1
fi

echo "完成时间: $(date)" | tee -a "$LOG_FILE"

posted on 2025-03-31 21:19  Hahntoh  阅读(135)  评论(0)    收藏  举报

导航