mpandbracken

#!/bin/bash

# 将MPA格式的分类信息添加到Bracken合并结果文件中
# 创建只包含到对应级别分类信息的增强版Bracken结果

# 设置路径变量
BASE_DIR="/data/khanh/hurmicro"
KRAKEN_DIR="${BASE_DIR}/5.annotation/kraken2"
MPA_DIR="${BASE_DIR}/5.annotation/kraken2/mpa"
ENHANCED_DIR="${BASE_DIR}/5.annotation/bracken/enhanced"
LOG_DIR="${MPA_DIR}/logs"

# 创建输出目录
mkdir -p "$ENHANCED_DIR"
mkdir -p "$LOG_DIR"

# 日志文件
LOG_FILE="${LOG_DIR}/enhance_bracken_$(date +%Y%m%d_%H%M%S).log"

echo "===== 开始增强Bracken结果文件 =====" | tee -a "$LOG_FILE"
echo "开始时间: $(date)" | tee -a "$LOG_FILE"

# 检查所需文件是否存在
MPA_FILE="${MPA_DIR}/combined_mpa.txt"
if [ ! -f "$MPA_FILE" ]; then
    echo "错误: MPA文件不存在 ($MPA_FILE)" | tee -a "$LOG_FILE"
    echo "请先运行create_mpa_format.sh脚本生成MPA文件" | tee -a "$LOG_FILE"
    exit 1
fi

# 分类级别及对应的合并文件
declare -A LEVEL_FILES
LEVEL_FILES=(
    ["species"]="${KRAKEN_DIR}/combined_bracken_species.txt"
    ["genus"]="${KRAKEN_DIR}/combined_bracken_genus.txt"
    ["family"]="${KRAKEN_DIR}/combined_bracken_family.txt"
    ["order"]="${KRAKEN_DIR}/combined_bracken_order.txt"
    ["class"]="${KRAKEN_DIR}/combined_bracken_class.txt"
    ["phylum"]="${KRAKEN_DIR}/combined_bracken_phylum.txt"
    ["domain"]="${KRAKEN_DIR}/combined_bracken_domain.txt"
)

# 分类级别映射及排序
declare -A LEVEL_INDICES
LEVEL_INDICES=(
    ["domain"]=0
    ["phylum"]=1
    ["class"]=2
    ["order"]=3
    ["family"]=4
    ["genus"]=5
    ["species"]=6
)

# 创建Python脚本来处理数据
PYTHON_SCRIPT="${ENHANCED_DIR}/enhance_bracken.py"

echo "创建Python处理脚本" | tee -a "$LOG_FILE"
cat > "$PYTHON_SCRIPT" << 'EOF'
#!/usr/bin/env python3
"""
将MPA格式的分类信息添加到Bracken合并结果文件中
但只包含到对应级别的分类信息
"""

import os
import sys
import argparse
import pandas as pd
from collections import defaultdict

def parse_mpa_file(mpa_file):
    """解析MPA文件,提取分类信息"""
    taxonomy_map = {}
    taxonomy_levels = {
        'k__': 'domain',
        'p__': 'phylum',
        'c__': 'class', 
        'o__': 'order',
        'f__': 'family',
        'g__': 'genus',
        's__': 'species'
    }
    
    with open(mpa_file, 'r') as f:
        next(f)  # 跳过标题行
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
            
            lineage = parts[0]
            lineage_parts = lineage.split('|')
            
            # 获取该行的最后一个分类单元
            last_part = lineage_parts[-1]
            for prefix, level in taxonomy_levels.items():
                if last_part.startswith(prefix):
                    # 提取名称和级别
                    name = last_part[len(prefix):]
                    tax_level = level
                    
                    # 构建完整分类谱系字典
                    full_taxonomy = {}
                    for part in lineage_parts:
                        for p, l in taxonomy_levels.items():
                            if part.startswith(p):
                                full_taxonomy[l] = part[len(p):]
                                break
                    
                    # 保存数据,使用名称和税号作为键 (适配Bracken输出)
                    if tax_level == 'species' and name:
                        taxonomy_map[name] = full_taxonomy
                    
                    break
    
    return taxonomy_map

def enhance_bracken_file(bracken_file, taxonomy_map, output_file, level):
    """增强Bracken文件,只添加到对应级别的分类信息"""
    try:
        # 分类级别对应的列名和索引
        level_columns = {
            'domain': 'Domain',
            'phylum': 'Phylum',
            'class': 'Class',
            'order': 'Order',
            'family': 'Family',
            'genus': 'Genus',
            'species': 'Species'
        }
        
        # 级别索引,用于确定显示哪些列
        level_indices = {
            'domain': 0,
            'phylum': 1,
            'class': 2,
            'order': 3,
            'family': 4,
            'genus': 5,
            'species': 6
        }
        
        # 验证级别
        if level not in level_indices:
            print(f"错误: 无效的分类级别 '{level}'")
            return False
        
        # 获取当前级别索引
        current_index = level_indices[level]
        
        # 读取Bracken文件
        df = pd.read_csv(bracken_file, sep='\t')
        
        # 只创建到当前级别的列
        for l, col_name in level_columns.items():
            if level_indices[l] <= current_index:
                df[col_name] = ""
        
        # 填充分类信息
        for index, row in df.iterrows():
            name = row['name']
            
            # 对于非物种级别,尝试直接匹配
            if level != 'species' and name in taxonomy_map:
                tax_info = taxonomy_map[name]
                for l, col_name in level_columns.items():
                    if level_indices[l] <= current_index and l in tax_info:
                        df.at[index, col_name] = tax_info[l]
            else:
                # 尝试找到匹配的分类信息
                found = False
                
                # 对于物种级别,直接查找
                if level == 'species' and name in taxonomy_map:
                    tax_info = taxonomy_map[name]
                    for l, col_name in level_columns.items():
                        if level_indices[l] <= current_index and l in tax_info:
                            df.at[index, col_name] = tax_info[l]
                    found = True
                
                # 对于其他级别,使用部分匹配
                if not found and level != 'species':
                    for tax_name, tax_info in taxonomy_map.items():
                        if level in tax_info and tax_info[level] == name:
                            for l, col_name in level_columns.items():
                                if level_indices[l] <= current_index and l in tax_info:
                                    df.at[index, col_name] = tax_info[l]
                            found = True
                            break
        
        # 重新排列列,将分类信息放在前面
        cols = df.columns.tolist()
        taxonomy_cols = [level_columns[l] for l in level_columns if level_indices[l] <= current_index]
        other_cols = [c for c in cols if c not in list(level_columns.values())]
        new_cols = taxonomy_cols + other_cols
        
        # 保存结果
        df = df[new_cols]
        df.to_csv(output_file, sep='\t', index=False)
        
        print(f"已创建增强版文件,只包含到 {level} 级别的分类信息")
        return True
    except Exception as e:
        print(f"错误处理文件 {bracken_file}: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description='增强Bracken结果文件,添加完整分类信息')
    parser.add_argument('-m', '--mpa', required=True, help='MPA格式文件')
    parser.add_argument('-b', '--bracken', required=True, help='Bracken合并结果文件')
    parser.add_argument('-o', '--output', required=True, help='输出文件')
    parser.add_argument('-l', '--level', required=True, help='分类级别')
    args = parser.parse_args()
    
    # 解析MPA文件
    print(f"解析MPA文件: {args.mpa}")
    taxonomy_map = parse_mpa_file(args.mpa)
    print(f"提取了 {len(taxonomy_map)} 个分类单元的信息")
    
    # 增强Bracken文件
    print(f"处理Bracken文件: {args.bracken}")
    result = enhance_bracken_file(args.bracken, taxonomy_map, args.output, args.level)
    
    if result:
        print(f"成功创建增强版结果文件: {args.output}")
    else:
        print(f"处理失败")
        return 1
    
    return 0

if __name__ == '__main__':
    sys.exit(main())
EOF

# 添加执行权限
chmod +x "$PYTHON_SCRIPT"

# 处理每个分类级别的Bracken结果
for LEVEL in "${!LEVEL_FILES[@]}"; do
    BRACKEN_FILE="${LEVEL_FILES[$LEVEL]}"
    
    # 检查文件是否存在
    if [ ! -f "$BRACKEN_FILE" ]; then
        echo "警告: Bracken结果文件不存在 ($BRACKEN_FILE)" | tee -a "$LOG_FILE"
        continue
    fi
    
    ENHANCED_FILE="${ENHANCED_DIR}/enhanced_bracken_${LEVEL}.txt"
    echo "处理 $LEVEL 级别的Bracken结果" | tee -a "$LOG_FILE"
    
    # 运行Python脚本增强Bracken结果
    python "$PYTHON_SCRIPT" \
        -m "$MPA_FILE" \
        -b "$BRACKEN_FILE" \
        -o "$ENHANCED_FILE" \
        -l "$LEVEL" 2>&1 | tee -a "${LOG_DIR}/enhance_${LEVEL}.log"
    
    if [ $? -eq 0 ]; then
        echo "成功创建增强版 $LEVEL 结果: $ENHANCED_FILE" | tee -a "$LOG_FILE"
    else
        echo "错误: 处理 $LEVEL 级别失败" | tee -a "$LOG_FILE"
    fi
done

echo "===== 增强Bracken结果完成 =====" | tee -a "$LOG_FILE"
echo "结果保存在: $ENHANCED_DIR" | tee -a "$LOG_FILE"
echo "完成时间: $(date)" | tee -a "$LOG_FILE"

posted on 2025-03-31 21:21  Hahntoh  阅读(65)  评论(0)    收藏  举报

导航