mpandbracken
#!/bin/bash
# 将MPA格式的分类信息添加到Bracken合并结果文件中
# 创建只包含到对应级别分类信息的增强版Bracken结果
# 设置路径变量
BASE_DIR="/data/khanh/hurmicro"
KRAKEN_DIR="${BASE_DIR}/5.annotation/kraken2"
MPA_DIR="${BASE_DIR}/5.annotation/kraken2/mpa"
ENHANCED_DIR="${BASE_DIR}/5.annotation/bracken/enhanced"
LOG_DIR="${MPA_DIR}/logs"
# 创建输出目录
mkdir -p "$ENHANCED_DIR"
mkdir -p "$LOG_DIR"
# 日志文件
LOG_FILE="${LOG_DIR}/enhance_bracken_$(date +%Y%m%d_%H%M%S).log"
echo "===== 开始增强Bracken结果文件 =====" | tee -a "$LOG_FILE"
echo "开始时间: $(date)" | tee -a "$LOG_FILE"
# 检查所需文件是否存在
MPA_FILE="${MPA_DIR}/combined_mpa.txt"
if [ ! -f "$MPA_FILE" ]; then
echo "错误: MPA文件不存在 ($MPA_FILE)" | tee -a "$LOG_FILE"
echo "请先运行create_mpa_format.sh脚本生成MPA文件" | tee -a "$LOG_FILE"
exit 1
fi
# 分类级别及对应的合并文件
declare -A LEVEL_FILES
LEVEL_FILES=(
["species"]="${KRAKEN_DIR}/combined_bracken_species.txt"
["genus"]="${KRAKEN_DIR}/combined_bracken_genus.txt"
["family"]="${KRAKEN_DIR}/combined_bracken_family.txt"
["order"]="${KRAKEN_DIR}/combined_bracken_order.txt"
["class"]="${KRAKEN_DIR}/combined_bracken_class.txt"
["phylum"]="${KRAKEN_DIR}/combined_bracken_phylum.txt"
["domain"]="${KRAKEN_DIR}/combined_bracken_domain.txt"
)
# 分类级别映射及排序
declare -A LEVEL_INDICES
LEVEL_INDICES=(
["domain"]=0
["phylum"]=1
["class"]=2
["order"]=3
["family"]=4
["genus"]=5
["species"]=6
)
# 创建Python脚本来处理数据
PYTHON_SCRIPT="${ENHANCED_DIR}/enhance_bracken.py"
echo "创建Python处理脚本" | tee -a "$LOG_FILE"
cat > "$PYTHON_SCRIPT" << 'EOF'
#!/usr/bin/env python3
"""
将MPA格式的分类信息添加到Bracken合并结果文件中
但只包含到对应级别的分类信息
"""
import os
import sys
import argparse
import pandas as pd
from collections import defaultdict
def parse_mpa_file(mpa_file):
"""解析MPA文件,提取分类信息"""
taxonomy_map = {}
taxonomy_levels = {
'k__': 'domain',
'p__': 'phylum',
'c__': 'class',
'o__': 'order',
'f__': 'family',
'g__': 'genus',
's__': 'species'
}
with open(mpa_file, 'r') as f:
next(f) # 跳过标题行
for line in f:
parts = line.strip().split('\t')
if len(parts) < 2:
continue
lineage = parts[0]
lineage_parts = lineage.split('|')
# 获取该行的最后一个分类单元
last_part = lineage_parts[-1]
for prefix, level in taxonomy_levels.items():
if last_part.startswith(prefix):
# 提取名称和级别
name = last_part[len(prefix):]
tax_level = level
# 构建完整分类谱系字典
full_taxonomy = {}
for part in lineage_parts:
for p, l in taxonomy_levels.items():
if part.startswith(p):
full_taxonomy[l] = part[len(p):]
break
# 保存数据,使用名称和税号作为键 (适配Bracken输出)
if tax_level == 'species' and name:
taxonomy_map[name] = full_taxonomy
break
return taxonomy_map
def enhance_bracken_file(bracken_file, taxonomy_map, output_file, level):
"""增强Bracken文件,只添加到对应级别的分类信息"""
try:
# 分类级别对应的列名和索引
level_columns = {
'domain': 'Domain',
'phylum': 'Phylum',
'class': 'Class',
'order': 'Order',
'family': 'Family',
'genus': 'Genus',
'species': 'Species'
}
# 级别索引,用于确定显示哪些列
level_indices = {
'domain': 0,
'phylum': 1,
'class': 2,
'order': 3,
'family': 4,
'genus': 5,
'species': 6
}
# 验证级别
if level not in level_indices:
print(f"错误: 无效的分类级别 '{level}'")
return False
# 获取当前级别索引
current_index = level_indices[level]
# 读取Bracken文件
df = pd.read_csv(bracken_file, sep='\t')
# 只创建到当前级别的列
for l, col_name in level_columns.items():
if level_indices[l] <= current_index:
df[col_name] = ""
# 填充分类信息
for index, row in df.iterrows():
name = row['name']
# 对于非物种级别,尝试直接匹配
if level != 'species' and name in taxonomy_map:
tax_info = taxonomy_map[name]
for l, col_name in level_columns.items():
if level_indices[l] <= current_index and l in tax_info:
df.at[index, col_name] = tax_info[l]
else:
# 尝试找到匹配的分类信息
found = False
# 对于物种级别,直接查找
if level == 'species' and name in taxonomy_map:
tax_info = taxonomy_map[name]
for l, col_name in level_columns.items():
if level_indices[l] <= current_index and l in tax_info:
df.at[index, col_name] = tax_info[l]
found = True
# 对于其他级别,使用部分匹配
if not found and level != 'species':
for tax_name, tax_info in taxonomy_map.items():
if level in tax_info and tax_info[level] == name:
for l, col_name in level_columns.items():
if level_indices[l] <= current_index and l in tax_info:
df.at[index, col_name] = tax_info[l]
found = True
break
# 重新排列列,将分类信息放在前面
cols = df.columns.tolist()
taxonomy_cols = [level_columns[l] for l in level_columns if level_indices[l] <= current_index]
other_cols = [c for c in cols if c not in list(level_columns.values())]
new_cols = taxonomy_cols + other_cols
# 保存结果
df = df[new_cols]
df.to_csv(output_file, sep='\t', index=False)
print(f"已创建增强版文件,只包含到 {level} 级别的分类信息")
return True
except Exception as e:
print(f"错误处理文件 {bracken_file}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description='增强Bracken结果文件,添加完整分类信息')
parser.add_argument('-m', '--mpa', required=True, help='MPA格式文件')
parser.add_argument('-b', '--bracken', required=True, help='Bracken合并结果文件')
parser.add_argument('-o', '--output', required=True, help='输出文件')
parser.add_argument('-l', '--level', required=True, help='分类级别')
args = parser.parse_args()
# 解析MPA文件
print(f"解析MPA文件: {args.mpa}")
taxonomy_map = parse_mpa_file(args.mpa)
print(f"提取了 {len(taxonomy_map)} 个分类单元的信息")
# 增强Bracken文件
print(f"处理Bracken文件: {args.bracken}")
result = enhance_bracken_file(args.bracken, taxonomy_map, args.output, args.level)
if result:
print(f"成功创建增强版结果文件: {args.output}")
else:
print(f"处理失败")
return 1
return 0
if __name__ == '__main__':
sys.exit(main())
EOF
# 添加执行权限
chmod +x "$PYTHON_SCRIPT"
# 处理每个分类级别的Bracken结果
for LEVEL in "${!LEVEL_FILES[@]}"; do
BRACKEN_FILE="${LEVEL_FILES[$LEVEL]}"
# 检查文件是否存在
if [ ! -f "$BRACKEN_FILE" ]; then
echo "警告: Bracken结果文件不存在 ($BRACKEN_FILE)" | tee -a "$LOG_FILE"
continue
fi
ENHANCED_FILE="${ENHANCED_DIR}/enhanced_bracken_${LEVEL}.txt"
echo "处理 $LEVEL 级别的Bracken结果" | tee -a "$LOG_FILE"
# 运行Python脚本增强Bracken结果
python "$PYTHON_SCRIPT" \
-m "$MPA_FILE" \
-b "$BRACKEN_FILE" \
-o "$ENHANCED_FILE" \
-l "$LEVEL" 2>&1 | tee -a "${LOG_DIR}/enhance_${LEVEL}.log"
if [ $? -eq 0 ]; then
echo "成功创建增强版 $LEVEL 结果: $ENHANCED_FILE" | tee -a "$LOG_FILE"
else
echo "错误: 处理 $LEVEL 级别失败" | tee -a "$LOG_FILE"
fi
done
echo "===== 增强Bracken结果完成 =====" | tee -a "$LOG_FILE"
echo "结果保存在: $ENHANCED_DIR" | tee -a "$LOG_FILE"
echo "完成时间: $(date)" | tee -a "$LOG_FILE"
本文来自博客园,作者:Hahntoh,转载请注明原文链接:https://www.cnblogs.com/hahn/p/18803012
浙公网安备 33010602011771号