序列重复项检查

地址:D:\1CAAS\Lab\songqianlin\Cas新蛋白\Cas12\cas12_lmnopq_fasta\重复项去除

点击查看代码
from collections import defaultdict

fasta_file = (r"D:\1CAAS\Lab\生物信息操作\结构域划分\PLV\IscB\PLV_IscB_vs_IscB_21-25.fasta")  # 替换为你的fasta文件路径

def check_fasta_duplicates(fasta_file):
    seq_dict = defaultdict(list)
    name_dict = defaultdict(list)
    name, seq = None, []

    with open(fasta_file, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):  # fasta标题行
                if name:
                    sequence = "".join(seq)
                    seq_dict[sequence].append(name)
                    name_dict[name].append(sequence)
                name = line[1:].split()[0]  # 取第一段名称(避免空格后注释干扰)
                seq = []
            else:
                seq.append(line.upper())
        # 最后一条序列写入
        if name:
            sequence = "".join(seq)
            seq_dict[sequence].append(name)
            name_dict[name].append(sequence)

    # 检查重复名称
    duplicate_names = [k for k, v in name_dict.items() if len(v) > 1]

    # 检查重复序列
    duplicate_seqs = {k: v for k, v in seq_dict.items() if len(v) > 1}

    print("🧬 检查结果:")
    print(f"总序列数: {len(name_dict)}")
    print(f"重复名称数: {len(duplicate_names)}")
    print(f"重复序列数: {len(duplicate_seqs)}\n")

    if duplicate_names:
        print("⚠️ 重复名称示例:")
        for name in duplicate_names[:10]:
            print(" ", name)
        print()

    if duplicate_seqs:
        print("⚠️ 重复序列示例:")
        for seq, names in list(duplicate_seqs.items()):
            print(f" 序列长度: {len(seq)}, 对应名称: {', '.join(names)}")
        print()

    return duplicate_names, duplicate_seqs

if __name__ == "__main__":
    check_fasta_duplicates(fasta_file)

posted @ 2025-12-01 16:23  Zarinan  阅读(0)  评论(0)    收藏  举报