去除重复序列

D:\1CAAS\Lab\songqianlin\Cas新蛋白\Cas12\cas12_lmnopq_fasta\重复项去除
脚本：
点击查看代码
import os
from collections import defaultdict

def read_fasta(fasta_file):
    records = []
    with open(fasta_file, "r", encoding="utf-8") as f:
        header = None
        seq_lines = []

        for line in f:
            line = line.rstrip()
            if line.startswith(">"):
                if header is not None:
                    records.append((header, "".join(seq_lines)))
                header = line
                seq_lines = []
            else:
                seq_lines.append(line)

        if header is not None:
            records.append((header, "".join(seq_lines)))

    return records


def manual_dedup(records):
    seq_dict = defaultdict(list)
    for h, s in records:
        seq_dict[s].append(h)

    kept_records = []

    for seq, headers in seq_dict.items():
        if len(headers) == 1:
            kept_records.append((headers[0], seq))
            continue

        print("\n⚠️ 发现重复序列（序列完全一致）：")
        for i, h in enumerate(headers):
            print(f"  [{i}] {h}")

        print("选择要【保留】的序列编号（如 0）")
        print("或直接回车 → 保留全部")

        choice = input("你的选择: ").strip()

        if choice == "":
            for h in headers:
                kept_records.append((h, seq))
        else:
            try:
                idx = int(choice)
                kept_records.append((headers[idx], seq))
            except (ValueError, IndexError):
                print("输入无效，默认保留全部")
                for h in headers:
                    kept_records.append((h, seq))

    return kept_records


def write_fasta(records, output_fasta):
    with open(output_fasta, "w", encoding="utf-8") as out:
        for h, s in records:
            out.write(h + "\n")
            out.write(s + "\n")


if __name__ == "__main__":
    input_fasta = r"D:\1CAAS\Lab\生物信息操作\进化树\22_CM_PLV_TnpB_IscB_Cas12\CM_PLV_TNPB_IscB_Cas12_Fanzor.manual_dedup.fasta"
    output_fasta = r"D:\1CAAS\Lab\生物信息操作\进化树\22_CM_PLV_TnpB_IscB_Cas12\CM_PLV_TNPB_IscB_Cas12_Fanzor.manual_dedup2.fasta"

    records = read_fasta(input_fasta)
    final_records = manual_dedup(records)
    write_fasta(final_records, output_fasta)

    print(f"\n完成！最终保留 {len(final_records)} 条序列")
    print(f"输出文件：{output_fasta}")
posted @ 2026-01-15 20:27 Zarinan 阅读(2) 评论(0) 收藏举报
刷新页面返回顶部
Zarinan

去除重复序列

公告