去除重复序列

D:\1CAAS\Lab\songqianlin\Cas新蛋白\Cas12\cas12_lmnopq_fasta\重复项去除
检测序列相同的序列,不管ID是否相同
脚本:

点击查看代码
import os
from collections import defaultdict

def read_fasta(fasta_file):
    records = []
    with open(fasta_file, "r", encoding="utf-8") as f:
        header = None
        seq_lines = []

        for line in f:
            line = line.rstrip()
            if line.startswith(">"):
                if header is not None:
                    records.append((header, "".join(seq_lines)))
                header = line
                seq_lines = []
            else:
                seq_lines.append(line)

        if header is not None:
            records.append((header, "".join(seq_lines)))

    return records


def manual_dedup(records):
    seq_dict = defaultdict(list)
    for h, s in records:
        seq_dict[s].append(h)

    kept_records = []

    for seq, headers in seq_dict.items():
        if len(headers) == 1:
            kept_records.append((headers[0], seq))
            continue

        print("\n⚠️ 发现重复序列(序列完全一致):")
        for i, h in enumerate(headers):
            print(f"  [{i}] {h}")

        print("选择要【保留】的序列编号(如 0)")
        print("或直接回车 → 保留全部")

        choice = input("你的选择: ").strip()

        if choice == "":
            for h in headers:
                kept_records.append((h, seq))
        else:
            try:
                idx = int(choice)
                kept_records.append((headers[idx], seq))
            except (ValueError, IndexError):
                print("输入无效,默认保留全部")
                for h in headers:
                    kept_records.append((h, seq))

    return kept_records


def write_fasta(records, output_fasta):
    with open(output_fasta, "w", encoding="utf-8") as out:
        for h, s in records:
            out.write(h + "\n")
            out.write(s + "\n")


if __name__ == "__main__":
    input_fasta = r"D:\1CAAS\Lab\生物信息操作\进化树\22_CM_PLV_TnpB_IscB_Cas12\CM_PLV_TNPB_IscB_Cas12_Fanzor.manual_dedup.fasta"
    output_fasta = r"D:\1CAAS\Lab\生物信息操作\进化树\22_CM_PLV_TnpB_IscB_Cas12\CM_PLV_TNPB_IscB_Cas12_Fanzor.manual_dedup2.fasta"

    records = read_fasta(input_fasta)
    final_records = manual_dedup(records)
    write_fasta(final_records, output_fasta)

    print(f"\n完成!最终保留 {len(final_records)} 条序列")
    print(f"输出文件:{output_fasta}")

检测序列相同和ID相同的序列
去除重复序列2A:

点击查看代码
from collections import defaultdict


def read_fasta(fasta_file):
    records = []
    with open(fasta_file, "r", encoding="utf-8") as f:
        header = None
        seq_lines = []

        for line in f:
            line = line.rstrip()
            if line.startswith(">"):
                if header is not None:
                    records.append((header, "".join(seq_lines)))
                header = line
                seq_lines = []
            else:
                seq_lines.append(line)

        if header is not None:
            records.append((header, "".join(seq_lines)))

    return records


def extract_id(header):
    return header[1:].split()[0]


def manual_dedup_by_sequence(records):
    """
    Step 1:序列完全一致 → 手动选择
    """
    seq_dict = defaultdict(list)
    for h, s in records:
        seq_dict[s].append(h)

    kept_records = []

    for seq, headers in seq_dict.items():
        if len(headers) == 1:
            kept_records.append((headers[0], seq))
            continue

        print("\n⚠️ 发现完全相同的序列:")
        for i, h in enumerate(headers):
            print(f"  [{i}] {h}")

        print("选择要【保留】的编号(如 0)")
        print("或直接回车 → 保留全部")

        choice = input("你的选择: ").strip()

        if choice == "":
            for h in headers:
                kept_records.append((h, seq))
        else:
            try:
                idx = int(choice)
                kept_records.append((headers[idx], seq))
            except (ValueError, IndexError):
                print("输入无效,默认保留全部")
                for h in headers:
                    kept_records.append((h, seq))

    return kept_records


def interactive_rename_same_id_diff_seq(records):
    """
    Step 2:在【已去除完全重复序列】的前提下,
    再检测 ID 相同但序列不同
    """
    id_groups = defaultdict(list)

    for h, s in records:
        seq_id = extract_id(h)
        id_groups[seq_id].append((h, s))

    new_records = []

    for seq_id, items in id_groups.items():
        # 只剩 1 条 → 不可能冲突
        if len(items) == 1:
            new_records.extend(items)
            continue

        print("\n⚠️ 发现 ID 相同但序列不同(已排除完全重复序列)")
        print(f"ID: {seq_id}")

        for i, (h, s) in enumerate(items):
            suggested = f"{seq_id}_v{i+1}"

            print(f"\n[{i}]")
            print(f"原 header: {h}")
            print(f"序列长度: {len(s)}")
            print(f"建议 ID: {suggested}")

            user_input = input("请输入新的 ID(不含 >,回车表示不改): ").strip()

            if user_input:
                new_h = ">" + user_input
            else:
                new_h = h

            new_records.append((new_h, s))

    return new_records


def write_fasta(records, output_fasta):
    with open(output_fasta, "w", encoding="utf-8") as out:
        for h, s in records:
            out.write(h + "\n")
            out.write(s + "\n")


if __name__ == "__main__":
    input_fasta = r"D:\1CAAS\Lab\生物信息操作\结构域划分\PLV\PLV_results\PV_TnpB\PV_CM_all_gRNA-V2_TnpB_output.fasta"
    output_fasta = r"D:\1CAAS\Lab\生物信息操作\结构域划分\PLV\PLV_results\PV_TnpB\PV_CM_all_gRNA-V2_TnpB_output.cleaned.fasta"

    records = read_fasta(input_fasta)

    # ✅ 正确顺序
    records = manual_dedup_by_sequence(records)        # Step 1
    records = interactive_rename_same_id_diff_seq(records)  # Step 2

    write_fasta(records, output_fasta)

    print(f"\n✅ 完成!最终保留 {len(records)} 条序列")
    print(f"📄 输出文件:{output_fasta}")

posted @ 2026-01-15 20:27  Zarinan  阅读(12)  评论(0)    收藏  举报