去除重复序列
D:\1CAAS\Lab\songqianlin\Cas新蛋白\Cas12\cas12_lmnopq_fasta\重复项去除
脚本:
点击查看代码
import os
from collections import defaultdict
def read_fasta(fasta_file):
records = []
with open(fasta_file, "r", encoding="utf-8") as f:
header = None
seq_lines = []
for line in f:
line = line.rstrip()
if line.startswith(">"):
if header is not None:
records.append((header, "".join(seq_lines)))
header = line
seq_lines = []
else:
seq_lines.append(line)
if header is not None:
records.append((header, "".join(seq_lines)))
return records
def manual_dedup(records):
seq_dict = defaultdict(list)
for h, s in records:
seq_dict[s].append(h)
kept_records = []
for seq, headers in seq_dict.items():
if len(headers) == 1:
kept_records.append((headers[0], seq))
continue
print("\n⚠️ 发现重复序列(序列完全一致):")
for i, h in enumerate(headers):
print(f" [{i}] {h}")
print("选择要【保留】的序列编号(如 0)")
print("或直接回车 → 保留全部")
choice = input("你的选择: ").strip()
if choice == "":
for h in headers:
kept_records.append((h, seq))
else:
try:
idx = int(choice)
kept_records.append((headers[idx], seq))
except (ValueError, IndexError):
print("输入无效,默认保留全部")
for h in headers:
kept_records.append((h, seq))
return kept_records
def write_fasta(records, output_fasta):
with open(output_fasta, "w", encoding="utf-8") as out:
for h, s in records:
out.write(h + "\n")
out.write(s + "\n")
if __name__ == "__main__":
input_fasta = r"D:\1CAAS\Lab\生物信息操作\进化树\22_CM_PLV_TnpB_IscB_Cas12\CM_PLV_TNPB_IscB_Cas12_Fanzor.manual_dedup.fasta"
output_fasta = r"D:\1CAAS\Lab\生物信息操作\进化树\22_CM_PLV_TnpB_IscB_Cas12\CM_PLV_TNPB_IscB_Cas12_Fanzor.manual_dedup2.fasta"
records = read_fasta(input_fasta)
final_records = manual_dedup(records)
write_fasta(final_records, output_fasta)
print(f"\n完成!最终保留 {len(final_records)} 条序列")
print(f"输出文件:{output_fasta}")

浙公网安备 33010602011771号