去除重复序列
D:\1CAAS\Lab\songqianlin\Cas新蛋白\Cas12\cas12_lmnopq_fasta\重复项去除
检测序列相同的序列,不管ID是否相同
脚本:
点击查看代码
import os
from collections import defaultdict
def read_fasta(fasta_file):
records = []
with open(fasta_file, "r", encoding="utf-8") as f:
header = None
seq_lines = []
for line in f:
line = line.rstrip()
if line.startswith(">"):
if header is not None:
records.append((header, "".join(seq_lines)))
header = line
seq_lines = []
else:
seq_lines.append(line)
if header is not None:
records.append((header, "".join(seq_lines)))
return records
def manual_dedup(records):
seq_dict = defaultdict(list)
for h, s in records:
seq_dict[s].append(h)
kept_records = []
for seq, headers in seq_dict.items():
if len(headers) == 1:
kept_records.append((headers[0], seq))
continue
print("\n⚠️ 发现重复序列(序列完全一致):")
for i, h in enumerate(headers):
print(f" [{i}] {h}")
print("选择要【保留】的序列编号(如 0)")
print("或直接回车 → 保留全部")
choice = input("你的选择: ").strip()
if choice == "":
for h in headers:
kept_records.append((h, seq))
else:
try:
idx = int(choice)
kept_records.append((headers[idx], seq))
except (ValueError, IndexError):
print("输入无效,默认保留全部")
for h in headers:
kept_records.append((h, seq))
return kept_records
def write_fasta(records, output_fasta):
with open(output_fasta, "w", encoding="utf-8") as out:
for h, s in records:
out.write(h + "\n")
out.write(s + "\n")
if __name__ == "__main__":
input_fasta = r"D:\1CAAS\Lab\生物信息操作\进化树\22_CM_PLV_TnpB_IscB_Cas12\CM_PLV_TNPB_IscB_Cas12_Fanzor.manual_dedup.fasta"
output_fasta = r"D:\1CAAS\Lab\生物信息操作\进化树\22_CM_PLV_TnpB_IscB_Cas12\CM_PLV_TNPB_IscB_Cas12_Fanzor.manual_dedup2.fasta"
records = read_fasta(input_fasta)
final_records = manual_dedup(records)
write_fasta(final_records, output_fasta)
print(f"\n完成!最终保留 {len(final_records)} 条序列")
print(f"输出文件:{output_fasta}")
检测序列相同和ID相同的序列
去除重复序列2A:
点击查看代码
from collections import defaultdict
def read_fasta(fasta_file):
records = []
with open(fasta_file, "r", encoding="utf-8") as f:
header = None
seq_lines = []
for line in f:
line = line.rstrip()
if line.startswith(">"):
if header is not None:
records.append((header, "".join(seq_lines)))
header = line
seq_lines = []
else:
seq_lines.append(line)
if header is not None:
records.append((header, "".join(seq_lines)))
return records
def extract_id(header):
return header[1:].split()[0]
def manual_dedup_by_sequence(records):
"""
Step 1:序列完全一致 → 手动选择
"""
seq_dict = defaultdict(list)
for h, s in records:
seq_dict[s].append(h)
kept_records = []
for seq, headers in seq_dict.items():
if len(headers) == 1:
kept_records.append((headers[0], seq))
continue
print("\n⚠️ 发现完全相同的序列:")
for i, h in enumerate(headers):
print(f" [{i}] {h}")
print("选择要【保留】的编号(如 0)")
print("或直接回车 → 保留全部")
choice = input("你的选择: ").strip()
if choice == "":
for h in headers:
kept_records.append((h, seq))
else:
try:
idx = int(choice)
kept_records.append((headers[idx], seq))
except (ValueError, IndexError):
print("输入无效,默认保留全部")
for h in headers:
kept_records.append((h, seq))
return kept_records
def interactive_rename_same_id_diff_seq(records):
"""
Step 2:在【已去除完全重复序列】的前提下,
再检测 ID 相同但序列不同
"""
id_groups = defaultdict(list)
for h, s in records:
seq_id = extract_id(h)
id_groups[seq_id].append((h, s))
new_records = []
for seq_id, items in id_groups.items():
# 只剩 1 条 → 不可能冲突
if len(items) == 1:
new_records.extend(items)
continue
print("\n⚠️ 发现 ID 相同但序列不同(已排除完全重复序列)")
print(f"ID: {seq_id}")
for i, (h, s) in enumerate(items):
suggested = f"{seq_id}_v{i+1}"
print(f"\n[{i}]")
print(f"原 header: {h}")
print(f"序列长度: {len(s)}")
print(f"建议 ID: {suggested}")
user_input = input("请输入新的 ID(不含 >,回车表示不改): ").strip()
if user_input:
new_h = ">" + user_input
else:
new_h = h
new_records.append((new_h, s))
return new_records
def write_fasta(records, output_fasta):
with open(output_fasta, "w", encoding="utf-8") as out:
for h, s in records:
out.write(h + "\n")
out.write(s + "\n")
if __name__ == "__main__":
input_fasta = r"D:\1CAAS\Lab\生物信息操作\结构域划分\PLV\PLV_results\PV_TnpB\PV_CM_all_gRNA-V2_TnpB_output.fasta"
output_fasta = r"D:\1CAAS\Lab\生物信息操作\结构域划分\PLV\PLV_results\PV_TnpB\PV_CM_all_gRNA-V2_TnpB_output.cleaned.fasta"
records = read_fasta(input_fasta)
# ✅ 正确顺序
records = manual_dedup_by_sequence(records) # Step 1
records = interactive_rename_same_id_diff_seq(records) # Step 2
write_fasta(records, output_fasta)
print(f"\n✅ 完成!最终保留 {len(records)} 条序列")
print(f"📄 输出文件:{output_fasta}")

浙公网安备 33010602011771号