序列重复项检查
地址:D:\1CAAS\Lab\songqianlin\Cas新蛋白\Cas12\cas12_lmnopq_fasta\重复项去除
点击查看代码
from collections import defaultdict
fasta_file = (r"D:\1CAAS\Lab\生物信息操作\结构域划分\PLV\IscB\PLV_IscB_vs_IscB_21-25.fasta") # 替换为你的fasta文件路径
def check_fasta_duplicates(fasta_file):
seq_dict = defaultdict(list)
name_dict = defaultdict(list)
name, seq = None, []
with open(fasta_file, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
if line.startswith(">"): # fasta标题行
if name:
sequence = "".join(seq)
seq_dict[sequence].append(name)
name_dict[name].append(sequence)
name = line[1:].split()[0] # 取第一段名称(避免空格后注释干扰)
seq = []
else:
seq.append(line.upper())
# 最后一条序列写入
if name:
sequence = "".join(seq)
seq_dict[sequence].append(name)
name_dict[name].append(sequence)
# 检查重复名称
duplicate_names = [k for k, v in name_dict.items() if len(v) > 1]
# 检查重复序列
duplicate_seqs = {k: v for k, v in seq_dict.items() if len(v) > 1}
print("🧬 检查结果:")
print(f"总序列数: {len(name_dict)}")
print(f"重复名称数: {len(duplicate_names)}")
print(f"重复序列数: {len(duplicate_seqs)}\n")
if duplicate_names:
print("⚠️ 重复名称示例:")
for name in duplicate_names[:10]:
print(" ", name)
print()
if duplicate_seqs:
print("⚠️ 重复序列示例:")
for seq, names in list(duplicate_seqs.items()):
print(f" 序列长度: {len(seq)}, 对应名称: {', '.join(names)}")
print()
return duplicate_names, duplicate_seqs
if __name__ == "__main__":
check_fasta_duplicates(fasta_file)

浙公网安备 33010602011771号