蛋白多序列比对美化
1、用snapgene进行多序列比对,导出alin文件
2、用python进行多序列比对美化
点击查看代码
from Bio import AlignIO
import os
# ====== 用户参数 ======
alignment_file = "比对.fa" # 输入比对文件(fasta/clustal)
alignment_format = "fasta"
html_output = "msa_ruvc_all.html"
# 背景渐变蓝色(保守性)
light_blue = "e6f3ff"
dark_blue = "08306b"
# RUVC1/2/3 定义(基于 ungapped 序列位置,1-based)
RUVC1 = {
"FnCas12a_6I1K_1": [(892, 953)],
"LbCas12a_5ID6_1": [(809, 858)],
"LbCas12a_6NME_1": [(808, 872)],
"Lb2Cas12a_8I54_1": [(792, 852)],
"ReChb_Cas12a_1": [(853, 914)],
}
RUVC2 = {
"FnCas12a_6I1K_1": [(971, 1078)],
"LbCas12a_5ID6_1": [(890, 1011)],
"LbCas12a_6NME_1": [(890, 997)],
"Lb2Cas12a_8I54_1": [(869, 992)],
"ReChb_Cas12a_1": [(930, 1044)],
}
RUVC3 = {
"FnCas12a_6I1K_1": [(1254, 1300)],
"LbCas12a_5ID6_1": [(1138, 1228)],
"LbCas12a_6NME_1": [(1179, 1228)],
"Lb2Cas12a_8I54_1": [(1151, 1206)],
"ReChb_Cas12a_1": [(1215, 1261)],
}
# RUVC 样式
RUVC_color = "#FEBC28" # 所有 RUVC 使用同一颜色
RUVC_italic = False # 是否斜体
# ====== 读取比对 ======
alignment = AlignIO.read(alignment_file, alignment_format)
seq_len = alignment.get_alignment_length()
# 计算保守性
conservation = []
for i in range(seq_len):
column = [rec.seq[i] for rec in alignment]
chars = [aa for aa in column if aa != "-"]
freq = max([chars.count(aa)/len(chars) for aa in set(chars)]) if chars else 0.0
conservation.append(freq)
# ====== 辅助函数 ======
def hex_to_rgb(hexstr):
return int(hexstr[0:2], 16), int(hexstr[2:4], 16), int(hexstr[4:6], 16)
def rgb_to_hex(r, g, b):
return f"{r:02x}{g:02x}{b:02x}"
lr, lg, lb = hex_to_rgb(light_blue)
dr, dg, db = hex_to_rgb(dark_blue)
# 将 ungapped 座标映射到 alignment 座标
def build_ruvc_aligned(ruvc_dict, alignment):
result = {}
for rec in alignment:
seq_id = rec.id
seq = str(rec.seq)
mapping = [i for i, ch in enumerate(seq) if ch != "-"]
if seq_id in ruvc_dict:
newranges = []
seq_len = len(mapping)
for s, e in ruvc_dict[seq_id]:
if s > seq_len:
continue
start_al = mapping[s-1]
end_al = mapping[min(e, seq_len)-1]
if start_al <= end_al:
newranges.append((start_al, end_al))
if newranges:
result[seq_id] = newranges
return result
RUVC1_aligned = build_ruvc_aligned(RUVC1, alignment)
RUVC2_aligned = build_ruvc_aligned(RUVC2, alignment)
RUVC3_aligned = build_ruvc_aligned(RUVC3, alignment)
# 判断某位置是否属于任意 RUVC
def in_ruvc(seq_id, pos, aa):
if aa == "-":
return False
for ruvc_map in [RUVC1_aligned, RUVC2_aligned, RUVC3_aligned]:
for start, end in ruvc_map.get(seq_id, []):
if start <= pos <= end:
return True
return False
# ====== 生成 HTML ======
with open(html_output, "w", encoding="utf-8") as out:
out.write("<!doctype html><html lang='zh-CN'><head><meta charset='utf-8'>\n")
out.write("<title>MSA - RUVC 高亮</title>\n")
out.write("<style>\n")
out.write("body{font-family: Consolas, monospace; padding:16px}\n")
out.write("table { border-collapse: collapse; }\n")
out.write("td.id { vertical-align: top; padding:4px 8px; white-space: nowrap; }\n")
out.write("td.seq { vertical-align: top; padding:4px 8px; white-space: pre; }\n")
out.write("span.res { display:inline-block; padding:0 1px; }\n")
out.write("</style></head><body>\n")
out.write(f"<h2>多序列比对(RUVC 高亮) — {os.path.basename(alignment_file)}</h2>\n")
out.write("<div style='overflow-x:auto'><table>\n")
for rec in alignment:
seq_id = rec.id
seq = str(rec.seq)
out.write("<tr>")
out.write(f"<td class='id'>{seq_id}</td>")
out.write("<td class='seq'>")
for i, aa in enumerate(seq):
if aa == "-":
bg = "#ffffff"
color = "#000000"
style_extra = ""
else:
# 背景渐变蓝色
cons = conservation[i]
r = int(round(lr + (dr - lr) * cons))
g = int(round(lg + (dg - lg) * cons))
b = int(round(lb + (db - lb) * cons))
bg = "#" + rgb_to_hex(r, g, b)
# RUVC 标注
if in_ruvc(seq_id, i, aa):
color = RUVC_color
style_extra = "font-style:italic;" if RUVC_italic else ""
else:
color = "#000000"
style_extra = ""
out.write(f"<span class='res' style='background-color:{bg};color:{color};{style_extra}'>{aa}</span>")
out.write("</td></tr>\n")
out.write("</table></div></body></html>\n")
print(f"已生成:{html_output}")

浙公网安备 33010602011771号