可视化结构域序列并提取序列

1、可视化

点击查看代码
from Bio import AlignIO
import os

# ====== 用户参数 ======
alignment_file = "比对.fa"    # 输入比对文件(fasta/clustal)
alignment_format = "fasta"
html_output = "msa_ruvc_all.html"

# 背景渐变蓝色(保守性)
light_blue = "ffffff"
dark_blue  = "ffff66"

# RUVC1/2/3 定义(基于 ungapped 序列位置,1-based)
RUVC1 = {
    "FnCas12a_6I1K_1": [(892, 953)],
    "LbCas12a_5ID6_1": [(809, 858)],
    "LbCas12a_6NME_1": [(808, 872)],
    "Lb2Cas12a_8I54_1": [(792, 852)],
    "ReChb_Cas12a_1": [(853, 914)],
}
RUVC2 = {
    "FnCas12a_6I1K_1": [(971, 1078)],
    "LbCas12a_5ID6_1": [(890, 1011)],
    "LbCas12a_6NME_1": [(890, 997)],
    "Lb2Cas12a_8I54_1": [(869, 992)],
    "ReChb_Cas12a_1": [(930, 1044)],
}
RUVC3 = {
    "FnCas12a_6I1K_1": [(1254, 1300)],
    "LbCas12a_5ID6_1": [(1138, 1228)],
    "LbCas12a_6NME_1": [(1179, 1228)],
    "Lb2Cas12a_8I54_1": [(1151, 1206)],
    "ReChb_Cas12a_1": [(1215, 1261)],
}

# RUVC 样式
RUVC_color = "#ff0000"  # 所有 RUVC 使用同一颜色
RUVC_italic = False     # 是否斜体

# ====== 读取比对 ======
alignment = AlignIO.read(alignment_file, alignment_format)
seq_len = alignment.get_alignment_length()

# 计算保守性
conservation = []
for i in range(seq_len):
    column = [rec.seq[i] for rec in alignment]
    chars = [aa for aa in column if aa != "-"]
    freq = max([chars.count(aa)/len(chars) for aa in set(chars)]) if chars else 0.0
    conservation.append(freq)

# ====== 辅助函数 ======
def hex_to_rgb(hexstr):
    return int(hexstr[0:2], 16), int(hexstr[2:4], 16), int(hexstr[4:6], 16)

def rgb_to_hex(r, g, b):
    return f"{r:02x}{g:02x}{b:02x}"

lr, lg, lb = hex_to_rgb(light_blue)
dr, dg, db = hex_to_rgb(dark_blue)

# 将 ungapped 座标映射到 alignment 座标
def build_ruvc_aligned(ruvc_dict, alignment):
    result = {}
    for rec in alignment:
        seq_id = rec.id
        seq = str(rec.seq)
        mapping = [i for i, ch in enumerate(seq) if ch != "-"]
        if seq_id in ruvc_dict:
            newranges = []
            seq_len = len(mapping)
            for s, e in ruvc_dict[seq_id]:
                if s > seq_len:
                    continue
                start_al = mapping[s-1]
                end_al   = mapping[min(e, seq_len)-1]
                if start_al <= end_al:
                    newranges.append((start_al, end_al))
            if newranges:
                result[seq_id] = newranges
    return result

RUVC1_aligned = build_ruvc_aligned(RUVC1, alignment)
RUVC2_aligned = build_ruvc_aligned(RUVC2, alignment)
RUVC3_aligned = build_ruvc_aligned(RUVC3, alignment)

# 判断某位置是否属于任意 RUVC
def in_ruvc(seq_id, pos, aa):
    if aa == "-":
        return False
    for ruvc_map in [RUVC1_aligned, RUVC2_aligned, RUVC3_aligned]:
        for start, end in ruvc_map.get(seq_id, []):
            if start <= pos <= end:
                return True
    return False

# ====== 生成 HTML ======
with open(html_output, "w", encoding="utf-8") as out:
    out.write("<!doctype html><html lang='zh-CN'><head><meta charset='utf-8'>\n")
    out.write("<title>MSA - RUVC 高亮</title>\n")
    out.write("""
    <style>
    body{font-family:Consolas,monospace;padding:16px}
    table{border-collapse:collapse;}
    td.id{vertical-align:top;padding:4px 8px;white-space:nowrap;}
    td.seq{vertical-align:top;padding:4px 8px;white-space:pre;font-size:16px;}
    span.res{display:inline-block;padding:0 1px;cursor:pointer;}
    #zoom-controls{margin-bottom:10px;}
    </style>
    <script>
    let currentZoom = 1.0;
    function zoomIn(){currentZoom += 0.1; updateZoom();}
    function zoomOut(){currentZoom = Math.max(0.5, currentZoom - 0.1); updateZoom();}
    function updateZoom(){
        document.querySelectorAll('td.seq').forEach(e=>{
            e.style.fontSize = (16 * currentZoom) + 'px';
        });
    }

    function showInfo(seqId, alignedPos, ungappedPos, aa){
        alert(`序列: ${seqId}\\n氨基酸: ${aa}\\n比对位置: ${alignedPos+1}\\n原序列位置: ${ungappedPos}`);
    }
    </script>
    </head><body>
    <h2>多序列比对(RUVC 高亮) — """ + os.path.basename(alignment_file) + """</h2>
    <div id='zoom-controls'>
        <button onclick='zoomIn()'>放大</button>
        <button onclick='zoomOut()'>缩小</button>
    </div>
    <div style='overflow-x:auto'><table>
    """)

    for rec in alignment:
        seq_id = rec.id
        seq = str(rec.seq)
        out.write("<tr>")
        out.write(f"<td class='id'>{seq_id}</td>")
        out.write("<td class='seq'>")

        ungapped_pos = 0
        for i, aa in enumerate(seq):
            if aa != "-":
                ungapped_pos += 1

            # 背景渐变白→荧光黄
            if aa == "-":
                bg = "#ffffff"
                color = "#000000"
                style_extra = ""
            else:
                cons = conservation[i]
                r = int(round(lr + (dr - lr) * cons))
                g = int(round(lg + (dg - lg) * cons))
                b = int(round(lb + (db - lb) * cons))
                bg = "#" + rgb_to_hex(r, g, b)
                # RUVC 标注
                if in_ruvc(seq_id, i, aa):
                    color = RUVC_color
                    style_extra = "font-style:italic;" if RUVC_italic else ""
                else:
                    color = "#000000"
                    style_extra = ""

            out.write(
                f"<span class='res' style='background-color:{bg};color:{color};{style_extra}' "
                f"onclick=\"showInfo('{seq_id}', {i}, {ungapped_pos if aa!='-' else 'null'}, '{aa}')\">{aa}</span>"
            )
        out.write("</td></tr>\n")

    out.write("</table></div></body></html>\n")

print(f"已生成:{html_output}")

2、提取结构域区域序列

点击查看代码
from Bio import AlignIO

# ====== 用户参数 ======
alignment_file = "比对.fa"    # 输入多序列比对文件
alignment_format = "fasta"
output_fasta = "Cas12a_RUVC1.fasta"  # 输出文件
start_pos = 1610
end_pos = 1865

# ====== 读取比对 ======
alignment = AlignIO.read(alignment_file, alignment_format)
seq_len = alignment.get_alignment_length()

print(f"总比对长度:{seq_len}")
if end_pos > seq_len:
    raise ValueError(f"区间 {end_pos} 超出比对长度 {seq_len}")

# ====== 提取区间并去掉空位符 ======
records = []
for rec in alignment:
    seq_id = rec.id
    seq_str = str(rec.seq)

    # 截取对应区间(1-based → Python 0-based)
    sub_seq = seq_str[start_pos-1:end_pos]
    # 去掉空位符
    sub_seq_nogap = sub_seq.replace("-", "")

    records.append((seq_id, sub_seq_nogap))

# ====== 输出为 FASTA ======
with open(output_fasta, "w", encoding="utf-8") as f:
    for seq_id, seq in records:
        f.write(f">{seq_id}\n{seq}\n")

print(f"✅ 已生成文件:{output_fasta}")
print(f"提取区间:{start_pos}-{end_pos},共 {len(records)} 条序列。")

posted @ 2025-11-06 12:38  Zarinan  阅读(3)  评论(0)    收藏  举报