从基因组里面按照染色体和索引提取基因序列
简单而言就是 pos要少一位数
pos=row["pos"]-1
seq = genome[chrom][pos]
from pyensembl import EnsemblRelease
from pyfaidx import Fasta
ensembl = EnsemblRelease(110)
# 加载本地基因组FASTA文件
genome = Fasta('../datasets_make/Homo_sapiens.GRCh38.dna.primary_assembly.fa')
sum_num=0
suc=0
for idx, row in df.iterrows():
chrom=row["chrom"]
pos=row["pos"]-1
seq = genome[chrom][pos]
if str(seq).upper()==row["ref"].upper():
suc+=1
sum_num+=1
print(suc/sum_num)

浙公网安备 33010602011771号