python 根据染色体起始终止点坐标来获取碱基序列

在工作当中,有时候我们知到染色体编号以及染色体起始终止坐标,我们想知道这段序列是什么样的碱基。
其一,我们一般用去UCSC的genome browser里面去查询 ,其实也可以从UCSC的接口去解析网页,然后在提取序列信息
比如chr17:7676091,7676196 ,那么我只需要构造下面一个网页地址 http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment=chr17:7676091,7676196
然后
hg38可以更换成hg19,dna?segment= 后面可以按照标准格式更换,就可以返回我们想要的序列了。现在对网页返回 一个xml格式的信息,用python解析一下

1
import requests
  2 import re
  3 from bs4 import BeautifulSoup
  4 import xlwt
  5 import xlrd
  6 from xlutils.copy import copy
  7 import os ,sys
  8 #print(sys.path)
  9 cwd=os.getcwd()
 10 
 11 
 12 
 13 def getHTMLText(url):
 14     print("111111")
 15     try:
 16         header = {'user-agent': 'Mozilla/5.0'}
 17         r = requests.get(url,headers = header,timeout = 30 )
 18         r.raise_for_status()
 19         r.encoding =r.apparent_encoding
 20         print("get 222222222222222")
 21         return r.text
 22     except:
 23         return ""
 24 
 25 def fillDNAList(dnalist, html):
 26     # 使用正则表达式获取dna 序列的头文件
 27     match = re.search('SEQUENCE([\s\S]*?version="1.00")', html)
 28     print("match ok")
 29     if match:
 30         dna_header = re.search('SEQUENCE([\s\S]*?version="1.00")', html)
 31         #print('10====>', dna_header.group())
 32         #dna_header 存到列表
 33         dnalist.append(dna_header.group())
 34 
 35     match = re.search('DNA.*?length="(\d*)"', html)
 36     if match:
 37         length_header= re.search('DNA.*?length="(\d*)"', html)
 38         #print('11=====>', length_header.group())
 39         dnalist.append(length_header.group())
 40 
 41     #使用 BeautifulSoup
 42     print("BeautifulSoup tag属性 获取dna标签属性的字符串部分")
 43     soup = BeautifulSoup(html, 'html.parser')
 44     tag = soup.dna
 45     seq = soup.dna.string
 46     seq = seq.replace('\n','').upper()  #换行符删除掉,转换成大写
 47     # seq 存到列表
 48     dnalist.append(seq)
 49     print("final======>",dnalist)
 50     return dnalist
 51 
 52 def write_excell(dnalist,chrnum,pos):
 53     head = '>hg19' + ' ' + dnalist[0] + dnalist[1]
 54     f = xlwt.Workbook(encoding='utf-8', style_compression=0) #创建新的Excel(新的workbook)
 55     sheet = f.add_sheet('test8', cell_overwrite_ok=True) #创建新的表单
 56     # 先写第一行的头文件
 57     sheet.write(0,0,head)
 58     #再从第二行开始写,每行写入50 个字符串
 59     dna = dnalist[2]
 60     print('=====',dna,type(dna))
 61     for i in range(0,len(dna),50):
 62         sheet.write((int((i+1)/50))+1,0,dna[i:i+50])
 63 
 64     out_file = 'chrmosome%s_%s.xls'% (chrnum,pos)
 65     f.save(out_file)
 66     out_file_dir = os.path.join(cwd, out_file)
 67     return out_file_dir
 68 
 69 def modify_excell(out_file_dir,chrnum,pos):
 70     '''
 71     改Excel表(xlutils模块)
 72     :return:
 73     '''
 74     rb = xlrd.open_workbook(out_file_dir)  # 打开out_file.xls文件,创建工作簿实例对象
 75     sheet = rb.sheet_by_index(0)
 76     nrow11 = sheet.cell_value(10, 0) #修改第11行第一列,索引是10,0
 77     # 根据需要截取原单元格里面的内容与需要添加的内容进行拼接
 78     new_nrow11 = '[' + nrow11
 79     # 同理操作nrow12
 80     nrow12 = sheet.cell_value(11, 0)
 81     new_nrow12 = nrow12 + ']'
 82 
 83     wb = copy(rb)
 84     ws = wb.get_sheet(0)
 85     # 往单元格中写入拼接后的新字符串内容
 86     ws.write(10,0,new_nrow11)
 87     ws.write(11, 0, new_nrow12)
 88     modify_file = 'new_chrmosome%s_%s.xls' % (chrnum,pos)
 89     wb.save(modify_file)
 90 
 91 def main():
 92     hg19 = "hg19"
 93     chrnum = 17
 94     pos = 7676091
 95     start = pos - 500
 96     end = pos + 500
 97     position_DNA_list = []
 99     url = f"http://genome.ucsc.edu/cgi-bin/das/{hg19}/dna?segment=chr{chrnum}:{start},{end}"
100    
101     print(url)
102     html = getHTMLText(url)
103     dnalist = fillDNAList(position_DNA_list,html)
104     out_file_dir = write_excell(dnalist,chrnum,pos)
105     modify_excell(out_file_dir,chrnum,pos)
106 
107 main()

结果如下:

http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr17:7675591,7676591

match ok
BeautifulSoup tag属性 获取dna标签属性的字符串部分
final======> ['SEQUENCE id="chr17" start="7675591" stop="7676591" version="1.00"', 'DNA length="1001"', 'CCCAAGAGCCTTCAGTATACACATCAATAAAAATAATTTTAATTATTCTGATAAAAGATAAACATGAAAAGTTATGGTATGCAAAGTTGAATGACAACAACTGATACTATTTGAAATAATTGACAGAATTATATTCCGTAACAATTTATAAGCAAAGCCAAAAAAACAATGATCCCTTTGTTGAATGCACAGAACAAATCCATCTTGTCCACGGCTACTGAGCATGCCTGTGATCTCCAGGGGTCACTCAGGTTTGACTCAAAGGATCCAACAGCCTGTAGACCCTGTGCTTGAAGGCATGAGGGTCACCTCTGAGTTCACACTCACTAGTGTCCCTCCTTTCTTCAGAAAGCTAGGAACTGGGAAGACAAGGGGAAAATCAATCAAGGCCTGAGGTATGGGGCTGTAGGCTGGGAGGAAACTAACATTATTGAGAAGCTACTGATGTGAATACATTTCAATTACTACTCACATTGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTTAAGACGGAGTTTTGCTCTCGTTGCCCAGGCTGGAGTGCAATGGAATGATCTAAGGTCACCACAACCTCCACCTCCCGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGTGTGCCACCACACCCAGCTAAGTTTGTATTTTTTTAGTAGAGACGGTGTTTCACCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCACCTCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCCACCACACCCAGCCTCACGTTGGTTTTTGAGATGGATTTTATTGCCATTTTGTACACAAAAAGGTCAAAACTCAGTGAGGTGAATTGACATGACAGTAAGTGAAAGAACTACTATCTGATTGGGGGTCTTCTGCCGCCTGCTCTGGGACTCTTTCTGCTATGACATGAAGGACATTGGCAACCCCAGTCCTTGCAGATTTCTTTCACTGTGTGC']
===== CCCAAGAGCCTTCAGTATACACATCAATAAAAATAATTTTAATTATTCTGATAAAAGATAAACATGAAAAGTTATGGTATGCAAAGTTGAATGACAACAACTGATACTATTTGAAATAATTGACAGAATTATATTCCGTAACAATTTATAAGCAAAGCCAAAAAAACAATGATCCCTTTGTTGAATGCACAGAACAAATCCATCTTGTCCACGGCTACTGAGCATGCCTGTGATCTCCAGGGGTCACTCAGGTTTGACTCAAAGGATCCAACAGCCTGTAGACCCTGTGCTTGAAGGCATGAGGGTCACCTCTGAGTTCACACTCACTAGTGTCCCTCCTTTCTTCAGAAAGCTAGGAACTGGGAAGACAAGGGGAAAATCAATCAAGGCCTGAGGTATGGGGCTGTAGGCTGGGAGGAAACTAACATTATTGAGAAGCTACTGATGTGAATACATTTCAATTACTACTCACATTGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTTAAGACGGAGTTTTGCTCTCGTTGCCCAGGCTGGAGTGCAATGGAATGATCTAAGGTCACCACAACCTCCACCTCCCGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGTGTGCCACCACACCCAGCTAAGTTTGTATTTTTTTAGTAGAGACGGTGTTTCACCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCACCTCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCCACCACACCCAGCCTCACGTTGGTTTTTGAGATGGATTTTATTGCCATTTTGTACACAAAAAGGTCAAAACTCAGTGAGGTGAATTGACATGACAGTAAGTGAAAGAACTACTATCTGATTGGGGGTCTTCTGCCGCCTGCTCTGGGACTCTTTCTGCTATGACATGAAGGACATTGGCAACCCCAGTCCTTGCAGATTTCTTTCACTGTGTGC <class 'str'>

  

posted @ 2019-01-01 21:52  Yellow_huang  阅读(1020)  评论(0编辑  收藏  举报