关于DNA和蛋白质的对应

  1. 如果是都在编码区,那么肯定是严格的翻译关系。
  2. 对于DNA我们需要确定阅读框在哪,可能是0,1,2。
  3. DNA是有反链的,也有可能是反链的0,1,2,也有可能对应上。
  4. 最后比较重合部分,也就是最长公共子串。
    image
def get_reverse_complement(dna_sequence):
    """
    获取DNA序列的反向互补链
    
    参数:
        dna_sequence: DNA序列字符串
    
    返回:
        反向互补链序列字符串
    """
    complement_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    reverse_seq = dna_sequence[::-1]  # 反向
    reverse_complement = ''.join(complement_map.get(base, base) for base in reverse_seq.upper())
    return reverse_complement

def translate_dna(dna_sequence, start_index=0):
    """
    从指定起始位置翻译DNA序列为蛋白质序列
    
    参数:
        dna_sequence: DNA序列字符串
        start_index: 起始位置(0, 1, 2)
    
    返回:
        蛋白质序列字符串
    """
    # 密码子表 (标准遗传密码)
    codon_table = {
        'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
        'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
        'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
        'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
        'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
        'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
        'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
        'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
        'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
        'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
        'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
        'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
        'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
        'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
        'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
    }
    
    protein = ""
    dna_upper = dna_sequence.upper()
    
    # 从指定起始位置开始,每次取3个碱基
    for i in range(start_index, len(dna_upper) - 2, 3):
        codon = dna_upper[i:i+3]
        if len(codon) == 3:
            amino_acid = codon_table.get(codon, '?')
            protein += amino_acid
    
    return protein

def find_overlap(seq1, seq2):
    """
    查找两个序列的最长公共子串
    
    参数:
        seq1: 序列1
        seq2: 序列2
    
    返回:
        最长公共子串,起始位置(seq1中),起始位置(seq2中),长度
    """
    m = len(seq1)
    n = len(seq2)
    max_len = 0
    end_pos_seq1 = 0
    end_pos_seq2 = 0
    
    # 创建动态规划表
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if seq1[i-1] == seq2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
                if dp[i][j] > max_len:
                    max_len = dp[i][j]
                    end_pos_seq1 = i
                    end_pos_seq2 = j
    
    if max_len == 0:
        return "", 0, 0, 0
    
    # 提取最长公共子串
    lcs = seq1[end_pos_seq1 - max_len:end_pos_seq1]
    start_seq1 = end_pos_seq1 - max_len
    start_seq2 = end_pos_seq2 - max_len
    
    return lcs, start_seq1, start_seq2, max_len

def find_all_matches(seq1, seq2, min_length=3):
    """
    查找两个序列之间所有长度≥min_length的匹配子串
    
    参数:
        seq1: 序列1
        seq2: 序列2
        min_length: 最小匹配长度
    
    返回:
        匹配子串列表,每个元素为字典,包含起始位置、长度和序列
    """
    matches = []
    
    # 使用滑动窗口方法查找所有匹配
    for i in range(len(seq1)):
        for j in range(len(seq2)):
            k = 0
            while (i + k < len(seq1) and 
                   j + k < len(seq2) and 
                   seq1[i+k] == seq2[j+k]):
                k += 1
            if k >= min_length:
                matches.append({
                    'seq1_start': i,
                    'seq2_start': j,
                    'length': k,
                    'sequence': seq1[i:i+k]
                })
    
    # 如果没有找到匹配,返回空列表
    if not matches:
        return []
    
    # 去重,保留最长的匹配
    matches.sort(key=lambda x: x['length'], reverse=True)
    unique_matches = []
    seen_positions = set()
    for match in matches:
        pos_key = (match['seq1_start'], match['seq2_start'])
        if pos_key not in seen_positions:
            seen_positions.add(pos_key)
            unique_matches.append(match)
    
    return unique_matches

def compare_all_frames(dna_sequence, protein_sequence):
    """
    主函数:翻译6个阅读框并与给定蛋白质序列比较
    
    参数:
        dna_sequence: DNA序列
        protein_sequence: 蛋白质序列
    
    返回:
        包含比较结果的字典
    """
    results = {}
    
    # 获取反向互补链
    reverse_complement = get_reverse_complement(dna_sequence)
    
    # 定义链名称
    strands = [
        ("正向链", dna_sequence),
        ("反向互补链", reverse_complement)
    ]
    
    # 对每条链的3个阅读框进行翻译
    for strand_name, strand_seq in strands:
        for frame in range(3):
            key = f"{strand_name}_阅读框{frame}"
            translated_protein = translate_dna(strand_seq, frame)
            
            # 计算与目标蛋白质序列的最长公共子串
            longest_overlap, start_trans, start_target, length = find_overlap(translated_protein, protein_sequence)
            
            # 计算相似度
            similarity = 0
            if len(translated_protein) > 0 and len(protein_sequence) > 0:
                min_len = min(len(translated_protein), len(protein_sequence))
                similarity = (length / min_len) * 100 if min_len > 0 else 0
            
            # 查找所有重合部分
            all_matches = find_all_matches(translated_protein, protein_sequence, min_length=3)
            
            # 存储结果
            results[key] = {
                'translated_protein': translated_protein,
                'longest_overlap': longest_overlap,
                'overlap_length': length,
                'similarity_percent': similarity,
                'all_matches': all_matches,
                'frame': frame,
                'strand': strand_name,
                'translation_start': start_trans if longest_overlap else None,
                'target_start': start_target if longest_overlap else None
            }
    
    return results

def print_detailed_results(dna_seq, protein_seq, results):
    """
    打印详细的比较结果
    
    参数:
        dna_seq: DNA序列
        protein_seq: 蛋白质序列
        results: 比较结果字典
    """
    print("=" * 80)
    print("DNA序列分析结果 (6个阅读框)")
    print("=" * 80)
    print(f"DNA序列长度: {len(dna_seq)}")
    print(f"DNA序列: {dna_seq}")
    print(f"DNA反向互补序列: {get_reverse_complement(dna_seq)}")
    print(f"目标蛋白质序列长度: {len(protein_seq)}")
    print(f"目标蛋白质序列: {protein_seq}")
    print()
    
    # 按链分组显示结果
    strands = ["正向链", "反向互补链"]
    
    for strand in strands:
        print(f"{strand}:")
        print("-" * 40)
        
        for frame in range(3):
            key = f"{strand}_阅读框{frame}"
            data = results[key]
            
            print(f"  阅读框 {frame}:")
            print(f"    翻译的蛋白质序列: {data['translated_protein']}")
            print(f"    翻译序列长度: {len(data['translated_protein'])}")
            
            if data['longest_overlap']:
                print(f"    最长重合部分: {data['longest_overlap']}")
                print(f"    重合长度: {data['overlap_length']}")
                print(f"    在翻译序列中起始位置: {data['translation_start']}")
                print(f"    在目标序列中起始位置: {data['target_start']}")
                print(f"    相似度: {data['similarity_percent']:.2f}%")
            else:
                print(f"    最长重合部分: 无")
                print(f"    重合长度: 0")
                print(f"    相似度: 0%")
            
            if data['all_matches']:
                print(f"    所有重合部分 (长度≥3):")
                for i, match in enumerate(data['all_matches'][:3], 1):  # 只显示前3个
                    print(f"      {i}. 位置: 翻译序列[{match['seq1_start']}]-目标序列[{match['seq2_start']}], "
                          f"长度: {match['length']}, 序列: {match['sequence']}")
                if len(data['all_matches']) > 3:
                    print(f"      ... 以及 {len(data['all_matches']) - 3} 个其他匹配")
            else:
                print(f"    所有重合部分 (长度≥3): 无")
            print()

def find_best_match(results, protein_sequence):
    """
    在所有阅读框中找到与目标蛋白质序列最佳匹配的结果
    
    参数:
        results: 比较结果字典
        protein_sequence: 目标蛋白质序列
    
    返回:
        最佳匹配的结果
    """
    best_match = None
    best_length = 0
    
    for key, data in results.items():
        if data['overlap_length'] > best_length:
            best_length = data['overlap_length']
            best_match = (key, data)
    
    return best_match

def print_summary(dna_seq, protein_seq, results):
    """
    打印简要总结,突出显示最佳匹配
    
    参数:
        dna_seq: DNA序列
        protein_seq: 蛋白质序列
        results: 比较结果字典
    """
    print("=" * 60)
    print("简要总结")
    print("=" * 60)
    
    # 找到最佳匹配
    best_match_info = find_best_match(results, protein_seq)
    
    if best_match_info:
        best_key, best_data = best_match_info
        print(f"最佳匹配: {best_key}")
        print(f"匹配长度: {best_data['overlap_length']}")
        print(f"匹配序列: {best_data['longest_overlap']}")
        print(f"相似度: {best_data['similarity_percent']:.2f}%")
        
        # 显示翻译框架的详细信息
        if best_data['strand'] == "正向链":
            print(f"翻译框架: 从DNA位置{best_data['frame']}开始")
        else:
            print(f"翻译框架: 反向互补链,从位置{best_data['frame']}开始")
    else:
        print("未找到匹配")
    
    print()
    
    # 显示所有阅读框的匹配长度
    print("所有阅读框匹配情况:")
    print("-" * 40)
    
    strands = ["正向链", "反向互补链"]
    for strand in strands:
        print(f"{strand}:")
        for frame in range(3):
            key = f"{strand}_阅读框{frame}"
            data = results[key]
            print(f"  阅读框{frame}: 匹配长度={data['overlap_length']}, 相似度={data['similarity_percent']:.2f}%")

# 示例使用
if __name__ == "__main__":
    # 示例1: 短序列
    print("示例1: 短序列分析")
    print("=" * 60)
    #|
    # 示例DNA序列
    dna_example = "GGACCAGCCGTTTTGGTTGTCATGCAGCCGGTAGCGGCAGGTGTAGTGACCTCCATCCCCCAGAGCCACCGCGTTCAGGTGAAAGAAGATGCGATCTGGGCTGGTGCTGCTCCTGGGTACCAGCAGCT"
    
    # 示例蛋白质序列
    protein_example = "PPPPVLMHHGESSQVLHPGNKVTLTCVAPLSGVDFQLRRGEKELLVPRSSTSPDRIFFHLNAVALGDGGHYTCRYRLHDNQNGWSGDSAPVELILSDETLPAPEFSPEPESGRALRLRCLAPLEGARF"
    
    # 比较所有6个阅读框
    comparison_results = compare_all_frames(dna_example, protein_example)
    
    # 打印详细结果
    print_detailed_results(dna_example, protein_example, comparison_results)
    
    # 打印简要总结
    print_summary(dna_example, protein_example, comparison_results)
    
   
posted @ 2025-12-16 21:43  ylifs  阅读(2)  评论(0)    收藏  举报