寒假学习笔记1.18

一、 编译器前端:词法分析与语法分析

  1. 词法分析器(Lexer)
    词法单元定义
    python
    import re
    from enum import Enum

class TokenType(Enum):
# 标识符和常量
IDENTIFIER = 1
INTEGER = 2
HEX = 3
STRING = 4

# 指令和伪指令
INSTRUCTION = 10
PSEUDO_OP = 11
LABEL = 12

# 寄存器
REGISTER = 20

# 标点符号
COMMA = 30
COLON = 31
LBRACKET = 32
RBRACKET = 33
PLUS = 34
MINUS = 35

# 关键字
SECTION = 40
GLOBAL = 41
EXTERN = 42

# 结束
EOF = 100
NEWLINE = 101

词法分析实现
python
class Lexer:
def init(self, source_code):
self.source = source_code
self.position = 0
self.line = 1
self.column = 1
self.current_char = self.source[0] if self.source else None

    # 指令集模式匹配
    self.instructions = {
        'MOV', 'ADD', 'SUB', 'MUL', 'DIV', 'AND', 'OR', 'NOT',
        'PUSH', 'POP', 'CALL', 'RET', 'JMP', 'JE', 'JNE', 'JG',
        'CMP', 'INT', 'IRET', 'IN', 'OUT', 'HLT'
    }
    
    # 寄存器模式匹配
    self.registers = {
        'AX', 'BX', 'CX', 'DX', 'SI', 'DI', 'BP', 'SP',
        'CS', 'DS', 'SS', 'ES', 'IP', 'FLAGS'
    }
    
    # 伪指令
    self.pseudo_ops = {
        'DB', 'DW', 'DD', 'DQ', 'TIMES', 'EQU', 'RESB', 'RESW',
        'SECTION', 'GLOBAL', 'EXTERN', 'BITS', 'ORG'
    }

def advance(self):
    """前进一个字符"""
    self.position += 1
    self.column += 1
    
    if self.position >= len(self.source):
        self.current_char = None
    else:
        self.current_char = self.source[self.position]
        
        if self.current_char == '\n':
            self.line += 1
            self.column = 0

def peek(self):
    """查看下一个字符但不前进"""
    pos = self.position + 1
    return self.source[pos] if pos < len(self.source) else None

def skip_whitespace(self):
    """跳过空白字符"""
    while self.current_char and self.current_char.isspace():
        self.advance()

def skip_comment(self):
    """跳过注释"""
    if self.current_char == ';':
        while self.current_char and self.current_char != '\n':
            self.advance()

def get_next_token(self):
    """获取下一个词法单元"""
    while self.current_char:
        # 跳过空白和注释
        if self.current_char.isspace():
            self.skip_whitespace()
            continue
            
        if self.current_char == ';':
            self.skip_comment()
            continue
        
        # 识别标识符和关键字
        if self.current_char.isalpha() or self.current_char == '_':
            return self.get_identifier()
        
        # 识别数字
        if self.current_char.isdigit():
            return self.get_number()
        
        # 识别字符串
        if self.current_char == '"':
            return self.get_string()
        
        # 识别标点符号
        if self.current_char == ',':
            self.advance()
            return Token(TokenType.COMMA, ',', self.line, self.column)
        elif self.current_char == ':':
            self.advance()
            return Token(TokenType.COLON, ':', self.line, self.column)
        elif self.current_char == '[':
            self.advance()
            return Token(TokenType.LBRACKET, '[', self.line, self.column)
        elif self.current_char == ']':
            self.advance()
            return Token(TokenType.RBRACKET, ']', self.line, self.column)
        elif self.current_char == '+':
            self.advance()
            return Token(TokenType.PLUS, '+', self.line, self.column)
        elif self.current_char == '-':
            self.advance()
            return Token(TokenType.MINUS, '-', self.line, self.column)
        
        # 未知字符
        raise LexerError(f"未知字符: {self.current_char}", self.line, self.column)
    
    return Token(TokenType.EOF, '', self.line, self.column)

def get_identifier(self):
    """获取标识符"""
    start_pos = self.position
    start_line = self.line
    start_col = self.column
    
    while (self.current_char and 
           (self.current_char.isalnum() or self.current_char == '_')):
        self.advance()
    
    value = self.source[start_pos:self.position]
    
    # 检查是否为指令
    if value.upper() in self.instructions:
        return Token(TokenType.INSTRUCTION, value.upper(), start_line, start_col)
    
    # 检查是否为伪指令
    if value.upper() in self.pseudo_ops:
        return Token(TokenType.PSEUDO_OP, value.upper(), start_line, start_col)
    
    # 检查是否为寄存器
    if value.upper() in self.registers:
        return Token(TokenType.REGISTER, value.upper(), start_line, start_col)
    
    return Token(TokenType.IDENTIFIER, value, start_line, start_col)

def get_number(self):
    """获取数字"""
    start_pos = self.position
    start_line = self.line
    start_col = self.column
    
    # 检查十六进制
    if self.current_char == '0' and self.peek() and self.peek().lower() == 'x':
        self.advance()  # 跳过0
        self.advance()  # 跳过x
        
        while (self.current_char and 
               (self.current_char.isdigit() or 
                self.current_char.lower() in 'abcdef')):
            self.advance()
        
        value = self.source[start_pos:self.position]
        return Token(TokenType.HEX, value, start_line, start_col)
    
    # 十进制数字
    while self.current_char and self.current_char.isdigit():
        self.advance()
    
    value = self.source[start_pos:self.position]
    return Token(TokenType.INTEGER, value, start_line, start_col)

def get_string(self):
    """获取字符串"""
    start_line = self.line
    start_col = self.column
    
    self.advance()  # 跳过开头的引号
    
    value = ''
    while self.current_char and self.current_char != '"':
        if self.current_char == '\\':  # 转义字符
            self.advance()
            if self.current_char == 'n':
                value += '\n'
            elif self.current_char == 't':
                value += '\t'
            elif self.current_char == '"':
                value += '"'
            elif self.current_char == '\\':
                value += '\\'
            else:
                value += self.current_char
        else:
            value += self.current_char
        self.advance()
    
    if self.current_char != '"':
        raise LexerError("未终止的字符串", start_line, start_col)
    
    self.advance()  # 跳过结尾的引号
    
    return Token(TokenType.STRING, value, start_line, start_col)
  1. 语法分析器(Parser)
    抽象语法树节点
    python
    class ASTNode:
    def init(self, node_type):
    self.node_type = node_type

    def repr(self):
    return f"{self.class.name}()"

class ProgramNode(ASTNode):
def init(self, statements):
super().init("PROGRAM")
self.statements = statements

def __repr__(self):
    return f"ProgramNode({len(self.statements)} statements)"

class InstructionNode(ASTNode):
def init(self, opcode, operands=None, size=0):
super().init("INSTRUCTION")
self.opcode = opcode
self.operands = operands or []
self.size = size

def __repr__(self):
    operands_str = ', '.join(str(op) for op in self.operands)
    return f"InstructionNode({self.opcode} {operands_str})"

class LabelNode(ASTNode):
def init(self, name):
super().init("LABEL")
self.name = name

def __repr__(self):
    return f"LabelNode({self.name})"

class DirectiveNode(ASTNode):
def init(self, directive, args=None):
super().init("DIRECTIVE")
self.directive = directive
self.args = args or []

def __repr__(self):
    args_str = ', '.join(str(arg) for arg in self.args)
    return f"DirectiveNode({self.directive} {args_str})"

class OperandNode(ASTNode):
def init(self, operand_type, value, addressing_mode=None):
super().init("OPERAND")
self.operand_type = operand_type
self.value = value
self.addressing_mode = addressing_mode

def __repr__(self):
    return f"OperandNode({self.operand_type}, {self.value}, {self.addressing_mode})"

递归下降语法分析器
python
class Parser:
def init(self, lexer):
self.lexer = lexer
self.current_token = self.lexer.get_next_token()
self.symbol_table = {}
self.current_address = 0

def eat(self, token_type):
    """消费当前token,并获取下一个token"""
    if self.current_token.type == token_type:
        self.current_token = self.lexer.get_next_token()
    else:
        raise ParserError(
            f"期望 {token_type},但得到 {self.current_token.type}",
            self.current_token.line,
            self.current_token.column
        )

def parse(self):
    """解析整个程序"""
    statements = []
    
    while self.current_token.type != TokenType.EOF:
        # 跳过空行
        while self.current_token.type == TokenType.NEWLINE:
            self.eat(TokenType.NEWLINE)
        
        if self.current_token.type == TokenType.EOF:
            break
        
        # 解析语句
        statement = self.parse_statement()
        if statement:
            statements.append(statement)
    
    return ProgramNode(statements)

def parse_statement(self):
    """解析单个语句"""
    # 检查标签
    if self.peek_next_non_newline_token().type == TokenType.COLON:
        return self.parse_label()
    
    # 检查伪指令
    if self.current_token.type == TokenType.PSEUDO_OP:
        return self.parse_directive()
    
    # 检查指令
    if self.current_token.type == TokenType.INSTRUCTION:
        return self.parse_instruction()
    
    # 未知语句
    raise ParserError(
        f"无效语句开始: {self.current_token.value}",
        self.current_token.line,
        self.current_token.column
    )

def parse_label(self):
    """解析标签"""
    name = self.current_token.value
    self.eat(TokenType.IDENTIFIER)
    self.eat(TokenType.COLON)
    
    # 记录标签地址
    self.symbol_table[name] = self.current_address
    
    # 标签不占用空间,但返回节点用于AST
    return LabelNode(name)

def parse_instruction(self):
    """解析指令"""
    opcode = self.current_token.value
    self.eat(TokenType.INSTRUCTION)
    
    # 解析操作数
    operands = []
    if self.current_token.type not in [TokenType.NEWLINE, TokenType.EOF]:
        operands.append(self.parse_operand())
        
        while self.current_token.type == TokenType.COMMA:
            self.eat(TokenType.COMMA)
            operands.append(self.parse_operand())
    
    # 计算指令大小(简化)
    size = self.calculate_instruction_size(opcode, operands)
    self.current_address += size
    
    # 跳过换行
    if self.current_token.type == TokenType.NEWLINE:
        self.eat(TokenType.NEWLINE)
    
    return InstructionNode(opcode, operands, size)

def parse_operand(self):
    """解析操作数"""
    # 寄存器
    if self.current_token.type == TokenType.REGISTER:
        reg = self.current_token.value
        self.eat(TokenType.REGISTER)
        return OperandNode("REGISTER", reg, "REG_DIRECT")
    
    # 立即数
    if self.current_token.type in [TokenType.INTEGER, TokenType.HEX]:
        value = self.current_token.value
        token_type = self.current_token.type
        self.eat(token_type)
        
        # 检查是否为内存寻址 [立即数]
        if self.current_token.type == TokenType.LBRACKET:
            self.eat(TokenType.LBRACKET)
            # 实际上立即数已经读取,这里需要回退
            # 简化处理:重新解析为内存寻址
            return self.parse_memory_address(value)
        
        return OperandNode("IMMEDIATE", value, "IMMEDIATE")
    
    # 标识符(可能是标签或符号)
    if self.current_token.type == TokenType.IDENTIFIER:
        value = self.current_token.value
        self.eat(TokenType.IDENTIFIER)
        
        # 检查是否为内存寻址 [符号]
        if self.current_token.type == TokenType.LBRACKET:
            self.eat(TokenType.LBRACKET)
            return self.parse_memory_address(value)
        
        return OperandNode("SYMBOL", value, "SYMBOL")
    
    # 内存寻址
    if self.current_token.type == TokenType.LBRACKET:
        self.eat(TokenType.LBRACKET)
        return self.parse_memory_address()
    
    raise ParserError(
        f"无效操作数: {self.current_token.value}",
        self.current_token.line,
        self.current_token.column
    )

def parse_memory_address(self, base_value=None):
    """解析内存地址操作数"""
    # 解析基址
    if base_value:
        base = base_value
    elif self.current_token.type == TokenType.REGISTER:
        base = self.current_token.value
        self.eat(TokenType.REGISTER)
    elif self.current_token.type in [TokenType.IDENTIFIER, TokenType.INTEGER, TokenType.HEX]:
        base = self.current_token.value
        self.eat(self.current_token.type)
    else:
        raise ParserError("无效内存地址", self.current_token.line, self.current_token.column)
    
    # 检查是否有偏移
    offset = None
    if self.current_token.type in [TokenType.PLUS, TokenType.MINUS]:
        sign = 1 if self.current_token.type == TokenType.PLUS else -1
        self.eat(self.current_token.type)
        
        if self.current_token.type not in [TokenType.INTEGER, TokenType.HEX]:
            raise ParserError("偏移量必须是数字", self.current_token.line, self.current_token.column)
        
        offset_value = int(self.current_token.value, 
                          0 if self.current_token.type == TokenType.HEX else 10)
        offset = sign * offset_value
        self.eat(self.current_token.type)
    
    # 闭合括号
    self.eat(TokenType.RBRACKET)
    
    # 确定寻址模式
    if isinstance(base, str) and base.upper() in self.lexer.registers:
        addressing_mode = "REG_INDIRECT"
        if offset is not None:
            addressing_mode = "REG_INDIRECT_DISPLACEMENT"
    else:
        addressing_mode = "MEM_DIRECT"
        if offset is not None:
            addressing_mode = "MEM_DIRECT_DISPLACEMENT"
    
    return OperandNode("MEMORY", {"base": base, "offset": offset}, addressing_mode)

def parse_directive(self):
    """解析伪指令"""
    directive = self.current_token.value
    self.eat(TokenType.PSEUDO_OP)
    
    args = []
    
    # 解析参数
    while self.current_token.type not in [TokenType.NEWLINE, TokenType.EOF]:
        if self.current_token.type == TokenType.STRING:
            args.append(self.current_token.value)
            self.eat(TokenType.STRING)
        elif self.current_token.type == TokenType.INTEGER:
            args.append(int(self.current_token.value))
            self.eat(TokenType.INTEGER)
        elif self.current_token.type == TokenType.HEX:
            args.append(int(self.current_token.value, 16))
            self.eat(TokenType.HEX)
        elif self.current_token.type == TokenType.IDENTIFIER:
            args.append(self.current_token.value)
            self.eat(TokenType.IDENTIFIER)
        elif self.current_token.type == TokenType.COMMA:
            self.eat(TokenType.COMMA)
        else:
            raise ParserError(
                f"无效伪指令参数: {self.current_token.value}",
                self.current_token.line,
                self.current_token.column
            )
    
    # 计算伪指令大小
    size = self.calculate_directive_size(directive, args)
    self.current_address += size
    
    # 跳过换行
    if self.current_token.type == TokenType.NEWLINE:
        self.eat(TokenType.NEWLINE)
    
    return DirectiveNode(directive, args)

def calculate_instruction_size(self, opcode, operands):
    """计算指令大小(简化版)"""
    # 基础操作码大小
    base_size = 2  # 假设操作码占2字节
    
    # 根据操作数增加大小
    for operand in operands:
        if operand.operand_type == "REGISTER":
            base_size += 1
        elif operand.operand_type == "IMMEDIATE":
            # 假设立即数占4字节
            base_size += 4
        elif operand.operand_type == "MEMORY":
            base_size += 4  # 地址占4字节
            if operand.addressing_mode == "REG_INDIRECT_DISPLACEMENT":
                base_size += 2  # 偏移量
    
    return base_size

def calculate_directive_size(self, directive, args):
    """计算伪指令大小"""
    if directive == "DB":
        # Define Byte
        size = 0
        for arg in args:
            if isinstance(arg, str):
                size += len(arg) + 1  # 字符串加空终止符
            else:
                size += 1
        return size
    elif directive == "DW":
        # Define Word (2 bytes each)
        return len(args) * 2
    elif directive == "DD":
        # Define Double Word (4 bytes each)
        return len(args) * 4
    elif directive == "RESB":
        # Reserve Bytes
        return args[0] if args else 0
    elif directive == "RESW":
        # Reserve Words
        return (args[0] if args else 0) * 2
    else:
        # 其他伪指令不占空间
        return 0

def peek_next_non_newline_token(self):
    """预览下一个非换行token"""
    saved_position = self.lexer.position
    saved_current_token = self.current_token
    saved_line = self.lexer.line
    saved_column = self.lexer.column
    
    # 跳过当前token和新行
    while self.current_token.type == TokenType.NEWLINE:
        self.eat(TokenType.NEWLINE)
    
    next_token = self.current_token
    
    # 恢复状态
    self.lexer.position = saved_position
    self.lexer.current_char = self.lexer.source[saved_position] if saved_position < len(self.lexer.source) else None
    self.lexer.line = saved_line
    self.lexer.column = saved_column
    self.current_token = saved_current_token
    
    return next_token

二、 编译器后端:代码生成与优化

  1. 代码生成器
    机器码生成
    python
    class CodeGenerator:
    def init(self, symbol_table):
    self.symbol_table = symbol_table
    self.code_segment = [] # 代码段
    self.data_segment = [] # 数据段
    self.bss_segment = [] # 未初始化数据段
    self.current_segment = "CODE"
    self.address = 0
    self.relocation_table = [] # 重定位表

     # 操作码映射表(简化)
     self.opcode_map = {
         'MOV': 0x88,
         'ADD': 0x00,
         'SUB': 0x28,
         'MUL': 0xF6,
         'DIV': 0xF6,
         'PUSH': 0x50,
         'POP': 0x58,
         'CALL': 0xE8,
         'RET': 0xC3,
         'JMP': 0xE9,
         'JE': 0x74,
         'JNE': 0x75,
         'INT': 0xCD,
         'HLT': 0xF4,
     }
     
     # 寄存器编码
     self.register_map = {
         'AX': 0b000, 'CX': 0b001, 'DX': 0b010, 'BX': 0b011,
         'SP': 0b100, 'BP': 0b101, 'SI': 0b110, 'DI': 0b111,
     }
    

    def generate(self, ast):
    """从AST生成代码"""
    for statement in ast.statements:
    if isinstance(statement, DirectiveNode):
    self.generate_directive(statement)
    elif isinstance(statement, InstructionNode):
    self.generate_instruction(statement)
    elif isinstance(statement, LabelNode):
    # 标签已在符号表中处理
    continue

     return self.finalize_code()
    

    def generate_instruction(self, instr):
    """生成指令的机器码"""
    opcode = instr.opcode
    operands = instr.operands

     # 选择当前段
     if self.current_segment == "CODE":
         segment = self.code_segment
     else:
         # 数据段中的指令(如内联数据)
         segment = self.data_segment
     
     # 生成操作码
     if opcode in self.opcode_map:
         base_opcode = self.opcode_map[opcode]
         segment.append(base_opcode)
         self.address += 1
         
         # 生成ModR/M字节(如果有操作数)
         if operands:
             self.generate_modrm(segment, opcode, operands)
             
             # 生成立即数或位移
             for operand in operands:
                 if operand.operand_type == "IMMEDIATE":
                     self.generate_immediate(segment, operand.value)
                 elif operand.operand_type == "MEMORY":
                     self.generate_memory_operand(segment, operand)
                 elif operand.operand_type == "SYMBOL":
                     # 符号引用,需要重定位
                     self.generate_symbol_reference(segment, operand.value)
     else:
         # 未知指令,填充NOP
         segment.append(0x90)  # NOP指令
         self.address += 1
    

    def generate_modrm(self, segment, opcode, operands):
    """生成ModR/M字节"""
    if len(operands) == 0:
    return

     # 简化的ModR/M生成
     modrm = 0
     
     if len(operands) >= 1:
         op1 = operands[0]
         if op1.operand_type == "REGISTER":
             reg1_code = self.register_map.get(op1.value, 0)
             modrm |= (reg1_code << 3)
         
         if len(operands) >= 2:
             op2 = operands[1]
             if op2.operand_type == "REGISTER":
                 reg2_code = self.register_map.get(op2.value, 0)
                 modrm |= reg2_code
             elif op2.operand_type == "MEMORY":
                 # 内存操作数
                 modrm |= 0b00000110  # 假设使用[SI]寻址
     
     segment.append(modrm)
     self.address += 1
    

    def generate_immediate(self, segment, value):
    """生成立即数"""
    if isinstance(value, str):
    # 可能是符号
    if value in self.symbol_table:
    # 符号地址,需要重定位
    self.relocation_table.append({
    'type': 'ABS32',
    'address': self.address,
    'symbol': value
    })
    # 填充占位符
    segment.extend([0x00, 0x00, 0x00, 0x00])
    self.address += 4
    else:
    # 立即数
    try:
    num = int(value, 0) # 自动检测进制
    # 小端序存储
    for i in range(4):
    segment.append((num >> (i * 8)) & 0xFF)
    self.address += 1
    except ValueError:
    # 未知符号,报错
    raise CodeGenError(f"未定义的符号: {value}")
    else:
    # 数字
    num = int(value)
    for i in range(4):
    segment.append((num >> (i * 8)) & 0xFF)
    self.address += 1

    def generate_memory_operand(self, segment, operand):
    """生成内存操作数"""
    if operand.addressing_mode == "MEM_DIRECT":
    # 直接寻址
    base = operand.value['base']

         if isinstance(base, str) and base in self.symbol_table:
             # 符号地址
             self.relocation_table.append({
                 'type': 'ABS32',
                 'address': self.address,
                 'symbol': base
             })
             # 填充占位符
             segment.extend([0x00, 0x00, 0x00, 0x00])
             self.address += 4
         else:
             # 立即数地址
             addr = int(base, 0) if isinstance(base, str) else base
             # 加上偏移
             if operand.value['offset']:
                 addr += operand.value['offset']
             
             # 小端序存储地址
             for i in range(4):
                 segment.append((addr >> (i * 8)) & 0xFF)
                 self.address += 1
     else:
         # 其他寻址模式暂不实现
         raise CodeGenError(f"不支持的寻址模式: {operand.addressing_mode}")
    

    def generate_symbol_reference(self, segment, symbol):
    """生成符号引用"""
    if symbol not in self.symbol_table:
    raise CodeGenError(f"未定义的符号: {symbol}")

     # 添加重定位项
     self.relocation_table.append({
         'type': 'REL32',
         'address': self.address,
         'symbol': symbol
     })
     
     # 填充占位符
     segment.extend([0x00, 0x00, 0x00, 0x00])
     self.address += 4
    

    def generate_directive(self, directive):
    """处理伪指令"""
    if directive.directive == "SECTION":
    # 切换段
    section_name = directive.args[0] if directive.args else ".text"
    if section_name.startswith(".text"):
    self.current_segment = "CODE"
    elif section_name.startswith(".data"):
    self.current_segment = "DATA"
    elif section_name.startswith(".bss"):
    self.current_segment = "BSS"

     elif directive.directive == "DB":
         # 定义字节
         if self.current_segment == "CODE":
             segment = self.code_segment
         elif self.current_segment == "DATA":
             segment = self.data_segment
         else:
             segment = self.bss_segment
         
         for arg in directive.args:
             if isinstance(arg, str):
                 # 字符串
                 for char in arg:
                     segment.append(ord(char))
                     self.address += 1
                 segment.append(0)  # 空终止符
                 self.address += 1
             else:
                 # 数字
                 segment.append(arg & 0xFF)
                 self.address += 1
     
     elif directive.directive == "DW":
         # 定义字
         segment = self.get_current_segment()
         for arg in directive.args:
             num = int(arg, 0) if isinstance(arg, str) else arg
             # 小端序
             segment.append(num & 0xFF)
             segment.append((num >> 8) & 0xFF)
             self.address += 2
     
     elif directive.directive == "RESB":
         # 保留字节
         segment = self.get_current_segment()
         count = directive.args[0] if directive.args else 0
         segment.extend([0] * count)
         self.address += count
    

    def get_current_segment(self):
    """获取当前段"""
    if self.current_segment == "CODE":
    return self.code_segment
    elif self.current_segment == "DATA":
    return self.data_segment
    else:
    return self.bss_segment

    def finalize_code(self):
    """完成代码生成,处理重定位"""
    # 计算符号的最终地址
    symbol_addresses = {}

     # 简化的地址计算
     for symbol, addr in self.symbol_table.items():
         symbol_addresses[symbol] = addr
     
     # 应用重定位
     for reloc in self.relocation_table:
         symbol_addr = symbol_addresses.get(reloc['symbol'])
         if symbol_addr is None:
             raise CodeGenError(f"重定位失败: 未定义符号 {reloc['symbol']}")
         
         if reloc['type'] == 'ABS32':
             # 绝对地址
             target_addr = symbol_addr
         elif reloc['type'] == 'REL32':
             # 相对地址
             # 相对地址 = 目标地址 - (重定位地址 + 4)
             target_addr = symbol_addr - (reloc['address'] + 4)
         else:
             continue
         
         # 写入重定位地址(小端序)
         # 找到重定位位置并写入
         if reloc['address'] < len(self.code_segment):
             segment = self.code_segment
         else:
             segment = self.data_segment
             reloc_addr = reloc['address'] - len(self.code_segment)
         
         for i in range(4):
             segment[reloc['address'] + i] = (target_addr >> (i * 8)) & 0xFF
     
     return {
         'code': self.code_segment,
         'data': self.data_segment,
         'bss': self.bss_segment,
         'symbols': symbol_addresses,
         'relocations': self.relocation_table
     }
    
  2. 优化器
    窥孔优化
    python
    class PeepholeOptimizer:
    def init(self):
    self.patterns = [
    # 冗余移动消除
    (['MOV AX, AX'], []), # MOV AX, AX -> 无操作
    (['MOV AX, BX', 'MOV BX, AX'], ['MOV AX, BX']), # 交换消除

         # 常数传播
         (['MOV AX, 5', 'ADD BX, AX'], ['MOV AX, 5', 'ADD BX, 5']),
         
         # 死代码消除
         (['MOV AX, 5', 'MOV AX, 10'], ['MOV AX, 10']),  # 覆盖前值
         
         # 强度削减
         (['MUL AX, 2'], ['ADD AX, AX']),  # 乘法转加法
         (['MUL AX, 4'], ['SHL AX, 2']),   # 乘法转移位
         
         # 分支优化
         (['JMP LABEL1', 'LABEL1:'], []),  # 跳转到下一指令
         (['JE LABEL1', 'JMP LABEL2', 'LABEL1:'], ['JNE LABEL2', 'LABEL1:']),  # 条件反转
     ]
    

    def optimize(self, instructions):
    """应用窥孔优化"""
    optimized = instructions[:]
    changed = True

     while changed:
         changed = False
         i = 0
         
         while i < len(optimized):
             # 尝试匹配模式
             for pattern, replacement in self.patterns:
                 if self.match_pattern(optimized, i, pattern):
                     # 替换匹配的模式
                     optimized[i:i+len(pattern)] = replacement
                     changed = True
                     break
             
             i += 1
     
     return optimized
    

    def match_pattern(self, instructions, start, pattern):
    """检查指令序列是否匹配模式"""
    if start + len(pattern) > len(instructions):
    return False

     for i in range(len(pattern)):
         if instructions[start + i] != pattern[i]:
             return False
     
     return True
    

数据流分析
python
class DataFlowAnalyzer:
def init(self):
self.defs = {} # 变量定义点
self.uses = {} # 变量使用点
self.liveness = {} # 活跃变量分析

def analyze(self, instructions):
    """执行数据流分析"""
    # 构建控制流图(简化)
    cfg = self.build_cfg(instructions)
    
    # 活跃变量分析
    self.liveness_analysis(cfg, instructions)
    
    # 到达定义分析
    self.reaching_definitions(cfg, instructions)
    
    return {
        'liveness': self.liveness,
        'reaching_defs': self.defs,
        'uses': self.uses
    }

def build_cfg(self, instructions):
    """构建控制流图(简化版)"""
    cfg = {}
    
    for i, instr in enumerate(instructions):
        cfg[i] = []
        
        # 检查是否为跳转指令
        if isinstance(instr, InstructionNode) and instr.opcode.startswith('J'):
            # 寻找目标标签
            target_label = instr.operands[0].value if instr.operands else None
            if target_label:
                # 查找标签位置(简化)
                for j, instr2 in enumerate(instructions):
                    if isinstance(instr2, LabelNode) and instr2.name == target_label:
                        cfg[i].append(j)
        
        # 如果不是无条件跳转,则添加下一条指令
        if not (isinstance(instr, InstructionNode) and instr.opcode == 'JMP'):
            if i + 1 < len(instructions):
                cfg[i].append(i + 1)
    
    return cfg

def liveness_analysis(self, cfg, instructions):
    """活跃变量分析"""
    # 初始化
    n = len(instructions)
    live_in = [set() for _ in range(n)]
    live_out = [set() for() for _ in range(n)]
    
    changed = True
    while changed:
        changed = False
        
        # 逆序遍历基本块
        for i in range(n-1, -1, -1):
            instr = instructions[i]
            
            # 计算use和def集合
            use_set = self.get_uses(instr)
            def_set = self.get_defs(instr)
            
            # 计算live_in和live_out
            old_live_in = live_in[i].copy()
            old_live_out = live_out[i].copy()
            
            # live_out = union of live_in of successors
            live_out[i] = set()
            for succ in cfg.get(i, []):
                live_out[i].update(live_in[succ])
            
            # live_in = use ∪ (live_out - def)
            live_in[i] = use_set.union(live_out[i] - def_set)
            
            if live_in[i] != old_live_in or live_out[i] != old_live_out:
                changed = True
    
    # 存储结果
    for i in range(n):
        self.liveness[i] = {
            'in': live_in[i],
            'out': live_out[i]
        }

def get_uses(self, instr):
    """获取指令使用的变量"""
    uses = set()
    
    if isinstance(instr, InstructionNode):
        for operand in instr.operands:
            if operand.operand_type == "REGISTER":
                uses.add(operand.value)
            elif operand.operand_type == "SYMBOL":
                # 变量使用
                uses.add(operand.value)
    
    return uses

def get_defs(self, instr):
    """获取指令定义的变量"""
    defs = set()
    
    if isinstance(instr, InstructionNode):
        # 大多数指令的第一个操作数是目标
        if instr.operands:
            dest = instr.operands[0]
            if dest.operand_type == "REGISTER":
                defs.add(dest.value)
            elif dest.operand_type == "SYMBOL":
                defs.add(dest.value)
    
    return defs

三、 链接器

  1. 目标文件格式
    python
    class ObjectFile:
    def init(self):
    self.sections = {
    '.text': {'data': [], 'address': 0, 'flags': 'RX'},
    '.data': {'data': [], 'address': 0, 'flags': 'RW'},
    '.bss': {'data': [], 'address': 0, 'flags': 'RW'},
    '.rodata': {'data': [], 'address': 0, 'flags': 'R'},
    }

     self.symbols = {}  # 符号表
     self.relocations = []  # 重定位表
     self.imports = []  # 导入符号
     self.exports = []  # 导出符号
    

    def add_section_data(self, section_name, data):
    """添加段数据"""
    if section_name in self.sections:
    self.sections[section_name]['data'].extend(data)

    def add_symbol(self, name, value, section, type='LOCAL'):
    """添加符号"""
    self.symbols[name] = {
    'value': value,
    'section': section,
    'type': type,
    'size': 0 # 符号大小(字节)
    }

    def add_relocation(self, section, offset, symbol, type):
    """添加重定位项"""
    self.relocations.append({
    'section': section,
    'offset': offset,
    'symbol': symbol,
    'type': type
    })

  2. 链接器实现
    python
    class Linker:
    def init(self):
    self.objects = [] # 输入目标文件
    self.symbol_table = {} # 全局符号表
    self.section_addresses = {} # 段基址
    self.output_sections = {} # 输出段

     # 内存布局(简化)
     self.layout = {
         '.text': 0x1000,   # 代码段基址
         '.data': 0x2000,   # 数据段基址
         '.rodata': 0x3000, # 只读数据段
         '.bss': 0x4000,    # BSS段基址
     }
    

    def add_object(self, obj_file):
    """添加目标文件"""
    self.objects.append(obj_file)

    def link(self):
    """执行链接"""
    # 第一步:收集所有符号
    self.collect_symbols()

     # 第二步:解析符号引用
     self.resolve_symbols()
     
     # 第三步:分配最终地址
     self.assign_addresses()
     
     # 第四步:应用重定位
     self.apply_relocations()
     
     # 第五步:生成可执行映像
     return self.generate_executable()
    

    def collect_symbols(self):
    """收集所有符号"""
    self.symbol_table.clear()

     for obj_idx, obj in enumerate(self.objects):
         for name, sym_info in obj.symbols.items():
             if sym_info['type'] == 'EXPORT' or name in obj.exports:
                 # 导出符号
                 if name in self.symbol_table:
                     # 符号重复定义
                     raise LinkerError(f"符号重复定义: {name}")
                 
                 self.symbol_table[name] = {
                     'object': obj_idx,
                     'value': sym_info['value'],
                     'section': sym_info['section'],
                     'type': sym_info['type']
                 }
    

    def resolve_symbols(self):
    """解析符号引用"""
    unresolved = set()

     for obj_idx, obj in enumerate(self.objects):
         for reloc in obj.relocations:
             symbol = reloc['symbol']
             
             if symbol not in self.symbol_table:
                 # 检查是否为导入
                 if symbol in obj.imports:
                     # 需要外部符号
                     unresolved.add(symbol)
                 else:
                     # 未定义符号
                     raise LinkerError(f"未定义符号: {symbol}")
     
     if unresolved:
         # 尝试从其他对象文件解析
         for symbol in list(unresolved):
             for obj_idx, obj in enumerate(self.objects):
                 if symbol in obj.exports:
                     # 找到定义
                     sym_info = obj.symbols[symbol]
                     self.symbol_table[symbol] = {
                         'object': obj_idx,
                         'value': sym_info['value'],
                         'section': sym_info['section'],
                         'type': 'IMPORT'
                     }
                     unresolved.remove(symbol)
                     break
     
     if unresolved:
         raise LinkerError(f"无法解析的符号: {unresolved}")
    

    def assign_addresses(self):
    """分配最终地址"""
    # 初始化输出段
    for section in self.layout:
    self.output_sections[section] = {
    'address': self.layout[section],
    'data': [],
    'size': 0
    }

     # 合并段
     current_address = {section: self.layout[section] for section in self.layout}
     
     for obj_idx, obj in enumerate(self.objects):
         for section_name, section_info in obj.sections.items():
             if section_name not in self.output_sections:
                 # 创建新段
                 self.output_sections[section_name] = {
                     'address': current_address.get(section_name, 0),
                     'data': [],
                     'size': 0
                 }
                 if section_name not in current_address:
                     current_address[section_name] = 0
             
             output_section = self.output_sections[section_name]
             
             # 添加数据
             data = section_info['data']
             output_section['data'].extend(data)
             output_section['size'] += len(data)
             
             # 更新符号地址
             for name, sym_info in obj.symbols.items():
                 if sym_info['section'] == section_name:
                     if name in self.symbol_table:
                         sym_entry = self.symbol_table[name]
                         # 计算最终地址 = 段基址 + 符号在段内的偏移
                         final_addr = output_section['address'] + sym_info['value']
                         sym_entry['final_address'] = final_addr
     
     # 分配BSS段空间
     bss_size = self.output_sections.get('.bss', {'size': 0})['size']
     self.output_sections['.bss']['data'] = [0] * bss_size
    

    def apply_relocations(self):
    """应用重定位"""
    for obj_idx, obj in enumerate(self.objects):
    for reloc in obj.relocations:
    symbol = reloc['symbol']
    section = reloc['section']
    offset = reloc['offset']
    reloc_type = reloc['type']

             # 获取符号最终地址
             if symbol not in self.symbol_table:
                 continue
             
             sym_info = self.symbol_table[symbol]
             symbol_addr = sym_info.get('final_address', 0)
             
             # 获取重定位目标段
             output_section = self.output_sections.get(section)
             if not output_section:
                 continue
             
             # 计算重定位地址
             reloc_addr = output_section['address'] + offset
             
             # 在段数据中应用重定位
             data = output_section['data']
             reloc_idx = offset
             
             if reloc_type == 'ABS32':
                 # 绝对地址重定位
                 for i in range(4):
                     if reloc_idx + i < len(data):
                         data[reloc_idx + i] = (symbol_addr >> (i * 8)) & 0xFF
             
             elif reloc_type == 'REL32':
                 # 相对地址重定位
                 # 相对地址 = 目标地址 - (重定位地址 + 4)
                 relative_addr = symbol_addr - (reloc_addr + 4)
                 for i in range(4):
                     if reloc_idx + i < len(data):
                         data[reloc_idx + i] = (relative_addr >> (i * 8)) & 0xFF
    

    def generate_executable(self):
    """生成可执行文件"""
    # 按地址排序段
    sorted_sections = sorted(
    self.output_sections.items(),
    key=lambda x: x[1]['address']
    )

     # 构建内存映像
     memory_image = {}
     entry_point = None
     
     for section_name, section_info in sorted_sections:
         addr = section_info['address']
         data = section_info['data']
         
         # 存储到内存映像
         for i, byte in enumerate(data):
             memory_image[addr + i] = byte
         
         # 查找入口点(通常是_start标签)
         if section_name == '.text':
             if '_start' in self.symbol_table:
                 entry_point = self.symbol_table['_start']['final_address']
     
     return {
         'entry_point': entry_point or self.layout['.text'],
         'memory_image': memory_image,
         'sections': self.output_sections,
         'symbol_table': self.symbol_table
     }
    
posted @ 2026-01-20 21:12  头发少的文不识  阅读(0)  评论(0)    收藏  举报