寒假学习笔记1.18

一、编译器前端：词法分析与语法分析

词法分析器（Lexer）
词法单元定义
python
import re
from enum import Enum

class TokenType(Enum):
# 标识符和常量
IDENTIFIER = 1
INTEGER = 2
HEX = 3
STRING = 4

# 指令和伪指令
INSTRUCTION = 10
PSEUDO_OP = 11
LABEL = 12

# 寄存器
REGISTER = 20

# 标点符号
COMMA = 30
COLON = 31
LBRACKET = 32
RBRACKET = 33
PLUS = 34
MINUS = 35

# 关键字
SECTION = 40
GLOBAL = 41
EXTERN = 42

# 结束
EOF = 100
NEWLINE = 101

词法分析实现
python
class Lexer:
def init(self, source_code):
self.source = source_code
self.position = 0
self.line = 1
self.column = 1
self.current_char = self.source[0] if self.source else None

    # 指令集模式匹配
    self.instructions = {
        'MOV', 'ADD', 'SUB', 'MUL', 'DIV', 'AND', 'OR', 'NOT',
        'PUSH', 'POP', 'CALL', 'RET', 'JMP', 'JE', 'JNE', 'JG',
        'CMP', 'INT', 'IRET', 'IN', 'OUT', 'HLT'
    }
    
    # 寄存器模式匹配
    self.registers = {
        'AX', 'BX', 'CX', 'DX', 'SI', 'DI', 'BP', 'SP',
        'CS', 'DS', 'SS', 'ES', 'IP', 'FLAGS'
    }
    
    # 伪指令
    self.pseudo_ops = {
        'DB', 'DW', 'DD', 'DQ', 'TIMES', 'EQU', 'RESB', 'RESW',
        'SECTION', 'GLOBAL', 'EXTERN', 'BITS', 'ORG'
    }

def advance(self):
    """前进一个字符"""
    self.position += 1
    self.column += 1
    
    if self.position >= len(self.source):
        self.current_char = None
    else:
        self.current_char = self.source[self.position]
        
        if self.current_char == '\n':
            self.line += 1
            self.column = 0

def peek(self):
    """查看下一个字符但不前进"""
    pos = self.position + 1
    return self.source[pos] if pos < len(self.source) else None

def skip_whitespace(self):
    """跳过空白字符"""
    while self.current_char and self.current_char.isspace():
        self.advance()

def skip_comment(self):
    """跳过注释"""
    if self.current_char == ';':
        while self.current_char and self.current_char != '\n':
            self.advance()

def get_next_token(self):
    """获取下一个词法单元"""
    while self.current_char:
        # 跳过空白和注释
        if self.current_char.isspace():
            self.skip_whitespace()
            continue
            
        if self.current_char == ';':
            self.skip_comment()
            continue
        
        # 识别标识符和关键字
        if self.current_char.isalpha() or self.current_char == '_':
            return self.get_identifier()
        
        # 识别数字
        if self.current_char.isdigit():
            return self.get_number()
        
        # 识别字符串
        if self.current_char == '"':
            return self.get_string()
        
        # 识别标点符号
        if self.current_char == ',':
            self.advance()
            return Token(TokenType.COMMA, ',', self.line, self.column)
        elif self.current_char == ':':
            self.advance()
            return Token(TokenType.COLON, ':', self.line, self.column)
        elif self.current_char == '[':
            self.advance()
            return Token(TokenType.LBRACKET, '[', self.line, self.column)
        elif self.current_char == ']':
            self.advance()
            return Token(TokenType.RBRACKET, ']', self.line, self.column)
        elif self.current_char == '+':
            self.advance()
            return Token(TokenType.PLUS, '+', self.line, self.column)
        elif self.current_char == '-':
            self.advance()
            return Token(TokenType.MINUS, '-', self.line, self.column)
        
        # 未知字符
        raise LexerError(f"未知字符: {self.current_char}", self.line, self.column)
    
    return Token(TokenType.EOF, '', self.line, self.column)

def get_identifier(self):
    """获取标识符"""
    start_pos = self.position
    start_line = self.line
    start_col = self.column
    
    while (self.current_char and 
           (self.current_char.isalnum() or self.current_char == '_')):
        self.advance()
    
    value = self.source[start_pos:self.position]
    
    # 检查是否为指令
    if value.upper() in self.instructions:
        return Token(TokenType.INSTRUCTION, value.upper(), start_line, start_col)
    
    # 检查是否为伪指令
    if value.upper() in self.pseudo_ops:
        return Token(TokenType.PSEUDO_OP, value.upper(), start_line, start_col)
    
    # 检查是否为寄存器
    if value.upper() in self.registers:
        return Token(TokenType.REGISTER, value.upper(), start_line, start_col)
    
    return Token(TokenType.IDENTIFIER, value, start_line, start_col)

def get_number(self):
    """获取数字"""
    start_pos = self.position
    start_line = self.line
    start_col = self.column
    
    # 检查十六进制
    if self.current_char == '0' and self.peek() and self.peek().lower() == 'x':
        self.advance()  # 跳过0
        self.advance()  # 跳过x
        
        while (self.current_char and 
               (self.current_char.isdigit() or 
                self.current_char.lower() in 'abcdef')):
            self.advance()
        
        value = self.source[start_pos:self.position]
        return Token(TokenType.HEX, value, start_line, start_col)
    
    # 十进制数字
    while self.current_char and self.current_char.isdigit():
        self.advance()
    
    value = self.source[start_pos:self.position]
    return Token(TokenType.INTEGER, value, start_line, start_col)

def get_string(self):
    """获取字符串"""
    start_line = self.line
    start_col = self.column
    
    self.advance()  # 跳过开头的引号
    
    value = ''
    while self.current_char and self.current_char != '"':
        if self.current_char == '\\':  # 转义字符
            self.advance()
            if self.current_char == 'n':
                value += '\n'
            elif self.current_char == 't':
                value += '\t'
            elif self.current_char == '"':
                value += '"'
            elif self.current_char == '\\':
                value += '\\'
            else:
                value += self.current_char
        else:
            value += self.current_char
        self.advance()
    
    if self.current_char != '"':
        raise LexerError("未终止的字符串", start_line, start_col)
    
    self.advance()  # 跳过结尾的引号
    
    return Token(TokenType.STRING, value, start_line, start_col)

语法分析器（Parser）
抽象语法树节点
python
class ASTNode:
def init(self, node_type):
self.node_type = node_type

def repr(self):
return f"{self.class.name}()"

class ProgramNode(ASTNode):
def init(self, statements):
super().init("PROGRAM")
self.statements = statements

def __repr__(self):
    return f"ProgramNode({len(self.statements)} statements)"

class InstructionNode(ASTNode):
def init(self, opcode, operands=None, size=0):
super().init("INSTRUCTION")
self.opcode = opcode
self.operands = operands or []
self.size = size

def __repr__(self):
    operands_str = ', '.join(str(op) for op in self.operands)
    return f"InstructionNode({self.opcode} {operands_str})"

class LabelNode(ASTNode):
def init(self, name):
super().init("LABEL")
self.name = name

def __repr__(self):
    return f"LabelNode({self.name})"

class DirectiveNode(ASTNode):
def init(self, directive, args=None):
super().init("DIRECTIVE")
self.directive = directive
self.args = args or []

def __repr__(self):
    args_str = ', '.join(str(arg) for arg in self.args)
    return f"DirectiveNode({self.directive} {args_str})"

class OperandNode(ASTNode):
def init(self, operand_type, value, addressing_mode=None):
super().init("OPERAND")
self.operand_type = operand_type
self.value = value
self.addressing_mode = addressing_mode

def __repr__(self):
    return f"OperandNode({self.operand_type}, {self.value}, {self.addressing_mode})"

递归下降语法分析器
python
class Parser:
def init(self, lexer):
self.lexer = lexer
self.current_token = self.lexer.get_next_token()
self.symbol_table = {}
self.current_address = 0

def eat(self, token_type):
    """消费当前token，并获取下一个token"""
    if self.current_token.type == token_type:
        self.current_token = self.lexer.get_next_token()
    else:
        raise ParserError(
            f"期望 {token_type}，但得到 {self.current_token.type}",
            self.current_token.line,
            self.current_token.column
        )

def parse(self):
    """解析整个程序"""
    statements = []
    
    while self.current_token.type != TokenType.EOF:
        # 跳过空行
        while self.current_token.type == TokenType.NEWLINE:
            self.eat(TokenType.NEWLINE)
        
        if self.current_token.type == TokenType.EOF:
            break
        
        # 解析语句
        statement = self.parse_statement()
        if statement:
            statements.append(statement)
    
    return ProgramNode(statements)

def parse_statement(self):
    """解析单个语句"""
    # 检查标签
    if self.peek_next_non_newline_token().type == TokenType.COLON:
        return self.parse_label()
    
    # 检查伪指令
    if self.current_token.type == TokenType.PSEUDO_OP:
        return self.parse_directive()
    
    # 检查指令
    if self.current_token.type == TokenType.INSTRUCTION:
        return self.parse_instruction()
    
    # 未知语句
    raise ParserError(
        f"无效语句开始: {self.current_token.value}",
        self.current_token.line,
        self.current_token.column
    )

def parse_label(self):
    """解析标签"""
    name = self.current_token.value
    self.eat(TokenType.IDENTIFIER)
    self.eat(TokenType.COLON)
    
    # 记录标签地址
    self.symbol_table[name] = self.current_address
    
    # 标签不占用空间，但返回节点用于AST
    return LabelNode(name)

def parse_instruction(self):
    """解析指令"""
    opcode = self.current_token.value
    self.eat(TokenType.INSTRUCTION)
    
    # 解析操作数
    operands = []
    if self.current_token.type not in [TokenType.NEWLINE, TokenType.EOF]:
        operands.append(self.parse_operand())
        
        while self.current_token.type == TokenType.COMMA:
            self.eat(TokenType.COMMA)
            operands.append(self.parse_operand())
    
    # 计算指令大小（简化）
    size = self.calculate_instruction_size(opcode, operands)
    self.current_address += size
    
    # 跳过换行
    if self.current_token.type == TokenType.NEWLINE:
        self.eat(TokenType.NEWLINE)
    
    return InstructionNode(opcode, operands, size)

def parse_operand(self):
    """解析操作数"""
    # 寄存器
    if self.current_token.type == TokenType.REGISTER:
        reg = self.current_token.value
        self.eat(TokenType.REGISTER)
        return OperandNode("REGISTER", reg, "REG_DIRECT")
    
    # 立即数
    if self.current_token.type in [TokenType.INTEGER, TokenType.HEX]:
        value = self.current_token.value
        token_type = self.current_token.type
        self.eat(token_type)
        
        # 检查是否为内存寻址 [立即数]
        if self.current_token.type == TokenType.LBRACKET:
            self.eat(TokenType.LBRACKET)
            # 实际上立即数已经读取，这里需要回退
            # 简化处理：重新解析为内存寻址
            return self.parse_memory_address(value)
        
        return OperandNode("IMMEDIATE", value, "IMMEDIATE")
    
    # 标识符（可能是标签或符号）
    if self.current_token.type == TokenType.IDENTIFIER:
        value = self.current_token.value
        self.eat(TokenType.IDENTIFIER)
        
        # 检查是否为内存寻址 [符号]
        if self.current_token.type == TokenType.LBRACKET:
            self.eat(TokenType.LBRACKET)
            return self.parse_memory_address(value)
        
        return OperandNode("SYMBOL", value, "SYMBOL")
    
    # 内存寻址
    if self.current_token.type == TokenType.LBRACKET:
        self.eat(TokenType.LBRACKET)
        return self.parse_memory_address()
    
    raise ParserError(
        f"无效操作数: {self.current_token.value}",
        self.current_token.line,
        self.current_token.column
    )

def parse_memory_address(self, base_value=None):
    """解析内存地址操作数"""
    # 解析基址
    if base_value:
        base = base_value
    elif self.current_token.type == TokenType.REGISTER:
        base = self.current_token.value
        self.eat(TokenType.REGISTER)
    elif self.current_token.type in [TokenType.IDENTIFIER, TokenType.INTEGER, TokenType.HEX]:
        base = self.current_token.value
        self.eat(self.current_token.type)
    else:
        raise ParserError("无效内存地址", self.current_token.line, self.current_token.column)
    
    # 检查是否有偏移
    offset = None
    if self.current_token.type in [TokenType.PLUS, TokenType.MINUS]:
        sign = 1 if self.current_token.type == TokenType.PLUS else -1
        self.eat(self.current_token.type)
        
        if self.current_token.type not in [TokenType.INTEGER, TokenType.HEX]:
            raise ParserError("偏移量必须是数字", self.current_token.line, self.current_token.column)
        
        offset_value = int(self.current_token.value, 
                          0 if self.current_token.type == TokenType.HEX else 10)
        offset = sign * offset_value
        self.eat(self.current_token.type)
    
    # 闭合括号
    self.eat(TokenType.RBRACKET)
    
    # 确定寻址模式
    if isinstance(base, str) and base.upper() in self.lexer.registers:
        addressing_mode = "REG_INDIRECT"
        if offset is not None:
            addressing_mode = "REG_INDIRECT_DISPLACEMENT"
    else:
        addressing_mode = "MEM_DIRECT"
        if offset is not None:
            addressing_mode = "MEM_DIRECT_DISPLACEMENT"
    
    return OperandNode("MEMORY", {"base": base, "offset": offset}, addressing_mode)

def parse_directive(self):
    """解析伪指令"""
    directive = self.current_token.value
    self.eat(TokenType.PSEUDO_OP)
    
    args = []
    
    # 解析参数
    while self.current_token.type not in [TokenType.NEWLINE, TokenType.EOF]:
        if self.current_token.type == TokenType.STRING:
            args.append(self.current_token.value)
            self.eat(TokenType.STRING)
        elif self.current_token.type == TokenType.INTEGER:
            args.append(int(self.current_token.value))
            self.eat(TokenType.INTEGER)
        elif self.current_token.type == TokenType.HEX:
            args.append(int(self.current_token.value, 16))
            self.eat(TokenType.HEX)
        elif self.current_token.type == TokenType.IDENTIFIER:
            args.append(self.current_token.value)
            self.eat(TokenType.IDENTIFIER)
        elif self.current_token.type == TokenType.COMMA:
            self.eat(TokenType.COMMA)
        else:
            raise ParserError(
                f"无效伪指令参数: {self.current_token.value}",
                self.current_token.line,
                self.current_token.column
            )
    
    # 计算伪指令大小
    size = self.calculate_directive_size(directive, args)
    self.current_address += size
    
    # 跳过换行
    if self.current_token.type == TokenType.NEWLINE:
        self.eat(TokenType.NEWLINE)
    
    return DirectiveNode(directive, args)

def calculate_instruction_size(self, opcode, operands):
    """计算指令大小（简化版）"""
    # 基础操作码大小
    base_size = 2  # 假设操作码占2字节
    
    # 根据操作数增加大小
    for operand in operands:
        if operand.operand_type == "REGISTER":
            base_size += 1
        elif operand.operand_type == "IMMEDIATE":
            # 假设立即数占4字节
            base_size += 4
        elif operand.operand_type == "MEMORY":
            base_size += 4  # 地址占4字节
            if operand.addressing_mode == "REG_INDIRECT_DISPLACEMENT":
                base_size += 2  # 偏移量
    
    return base_size

def calculate_directive_size(self, directive, args):
    """计算伪指令大小"""
    if directive == "DB":
        # Define Byte
        size = 0
        for arg in args:
            if isinstance(arg, str):
                size += len(arg) + 1  # 字符串加空终止符
            else:
                size += 1
        return size
    elif directive == "DW":
        # Define Word (2 bytes each)
        return len(args) * 2
    elif directive == "DD":
        # Define Double Word (4 bytes each)
        return len(args) * 4
    elif directive == "RESB":
        # Reserve Bytes
        return args[0] if args else 0
    elif directive == "RESW":
        # Reserve Words
        return (args[0] if args else 0) * 2
    else:
        # 其他伪指令不占空间
        return 0

def peek_next_non_newline_token(self):
    """预览下一个非换行token"""
    saved_position = self.lexer.position
    saved_current_token = self.current_token
    saved_line = self.lexer.line
    saved_column = self.lexer.column
    
    # 跳过当前token和新行
    while self.current_token.type == TokenType.NEWLINE:
        self.eat(TokenType.NEWLINE)
    
    next_token = self.current_token
    
    # 恢复状态
    self.lexer.position = saved_position
    self.lexer.current_char = self.lexer.source[saved_position] if saved_position < len(self.lexer.source) else None
    self.lexer.line = saved_line
    self.lexer.column = saved_column
    self.current_token = saved_current_token
    
    return next_token

二、编译器后端：代码生成与优化

代码生成器
机器码生成
python
class CodeGenerator:
def init(self, symbol_table):
self.symbol_table = symbol_table
self.code_segment = [] # 代码段
self.data_segment = [] # 数据段
self.bss_segment = [] # 未初始化数据段
self.current_segment = "CODE"
self.address = 0
self.relocation_table = [] # 重定位表

 # 操作码映射表（简化）
 self.opcode_map = {
     'MOV': 0x88,
     'ADD': 0x00,
     'SUB': 0x28,
     'MUL': 0xF6,
     'DIV': 0xF6,
     'PUSH': 0x50,
     'POP': 0x58,
     'CALL': 0xE8,
     'RET': 0xC3,
     'JMP': 0xE9,
     'JE': 0x74,
     'JNE': 0x75,
     'INT': 0xCD,
     'HLT': 0xF4,
 }
 
 # 寄存器编码
 self.register_map = {
     'AX': 0b000, 'CX': 0b001, 'DX': 0b010, 'BX': 0b011,
     'SP': 0b100, 'BP': 0b101, 'SI': 0b110, 'DI': 0b111,
 }

def generate(self, ast):
"""从AST生成代码"""
for statement in ast.statements:
if isinstance(statement, DirectiveNode):
self.generate_directive(statement)
elif isinstance(statement, InstructionNode):
self.generate_instruction(statement)
elif isinstance(statement, LabelNode):
# 标签已在符号表中处理
continue

 return self.finalize_code()

def generate_instruction(self, instr):
"""生成指令的机器码"""
opcode = instr.opcode
operands = instr.operands

 # 选择当前段
 if self.current_segment == "CODE":
     segment = self.code_segment
 else:
     # 数据段中的指令（如内联数据）
     segment = self.data_segment
 
 # 生成操作码
 if opcode in self.opcode_map:
     base_opcode = self.opcode_map[opcode]
     segment.append(base_opcode)
     self.address += 1
     
     # 生成ModR/M字节（如果有操作数）
     if operands:
         self.generate_modrm(segment, opcode, operands)
         
         # 生成立即数或位移
         for operand in operands:
             if operand.operand_type == "IMMEDIATE":
                 self.generate_immediate(segment, operand.value)
             elif operand.operand_type == "MEMORY":
                 self.generate_memory_operand(segment, operand)
             elif operand.operand_type == "SYMBOL":
                 # 符号引用，需要重定位
                 self.generate_symbol_reference(segment, operand.value)
 else:
     # 未知指令，填充NOP
     segment.append(0x90)  # NOP指令
     self.address += 1

def generate_modrm(self, segment, opcode, operands):
"""生成ModR/M字节"""
if len(operands) == 0:
return

 # 简化的ModR/M生成
 modrm = 0
 
 if len(operands) >= 1:
     op1 = operands[0]
     if op1.operand_type == "REGISTER":
         reg1_code = self.register_map.get(op1.value, 0)
         modrm |= (reg1_code << 3)
     
     if len(operands) >= 2:
         op2 = operands[1]
         if op2.operand_type == "REGISTER":
             reg2_code = self.register_map.get(op2.value, 0)
             modrm |= reg2_code
         elif op2.operand_type == "MEMORY":
             # 内存操作数
             modrm |= 0b00000110  # 假设使用[SI]寻址
 
 segment.append(modrm)
 self.address += 1

def generate_immediate(self, segment, value):
"""生成立即数"""
if isinstance(value, str):
# 可能是符号
if value in self.symbol_table:
# 符号地址，需要重定位
self.relocation_table.append({
'type': 'ABS32',
'address': self.address,
'symbol': value
})
# 填充占位符
segment.extend([0x00, 0x00, 0x00, 0x00])
self.address += 4
else:
# 立即数
try:
num = int(value, 0) # 自动检测进制
# 小端序存储
for i in range(4):
segment.append((num >> (i * 8)) & 0xFF)
self.address += 1
except ValueError:
# 未知符号，报错
raise CodeGenError(f"未定义的符号: {value}")
else:
# 数字
num = int(value)
for i in range(4):
segment.append((num >> (i * 8)) & 0xFF)
self.address += 1

def generate_memory_operand(self, segment, operand):
"""生成内存操作数"""
if operand.addressing_mode == "MEM_DIRECT":
# 直接寻址
base = operand.value['base']

     if isinstance(base, str) and base in self.symbol_table:
         # 符号地址
         self.relocation_table.append({
             'type': 'ABS32',
             'address': self.address,
             'symbol': base
         })
         # 填充占位符
         segment.extend([0x00, 0x00, 0x00, 0x00])
         self.address += 4
     else:
         # 立即数地址
         addr = int(base, 0) if isinstance(base, str) else base
         # 加上偏移
         if operand.value['offset']:
             addr += operand.value['offset']
         
         # 小端序存储地址
         for i in range(4):
             segment.append((addr >> (i * 8)) & 0xFF)
             self.address += 1
 else:
     # 其他寻址模式暂不实现
     raise CodeGenError(f"不支持的寻址模式: {operand.addressing_mode}")

def generate_symbol_reference(self, segment, symbol):
"""生成符号引用"""
if symbol not in self.symbol_table:
raise CodeGenError(f"未定义的符号: {symbol}")

 # 添加重定位项
 self.relocation_table.append({
     'type': 'REL32',
     'address': self.address,
     'symbol': symbol
 })
 
 # 填充占位符
 segment.extend([0x00, 0x00, 0x00, 0x00])
 self.address += 4

def generate_directive(self, directive):
"""处理伪指令"""
if directive.directive == "SECTION":
# 切换段
section_name = directive.args[0] if directive.args else ".text"
if section_name.startswith(".text"):
self.current_segment = "CODE"
elif section_name.startswith(".data"):
self.current_segment = "DATA"
elif section_name.startswith(".bss"):
self.current_segment = "BSS"

 elif directive.directive == "DB":
     # 定义字节
     if self.current_segment == "CODE":
         segment = self.code_segment
     elif self.current_segment == "DATA":
         segment = self.data_segment
     else:
         segment = self.bss_segment
     
     for arg in directive.args:
         if isinstance(arg, str):
             # 字符串
             for char in arg:
                 segment.append(ord(char))
                 self.address += 1
             segment.append(0)  # 空终止符
             self.address += 1
         else:
             # 数字
             segment.append(arg & 0xFF)
             self.address += 1
 
 elif directive.directive == "DW":
     # 定义字
     segment = self.get_current_segment()
     for arg in directive.args:
         num = int(arg, 0) if isinstance(arg, str) else arg
         # 小端序
         segment.append(num & 0xFF)
         segment.append((num >> 8) & 0xFF)
         self.address += 2
 
 elif directive.directive == "RESB":
     # 保留字节
     segment = self.get_current_segment()
     count = directive.args[0] if directive.args else 0
     segment.extend([0] * count)
     self.address += count

def get_current_segment(self):
"""获取当前段"""
if self.current_segment == "CODE":
return self.code_segment
elif self.current_segment == "DATA":
return self.data_segment
else:
return self.bss_segment

def finalize_code(self):
"""完成代码生成，处理重定位"""
# 计算符号的最终地址
symbol_addresses = {}

 # 简化的地址计算
 for symbol, addr in self.symbol_table.items():
     symbol_addresses[symbol] = addr
 
 # 应用重定位
 for reloc in self.relocation_table:
     symbol_addr = symbol_addresses.get(reloc['symbol'])
     if symbol_addr is None:
         raise CodeGenError(f"重定位失败: 未定义符号 {reloc['symbol']}")
     
     if reloc['type'] == 'ABS32':
         # 绝对地址
         target_addr = symbol_addr
     elif reloc['type'] == 'REL32':
         # 相对地址
         # 相对地址 = 目标地址 - (重定位地址 + 4)
         target_addr = symbol_addr - (reloc['address'] + 4)
     else:
         continue
     
     # 写入重定位地址（小端序）
     # 找到重定位位置并写入
     if reloc['address'] < len(self.code_segment):
         segment = self.code_segment
     else:
         segment = self.data_segment
         reloc_addr = reloc['address'] - len(self.code_segment)
     
     for i in range(4):
         segment[reloc['address'] + i] = (target_addr >> (i * 8)) & 0xFF
 
 return {
     'code': self.code_segment,
     'data': self.data_segment,
     'bss': self.bss_segment,
     'symbols': symbol_addresses,
     'relocations': self.relocation_table
 }

优化器
窥孔优化
python
class PeepholeOptimizer:
def init(self):
self.patterns = [
# 冗余移动消除
(['MOV AX, AX'], []), # MOV AX, AX -> 无操作
(['MOV AX, BX', 'MOV BX, AX'], ['MOV AX, BX']), # 交换消除

     # 常数传播
     (['MOV AX, 5', 'ADD BX, AX'], ['MOV AX, 5', 'ADD BX, 5']),
     
     # 死代码消除
     (['MOV AX, 5', 'MOV AX, 10'], ['MOV AX, 10']),  # 覆盖前值
     
     # 强度削减
     (['MUL AX, 2'], ['ADD AX, AX']),  # 乘法转加法
     (['MUL AX, 4'], ['SHL AX, 2']),   # 乘法转移位
     
     # 分支优化
     (['JMP LABEL1', 'LABEL1:'], []),  # 跳转到下一指令
     (['JE LABEL1', 'JMP LABEL2', 'LABEL1:'], ['JNE LABEL2', 'LABEL1:']),  # 条件反转
 ]

def optimize(self, instructions):
"""应用窥孔优化"""
optimized = instructions[:]
changed = True

 while changed:
     changed = False
     i = 0
     
     while i < len(optimized):
         # 尝试匹配模式
         for pattern, replacement in self.patterns:
             if self.match_pattern(optimized, i, pattern):
                 # 替换匹配的模式
                 optimized[i:i+len(pattern)] = replacement
                 changed = True
                 break
         
         i += 1
 
 return optimized

def match_pattern(self, instructions, start, pattern):
"""检查指令序列是否匹配模式"""
if start + len(pattern) > len(instructions):
return False

 for i in range(len(pattern)):
     if instructions[start + i] != pattern[i]:
         return False
 
 return True

数据流分析
python
class DataFlowAnalyzer:
def init(self):
self.defs = {} # 变量定义点
self.uses = {} # 变量使用点
self.liveness = {} # 活跃变量分析

def analyze(self, instructions):
    """执行数据流分析"""
    # 构建控制流图（简化）
    cfg = self.build_cfg(instructions)
    
    # 活跃变量分析
    self.liveness_analysis(cfg, instructions)
    
    # 到达定义分析
    self.reaching_definitions(cfg, instructions)
    
    return {
        'liveness': self.liveness,
        'reaching_defs': self.defs,
        'uses': self.uses
    }

def build_cfg(self, instructions):
    """构建控制流图（简化版）"""
    cfg = {}
    
    for i, instr in enumerate(instructions):
        cfg[i] = []
        
        # 检查是否为跳转指令
        if isinstance(instr, InstructionNode) and instr.opcode.startswith('J'):
            # 寻找目标标签
            target_label = instr.operands[0].value if instr.operands else None
            if target_label:
                # 查找标签位置（简化）
                for j, instr2 in enumerate(instructions):
                    if isinstance(instr2, LabelNode) and instr2.name == target_label:
                        cfg[i].append(j)
        
        # 如果不是无条件跳转，则添加下一条指令
        if not (isinstance(instr, InstructionNode) and instr.opcode == 'JMP'):
            if i + 1 < len(instructions):
                cfg[i].append(i + 1)
    
    return cfg

def liveness_analysis(self, cfg, instructions):
    """活跃变量分析"""
    # 初始化
    n = len(instructions)
    live_in = [set() for _ in range(n)]
    live_out = [set() for() for _ in range(n)]
    
    changed = True
    while changed:
        changed = False
        
        # 逆序遍历基本块
        for i in range(n-1, -1, -1):
            instr = instructions[i]
            
            # 计算use和def集合
            use_set = self.get_uses(instr)
            def_set = self.get_defs(instr)
            
            # 计算live_in和live_out
            old_live_in = live_in[i].copy()
            old_live_out = live_out[i].copy()
            
            # live_out = union of live_in of successors
            live_out[i] = set()
            for succ in cfg.get(i, []):
                live_out[i].update(live_in[succ])
            
            # live_in = use ∪ (live_out - def)
            live_in[i] = use_set.union(live_out[i] - def_set)
            
            if live_in[i] != old_live_in or live_out[i] != old_live_out:
                changed = True
    
    # 存储结果
    for i in range(n):
        self.liveness[i] = {
            'in': live_in[i],
            'out': live_out[i]
        }

def get_uses(self, instr):
    """获取指令使用的变量"""
    uses = set()
    
    if isinstance(instr, InstructionNode):
        for operand in instr.operands:
            if operand.operand_type == "REGISTER":
                uses.add(operand.value)
            elif operand.operand_type == "SYMBOL":
                # 变量使用
                uses.add(operand.value)
    
    return uses

def get_defs(self, instr):
    """获取指令定义的变量"""
    defs = set()
    
    if isinstance(instr, InstructionNode):
        # 大多数指令的第一个操作数是目标
        if instr.operands:
            dest = instr.operands[0]
            if dest.operand_type == "REGISTER":
                defs.add(dest.value)
            elif dest.operand_type == "SYMBOL":
                defs.add(dest.value)
    
    return defs

三、链接器

目标文件格式
python
class ObjectFile:
def init(self):
self.sections = {
'.text': {'data': [], 'address': 0, 'flags': 'RX'},
'.data': {'data': [], 'address': 0, 'flags': 'RW'},
'.bss': {'data': [], 'address': 0, 'flags': 'RW'},
'.rodata': {'data': [], 'address': 0, 'flags': 'R'},
}
```
 self.symbols = {}  # 符号表
 self.relocations = []  # 重定位表
 self.imports = []  # 导入符号
 self.exports = []  # 导出符号
```
def add_section_data(self, section_name, data):
"""添加段数据"""
if section_name in self.sections:
self.sections[section_name]['data'].extend(data)

def add_symbol(self, name, value, section, type='LOCAL'):
"""添加符号"""
self.symbols[name] = {
'value': value,
'section': section,
'type': type,
'size': 0 # 符号大小（字节）
}

def add_relocation(self, section, offset, symbol, type):
"""添加重定位项"""
self.relocations.append({
'section': section,
'offset': offset,
'symbol': symbol,
'type': type
})

链接器实现
python
class Linker:
def init(self):
self.objects = [] # 输入目标文件
self.symbol_table = {} # 全局符号表
self.section_addresses = {} # 段基址
self.output_sections = {} # 输出段

 # 内存布局（简化）
 self.layout = {
     '.text': 0x1000,   # 代码段基址
     '.data': 0x2000,   # 数据段基址
     '.rodata': 0x3000, # 只读数据段
     '.bss': 0x4000,    # BSS段基址
 }

def add_object(self, obj_file):
"""添加目标文件"""
self.objects.append(obj_file)

def link(self):
"""执行链接"""
# 第一步：收集所有符号
self.collect_symbols()

 # 第二步：解析符号引用
 self.resolve_symbols()
 
 # 第三步：分配最终地址
 self.assign_addresses()
 
 # 第四步：应用重定位
 self.apply_relocations()
 
 # 第五步：生成可执行映像
 return self.generate_executable()

def collect_symbols(self):
"""收集所有符号"""
self.symbol_table.clear()

 for obj_idx, obj in enumerate(self.objects):
     for name, sym_info in obj.symbols.items():
         if sym_info['type'] == 'EXPORT' or name in obj.exports:
             # 导出符号
             if name in self.symbol_table:
                 # 符号重复定义
                 raise LinkerError(f"符号重复定义: {name}")
             
             self.symbol_table[name] = {
                 'object': obj_idx,
                 'value': sym_info['value'],
                 'section': sym_info['section'],
                 'type': sym_info['type']
             }

def resolve_symbols(self):
"""解析符号引用"""
unresolved = set()

 for obj_idx, obj in enumerate(self.objects):
     for reloc in obj.relocations:
         symbol = reloc['symbol']
         
         if symbol not in self.symbol_table:
             # 检查是否为导入
             if symbol in obj.imports:
                 # 需要外部符号
                 unresolved.add(symbol)
             else:
                 # 未定义符号
                 raise LinkerError(f"未定义符号: {symbol}")
 
 if unresolved:
     # 尝试从其他对象文件解析
     for symbol in list(unresolved):
         for obj_idx, obj in enumerate(self.objects):
             if symbol in obj.exports:
                 # 找到定义
                 sym_info = obj.symbols[symbol]
                 self.symbol_table[symbol] = {
                     'object': obj_idx,
                     'value': sym_info['value'],
                     'section': sym_info['section'],
                     'type': 'IMPORT'
                 }
                 unresolved.remove(symbol)
                 break
 
 if unresolved:
     raise LinkerError(f"无法解析的符号: {unresolved}")

def assign_addresses(self):
"""分配最终地址"""
# 初始化输出段
for section in self.layout:
self.output_sections[section] = {
'address': self.layout[section],
'data': [],
'size': 0
}

 # 合并段
 current_address = {section: self.layout[section] for section in self.layout}
 
 for obj_idx, obj in enumerate(self.objects):
     for section_name, section_info in obj.sections.items():
         if section_name not in self.output_sections:
             # 创建新段
             self.output_sections[section_name] = {
                 'address': current_address.get(section_name, 0),
                 'data': [],
                 'size': 0
             }
             if section_name not in current_address:
                 current_address[section_name] = 0
         
         output_section = self.output_sections[section_name]
         
         # 添加数据
         data = section_info['data']
         output_section['data'].extend(data)
         output_section['size'] += len(data)
         
         # 更新符号地址
         for name, sym_info in obj.symbols.items():
             if sym_info['section'] == section_name:
                 if name in self.symbol_table:
                     sym_entry = self.symbol_table[name]
                     # 计算最终地址 = 段基址 + 符号在段内的偏移
                     final_addr = output_section['address'] + sym_info['value']
                     sym_entry['final_address'] = final_addr
 
 # 分配BSS段空间
 bss_size = self.output_sections.get('.bss', {'size': 0})['size']
 self.output_sections['.bss']['data'] = [0] * bss_size

def apply_relocations(self):
"""应用重定位"""
for obj_idx, obj in enumerate(self.objects):
for reloc in obj.relocations:
symbol = reloc['symbol']
section = reloc['section']
offset = reloc['offset']
reloc_type = reloc['type']

         # 获取符号最终地址
         if symbol not in self.symbol_table:
             continue
         
         sym_info = self.symbol_table[symbol]
         symbol_addr = sym_info.get('final_address', 0)
         
         # 获取重定位目标段
         output_section = self.output_sections.get(section)
         if not output_section:
             continue
         
         # 计算重定位地址
         reloc_addr = output_section['address'] + offset
         
         # 在段数据中应用重定位
         data = output_section['data']
         reloc_idx = offset
         
         if reloc_type == 'ABS32':
             # 绝对地址重定位
             for i in range(4):
                 if reloc_idx + i < len(data):
                     data[reloc_idx + i] = (symbol_addr >> (i * 8)) & 0xFF
         
         elif reloc_type == 'REL32':
             # 相对地址重定位
             # 相对地址 = 目标地址 - (重定位地址 + 4)
             relative_addr = symbol_addr - (reloc_addr + 4)
             for i in range(4):
                 if reloc_idx + i < len(data):
                     data[reloc_idx + i] = (relative_addr >> (i * 8)) & 0xFF

def generate_executable(self):
"""生成可执行文件"""
# 按地址排序段
sorted_sections = sorted(
self.output_sections.items(),
key=lambda x: x[1]['address']
)

 # 构建内存映像
 memory_image = {}
 entry_point = None
 
 for section_name, section_info in sorted_sections:
     addr = section_info['address']
     data = section_info['data']
     
     # 存储到内存映像
     for i, byte in enumerate(data):
         memory_image[addr + i] = byte
     
     # 查找入口点（通常是_start标签）
     if section_name == '.text':
         if '_start' in self.symbol_table:
             entry_point = self.symbol_table['_start']['final_address']
 
 return {
     'entry_point': entry_point or self.layout['.text'],
     'memory_image': memory_image,
     'sections': self.output_sections,
     'symbol_table': self.symbol_table
 }

posted @ 2026-01-20 21:12 头发少的文不识阅读(0) 评论(0) 收藏举报

刷新页面返回顶部

寒假学习笔记1.18

公告