寒假学习笔记1.18
一、 编译器前端:词法分析与语法分析
- 词法分析器(Lexer)
词法单元定义
python
import re
from enum import Enum
class TokenType(Enum):
# 标识符和常量
IDENTIFIER = 1
INTEGER = 2
HEX = 3
STRING = 4
# 指令和伪指令
INSTRUCTION = 10
PSEUDO_OP = 11
LABEL = 12
# 寄存器
REGISTER = 20
# 标点符号
COMMA = 30
COLON = 31
LBRACKET = 32
RBRACKET = 33
PLUS = 34
MINUS = 35
# 关键字
SECTION = 40
GLOBAL = 41
EXTERN = 42
# 结束
EOF = 100
NEWLINE = 101
词法分析实现
python
class Lexer:
def init(self, source_code):
self.source = source_code
self.position = 0
self.line = 1
self.column = 1
self.current_char = self.source[0] if self.source else None
# 指令集模式匹配
self.instructions = {
'MOV', 'ADD', 'SUB', 'MUL', 'DIV', 'AND', 'OR', 'NOT',
'PUSH', 'POP', 'CALL', 'RET', 'JMP', 'JE', 'JNE', 'JG',
'CMP', 'INT', 'IRET', 'IN', 'OUT', 'HLT'
}
# 寄存器模式匹配
self.registers = {
'AX', 'BX', 'CX', 'DX', 'SI', 'DI', 'BP', 'SP',
'CS', 'DS', 'SS', 'ES', 'IP', 'FLAGS'
}
# 伪指令
self.pseudo_ops = {
'DB', 'DW', 'DD', 'DQ', 'TIMES', 'EQU', 'RESB', 'RESW',
'SECTION', 'GLOBAL', 'EXTERN', 'BITS', 'ORG'
}
def advance(self):
"""前进一个字符"""
self.position += 1
self.column += 1
if self.position >= len(self.source):
self.current_char = None
else:
self.current_char = self.source[self.position]
if self.current_char == '\n':
self.line += 1
self.column = 0
def peek(self):
"""查看下一个字符但不前进"""
pos = self.position + 1
return self.source[pos] if pos < len(self.source) else None
def skip_whitespace(self):
"""跳过空白字符"""
while self.current_char and self.current_char.isspace():
self.advance()
def skip_comment(self):
"""跳过注释"""
if self.current_char == ';':
while self.current_char and self.current_char != '\n':
self.advance()
def get_next_token(self):
"""获取下一个词法单元"""
while self.current_char:
# 跳过空白和注释
if self.current_char.isspace():
self.skip_whitespace()
continue
if self.current_char == ';':
self.skip_comment()
continue
# 识别标识符和关键字
if self.current_char.isalpha() or self.current_char == '_':
return self.get_identifier()
# 识别数字
if self.current_char.isdigit():
return self.get_number()
# 识别字符串
if self.current_char == '"':
return self.get_string()
# 识别标点符号
if self.current_char == ',':
self.advance()
return Token(TokenType.COMMA, ',', self.line, self.column)
elif self.current_char == ':':
self.advance()
return Token(TokenType.COLON, ':', self.line, self.column)
elif self.current_char == '[':
self.advance()
return Token(TokenType.LBRACKET, '[', self.line, self.column)
elif self.current_char == ']':
self.advance()
return Token(TokenType.RBRACKET, ']', self.line, self.column)
elif self.current_char == '+':
self.advance()
return Token(TokenType.PLUS, '+', self.line, self.column)
elif self.current_char == '-':
self.advance()
return Token(TokenType.MINUS, '-', self.line, self.column)
# 未知字符
raise LexerError(f"未知字符: {self.current_char}", self.line, self.column)
return Token(TokenType.EOF, '', self.line, self.column)
def get_identifier(self):
"""获取标识符"""
start_pos = self.position
start_line = self.line
start_col = self.column
while (self.current_char and
(self.current_char.isalnum() or self.current_char == '_')):
self.advance()
value = self.source[start_pos:self.position]
# 检查是否为指令
if value.upper() in self.instructions:
return Token(TokenType.INSTRUCTION, value.upper(), start_line, start_col)
# 检查是否为伪指令
if value.upper() in self.pseudo_ops:
return Token(TokenType.PSEUDO_OP, value.upper(), start_line, start_col)
# 检查是否为寄存器
if value.upper() in self.registers:
return Token(TokenType.REGISTER, value.upper(), start_line, start_col)
return Token(TokenType.IDENTIFIER, value, start_line, start_col)
def get_number(self):
"""获取数字"""
start_pos = self.position
start_line = self.line
start_col = self.column
# 检查十六进制
if self.current_char == '0' and self.peek() and self.peek().lower() == 'x':
self.advance() # 跳过0
self.advance() # 跳过x
while (self.current_char and
(self.current_char.isdigit() or
self.current_char.lower() in 'abcdef')):
self.advance()
value = self.source[start_pos:self.position]
return Token(TokenType.HEX, value, start_line, start_col)
# 十进制数字
while self.current_char and self.current_char.isdigit():
self.advance()
value = self.source[start_pos:self.position]
return Token(TokenType.INTEGER, value, start_line, start_col)
def get_string(self):
"""获取字符串"""
start_line = self.line
start_col = self.column
self.advance() # 跳过开头的引号
value = ''
while self.current_char and self.current_char != '"':
if self.current_char == '\\': # 转义字符
self.advance()
if self.current_char == 'n':
value += '\n'
elif self.current_char == 't':
value += '\t'
elif self.current_char == '"':
value += '"'
elif self.current_char == '\\':
value += '\\'
else:
value += self.current_char
else:
value += self.current_char
self.advance()
if self.current_char != '"':
raise LexerError("未终止的字符串", start_line, start_col)
self.advance() # 跳过结尾的引号
return Token(TokenType.STRING, value, start_line, start_col)
-
语法分析器(Parser)
抽象语法树节点
python
class ASTNode:
def init(self, node_type):
self.node_type = node_typedef repr(self):
return f"{self.class.name}()"
class ProgramNode(ASTNode):
def init(self, statements):
super().init("PROGRAM")
self.statements = statements
def __repr__(self):
return f"ProgramNode({len(self.statements)} statements)"
class InstructionNode(ASTNode):
def init(self, opcode, operands=None, size=0):
super().init("INSTRUCTION")
self.opcode = opcode
self.operands = operands or []
self.size = size
def __repr__(self):
operands_str = ', '.join(str(op) for op in self.operands)
return f"InstructionNode({self.opcode} {operands_str})"
class LabelNode(ASTNode):
def init(self, name):
super().init("LABEL")
self.name = name
def __repr__(self):
return f"LabelNode({self.name})"
class DirectiveNode(ASTNode):
def init(self, directive, args=None):
super().init("DIRECTIVE")
self.directive = directive
self.args = args or []
def __repr__(self):
args_str = ', '.join(str(arg) for arg in self.args)
return f"DirectiveNode({self.directive} {args_str})"
class OperandNode(ASTNode):
def init(self, operand_type, value, addressing_mode=None):
super().init("OPERAND")
self.operand_type = operand_type
self.value = value
self.addressing_mode = addressing_mode
def __repr__(self):
return f"OperandNode({self.operand_type}, {self.value}, {self.addressing_mode})"
递归下降语法分析器
python
class Parser:
def init(self, lexer):
self.lexer = lexer
self.current_token = self.lexer.get_next_token()
self.symbol_table = {}
self.current_address = 0
def eat(self, token_type):
"""消费当前token,并获取下一个token"""
if self.current_token.type == token_type:
self.current_token = self.lexer.get_next_token()
else:
raise ParserError(
f"期望 {token_type},但得到 {self.current_token.type}",
self.current_token.line,
self.current_token.column
)
def parse(self):
"""解析整个程序"""
statements = []
while self.current_token.type != TokenType.EOF:
# 跳过空行
while self.current_token.type == TokenType.NEWLINE:
self.eat(TokenType.NEWLINE)
if self.current_token.type == TokenType.EOF:
break
# 解析语句
statement = self.parse_statement()
if statement:
statements.append(statement)
return ProgramNode(statements)
def parse_statement(self):
"""解析单个语句"""
# 检查标签
if self.peek_next_non_newline_token().type == TokenType.COLON:
return self.parse_label()
# 检查伪指令
if self.current_token.type == TokenType.PSEUDO_OP:
return self.parse_directive()
# 检查指令
if self.current_token.type == TokenType.INSTRUCTION:
return self.parse_instruction()
# 未知语句
raise ParserError(
f"无效语句开始: {self.current_token.value}",
self.current_token.line,
self.current_token.column
)
def parse_label(self):
"""解析标签"""
name = self.current_token.value
self.eat(TokenType.IDENTIFIER)
self.eat(TokenType.COLON)
# 记录标签地址
self.symbol_table[name] = self.current_address
# 标签不占用空间,但返回节点用于AST
return LabelNode(name)
def parse_instruction(self):
"""解析指令"""
opcode = self.current_token.value
self.eat(TokenType.INSTRUCTION)
# 解析操作数
operands = []
if self.current_token.type not in [TokenType.NEWLINE, TokenType.EOF]:
operands.append(self.parse_operand())
while self.current_token.type == TokenType.COMMA:
self.eat(TokenType.COMMA)
operands.append(self.parse_operand())
# 计算指令大小(简化)
size = self.calculate_instruction_size(opcode, operands)
self.current_address += size
# 跳过换行
if self.current_token.type == TokenType.NEWLINE:
self.eat(TokenType.NEWLINE)
return InstructionNode(opcode, operands, size)
def parse_operand(self):
"""解析操作数"""
# 寄存器
if self.current_token.type == TokenType.REGISTER:
reg = self.current_token.value
self.eat(TokenType.REGISTER)
return OperandNode("REGISTER", reg, "REG_DIRECT")
# 立即数
if self.current_token.type in [TokenType.INTEGER, TokenType.HEX]:
value = self.current_token.value
token_type = self.current_token.type
self.eat(token_type)
# 检查是否为内存寻址 [立即数]
if self.current_token.type == TokenType.LBRACKET:
self.eat(TokenType.LBRACKET)
# 实际上立即数已经读取,这里需要回退
# 简化处理:重新解析为内存寻址
return self.parse_memory_address(value)
return OperandNode("IMMEDIATE", value, "IMMEDIATE")
# 标识符(可能是标签或符号)
if self.current_token.type == TokenType.IDENTIFIER:
value = self.current_token.value
self.eat(TokenType.IDENTIFIER)
# 检查是否为内存寻址 [符号]
if self.current_token.type == TokenType.LBRACKET:
self.eat(TokenType.LBRACKET)
return self.parse_memory_address(value)
return OperandNode("SYMBOL", value, "SYMBOL")
# 内存寻址
if self.current_token.type == TokenType.LBRACKET:
self.eat(TokenType.LBRACKET)
return self.parse_memory_address()
raise ParserError(
f"无效操作数: {self.current_token.value}",
self.current_token.line,
self.current_token.column
)
def parse_memory_address(self, base_value=None):
"""解析内存地址操作数"""
# 解析基址
if base_value:
base = base_value
elif self.current_token.type == TokenType.REGISTER:
base = self.current_token.value
self.eat(TokenType.REGISTER)
elif self.current_token.type in [TokenType.IDENTIFIER, TokenType.INTEGER, TokenType.HEX]:
base = self.current_token.value
self.eat(self.current_token.type)
else:
raise ParserError("无效内存地址", self.current_token.line, self.current_token.column)
# 检查是否有偏移
offset = None
if self.current_token.type in [TokenType.PLUS, TokenType.MINUS]:
sign = 1 if self.current_token.type == TokenType.PLUS else -1
self.eat(self.current_token.type)
if self.current_token.type not in [TokenType.INTEGER, TokenType.HEX]:
raise ParserError("偏移量必须是数字", self.current_token.line, self.current_token.column)
offset_value = int(self.current_token.value,
0 if self.current_token.type == TokenType.HEX else 10)
offset = sign * offset_value
self.eat(self.current_token.type)
# 闭合括号
self.eat(TokenType.RBRACKET)
# 确定寻址模式
if isinstance(base, str) and base.upper() in self.lexer.registers:
addressing_mode = "REG_INDIRECT"
if offset is not None:
addressing_mode = "REG_INDIRECT_DISPLACEMENT"
else:
addressing_mode = "MEM_DIRECT"
if offset is not None:
addressing_mode = "MEM_DIRECT_DISPLACEMENT"
return OperandNode("MEMORY", {"base": base, "offset": offset}, addressing_mode)
def parse_directive(self):
"""解析伪指令"""
directive = self.current_token.value
self.eat(TokenType.PSEUDO_OP)
args = []
# 解析参数
while self.current_token.type not in [TokenType.NEWLINE, TokenType.EOF]:
if self.current_token.type == TokenType.STRING:
args.append(self.current_token.value)
self.eat(TokenType.STRING)
elif self.current_token.type == TokenType.INTEGER:
args.append(int(self.current_token.value))
self.eat(TokenType.INTEGER)
elif self.current_token.type == TokenType.HEX:
args.append(int(self.current_token.value, 16))
self.eat(TokenType.HEX)
elif self.current_token.type == TokenType.IDENTIFIER:
args.append(self.current_token.value)
self.eat(TokenType.IDENTIFIER)
elif self.current_token.type == TokenType.COMMA:
self.eat(TokenType.COMMA)
else:
raise ParserError(
f"无效伪指令参数: {self.current_token.value}",
self.current_token.line,
self.current_token.column
)
# 计算伪指令大小
size = self.calculate_directive_size(directive, args)
self.current_address += size
# 跳过换行
if self.current_token.type == TokenType.NEWLINE:
self.eat(TokenType.NEWLINE)
return DirectiveNode(directive, args)
def calculate_instruction_size(self, opcode, operands):
"""计算指令大小(简化版)"""
# 基础操作码大小
base_size = 2 # 假设操作码占2字节
# 根据操作数增加大小
for operand in operands:
if operand.operand_type == "REGISTER":
base_size += 1
elif operand.operand_type == "IMMEDIATE":
# 假设立即数占4字节
base_size += 4
elif operand.operand_type == "MEMORY":
base_size += 4 # 地址占4字节
if operand.addressing_mode == "REG_INDIRECT_DISPLACEMENT":
base_size += 2 # 偏移量
return base_size
def calculate_directive_size(self, directive, args):
"""计算伪指令大小"""
if directive == "DB":
# Define Byte
size = 0
for arg in args:
if isinstance(arg, str):
size += len(arg) + 1 # 字符串加空终止符
else:
size += 1
return size
elif directive == "DW":
# Define Word (2 bytes each)
return len(args) * 2
elif directive == "DD":
# Define Double Word (4 bytes each)
return len(args) * 4
elif directive == "RESB":
# Reserve Bytes
return args[0] if args else 0
elif directive == "RESW":
# Reserve Words
return (args[0] if args else 0) * 2
else:
# 其他伪指令不占空间
return 0
def peek_next_non_newline_token(self):
"""预览下一个非换行token"""
saved_position = self.lexer.position
saved_current_token = self.current_token
saved_line = self.lexer.line
saved_column = self.lexer.column
# 跳过当前token和新行
while self.current_token.type == TokenType.NEWLINE:
self.eat(TokenType.NEWLINE)
next_token = self.current_token
# 恢复状态
self.lexer.position = saved_position
self.lexer.current_char = self.lexer.source[saved_position] if saved_position < len(self.lexer.source) else None
self.lexer.line = saved_line
self.lexer.column = saved_column
self.current_token = saved_current_token
return next_token
二、 编译器后端:代码生成与优化
-
代码生成器
机器码生成
python
class CodeGenerator:
def init(self, symbol_table):
self.symbol_table = symbol_table
self.code_segment = [] # 代码段
self.data_segment = [] # 数据段
self.bss_segment = [] # 未初始化数据段
self.current_segment = "CODE"
self.address = 0
self.relocation_table = [] # 重定位表# 操作码映射表(简化) self.opcode_map = { 'MOV': 0x88, 'ADD': 0x00, 'SUB': 0x28, 'MUL': 0xF6, 'DIV': 0xF6, 'PUSH': 0x50, 'POP': 0x58, 'CALL': 0xE8, 'RET': 0xC3, 'JMP': 0xE9, 'JE': 0x74, 'JNE': 0x75, 'INT': 0xCD, 'HLT': 0xF4, } # 寄存器编码 self.register_map = { 'AX': 0b000, 'CX': 0b001, 'DX': 0b010, 'BX': 0b011, 'SP': 0b100, 'BP': 0b101, 'SI': 0b110, 'DI': 0b111, }def generate(self, ast):
"""从AST生成代码"""
for statement in ast.statements:
if isinstance(statement, DirectiveNode):
self.generate_directive(statement)
elif isinstance(statement, InstructionNode):
self.generate_instruction(statement)
elif isinstance(statement, LabelNode):
# 标签已在符号表中处理
continuereturn self.finalize_code()def generate_instruction(self, instr):
"""生成指令的机器码"""
opcode = instr.opcode
operands = instr.operands# 选择当前段 if self.current_segment == "CODE": segment = self.code_segment else: # 数据段中的指令(如内联数据) segment = self.data_segment # 生成操作码 if opcode in self.opcode_map: base_opcode = self.opcode_map[opcode] segment.append(base_opcode) self.address += 1 # 生成ModR/M字节(如果有操作数) if operands: self.generate_modrm(segment, opcode, operands) # 生成立即数或位移 for operand in operands: if operand.operand_type == "IMMEDIATE": self.generate_immediate(segment, operand.value) elif operand.operand_type == "MEMORY": self.generate_memory_operand(segment, operand) elif operand.operand_type == "SYMBOL": # 符号引用,需要重定位 self.generate_symbol_reference(segment, operand.value) else: # 未知指令,填充NOP segment.append(0x90) # NOP指令 self.address += 1def generate_modrm(self, segment, opcode, operands):
"""生成ModR/M字节"""
if len(operands) == 0:
return# 简化的ModR/M生成 modrm = 0 if len(operands) >= 1: op1 = operands[0] if op1.operand_type == "REGISTER": reg1_code = self.register_map.get(op1.value, 0) modrm |= (reg1_code << 3) if len(operands) >= 2: op2 = operands[1] if op2.operand_type == "REGISTER": reg2_code = self.register_map.get(op2.value, 0) modrm |= reg2_code elif op2.operand_type == "MEMORY": # 内存操作数 modrm |= 0b00000110 # 假设使用[SI]寻址 segment.append(modrm) self.address += 1def generate_immediate(self, segment, value):
"""生成立即数"""
if isinstance(value, str):
# 可能是符号
if value in self.symbol_table:
# 符号地址,需要重定位
self.relocation_table.append({
'type': 'ABS32',
'address': self.address,
'symbol': value
})
# 填充占位符
segment.extend([0x00, 0x00, 0x00, 0x00])
self.address += 4
else:
# 立即数
try:
num = int(value, 0) # 自动检测进制
# 小端序存储
for i in range(4):
segment.append((num >> (i * 8)) & 0xFF)
self.address += 1
except ValueError:
# 未知符号,报错
raise CodeGenError(f"未定义的符号: {value}")
else:
# 数字
num = int(value)
for i in range(4):
segment.append((num >> (i * 8)) & 0xFF)
self.address += 1def generate_memory_operand(self, segment, operand):
"""生成内存操作数"""
if operand.addressing_mode == "MEM_DIRECT":
# 直接寻址
base = operand.value['base']if isinstance(base, str) and base in self.symbol_table: # 符号地址 self.relocation_table.append({ 'type': 'ABS32', 'address': self.address, 'symbol': base }) # 填充占位符 segment.extend([0x00, 0x00, 0x00, 0x00]) self.address += 4 else: # 立即数地址 addr = int(base, 0) if isinstance(base, str) else base # 加上偏移 if operand.value['offset']: addr += operand.value['offset'] # 小端序存储地址 for i in range(4): segment.append((addr >> (i * 8)) & 0xFF) self.address += 1 else: # 其他寻址模式暂不实现 raise CodeGenError(f"不支持的寻址模式: {operand.addressing_mode}")def generate_symbol_reference(self, segment, symbol):
"""生成符号引用"""
if symbol not in self.symbol_table:
raise CodeGenError(f"未定义的符号: {symbol}")# 添加重定位项 self.relocation_table.append({ 'type': 'REL32', 'address': self.address, 'symbol': symbol }) # 填充占位符 segment.extend([0x00, 0x00, 0x00, 0x00]) self.address += 4def generate_directive(self, directive):
"""处理伪指令"""
if directive.directive == "SECTION":
# 切换段
section_name = directive.args[0] if directive.args else ".text"
if section_name.startswith(".text"):
self.current_segment = "CODE"
elif section_name.startswith(".data"):
self.current_segment = "DATA"
elif section_name.startswith(".bss"):
self.current_segment = "BSS"elif directive.directive == "DB": # 定义字节 if self.current_segment == "CODE": segment = self.code_segment elif self.current_segment == "DATA": segment = self.data_segment else: segment = self.bss_segment for arg in directive.args: if isinstance(arg, str): # 字符串 for char in arg: segment.append(ord(char)) self.address += 1 segment.append(0) # 空终止符 self.address += 1 else: # 数字 segment.append(arg & 0xFF) self.address += 1 elif directive.directive == "DW": # 定义字 segment = self.get_current_segment() for arg in directive.args: num = int(arg, 0) if isinstance(arg, str) else arg # 小端序 segment.append(num & 0xFF) segment.append((num >> 8) & 0xFF) self.address += 2 elif directive.directive == "RESB": # 保留字节 segment = self.get_current_segment() count = directive.args[0] if directive.args else 0 segment.extend([0] * count) self.address += countdef get_current_segment(self):
"""获取当前段"""
if self.current_segment == "CODE":
return self.code_segment
elif self.current_segment == "DATA":
return self.data_segment
else:
return self.bss_segmentdef finalize_code(self):
"""完成代码生成,处理重定位"""
# 计算符号的最终地址
symbol_addresses = {}# 简化的地址计算 for symbol, addr in self.symbol_table.items(): symbol_addresses[symbol] = addr # 应用重定位 for reloc in self.relocation_table: symbol_addr = symbol_addresses.get(reloc['symbol']) if symbol_addr is None: raise CodeGenError(f"重定位失败: 未定义符号 {reloc['symbol']}") if reloc['type'] == 'ABS32': # 绝对地址 target_addr = symbol_addr elif reloc['type'] == 'REL32': # 相对地址 # 相对地址 = 目标地址 - (重定位地址 + 4) target_addr = symbol_addr - (reloc['address'] + 4) else: continue # 写入重定位地址(小端序) # 找到重定位位置并写入 if reloc['address'] < len(self.code_segment): segment = self.code_segment else: segment = self.data_segment reloc_addr = reloc['address'] - len(self.code_segment) for i in range(4): segment[reloc['address'] + i] = (target_addr >> (i * 8)) & 0xFF return { 'code': self.code_segment, 'data': self.data_segment, 'bss': self.bss_segment, 'symbols': symbol_addresses, 'relocations': self.relocation_table } -
优化器
窥孔优化
python
class PeepholeOptimizer:
def init(self):
self.patterns = [
# 冗余移动消除
(['MOV AX, AX'], []), # MOV AX, AX -> 无操作
(['MOV AX, BX', 'MOV BX, AX'], ['MOV AX, BX']), # 交换消除# 常数传播 (['MOV AX, 5', 'ADD BX, AX'], ['MOV AX, 5', 'ADD BX, 5']), # 死代码消除 (['MOV AX, 5', 'MOV AX, 10'], ['MOV AX, 10']), # 覆盖前值 # 强度削减 (['MUL AX, 2'], ['ADD AX, AX']), # 乘法转加法 (['MUL AX, 4'], ['SHL AX, 2']), # 乘法转移位 # 分支优化 (['JMP LABEL1', 'LABEL1:'], []), # 跳转到下一指令 (['JE LABEL1', 'JMP LABEL2', 'LABEL1:'], ['JNE LABEL2', 'LABEL1:']), # 条件反转 ]def optimize(self, instructions):
"""应用窥孔优化"""
optimized = instructions[:]
changed = Truewhile changed: changed = False i = 0 while i < len(optimized): # 尝试匹配模式 for pattern, replacement in self.patterns: if self.match_pattern(optimized, i, pattern): # 替换匹配的模式 optimized[i:i+len(pattern)] = replacement changed = True break i += 1 return optimizeddef match_pattern(self, instructions, start, pattern):
"""检查指令序列是否匹配模式"""
if start + len(pattern) > len(instructions):
return Falsefor i in range(len(pattern)): if instructions[start + i] != pattern[i]: return False return True
数据流分析
python
class DataFlowAnalyzer:
def init(self):
self.defs = {} # 变量定义点
self.uses = {} # 变量使用点
self.liveness = {} # 活跃变量分析
def analyze(self, instructions):
"""执行数据流分析"""
# 构建控制流图(简化)
cfg = self.build_cfg(instructions)
# 活跃变量分析
self.liveness_analysis(cfg, instructions)
# 到达定义分析
self.reaching_definitions(cfg, instructions)
return {
'liveness': self.liveness,
'reaching_defs': self.defs,
'uses': self.uses
}
def build_cfg(self, instructions):
"""构建控制流图(简化版)"""
cfg = {}
for i, instr in enumerate(instructions):
cfg[i] = []
# 检查是否为跳转指令
if isinstance(instr, InstructionNode) and instr.opcode.startswith('J'):
# 寻找目标标签
target_label = instr.operands[0].value if instr.operands else None
if target_label:
# 查找标签位置(简化)
for j, instr2 in enumerate(instructions):
if isinstance(instr2, LabelNode) and instr2.name == target_label:
cfg[i].append(j)
# 如果不是无条件跳转,则添加下一条指令
if not (isinstance(instr, InstructionNode) and instr.opcode == 'JMP'):
if i + 1 < len(instructions):
cfg[i].append(i + 1)
return cfg
def liveness_analysis(self, cfg, instructions):
"""活跃变量分析"""
# 初始化
n = len(instructions)
live_in = [set() for _ in range(n)]
live_out = [set() for() for _ in range(n)]
changed = True
while changed:
changed = False
# 逆序遍历基本块
for i in range(n-1, -1, -1):
instr = instructions[i]
# 计算use和def集合
use_set = self.get_uses(instr)
def_set = self.get_defs(instr)
# 计算live_in和live_out
old_live_in = live_in[i].copy()
old_live_out = live_out[i].copy()
# live_out = union of live_in of successors
live_out[i] = set()
for succ in cfg.get(i, []):
live_out[i].update(live_in[succ])
# live_in = use ∪ (live_out - def)
live_in[i] = use_set.union(live_out[i] - def_set)
if live_in[i] != old_live_in or live_out[i] != old_live_out:
changed = True
# 存储结果
for i in range(n):
self.liveness[i] = {
'in': live_in[i],
'out': live_out[i]
}
def get_uses(self, instr):
"""获取指令使用的变量"""
uses = set()
if isinstance(instr, InstructionNode):
for operand in instr.operands:
if operand.operand_type == "REGISTER":
uses.add(operand.value)
elif operand.operand_type == "SYMBOL":
# 变量使用
uses.add(operand.value)
return uses
def get_defs(self, instr):
"""获取指令定义的变量"""
defs = set()
if isinstance(instr, InstructionNode):
# 大多数指令的第一个操作数是目标
if instr.operands:
dest = instr.operands[0]
if dest.operand_type == "REGISTER":
defs.add(dest.value)
elif dest.operand_type == "SYMBOL":
defs.add(dest.value)
return defs
三、 链接器
-
目标文件格式
python
class ObjectFile:
def init(self):
self.sections = {
'.text': {'data': [], 'address': 0, 'flags': 'RX'},
'.data': {'data': [], 'address': 0, 'flags': 'RW'},
'.bss': {'data': [], 'address': 0, 'flags': 'RW'},
'.rodata': {'data': [], 'address': 0, 'flags': 'R'},
}self.symbols = {} # 符号表 self.relocations = [] # 重定位表 self.imports = [] # 导入符号 self.exports = [] # 导出符号def add_section_data(self, section_name, data):
"""添加段数据"""
if section_name in self.sections:
self.sections[section_name]['data'].extend(data)def add_symbol(self, name, value, section, type='LOCAL'):
"""添加符号"""
self.symbols[name] = {
'value': value,
'section': section,
'type': type,
'size': 0 # 符号大小(字节)
}def add_relocation(self, section, offset, symbol, type):
"""添加重定位项"""
self.relocations.append({
'section': section,
'offset': offset,
'symbol': symbol,
'type': type
}) -
链接器实现
python
class Linker:
def init(self):
self.objects = [] # 输入目标文件
self.symbol_table = {} # 全局符号表
self.section_addresses = {} # 段基址
self.output_sections = {} # 输出段# 内存布局(简化) self.layout = { '.text': 0x1000, # 代码段基址 '.data': 0x2000, # 数据段基址 '.rodata': 0x3000, # 只读数据段 '.bss': 0x4000, # BSS段基址 }def add_object(self, obj_file):
"""添加目标文件"""
self.objects.append(obj_file)def link(self):
"""执行链接"""
# 第一步:收集所有符号
self.collect_symbols()# 第二步:解析符号引用 self.resolve_symbols() # 第三步:分配最终地址 self.assign_addresses() # 第四步:应用重定位 self.apply_relocations() # 第五步:生成可执行映像 return self.generate_executable()def collect_symbols(self):
"""收集所有符号"""
self.symbol_table.clear()for obj_idx, obj in enumerate(self.objects): for name, sym_info in obj.symbols.items(): if sym_info['type'] == 'EXPORT' or name in obj.exports: # 导出符号 if name in self.symbol_table: # 符号重复定义 raise LinkerError(f"符号重复定义: {name}") self.symbol_table[name] = { 'object': obj_idx, 'value': sym_info['value'], 'section': sym_info['section'], 'type': sym_info['type'] }def resolve_symbols(self):
"""解析符号引用"""
unresolved = set()for obj_idx, obj in enumerate(self.objects): for reloc in obj.relocations: symbol = reloc['symbol'] if symbol not in self.symbol_table: # 检查是否为导入 if symbol in obj.imports: # 需要外部符号 unresolved.add(symbol) else: # 未定义符号 raise LinkerError(f"未定义符号: {symbol}") if unresolved: # 尝试从其他对象文件解析 for symbol in list(unresolved): for obj_idx, obj in enumerate(self.objects): if symbol in obj.exports: # 找到定义 sym_info = obj.symbols[symbol] self.symbol_table[symbol] = { 'object': obj_idx, 'value': sym_info['value'], 'section': sym_info['section'], 'type': 'IMPORT' } unresolved.remove(symbol) break if unresolved: raise LinkerError(f"无法解析的符号: {unresolved}")def assign_addresses(self):
"""分配最终地址"""
# 初始化输出段
for section in self.layout:
self.output_sections[section] = {
'address': self.layout[section],
'data': [],
'size': 0
}# 合并段 current_address = {section: self.layout[section] for section in self.layout} for obj_idx, obj in enumerate(self.objects): for section_name, section_info in obj.sections.items(): if section_name not in self.output_sections: # 创建新段 self.output_sections[section_name] = { 'address': current_address.get(section_name, 0), 'data': [], 'size': 0 } if section_name not in current_address: current_address[section_name] = 0 output_section = self.output_sections[section_name] # 添加数据 data = section_info['data'] output_section['data'].extend(data) output_section['size'] += len(data) # 更新符号地址 for name, sym_info in obj.symbols.items(): if sym_info['section'] == section_name: if name in self.symbol_table: sym_entry = self.symbol_table[name] # 计算最终地址 = 段基址 + 符号在段内的偏移 final_addr = output_section['address'] + sym_info['value'] sym_entry['final_address'] = final_addr # 分配BSS段空间 bss_size = self.output_sections.get('.bss', {'size': 0})['size'] self.output_sections['.bss']['data'] = [0] * bss_sizedef apply_relocations(self):
"""应用重定位"""
for obj_idx, obj in enumerate(self.objects):
for reloc in obj.relocations:
symbol = reloc['symbol']
section = reloc['section']
offset = reloc['offset']
reloc_type = reloc['type']# 获取符号最终地址 if symbol not in self.symbol_table: continue sym_info = self.symbol_table[symbol] symbol_addr = sym_info.get('final_address', 0) # 获取重定位目标段 output_section = self.output_sections.get(section) if not output_section: continue # 计算重定位地址 reloc_addr = output_section['address'] + offset # 在段数据中应用重定位 data = output_section['data'] reloc_idx = offset if reloc_type == 'ABS32': # 绝对地址重定位 for i in range(4): if reloc_idx + i < len(data): data[reloc_idx + i] = (symbol_addr >> (i * 8)) & 0xFF elif reloc_type == 'REL32': # 相对地址重定位 # 相对地址 = 目标地址 - (重定位地址 + 4) relative_addr = symbol_addr - (reloc_addr + 4) for i in range(4): if reloc_idx + i < len(data): data[reloc_idx + i] = (relative_addr >> (i * 8)) & 0xFFdef generate_executable(self):
"""生成可执行文件"""
# 按地址排序段
sorted_sections = sorted(
self.output_sections.items(),
key=lambda x: x[1]['address']
)# 构建内存映像 memory_image = {} entry_point = None for section_name, section_info in sorted_sections: addr = section_info['address'] data = section_info['data'] # 存储到内存映像 for i, byte in enumerate(data): memory_image[addr + i] = byte # 查找入口点(通常是_start标签) if section_name == '.text': if '_start' in self.symbol_table: entry_point = self.symbol_table['_start']['final_address'] return { 'entry_point': entry_point or self.layout['.text'], 'memory_image': memory_image, 'sections': self.output_sections, 'symbol_table': self.symbol_table }
浙公网安备 33010602011771号