Cornell cs3110 - Chapter9 Lessons

使用 Menhir 构建 SimPL 的编译器

Lexer and Parser 语法分析模块

Lexer, Parser, AST 是三个依次耦合的模块,可以这么描述三者的关系:

Lexer ---tokens--> Parser ---nodes--> AST

相对于上面的图像化描述,cs3110 反过来构建整个 Lexer 和 Parser 的结构

ast.ml 中,定义了 AST 上节点 node 的类型:

type bop = 
  | Add
  | Mul 
  | Leq 

type expr = 
  | Var of string
  | Int of int 
  | Bool of bool
  | Binop of bop * expr * expr 
  | If of expr * expr * expr 
  | Let of string * expr * expr

parser.mly 中构建的 parser 接受 TOKEN 并将其翻译成 AST 上节点

(* Link the ast and the parser *)
%{
    open Ast
%}

(* The name of tokens *)

%token <int> INT
%token <string> ID
%token TRUE
%token FALSE
%token LEQ
%token TIMES
%token PLUS
%token LPAREN
%token RPAREN
%token LET
%token EQUALS
%token IN
%token IF
%token THEN
%token ELSE
%token EOF

(* Compute the precedence and associativity of the operators *)
%nonassoc IN
%nonassoc ELSE
%left LEQ
%left PLUS
%left TIMES

(* Define the start symbol: where to parse the program? *)
%start <Ast.expr> prog

%%

prog:
    | e = expr EOF { e }
    ;

expr: 
    | i = INT { Int i } (* integer *)
    | TRUE { Bool true } (* booleans *)
    | FALSE { Bool false }
    | e1 = expr Add e2 = expr { Binop (Add, e1, e2) } (* binary operators *)
    | e1 = expr Mul e2 = expr { Binop (Mul, e1, e2) }
    | e1 = expr Leq e2 = expr { Binop (Leq, e1, e2) }
    | LET; x = ID; EQUALS; e1 = expr; IN; e2 = expr { Let (x, e1, e2) } (* let expressions *)
    | IF; e1 = expr; THEN; e2 = expr; ELSE; e3 = expr { If (e1, e2, e3) } (* if expressions *)
    | LPAREN; e = expr; RPAREN { e } (* parentheses *)
    ;

lexer.mll 接受输入的 token 序列(也即从字符串角度看代码)并将其进行划分,返回 parser.mly 中早就定义好的 TOKEN

{
    open Parser
}

(* Regular expressions for the lexer *)
let white = [' ' '\t']+
let digit = ['0'-'9']
let int = '-'?digit+
let letter = ['a'-'z' 'A'-'Z']
let id = letter (letter|digit)*

(* Define the rules of lexer *)
rule read = 
    parse 
    (* 'read lexbuf' reads the next token from lexbuf *)
    | white { read lexbuf }
    | "true" { TRUE }
    | "false" { FALSE }
    | "<=" { LEQ }
    | "*" { TIMES }
    | "+" { PLUS }
    | "(" { LPAREN }
    | ")" { RPAREN }
    | "let" { LET }
    | "=" { EQUALS }
    | "in" { IN }
    | "if" { IF }
    | "then" { THEN }
    | "else" { ELSE }
    (* 'Lexing.lexeme lexbuf' returns the string that was matched *)
    | id { ID (Lexing.lexeme lexbuf) }
    (* 'int_of_string' converts a string to an integer *)
    | int { INT (int_of_string (Lexing.lexeme lexbuf)) }
    | eof { EOF }

对于 Menhir 工具来说,这样的构建顺序是合理的;毕竟看上去 lexer.mll 中定义 TOKEN 并不是那么方便 😃

语义分析模块

考虑小步语义风格,这方便我们在后面加入类型检查:

open Ast

let parse (s : string) : expr = 
  let lexbuf = Lexing.from_string s in 
  let ast = Parser.prog Lexer.read lexbuf in
  ast

(** [is_value e] is whether [e] is a value. *)
let is_value e = match e with 
  | Int _ | Bool _ -> true
  | _ -> false

(** [subst e v x] is [e{v/x}]. *)
let subst _ _ _ =
  failwith "See next section"

(** [step] is the [-->] relation, that is, a single step of
    evaluation. *)
let rec step : expr -> expr = function
  | Int _ | Bool _ -> failwith "Does not step"
  | Var _ -> failwith "Unbound variable"
  | Binop (bop, e1, e2) when is_value e1 && is_value e2 ->
    step_bop bop e1 e2
  | Binop (bop, e1, e2) when is_value e1 ->
    Binop (bop, e1, step e2)
  | Binop (bop, e1, e2) -> Binop (bop, step e1, e2)
  | Let (x, e1, e2) when is_value e1 -> subst e2 e1 x
  | Let (x, e1, e2) -> Let (x, step e1, e2)
  | If (Bool true, e2, _) -> e2
  | If (Bool false, _, e3) -> e3
  | If (Int _, _, _) -> failwith "Guard of if must have type bool"
  | If (e1, e2, e3) -> If (step e1, e2, e3)

(** [step_bop bop v1 v2] implements the primitive operation
    [v1 bop v2].  Requires: [v1] and [v2] are both values. *)
and step_bop bop e1 e2 = match bop, e1, e2 with
  | Add, Int a, Int b -> Int (a + b)
  | Mult, Int a, Int b -> Int (a * b)
  | Leq, Int a, Int b -> Bool (a <= b)
  | _ -> failwith "Operator and operand type mismatch"
posted @ 2024-10-08 22:06  sysss  阅读(44)  评论(0)    收藏  举报