|
@@ -196,6 +196,7 @@ ISBN:
|
|
|
|
|
|
%\listoftables
|
|
|
|
|
|
+
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
\chapter*{Preface}
|
|
|
\addcontentsline{toc}{fmbm}{Preface}
|
|
@@ -247,7 +248,7 @@ concepts and algorithms used in compilers.
|
|
|
the fundamental tools of compiler construction: \emph{abstract
|
|
|
syntax trees} and \emph{recursive functions}.
|
|
|
{\if\edition\pythonEd
|
|
|
-\item In Chapter~\ref{ch:parsing-Lvar} we learn how to use the Lark
|
|
|
+\item In Chapter~\ref{ch:parsing} we learn how to use the Lark
|
|
|
parser generator to create a parser for the language of integer
|
|
|
arithmetic and local variables. We learn about the parsing
|
|
|
algorithms inside Lark, including Earley and LALR(1).
|
|
@@ -307,14 +308,13 @@ programming, data structures and algorithms, and discrete
|
|
|
mathematics.
|
|
|
%
|
|
|
At the beginning of the course, students form groups of two to four
|
|
|
-people. The groups complete one chapter every two weeks, starting
|
|
|
-with chapter~\ref{ch:Lvar} and finishing with
|
|
|
-chapter~\ref{ch:Llambda}. Many chapters include a challenge problem
|
|
|
-that we assign to the graduate students. The last two weeks of the
|
|
|
+people. The groups complete approximately one chapter every two
|
|
|
+weeks, starting with chapter~\ref{ch:Lvar}. The last two weeks of the
|
|
|
course involve a final project in which students design and implement
|
|
|
a compiler extension of their choosing. The last few chapters can be
|
|
|
-used in support of these projects. For compiler courses at
|
|
|
-universities on the quarter system (about ten weeks in length), we
|
|
|
+used in support of these projects. Many chapters include a challenge
|
|
|
+problem that we assign to the graduate students. For compiler courses
|
|
|
+at universities on the quarter system (about ten weeks in length), we
|
|
|
recommend completing the course through chapter~\ref{ch:Lvec} or
|
|
|
chapter~\ref{ch:Lfun} and providing some scaffolding code to the
|
|
|
students for each compiler pass.
|
|
@@ -337,7 +337,6 @@ State University, Portland State University, Rose–Hulman Institute of
|
|
|
Technology, University of Freiburg, University of Massachusetts
|
|
|
Lowell, and the University of Vermont.
|
|
|
|
|
|
-
|
|
|
\begin{figure}[tp]
|
|
|
\begin{tcolorbox}[colback=white]
|
|
|
{\if\edition\racketEd
|
|
@@ -370,32 +369,35 @@ Lowell, and the University of Vermont.
|
|
|
\fi}
|
|
|
{\if\edition\pythonEd
|
|
|
\begin{tikzpicture}[baseline=(current bounding box.center)]
|
|
|
- \node (C1) at (0,1.5) {\small Ch.~\ref{ch:trees-recur} Preliminaries};
|
|
|
- \node (C2) at (4,1.5) {\small Ch.~\ref{ch:Lvar} Variables};
|
|
|
- \node (C3) at (8,1.5) {\small Ch.~\ref{ch:register-allocation-Lvar} Registers};
|
|
|
- \node (C4) at (0,0) {\small Ch.~\ref{ch:Lif} Conditionals};
|
|
|
- \node (C5) at (4,0) {\small Ch.~\ref{ch:Lvec} Tuples};
|
|
|
- \node (C6) at (8,0) {\small Ch.~\ref{ch:Lfun} Functions};
|
|
|
- \node (C9) at (0,-1.5) {\small Ch.~\ref{ch:Lwhile} Loops};
|
|
|
- \node (C8) at (4,-1.5) {\small Ch.~\ref{ch:Ldyn} Dynamic};
|
|
|
+ \node (Prelim) at (0,1.5) {\small Ch.~\ref{ch:trees-recur} Preliminaries};
|
|
|
+ \node (Var) at (4,1.5) {\small Ch.~\ref{ch:Lvar} Variables};
|
|
|
+ \node (Parse) at (8,1.5) {\small Ch.~\ref{ch:parsing} Parsing};
|
|
|
+ \node (Reg) at (0,0) {\small Ch.~\ref{ch:register-allocation-Lvar} Registers};
|
|
|
+ \node (Cond) at (4,0) {\small Ch.~\ref{ch:Lif} Conditionals};
|
|
|
+ \node (Loop) at (8,0) {\small Ch.~\ref{ch:Lwhile} Loops};
|
|
|
+ \node (Fun) at (0,-1.5) {\small Ch.~\ref{ch:Lfun} Functions};
|
|
|
+ \node (Tuple) at (4,-1.5) {\small Ch.~\ref{ch:Lvec} Tuples};
|
|
|
+ \node (Dyn) at (8,-1.5) {\small Ch.~\ref{ch:Ldyn} Dynamic};
|
|
|
% \node (CO) at (0,-3) {\small Ch.~\ref{ch:Lobject} Objects};
|
|
|
- \node (C7) at (8,-1.5) {\small Ch.~\ref{ch:Llambda} Lambda};
|
|
|
- \node (C10) at (4,-3) {\small Ch.~\ref{ch:Lgrad} Gradual Typing};
|
|
|
- \node (C11) at (8,-3) {\small Ch.~\ref{ch:Lpoly} Generics};
|
|
|
-
|
|
|
- \path[->] (C1) edge [above] node {} (C2);
|
|
|
- \path[->] (C2) edge [above] node {} (C3);
|
|
|
- \path[->] (C3) edge [above] node {} (C4);
|
|
|
- \path[->] (C4) edge [above] node {} (C5);
|
|
|
- \path[->,style=dotted] (C5) edge [above] node {} (C6);
|
|
|
- \path[->] (C5) edge [above] node {} (C7);
|
|
|
- \path[->] (C6) edge [above] node {} (C7);
|
|
|
- \path[->] (C4) edge [above] node {} (C8);
|
|
|
- \path[->] (C4) edge [above] node {} (C9);
|
|
|
- \path[->] (C7) edge [above] node {} (C10);
|
|
|
- \path[->] (C8) edge [above] node {} (C10);
|
|
|
-% \path[->] (C8) edge [above] node {} (CO);
|
|
|
- \path[->] (C10) edge [above] node {} (C11);
|
|
|
+ \node (Lam) at (0,-3) {\small Ch.~\ref{ch:Llambda} Lambda};
|
|
|
+ \node (Gradual) at (4,-3) {\small Ch.~\ref{ch:Lgrad} Gradual Typing};
|
|
|
+ \node (Generic) at (8,-3) {\small Ch.~\ref{ch:Lpoly} Generics};
|
|
|
+
|
|
|
+ \path[->] (Prelim) edge [above] node {} (Var);
|
|
|
+ \path[->] (Var) edge [above] node {} (Reg);
|
|
|
+ \path[->] (Var) edge [above] node {} (Parse);
|
|
|
+ \path[->] (Reg) edge [above] node {} (Cond);
|
|
|
+ \path[->] (Cond) edge [above] node {} (Tuple);
|
|
|
+ \path[->,style=dotted] (Tuple) edge [above] node {} (Fun);
|
|
|
+ \path[->] (Cond) edge [above] node {} (Fun);
|
|
|
+ \path[->] (Tuple) edge [above] node {} (Lam);
|
|
|
+ \path[->] (Fun) edge [above] node {} (Lam);
|
|
|
+ \path[->] (Cond) edge [above] node {} (Dyn);
|
|
|
+ \path[->] (Cond) edge [above] node {} (Loop);
|
|
|
+ \path[->] (Lam) edge [above] node {} (Gradual);
|
|
|
+ \path[->] (Dyn) edge [above] node {} (Gradual);
|
|
|
+% \path[->] (Dyn) edge [above] node {} (CO);
|
|
|
+ \path[->] (Gradual) edge [above] node {} (Generic);
|
|
|
\end{tikzpicture}
|
|
|
\fi}
|
|
|
\end{tcolorbox}
|
|
@@ -506,9 +508,11 @@ perform.\index{subject}{concrete syntax}\index{subject}{abstract
|
|
|
syntax}\index{subject}{abstract syntax
|
|
|
tree}\index{subject}{AST}\index{subject}{program}\index{subject}{parse}
|
|
|
The process of translating from concrete syntax to abstract syntax is
|
|
|
-called \emph{parsing}~\citep{Aho:2006wb}\python{ and is studied in
|
|
|
- chapter~\ref{ch:parsing-Lvar}}.
|
|
|
-\racket{This book does not cover the theory and implementation of parsing.}%
|
|
|
+called \emph{parsing}\python{ and is studied in
|
|
|
+ chapter~\ref{ch:parsing}}.
|
|
|
+\racket{This book does not cover the theory and implementation of parsing.
|
|
|
+ We refer the readers interested in parsing to the thorough treatment
|
|
|
+ of parsing by \citet{Aho:2006wb}.}%
|
|
|
%
|
|
|
\racket{A parser is provided in the support code for translating from
|
|
|
concrete to abstract syntax.}%
|
|
@@ -4090,23 +4094,23 @@ all, fast code is useless if it produces incorrect results!
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
{\if\edition\pythonEd
|
|
|
\chapter{Parsing}
|
|
|
-\label{ch:parsing-Lvar}
|
|
|
+\label{ch:parsing}
|
|
|
\setcounter{footnote}{0}
|
|
|
\index{subject}{parsing}
|
|
|
|
|
|
In this chapter we learn how to use the Lark parser
|
|
|
-generator~\citep{shinan20:_lark_docs} to translate the concrete syntax
|
|
|
+framework~\citep{shinan20:_lark_docs} to translate the concrete syntax
|
|
|
of \LangInt{} (a sequence of characters) into an abstract syntax tree.
|
|
|
You will then be asked to use Lark to create a parser for \LangVar{}.
|
|
|
-We then learn about the parsing algorithms used inside Lark, studying
|
|
|
-the \citet{Earley:1970ly} and LALR algorithms.
|
|
|
+We also describe the parsing algorithms used inside Lark, studying the
|
|
|
+\citet{Earley:1970ly} and LALR(1) algorithms.
|
|
|
|
|
|
-A parser generator takes in a specification of the concrete syntax and
|
|
|
-produces a parser. Even though a parser generator does most of the
|
|
|
-work for us, using one properly requires some knowledge. In
|
|
|
-particular, we must learn about the specification languages used by
|
|
|
-parser generators and we must learn how to deal with ambiguity in our
|
|
|
-language specifications.
|
|
|
+A parser framework such as Lark takes in a specification of the
|
|
|
+concrete syntax and the input program and produces a parse tree. Even
|
|
|
+though a parser framework does most of the work for us, using one
|
|
|
+properly requires some knowledge. In particular, we must learn about
|
|
|
+its specification languages and we must learn how to deal with
|
|
|
+ambiguity in our language specifications.
|
|
|
|
|
|
The process of parsing is traditionally subdivided into two phases:
|
|
|
\emph{lexical analysis} (also called scanning) and \emph{syntax
|
|
@@ -4119,16 +4123,16 @@ language. The reason for the subdivision into two phases is to enable
|
|
|
the use of a faster but less powerful algorithm for lexical analysis
|
|
|
and the use of a slower but more powerful algorithm for parsing.
|
|
|
%
|
|
|
-Likewise, parser generators typical come in pairs, with separate
|
|
|
-generators for the lexical analyzer (or lexer for short) and for the
|
|
|
-parser. A paricularly influential pair of generators were
|
|
|
-\texttt{lex} and \texttt{yacc}. The \texttt{lex} generator was written
|
|
|
-by \citet{Lesk:1975uq} at Bell Labs. The \texttt{yacc} generator was
|
|
|
-written by \citet{Johnson:1979qy} at AT\&T and stands for Yet Another
|
|
|
-Compiler Compiler.
|
|
|
-
|
|
|
-The Lark parse generator that we use in this chapter includes both a
|
|
|
-lexical analyzer and a parser. The next section discusses lexical
|
|
|
+%% Likewise, parser generators typical come in pairs, with separate
|
|
|
+%% generators for the lexical analyzer (or lexer for short) and for the
|
|
|
+%% parser. A paricularly influential pair of generators were
|
|
|
+%% \texttt{lex} and \texttt{yacc}. The \texttt{lex} generator was written
|
|
|
+%% by \citet{Lesk:1975uq} at Bell Labs. The \texttt{yacc} generator was
|
|
|
+%% written by \citet{Johnson:1979qy} at AT\&T and stands for Yet Another
|
|
|
+%% Compiler Compiler.
|
|
|
+%
|
|
|
+The Lark parse framwork that we use in this chapter includes both
|
|
|
+lexical analyzers and parsers. The next section discusses lexical
|
|
|
analysis and the remainder of the chapter discusses parsing.
|
|
|
|
|
|
|
|
@@ -4522,10 +4526,13 @@ section~\ref{sec:lalr} we learn about the LALR algorithm, which is
|
|
|
more efficient but can only handle a subset of the context-free
|
|
|
grammars.
|
|
|
|
|
|
-The Earley algorithm uses a data structure called a
|
|
|
-\emph{chart}\index{subject}{chart} to keep track of its progress. The
|
|
|
-chart is an array with one slot for each position in the input string,
|
|
|
-where position $0$ is before the first character and position $n$ is
|
|
|
+The Earley algorithm can be viewed as an interpreter; it treats the
|
|
|
+grammar as the program being interpreted and it treats the concrete
|
|
|
+syntax of the program-to-be-parsed as its input. The Earley algorithm
|
|
|
+uses a data structure called a \emph{chart}\index{subject}{chart} to
|
|
|
+keep track of its progress and to memoize its results. The chart is an
|
|
|
+array with one slot for each position in the input string, where
|
|
|
+position $0$ is before the first character and position $n$ is
|
|
|
immediately after the last character. So the array has length $n+1$
|
|
|
for an input string of length $n$. Each slot in the chart contains a
|
|
|
set of \emph{dotted rules}. A dotted rule is simply a grammar rule
|
|
@@ -4553,8 +4560,8 @@ grammar in figure~\ref{fig:Lint-lark-grammar}, we place
|
|
|
\begin{lstlisting}
|
|
|
lang_int: . stmt_list (0)
|
|
|
\end{lstlisting}
|
|
|
-in slot $0$ of the chart. The algorithm then proceeds to its
|
|
|
-\emph{prediction} phase in which it adds more dotted rules to the
|
|
|
+in slot $0$ of the chart. The algorithm then proceeds to with
|
|
|
+\emph{prediction} actions in which it adds more dotted rules to the
|
|
|
chart based on which nonterminal come after a period. In the above,
|
|
|
the nonterminal \code{stmt\_list} appears after a period, so we add all
|
|
|
the rules for \code{stmt\_list} to slot $0$, with a period at the
|
|
@@ -4767,13 +4774,15 @@ use with even the largest of input files.
|
|
|
\section{The LALR(1) Algorithm}
|
|
|
\label{sec:lalr}
|
|
|
|
|
|
-The LALR(1) algorithm consists of a finite automata and a stack to
|
|
|
-record its progress in parsing the input string. Each element of the
|
|
|
-stack is a pair: a state number and a grammar symbol (a terminal or
|
|
|
-nonterminal). The symbol characterizes the input that has been parsed
|
|
|
-so-far and the state number is used to remember how to proceed once
|
|
|
-the next symbol-worth of input has been parsed. Each state in the
|
|
|
-finite automata represents where the parser stands in the parsing
|
|
|
+The LALR(1) algorithm can be viewed as a two phase approach in which
|
|
|
+it first compiles the grammar into a state machine and then runs the
|
|
|
+state machine to parse the input string. The state machine also uses
|
|
|
+a stack to record its progress in parsing the input string. Each
|
|
|
+element of the stack is a pair: a state number and a grammar symbol (a
|
|
|
+terminal or nonterminal). The symbol characterizes the input that has
|
|
|
+been parsed so-far and the state number is used to remember how to
|
|
|
+proceed once the next symbol-worth of input has been parsed. Each
|
|
|
+state in the machine represents where the parser stands in the parsing
|
|
|
process with respect to certain grammar rules. In particular, each
|
|
|
state is associated with a set of dotted rules.
|
|
|
|
|
@@ -4797,7 +4806,7 @@ rule 1 with a period after the \code{PRINT} token and before the
|
|
|
\emph{item}. There are several rules that could apply next, both rule
|
|
|
2 and 3, so state 1 also shows those rules with a period at the
|
|
|
beginning of their right-hand sides. The edges between states indicate
|
|
|
-which transitions the automata should make depending on the next input
|
|
|
+which transitions the machine should make depending on the next input
|
|
|
token. So, for example, if the next input token is \code{INT} then the
|
|
|
parser will push \code{INT} and the target state 4 on the stack and
|
|
|
transition to state 4. Suppose we are now at the end of the input. In
|
|
@@ -10155,7 +10164,7 @@ arguments may not be used at all. For example, consider the case for
|
|
|
the constant \TRUE{} in \code{explicate\_pred}, in which we discard the
|
|
|
\code{els} continuation.
|
|
|
%
|
|
|
- {\if\edition\racketEd
|
|
|
+{\if\edition\racketEd
|
|
|
The following example program falls into this
|
|
|
case, and it creates two unused blocks.
|
|
|
\begin{center}
|
|
@@ -10277,11 +10286,12 @@ return a \code{Goto} to the new label.
|
|
|
[else
|
|
|
(let ([label (gensym 'block)])
|
|
|
(set! basic-blocks (cons (cons label t) basic-blocks))
|
|
|
- (Goto label))]))
|
|
|
+ (Goto label))])))
|
|
|
\end{lstlisting}
|
|
|
\end{minipage}
|
|
|
\end{center}
|
|
|
\fi}
|
|
|
+
|
|
|
{\if\edition\pythonEd
|
|
|
%
|
|
|
Here is the new version of the \code{create\_block} auxiliary function
|
|
@@ -20663,6 +20673,7 @@ class TypeCheckLgrad(TypeCheckLlambda):
|
|
|
|
|
|
\fi}
|
|
|
|
|
|
+
|
|
|
\clearpage
|
|
|
|
|
|
\section{Interpreting \LangCast{}}
|
|
@@ -20780,7 +20791,7 @@ For the first \code{vector-set!}, the proxy casts a tagged \code{1}
|
|
|
from \CANYTY{} to \INTTY{}.
|
|
|
}
|
|
|
\python{
|
|
|
- For the subscript \code{v[i]} in \code{f([v[i])} of \code{map\_inplace},
|
|
|
+ For the subscript \code{v[i]} in \code{f(v[i])} of \code{map\_inplace},
|
|
|
the proxy casts the integer from \INTTY{} to \CANYTY{}.
|
|
|
For the subscript on the left of the assignment,
|
|
|
the proxy casts the tagged value from \CANYTY{} to \INTTY{}.
|