Browse Source

more explanation in parsing

Jeremy Siek 2 years ago
parent
commit
dbd950a748
3 changed files with 89 additions and 95 deletions
  1. 1 1
      Makefile
  2. 4 21
      book.bib
  3. 84 73
      book.tex

+ 1 - 1
Makefile

@@ -4,7 +4,7 @@
 LATEXMK= latexmk -pdf
 LATEXMK= latexmk -pdf
 
 
 all:
 all:
-	$(LATEXMK) book
+	$(LATEXMK) -f book
 
 
 cont: continuous
 cont: continuous
 continuous:
 continuous:

+ 4 - 21
book.bib

@@ -1,36 +1,22 @@
 @book{Tomita:1985qr,
 @book{Tomita:1985qr,
-	address = {Norwell, MA, USA},
 	author = {Masaru Tomita},
 	author = {Masaru Tomita},
-	date-added = {2008-12-02 14:16:33 -0700},
-	date-modified = {2008-12-02 14:16:39 -0700},
-	isbn = {0898382025},
 	publisher = {Kluwer Academic Publishers},
 	publisher = {Kluwer Academic Publishers},
 	title = {Efficient Parsing for Natural Language: A Fast Algorithm for Practical Systems},
 	title = {Efficient Parsing for Natural Language: A Fast Algorithm for Practical Systems},
 	year = {1985}}
 	year = {1985}}
 
 
 @article{Earley:1970ly,
 @article{Earley:1970ly,
-	acmid = {362035},
-	address = {New York, NY, USA},
 	author = {Earley, Jay},
 	author = {Earley, Jay},
-	date-added = {2011-05-28 11:31:46 -0600},
-	date-modified = {2011-05-28 11:31:48 -0600},
-	doi = {http://doi.acm.org/10.1145/362007.362035},
-	issn = {0001-0782},
 	issue = {2},
 	issue = {2},
 	journal = {Commun. ACM},
 	journal = {Commun. ACM},
-	keywords = {compilers, computational complexity, context-free grammar, parsing, syntax analysis},
 	month = {February},
 	month = {February},
 	numpages = {9},
 	numpages = {9},
 	pages = {94--102},
 	pages = {94--102},
 	publisher = {ACM},
 	publisher = {ACM},
 	title = {An efficient context-free parsing algorithm},
 	title = {An efficient context-free parsing algorithm},
-	url = {http://doi.acm.org/10.1145/362007.362035},
 	volume = {13},
 	volume = {13},
-	year = {1970},
-	Bdsk-File-1 = {YnBsaXN0MDDRAQJccmVsYXRpdmVQYXRoXnA5NC1lYXJsZXkucGRmCAsYAAAAAAAAAQEAAAAAAAAAAwAAAAAAAAAAAAAAAAAAACc=},
-	Bdsk-Url-1 = {http://doi.acm.org/10.1145/362007.362035}}
+	year = {1970}}
 
 
-@Book{Hopcroft06:_automata,
+@book{Hopcroft06:_automata,
   author = 	 {John Hopcroft and Rajeev Motwani and Jeffrey Ullman},
   author = 	 {John Hopcroft and Rajeev Motwani and Jeffrey Ullman},
   title = 	 {Introduction to Automata Theory, Languages, and Computation},
   title = 	 {Introduction to Automata Theory, Languages, and Computation},
   publisher = 	 {Pearson},
   publisher = 	 {Pearson},
@@ -38,15 +24,12 @@
 
 
 @techreport{Lesk:1975uq,
 @techreport{Lesk:1975uq,
 	author = {M. E. Lesk and E. Schmidt},
 	author = {M. E. Lesk and E. Schmidt},
-	date-added = {2007-08-27 13:37:27 -0600},
-	date-modified = {2009-08-25 22:28:17 -0600},
 	institution = {Bell Laboratories},
 	institution = {Bell Laboratories},
 	month = {July},
 	month = {July},
 	title = {Lex - A Lexical Analyzer Generator},
 	title = {Lex - A Lexical Analyzer Generator},
-	year = {1975},
-	Bdsk-File-1 = {YnBsaXN0MDDRAQJccmVsYXRpdmVQYXRoV2xleC5wZGYICxgAAAAAAAABAQAAAAAAAAADAAAAAAAAAAAAAAAAAAAAIA==}}
+	year = {1975}}
 
 
-@Misc{shinan20:_lark_docs,
+@misc{shinan20:_lark_docs,
   author = 	 {Erez Shinan},
   author = 	 {Erez Shinan},
   title = 	 {Lark Documentation},
   title = 	 {Lark Documentation},
   url = {https://lark-parser.readthedocs.io/en/latest/index.html},
   url = {https://lark-parser.readthedocs.io/en/latest/index.html},

+ 84 - 73
book.tex

@@ -196,6 +196,7 @@ ISBN:
 
 
 %\listoftables
 %\listoftables
 
 
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \chapter*{Preface}
 \chapter*{Preface}
 \addcontentsline{toc}{fmbm}{Preface}
 \addcontentsline{toc}{fmbm}{Preface}
@@ -247,7 +248,7 @@ concepts and algorithms used in compilers.
   the fundamental tools of compiler construction: \emph{abstract
   the fundamental tools of compiler construction: \emph{abstract
     syntax trees} and \emph{recursive functions}. 
     syntax trees} and \emph{recursive functions}. 
 {\if\edition\pythonEd
 {\if\edition\pythonEd
-\item In Chapter~\ref{ch:parsing-Lvar} we learn how to use the Lark
+\item In Chapter~\ref{ch:parsing} we learn how to use the Lark
   parser generator to create a parser for the language of integer
   parser generator to create a parser for the language of integer
   arithmetic and local variables. We learn about the parsing
   arithmetic and local variables. We learn about the parsing
   algorithms inside Lark, including Earley and LALR(1).
   algorithms inside Lark, including Earley and LALR(1).
@@ -307,14 +308,13 @@ programming, data structures and algorithms, and discrete
 mathematics.
 mathematics.
 %
 %
 At the beginning of the course, students form groups of two to four
 At the beginning of the course, students form groups of two to four
-people.  The groups complete one chapter every two weeks, starting
-with chapter~\ref{ch:Lvar} and finishing with
-chapter~\ref{ch:Llambda}. Many chapters include a challenge problem
-that we assign to the graduate students. The last two weeks of the
+people.  The groups complete approximately one chapter every two
+weeks, starting with chapter~\ref{ch:Lvar}. The last two weeks of the
 course involve a final project in which students design and implement
 course involve a final project in which students design and implement
 a compiler extension of their choosing.  The last few chapters can be
 a compiler extension of their choosing.  The last few chapters can be
-used in support of these projects.  For compiler courses at
-universities on the quarter system (about ten weeks in length), we
+used in support of these projects.  Many chapters include a challenge
+problem that we assign to the graduate students. For compiler courses
+at universities on the quarter system (about ten weeks in length), we
 recommend completing the course through chapter~\ref{ch:Lvec} or
 recommend completing the course through chapter~\ref{ch:Lvec} or
 chapter~\ref{ch:Lfun} and providing some scaffolding code to the
 chapter~\ref{ch:Lfun} and providing some scaffolding code to the
 students for each compiler pass.
 students for each compiler pass.
@@ -337,7 +337,6 @@ State University, Portland State University, Rose–Hulman Institute of
 Technology, University of Freiburg, University of Massachusetts
 Technology, University of Freiburg, University of Massachusetts
 Lowell, and the University of Vermont.
 Lowell, and the University of Vermont.
 
 
-
 \begin{figure}[tp]
 \begin{figure}[tp]
 \begin{tcolorbox}[colback=white]
 \begin{tcolorbox}[colback=white]
   {\if\edition\racketEd
   {\if\edition\racketEd
@@ -370,32 +369,35 @@ Lowell, and the University of Vermont.
 \fi}
 \fi}
 {\if\edition\pythonEd
 {\if\edition\pythonEd
 \begin{tikzpicture}[baseline=(current  bounding  box.center)]
 \begin{tikzpicture}[baseline=(current  bounding  box.center)]
-  \node (C1) at (0,1.5) {\small Ch.~\ref{ch:trees-recur} Preliminaries};
-  \node (C2) at (4,1.5) {\small Ch.~\ref{ch:Lvar} Variables};
-  \node (C3) at (8,1.5) {\small Ch.~\ref{ch:register-allocation-Lvar} Registers};
-  \node (C4) at (0,0) {\small Ch.~\ref{ch:Lif} Conditionals};
-  \node (C5) at (4,0) {\small Ch.~\ref{ch:Lvec} Tuples};
-  \node (C6) at (8,0) {\small Ch.~\ref{ch:Lfun} Functions};
-  \node (C9) at (0,-1.5) {\small Ch.~\ref{ch:Lwhile} Loops};
-  \node (C8) at (4,-1.5) {\small Ch.~\ref{ch:Ldyn} Dynamic};
+  \node (Prelim) at (0,1.5) {\small Ch.~\ref{ch:trees-recur} Preliminaries};
+  \node (Var) at (4,1.5) {\small Ch.~\ref{ch:Lvar} Variables};
+  \node (Parse) at (8,1.5) {\small Ch.~\ref{ch:parsing} Parsing};
+  \node (Reg) at (0,0) {\small Ch.~\ref{ch:register-allocation-Lvar} Registers};
+  \node (Cond) at (4,0) {\small Ch.~\ref{ch:Lif} Conditionals};
+  \node (Loop) at (8,0) {\small Ch.~\ref{ch:Lwhile} Loops};
+  \node (Fun) at (0,-1.5) {\small Ch.~\ref{ch:Lfun} Functions};
+  \node (Tuple) at (4,-1.5) {\small Ch.~\ref{ch:Lvec} Tuples};
+  \node (Dyn) at (8,-1.5) {\small Ch.~\ref{ch:Ldyn} Dynamic};
 %  \node (CO) at (0,-3) {\small Ch.~\ref{ch:Lobject} Objects};
 %  \node (CO) at (0,-3) {\small Ch.~\ref{ch:Lobject} Objects};
-  \node (C7) at (8,-1.5) {\small Ch.~\ref{ch:Llambda} Lambda};
-  \node (C10) at (4,-3) {\small Ch.~\ref{ch:Lgrad} Gradual Typing};
-  \node (C11) at (8,-3) {\small Ch.~\ref{ch:Lpoly} Generics};
-
-  \path[->] (C1) edge [above] node {} (C2);
-  \path[->] (C2) edge [above] node {} (C3);
-  \path[->] (C3) edge [above] node {} (C4);
-  \path[->] (C4) edge [above] node {} (C5);
-  \path[->,style=dotted] (C5) edge [above] node {} (C6);
-  \path[->] (C5) edge [above] node {} (C7);
-  \path[->] (C6) edge [above] node {} (C7);
-  \path[->] (C4) edge [above] node {} (C8);
-  \path[->] (C4) edge [above] node {} (C9);
-  \path[->] (C7) edge [above] node {} (C10);
-  \path[->] (C8) edge [above] node {} (C10);
-%  \path[->] (C8) edge [above] node {} (CO);
-  \path[->] (C10) edge [above] node {} (C11);
+  \node (Lam) at (0,-3) {\small Ch.~\ref{ch:Llambda} Lambda};
+  \node (Gradual) at (4,-3) {\small Ch.~\ref{ch:Lgrad} Gradual Typing};
+  \node (Generic) at (8,-3) {\small Ch.~\ref{ch:Lpoly} Generics};
+
+  \path[->] (Prelim) edge [above] node {} (Var);
+  \path[->] (Var) edge [above] node {} (Reg);
+  \path[->] (Var) edge [above] node {} (Parse);
+  \path[->] (Reg) edge [above] node {} (Cond);
+  \path[->] (Cond) edge [above] node {} (Tuple);
+  \path[->,style=dotted] (Tuple) edge [above] node {} (Fun);
+  \path[->] (Cond) edge [above] node {} (Fun);
+  \path[->] (Tuple) edge [above] node {} (Lam);
+  \path[->] (Fun) edge [above] node {} (Lam);
+  \path[->] (Cond) edge [above] node {} (Dyn);
+  \path[->] (Cond) edge [above] node {} (Loop);
+  \path[->] (Lam) edge [above] node {} (Gradual);
+  \path[->] (Dyn) edge [above] node {} (Gradual);
+%  \path[->] (Dyn) edge [above] node {} (CO);
+  \path[->] (Gradual) edge [above] node {} (Generic);
 \end{tikzpicture}
 \end{tikzpicture}
 \fi}
 \fi}
 \end{tcolorbox}
 \end{tcolorbox}
@@ -506,9 +508,11 @@ perform.\index{subject}{concrete syntax}\index{subject}{abstract
   syntax}\index{subject}{abstract syntax
   syntax}\index{subject}{abstract syntax
   tree}\index{subject}{AST}\index{subject}{program}\index{subject}{parse}
   tree}\index{subject}{AST}\index{subject}{program}\index{subject}{parse}
 The process of translating from concrete syntax to abstract syntax is
 The process of translating from concrete syntax to abstract syntax is
-called \emph{parsing}~\citep{Aho:2006wb}\python{ and is studied in
-  chapter~\ref{ch:parsing-Lvar}}.
-\racket{This book does not cover the theory and implementation of parsing.}%
+called \emph{parsing}\python{ and is studied in
+  chapter~\ref{ch:parsing}}.
+\racket{This book does not cover the theory and implementation of parsing.
+  We refer the readers interested in parsing to the thorough treatment
+  of parsing by \citet{Aho:2006wb}.}%
 %
 %
 \racket{A parser is provided in the support code for translating from
 \racket{A parser is provided in the support code for translating from
   concrete to abstract syntax.}%
   concrete to abstract syntax.}%
@@ -4090,23 +4094,23 @@ all, fast code is useless if it produces incorrect results!
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 {\if\edition\pythonEd
 {\if\edition\pythonEd
 \chapter{Parsing}
 \chapter{Parsing}
-\label{ch:parsing-Lvar}
+\label{ch:parsing}
 \setcounter{footnote}{0}
 \setcounter{footnote}{0}
 \index{subject}{parsing}
 \index{subject}{parsing}
 
 
 In this chapter we learn how to use the Lark parser
 In this chapter we learn how to use the Lark parser
-generator~\citep{shinan20:_lark_docs} to translate the concrete syntax
+framework~\citep{shinan20:_lark_docs} to translate the concrete syntax
 of \LangInt{} (a sequence of characters) into an abstract syntax tree.
 of \LangInt{} (a sequence of characters) into an abstract syntax tree.
 You will then be asked to use Lark to create a parser for \LangVar{}.
 You will then be asked to use Lark to create a parser for \LangVar{}.
-We then learn about the parsing algorithms used inside Lark, studying
-the \citet{Earley:1970ly} and LALR algorithms.
+We also describe the parsing algorithms used inside Lark, studying the
+\citet{Earley:1970ly} and LALR(1) algorithms.
 
 
-A parser generator takes in a specification of the concrete syntax and
-produces a parser. Even though a parser generator does most of the
-work for us, using one properly requires some knowledge.  In
-particular, we must learn about the specification languages used by
-parser generators and we must learn how to deal with ambiguity in our
-language specifications.
+A parser framework such as Lark takes in a specification of the
+concrete syntax and the input program and produces a parse tree. Even
+though a parser framework does most of the work for us, using one
+properly requires some knowledge.  In particular, we must learn about
+its specification languages and we must learn how to deal with
+ambiguity in our language specifications.
 
 
 The process of parsing is traditionally subdivided into two phases:
 The process of parsing is traditionally subdivided into two phases:
 \emph{lexical analysis} (also called scanning) and \emph{syntax
 \emph{lexical analysis} (also called scanning) and \emph{syntax
@@ -4119,16 +4123,16 @@ language. The reason for the subdivision into two phases is to enable
 the use of a faster but less powerful algorithm for lexical analysis
 the use of a faster but less powerful algorithm for lexical analysis
 and the use of a slower but more powerful algorithm for parsing.
 and the use of a slower but more powerful algorithm for parsing.
 %
 %
-Likewise, parser generators typical come in pairs, with separate
-generators for the lexical analyzer (or lexer for short) and for the
-parser.  A paricularly influential pair of generators were
-\texttt{lex} and \texttt{yacc}. The \texttt{lex} generator was written
-by \citet{Lesk:1975uq} at Bell Labs. The \texttt{yacc} generator was
-written by \citet{Johnson:1979qy} at AT\&T and stands for Yet Another
-Compiler Compiler.
-
-The Lark parse generator that we use in this chapter includes both a
-lexical analyzer and a parser. The next section discusses lexical
+%% Likewise, parser generators typical come in pairs, with separate
+%% generators for the lexical analyzer (or lexer for short) and for the
+%% parser.  A paricularly influential pair of generators were
+%% \texttt{lex} and \texttt{yacc}. The \texttt{lex} generator was written
+%% by \citet{Lesk:1975uq} at Bell Labs. The \texttt{yacc} generator was
+%% written by \citet{Johnson:1979qy} at AT\&T and stands for Yet Another
+%% Compiler Compiler.
+%
+The Lark parse framwork that we use in this chapter includes both
+lexical analyzers and parsers. The next section discusses lexical
 analysis and the remainder of the chapter discusses parsing.
 analysis and the remainder of the chapter discusses parsing.
 
 
 
 
@@ -4522,10 +4526,13 @@ section~\ref{sec:lalr} we learn about the LALR algorithm, which is
 more efficient but can only handle a subset of the context-free
 more efficient but can only handle a subset of the context-free
 grammars.
 grammars.
 
 
-The Earley algorithm uses a data structure called a
-\emph{chart}\index{subject}{chart} to keep track of its progress.  The
-chart is an array with one slot for each position in the input string,
-where position $0$ is before the first character and position $n$ is
+The Earley algorithm can be viewed as an interpreter; it treats the
+grammar as the program being interpreted and it treats the concrete
+syntax of the program-to-be-parsed as its input.  The Earley algorithm
+uses a data structure called a \emph{chart}\index{subject}{chart} to
+keep track of its progress and to memoize its results. The chart is an
+array with one slot for each position in the input string, where
+position $0$ is before the first character and position $n$ is
 immediately after the last character. So the array has length $n+1$
 immediately after the last character. So the array has length $n+1$
 for an input string of length $n$. Each slot in the chart contains a
 for an input string of length $n$. Each slot in the chart contains a
 set of \emph{dotted rules}. A dotted rule is simply a grammar rule
 set of \emph{dotted rules}. A dotted rule is simply a grammar rule
@@ -4553,8 +4560,8 @@ grammar in figure~\ref{fig:Lint-lark-grammar}, we place
 \begin{lstlisting}
 \begin{lstlisting}
   lang_int: . stmt_list         (0)
   lang_int: . stmt_list         (0)
 \end{lstlisting}
 \end{lstlisting}
-in slot $0$ of the chart. The algorithm then proceeds to its
-\emph{prediction} phase in which it adds more dotted rules to the
+in slot $0$ of the chart. The algorithm then proceeds to with
+\emph{prediction} actions in which it adds more dotted rules to the
 chart based on which nonterminal come after a period. In the above,
 chart based on which nonterminal come after a period. In the above,
 the nonterminal \code{stmt\_list} appears after a period, so we add all
 the nonterminal \code{stmt\_list} appears after a period, so we add all
 the rules for \code{stmt\_list} to slot $0$, with a period at the
 the rules for \code{stmt\_list} to slot $0$, with a period at the
@@ -4767,13 +4774,15 @@ use with even the largest of input files.
 \section{The LALR(1) Algorithm}
 \section{The LALR(1) Algorithm}
 \label{sec:lalr}
 \label{sec:lalr}
 
 
-The LALR(1) algorithm consists of a finite automata and a stack to
-record its progress in parsing the input string.  Each element of the
-stack is a pair: a state number and a grammar symbol (a terminal or
-nonterminal). The symbol characterizes the input that has been parsed
-so-far and the state number is used to remember how to proceed once
-the next symbol-worth of input has been parsed.  Each state in the
-finite automata represents where the parser stands in the parsing
+The LALR(1) algorithm can be viewed as a two phase approach in which
+it first compiles the grammar into a state machine and then runs the
+state machine to parse the input string.  The state machine also uses
+a stack to record its progress in parsing the input string.  Each
+element of the stack is a pair: a state number and a grammar symbol (a
+terminal or nonterminal). The symbol characterizes the input that has
+been parsed so-far and the state number is used to remember how to
+proceed once the next symbol-worth of input has been parsed.  Each
+state in the machine represents where the parser stands in the parsing
 process with respect to certain grammar rules. In particular, each
 process with respect to certain grammar rules. In particular, each
 state is associated with a set of dotted rules.
 state is associated with a set of dotted rules.
 
 
@@ -4797,7 +4806,7 @@ rule 1 with a period after the \code{PRINT} token and before the
 \emph{item}. There are several rules that could apply next, both rule
 \emph{item}. There are several rules that could apply next, both rule
 2 and 3, so state 1 also shows those rules with a period at the
 2 and 3, so state 1 also shows those rules with a period at the
 beginning of their right-hand sides. The edges between states indicate
 beginning of their right-hand sides. The edges between states indicate
-which transitions the automata should make depending on the next input
+which transitions the machine should make depending on the next input
 token. So, for example, if the next input token is \code{INT} then the
 token. So, for example, if the next input token is \code{INT} then the
 parser will push \code{INT} and the target state 4 on the stack and
 parser will push \code{INT} and the target state 4 on the stack and
 transition to state 4.  Suppose we are now at the end of the input. In
 transition to state 4.  Suppose we are now at the end of the input. In
@@ -10155,7 +10164,7 @@ arguments may not be used at all. For example, consider the case for
 the constant \TRUE{} in \code{explicate\_pred}, in which we discard the
 the constant \TRUE{} in \code{explicate\_pred}, in which we discard the
 \code{els} continuation.
 \code{els} continuation.
 %
 %
- {\if\edition\racketEd
+{\if\edition\racketEd
 The following example program falls into this
 The following example program falls into this
 case, and it creates two unused blocks.       
 case, and it creates two unused blocks.       
 \begin{center}
 \begin{center}
@@ -10277,11 +10286,12 @@ return a \code{Goto} to the new label.
       [else
       [else
         (let ([label (gensym 'block)])
         (let ([label (gensym 'block)])
           (set! basic-blocks (cons (cons label t) basic-blocks))
           (set! basic-blocks (cons (cons label t) basic-blocks))
-          (Goto label))]))
+          (Goto label))])))
 \end{lstlisting}
 \end{lstlisting}
 \end{minipage}
 \end{minipage}
 \end{center}
 \end{center}
 \fi}
 \fi}
+
 {\if\edition\pythonEd
 {\if\edition\pythonEd
 %
 %
 Here is the new version of the \code{create\_block} auxiliary function
 Here is the new version of the \code{create\_block} auxiliary function
@@ -20663,6 +20673,7 @@ class TypeCheckLgrad(TypeCheckLlambda):
 
 
 \fi}
 \fi}
 
 
+
 \clearpage
 \clearpage
 
 
 \section{Interpreting \LangCast{}}
 \section{Interpreting \LangCast{}}
@@ -20780,7 +20791,7 @@ For the first \code{vector-set!}, the proxy casts a tagged \code{1}
 from \CANYTY{} to \INTTY{}.
 from \CANYTY{} to \INTTY{}.
 }
 }
 \python{
 \python{
-  For the subscript \code{v[i]} in \code{f([v[i])} of \code{map\_inplace},
+  For the subscript \code{v[i]} in \code{f(v[i])} of \code{map\_inplace},
   the proxy casts the integer from \INTTY{} to \CANYTY{}.
   the proxy casts the integer from \INTTY{} to \CANYTY{}.
   For the subscript on the left of the assignment,
   For the subscript on the left of the assignment,
   the proxy casts the tagged value from \CANYTY{} to \INTTY{}.
   the proxy casts the tagged value from \CANYTY{} to \INTTY{}.