3 years ago · 915e6828b2
--- a/book.bib
+++ b/book.bib
@@ -1,3 +1,31 @@
 
															+@article{Earley:1970ly,
														
 
															+	acmid = {362035},
														
 
															+	address = {New York, NY, USA},
														
 
															+	author = {Earley, Jay},
														
 
															+	date-added = {2011-05-28 11:31:46 -0600},
														
 
															+	date-modified = {2011-05-28 11:31:48 -0600},
														
 
															+	doi = {http://doi.acm.org/10.1145/362007.362035},
														
 
															+	issn = {0001-0782},
														
 
															+	issue = {2},
														
 
															+	journal = {Commun. ACM},
														
 
															+	keywords = {compilers, computational complexity, context-free grammar, parsing, syntax analysis},
														
 
															+	month = {February},
														
 
															+	numpages = {9},
														
 
															+	pages = {94--102},
														
 
															+	publisher = {ACM},
														
 
															+	title = {An efficient context-free parsing algorithm},
														
 
															+	url = {http://doi.acm.org/10.1145/362007.362035},
														
 
															+	volume = {13},
														
 
															+	year = {1970},
														
 
															+	Bdsk-File-1 = {YnBsaXN0MDDRAQJccmVsYXRpdmVQYXRoXnA5NC1lYXJsZXkucGRmCAsYAAAAAAAAAQEAAAAAAAAAAwAAAAAAAAAAAAAAAAAAACc=},
														
 
															+	Bdsk-Url-1 = {http://doi.acm.org/10.1145/362007.362035}}
														
 
															+
														
 
															+@Book{Hopcroft06:_automata,
														
 
															+  author = 	 {John Hopcroft and Rajeev Motwani and Jeffrey Ullman},
														
 
															+  title = 	 {Introduction to Automata Theory, Languages, and Computation},
														
 
															+  publisher = 	 {Pearson},
														
 
															+  year = 	 2006}
														
 
															+
														
 
															 @techreport{Lesk:1975uq,
														
 
															 	author = {M. E. Lesk and E. Schmidt},
														
 
															 	date-added = {2007-08-27 13:37:27 -0600},
														
--- a/book.tex
+++ b/book.tex
@@ -782,13 +782,14 @@ A programming language can be thought of as a \emph{set} of programs.
 
															 The set is infinite (that is, one can always create larger programs),
														
 
															 so one cannot simply describe a language by listing all the
														
 
															 programs in the language. Instead we write down a set of rules, a
														
 
															-\emph{grammar}, for building programs. Grammars are often used to
														
 
															+\emph{context-free grammar}, for building programs. Grammars are often used to
														
 
															 define the concrete syntax of a language, but they can also be used to
														
 
															 describe the abstract syntax. We write our rules in a variant of
														
 
															 Backus-Naur form (BNF)~\citep{Backus:1960aa,Knuth:1964aa}.
														
 
															 \index{subject}{Backus-Naur form}\index{subject}{BNF} As an example,
														
 
															 we describe a small language, named \LangInt{}, that consists of
														
 
															-integers and arithmetic operations.  \index{subject}{grammar}
														
 
															+integers and arithmetic operations.\index{subject}{grammar}
														
 
															+\index{subject}{context-free grammar}
														
 
															 The first grammar rule for the abstract syntax of \LangInt{} says that an
														
 
															 instance of the \racket{\code{Int} structure}\python{\code{Constant} class} is an expression:
														
@@ -4086,7 +4087,7 @@ all, fast code is useless if it produces incorrect results!
 
															 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
														
 
															 {\if\edition\pythonEd
														
 
															-\chapter{Parser Generation}
														
 
															+\chapter{Parsing}
														
 
															 \label{ch:parsing-Lvar}
														
 
															 \setcounter{footnote}{0}
														
 
															 \index{subject}{parsing}
														
@@ -4095,6 +4096,9 @@ In this chapter we learn how to use the Lark parser
 
															 generator~\citep{shinan20:_lark_docs} to translate the concrete syntax
														
 
															 of \LangInt{} (a sequence of characters) into an abstract syntax tree.
														
 
															 You will then be asked to use Lark to create a parser for \LangVar{}.
														
 
															+We then learn about the parsing algorithms used inside Lark, studying
														
 
															+the \citet{Earley:1970ly} and LALR algorithms.
														
 
															+
														
 
															 A parser generator takes in a specification of the concrete syntax and
														
 
															 produces a parser. Even though a parser generator does most of the
														
 
															 work for us, using one properly requires some knowledge.  In
														
@@ -4274,10 +4278,10 @@ exp: INT
 
															    | exp "-" exp
														
 
															    | "(" exp ")"
														
 
															-stmt: "print" "(" exp ")"
														
 
															-    | exp
														
 
															+stmt_list:
														
 
															+    | stmt NEWLINE stmt_list
														
 
															-lang_int: (stmt NEWLINE)*
														
 
															+lang_int: stmt_list
														
 
															 \end{lstlisting}
														
 
															 \end{minipage}
														
 
															 \end{center}
														
@@ -4350,7 +4354,10 @@ exp: INT                    -> int
 
															 stmt: "print" "(" exp ")"   -> print
														
 
															     | exp                   -> expr
														
 
															-lang_int: (stmt NEWLINE)*   -> module
														
 
															+stmt_list:                   -> empty_stmt
														
 
															+    | stmt NEWLINE stmt_list -> add_stmt
														
 
															+
														
 
															+lang_int: stmt_list          -> module
														
 
															 \end{lstlisting}
														
 
															 \end{minipage}
														
 
															 \end{center}
														
@@ -4439,7 +4446,10 @@ exp_hi: INT                 -> int
 
															 stmt: "print" "(" exp ")"  -> print
														
 
															     | exp                  -> expr
														
 
															-lang_int: (stmt NEWLINE)*  -> module
														
 
															+stmt_list:                   -> empty_stmt
														
 
															+    | stmt NEWLINE stmt_list -> add_stmt
														
 
															+
														
 
															+lang_int: stmt_list          -> module
														
 
															 \end{lstlisting}
														
 
															 \end{tcolorbox}
														
 
															 \caption{An unambiguous Lark grammar for \LangInt{}.}
														
@@ -4476,14 +4486,14 @@ def parse_tree_to_ast(e):
 
															 \begin{exercise}
														
 
															   \normalfont\normalsize
														
 
															-
														
 
															-  Use Lark to create a lexer and parser for \LangVar{}.  We recommend
														
 
															-  using Lark's default parsing algorithm (Earley) with the
														
 
															-  \code{ambiguity} option set to \code{'explicit'} so that if your
														
 
															-  grammar is ambiguous, the output will include multiple parse trees
														
 
															-  which will indicate to you that there is a problem with your
														
 
															-  grammar. Your parser should ignore white space so we
														
 
															-  recommend using Lark's \code{\%ignore} directive as follows.
														
 
															+%
														
 
															+  Use Lark to create a lexer and parser for \LangVar{}.  Use Lark's
														
 
															+  default parsing algorithm (Earley) with the \code{ambiguity} option
														
 
															+  set to \code{'explicit'} so that if your grammar is ambiguous, the
														
 
															+  output will include multiple parse trees which will indicate to you
														
 
															+  that there is a problem with your grammar. Your parser should ignore
														
 
															+  white space so we recommend using Lark's \code{\%ignore} directive
														
 
															+  as follows.
														
 
															 \begin{lstlisting}
														
 
															 WS: /[ \t\f\r\n]/+
														
 
															 %ignore WS
														
@@ -4493,20 +4503,85 @@ Lark-generated parser instead of using the \code{parse} function from
 
															 the \code{ast} module. Test your compiler on all of the \LangVar{}
														
 
															 programs that you have created and create four additional programs
														
 
															 that would reveal ambiguities in your grammar.
														
 
															-
														
 
															 \end{exercise}
														
 
															 \section{The Earley Algorithm}
														
 
															+\label{sec:earley}
														
 
															+
														
 
															+In this section we discuss the parsing algorithm of
														
 
															+\citet{Earley:1970ly}, which is the default algorithm used by Lark.
														
 
															+The algorithm is powerful in that it can handle any context-free
														
 
															+grammar, which makes it easy to use. However, it is not the most
														
 
															+efficient parsing algorithm: it is $O(n^3)$ for ambiguous grammars and
														
 
															+$O(n^2)$ for unambiguous grammars~\citep{Hopcroft06:_automata}.  In
														
 
															+section~\ref{sec:lalr} we learn about the LALR algorithm, which is
														
 
															+more efficient but can only handle a subset of the context-free
														
 
															+grammars.
														
 
															+
														
 
															+The Earley algorithm uses a data structure called a
														
 
															+\emph{chart}\index{subject}{chart} to keep track of its progress.  The
														
 
															+chart is an array with one slot for each position in the input string,
														
 
															+where position $0$ is before the first character and position $n$ is
														
 
															+immediately after the last character. So the array has length $n+1$
														
 
															+for an input string of length $n$. Each slot in the chart contains a
														
 
															+set of \emph{dotted rules}. A dotted rule is simply a grammar rule
														
 
															+with a period indicating how much of its right-hand side has already
														
 
															+been parsed. For example, the dotted rule
														
 
															+\begin{lstlisting}
														
 
															+exp: exp "+" . exp_hi
														
 
															+\end{lstlisting}
														
 
															+represents a partial parse that has matched an expression followed by
														
 
															+\code{+}, but has not yet parsed an expression to the right of
														
 
															+\code{+}.
														
 
															+
														
 
															+The algorithm begins by creating dotted rules for all the grammar
														
 
															+rules whose left-hand side is the start symbol and placing then in
														
 
															+slot $0$ of the chart.  For example, given the grammar in
														
 
															+figure~\ref{fig:Lint-lark-grammar}, we would place
														
 
															+\begin{lstlisting}
														
 
															+  lang_int: . stmt_list
														
 
															+\end{lstlisting}
														
 
															+in slot $0$ of the chart. The algorithm then proceeds to its
														
 
															+\emph{prediction} phase in which it adds more dotted rules to the
														
 
															+chart based on which nonterminal come after a period. In the above,
														
 
															+the nonterminal \code{stmt\_list} appears after a period, so we add all
														
 
															+the rules for \code{stmt\_list} to slot $0$, with a period at the
														
 
															+beginning of their right-hand sides, as follows:
														
 
															+\begin{lstlisting}
														
 
															+stmt_list:  . 
														
 
															+stmt_list:  .  stmt  NEWLINE  stmt_list
														
 
															+\end{lstlisting}
														
 
															+The prediction phase continues to add dotted rules as more
														
 
															+opportunities arise. For example, the \code{stmt} nonterminal now
														
 
															+appears after a period, so we add all the rules for \code{stmt}.
														
 
															+\begin{lstlisting}
														
 
															+stmt:  .  "print" "("  exp ")"
														
 
															+stmt:  .  exp
														
 
															+\end{lstlisting}
														
 
															+To finish the preduction phase, we add the grammar rules for
														
 
															+\code{exp} and \code{exp\_hi}.
														
 
															+\begin{lstlisting}[escapechar=$]
														
 
															+exp: . exp "+" exp_hi
														
 
															+exp: . exp "-" exp_hi
														
 
															+exp: . exp_hi
														
 
															+exp_hi: . INT
														
 
															+exp_hi: . "input_int" "(" ")"
														
 
															+exp_hi: . "-" exp_hi
														
 
															+exp_hi: . "(" exp ")"
														
 
															+\end{lstlisting}
														
 
															 \section{The LALR Algorithm}
														
 
															-
														
 
															+\label{sec:lalr}
														
 
															 \section{Further Reading}
														
 
															 UNDER CONSTRUCTION
														
 
															+finite automata
														
 
															+
														
 
															+