From 2346342261e3e3a584266a680d1f6e4151ffb69f Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Tue, 20 Aug 2024 11:36:01 +0100 Subject: [PATCH 01/46] Initial sphinx setup --- site/Makefile | 20 ++++++++++++++++ site/make.bat | 35 +++++++++++++++++++++++++++ site/source/conf.py | 55 +++++++++++++++++++++++++++++++++++++++++++ site/source/index.rst | 20 ++++++++++++++++ 4 files changed, 130 insertions(+) create mode 100644 site/Makefile create mode 100644 site/make.bat create mode 100644 site/source/conf.py create mode 100644 site/source/index.rst diff --git a/site/Makefile b/site/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/site/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/site/make.bat b/site/make.bat new file mode 100644 index 0000000..6fcf05b --- /dev/null +++ b/site/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/site/source/conf.py b/site/source/conf.py new file mode 100644 index 0000000..c2b7ba4 --- /dev/null +++ b/site/source/conf.py @@ -0,0 +1,55 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'CompilerProgramming' +copyright = '2024, Dibyendu Majumdar' +author = 'Dibyendu Majumdar' + +# The full version, including alpha/beta/rc tags +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/site/source/index.rst b/site/source/index.rst new file mode 100644 index 0000000..c05e5cd --- /dev/null +++ b/site/source/index.rst @@ -0,0 +1,20 @@ +.. CompilerProgramming documentation master file, created by + sphinx-quickstart on Tue Aug 20 11:35:03 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to CompilerProgramming's documentation! +=============================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` From cb8957ae2dbbefe78aed3ef86a7431b274da95d8 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Tue, 20 Aug 2024 16:12:10 +0100 Subject: [PATCH 02/46] Start on the over all structure of the project --- .gitignore | 1 + site/source/compiler-books.rst | 4 ++ site/source/conf.py | 6 +- site/source/index.rst | 100 ++++++++++++++++++++++++++++----- 4 files changed, 96 insertions(+), 15 deletions(-) create mode 100644 .gitignore create mode 100644 site/source/compiler-books.rst diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9e37d03 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +site/build \ No newline at end of file diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst new file mode 100644 index 0000000..e8b45a9 --- /dev/null +++ b/site/source/compiler-books.rst @@ -0,0 +1,4 @@ +Compiler Books +============== + +I own a bunch of compiler books that I have purchased over the years. diff --git a/site/source/conf.py b/site/source/conf.py index c2b7ba4..2a3e49e 100644 --- a/site/source/conf.py +++ b/site/source/conf.py @@ -47,9 +47,11 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'agogo' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file +html_static_path = ['_static'] + +html_title = 'Compiler Programming' \ No newline at end of file diff --git a/site/source/index.rst b/site/source/index.rst index c05e5cd..df1e680 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -1,20 +1,94 @@ -.. CompilerProgramming documentation master file, created by - sphinx-quickstart on Tue Aug 20 11:35:03 2024. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +================================ +Welcome to Compiler Programming! +================================ -Welcome to CompilerProgramming's documentation! -=============================================== +This site aims to bring together practical knowledge regarding the design and implementation of optimizing compilers +and interpreters for Programming Languages. + +This site is not about programming language design! Instead, our focus is on compiler implementation techniques. + +There are a number of books on Compilers and Interpreters however only a very few of them are accompanied by +source code that implements the topics covered by the book. See below for a list of useful +learning projects that do include source code. + +In recent years, thanks to LLVM, new programming language design has become a fertile space. New Language implementations +tend to focus on the language front-end, leveraging LLVM as the backend for code optimization and code generation. +While this is beneficial if you only care about the language design aspects, it is unhelpful for the industry +as a whole, because the backend of an optimizing compiler is a very interesting component, with a rich history of +algorithms and data structures, and is a subject worthy of study on its own. + +We will cover both frontend and backend techniques. We will implement a small scale language as a way +to learn various techniques, see what the common challenges are and how to address them. +Language design not being our goal, we will keep the language as simple as possible so that it allows us to +focus on important implementation issues. + +Initially we will start with a procedural language. Later we will add features such as closures from functional languages +and classes and objects from OOP languages. We will also look at advanced front end techniques such as type inference and +generics. + +The language will be statically typed to start with because this allows us to investigate the traditional compiler +optimization pipeline. Dyamically typed languages have their own interesting engineering problems. +We will eventually look at gradual typing and dynamic typing. + +Preliminaries +============= + +* Implementation language + +Basic Front-End techniques +========================== + +* Lexical analysis +* Parsing +* Abstract Syntax Trees +* Type Systems +* Semantic Analysis + + +Basic Back-end techniques +========================= + +* Stack based vs register based Intermediate Representation +* Control flow graphs and Basic Blocks +* Bytecode VM with simple garbage collection + +Basic Optimization techniques +============================= + +* Data Flow Analysis, Type Lattices, Abstract Interpretation +* Peephole Optimizations +* Static Single Assignment +* Sea of Nodes Representation +* Code generation and Register Allocation + +Language Tools +============== + +* Debuggers +* Language IDEs + +Advanced Front-end techniques +============================= + +* Type inference +* Classes and objects +* Closures +* Exception handling +* Gradual typing +* Generics + + +Some Useful Projects +==================== + + +Book Reviews +============ .. toctree:: :maxdepth: 2 - :caption: Contents: - + :caption: Reviews + compiler-books -Indices and tables -================== -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` From 7d201ac1270bf520ef2d0736446e86b0b0f9626c Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Tue, 20 Aug 2024 21:03:37 +0100 Subject: [PATCH 03/46] Added section on implementation language choice --- site/source/index.rst | 14 +++++++++----- site/source/prelim-impl-lang.rst | 27 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 site/source/prelim-impl-lang.rst diff --git a/site/source/index.rst b/site/source/index.rst index df1e680..c84c59f 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -12,12 +12,12 @@ source code that implements the topics covered by the book. See below for a list learning projects that do include source code. In recent years, thanks to LLVM, new programming language design has become a fertile space. New Language implementations -tend to focus on the language front-end, leveraging LLVM as the backend for code optimization and code generation. +tend to focus on the language front-end, leveraging LLVM as the back-end for code optimization and code generation. While this is beneficial if you only care about the language design aspects, it is unhelpful for the industry -as a whole, because the backend of an optimizing compiler is a very interesting component, with a rich history of +as a whole, because the back-end of an optimizing compiler is a very interesting component, with a rich history of algorithms and data structures, and is a subject worthy of study on its own. -We will cover both frontend and backend techniques. We will implement a small scale language as a way +We will cover both front-end and back-end techniques. We will implement a small scale language as a way to learn various techniques, see what the common challenges are and how to address them. Language design not being our goal, we will keep the language as simple as possible so that it allows us to focus on important implementation issues. @@ -33,7 +33,11 @@ We will eventually look at gradual typing and dynamic typing. Preliminaries ============= -* Implementation language +.. toctree:: + :maxdepth: 2 + :caption: Preliminaries + + prelim-impl-lang Basic Front-End techniques ========================== @@ -44,7 +48,6 @@ Basic Front-End techniques * Type Systems * Semantic Analysis - Basic Back-end techniques ========================= @@ -55,6 +58,7 @@ Basic Back-end techniques Basic Optimization techniques ============================= +* Dominators and Control Flow Graph * Data Flow Analysis, Type Lattices, Abstract Interpretation * Peephole Optimizations * Static Single Assignment diff --git a/site/source/prelim-impl-lang.rst b/site/source/prelim-impl-lang.rst new file mode 100644 index 0000000..5165133 --- /dev/null +++ b/site/source/prelim-impl-lang.rst @@ -0,0 +1,27 @@ +Implementation Language +======================= + +A compiler can be implemented in any language we choose. For a pedagogical project it is more convenient +to choose a language that is widely used, has garbage collection, and comes with excellent tools such +as IDEs and Debuggers. + +Production quality compilers are often written in C, C++ or Rust. For us these languages are too difficult +to work with. + +Lisp and Python appear to be popular languages in teaching projects. Lisp is not as widely used +as we would like our implementation language to be, and dynamically typed languages such as Python are +harder to work with as the project grows. + +Compared to C, C++ and Rust, the programming language D appears to be much more suitable for this project, +from a technical standpoint, that is. It is a garbage collection language that has less friction and is pleasant to +work with. The main negatives are that it is not a popular language, and the tooling is not up to +the standards of other languages. + +Go would be a good candidate except that its an opinionated language that forces a certain programming model, +whereas we would like a language that offers least resistance. + +Java and C# seem like good candidates. Java has some limitations that make it harder to write memory optimized +code that is often necessary in a production compiler, but we don't care so much about that. Between the two, +Java is both more popular and has a wider set of tools available. + +I see no other sane choice but Java. My first choice would have been D if it was purely a technical question. From 5cd59a09d2af74f4b7c6e438e7c3d11f553c53f8 Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Sun, 1 Sep 2024 10:01:47 +0100 Subject: [PATCH 04/46] Added Swift and Kotlin to the list of candidate implementation languages --- site/source/index.rst | 2 -- site/source/prelim-impl-lang.rst | 12 ++++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/site/source/index.rst b/site/source/index.rst index c84c59f..1c9f9ee 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -5,8 +5,6 @@ Welcome to Compiler Programming! This site aims to bring together practical knowledge regarding the design and implementation of optimizing compilers and interpreters for Programming Languages. -This site is not about programming language design! Instead, our focus is on compiler implementation techniques. - There are a number of books on Compilers and Interpreters however only a very few of them are accompanied by source code that implements the topics covered by the book. See below for a list of useful learning projects that do include source code. diff --git a/site/source/prelim-impl-lang.rst b/site/source/prelim-impl-lang.rst index 5165133..ee6c764 100644 --- a/site/source/prelim-impl-lang.rst +++ b/site/source/prelim-impl-lang.rst @@ -20,8 +20,12 @@ the standards of other languages. Go would be a good candidate except that its an opinionated language that forces a certain programming model, whereas we would like a language that offers least resistance. -Java and C# seem like good candidates. Java has some limitations that make it harder to write memory optimized -code that is often necessary in a production compiler, but we don't care so much about that. Between the two, -Java is both more popular and has a wider set of tools available. +Java, Kotlin, Swift and C# seem like good candidates. Java has some limitations that make it harder to write memory optimized +code that is often necessary in a production compiler, but we don't care so much about that. -I see no other sane choice but Java. My first choice would have been D if it was purely a technical question. +I decided to use Java because it is the language I am most familar with, has great tooling, and despite some +short comings, is widely understood by developers around the world. My first choice would have been D if it was +purely a question of technical preference. + +The use of Java biases the implementation towards using some Object Orientation; this is just a consequence of the +most comfortable way of expressing some designs in Java. From 1d96da54b1d688c27f28d72134e5d97ad7b61398 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 10:20:16 +0100 Subject: [PATCH 05/46] Update compiler-books.rst --- site/source/compiler-books.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index e8b45a9..c040492 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -1,4 +1,5 @@ Compiler Books ============== -I own a bunch of compiler books that I have purchased over the years. +I have owned a bunch of compiler books that I have purchased over the years. + From b7ad227c4a8778a8988bedc97184722f71f32171 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 11:09:44 +0100 Subject: [PATCH 06/46] Update compiler-books.rst --- site/source/compiler-books.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index c040492..4678eff 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -1,5 +1,21 @@ +============== Compiler Books ============== I have owned a bunch of compiler books that I have purchased over the years. +Dragon Books +============ +I have 3 editions of these. + +* Principles of Compiler Design. Aho & Ullman, 1977. +* Compilers: Principles, Techniques and Tools. Aho, Sethi, Ullman, 1986. +* Compilers: Principles, Techniques and Tools, 2nd Ed. Aho, Lam, Sethi, Ullman, 2006. + +These books are criticised today because of the excessive focus on lexical analysis and parsing techniques. +While this is true, they do cover various aspects of the compiler such as intermediate representation and +optimization techniques such as peephole optimzation, data flow analysis, register allocation etc. + +The 2nd edition adopts a more mathematical presentation style, whereas the earlier editions present +algorithms using pseudo code. I think the 1986 edition is the best. + From 9edfd09b9ab855e5a4600053d9360cccb8d3510f Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 11:23:48 +0100 Subject: [PATCH 07/46] Update compiler-books.rst --- site/source/compiler-books.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 4678eff..ed2cacf 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -19,3 +19,9 @@ optimization techniques such as peephole optimzation, data flow analysis, regis The 2nd edition adopts a more mathematical presentation style, whereas the earlier editions present algorithms using pseudo code. I think the 1986 edition is the best. +For a different take on 2nd edition see `Review of the second addition of the "Dragon Book"`_. + + +Other Book Reviews +================== +* `List of compiler books `_ From b583cd0d39bafefc39d949f5d2c167aeb2d5c0ee Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 11:24:26 +0100 Subject: [PATCH 08/46] Update compiler-books.rst --- site/source/compiler-books.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index ed2cacf..497fdfd 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -19,7 +19,7 @@ optimization techniques such as peephole optimzation, data flow analysis, regis The 2nd edition adopts a more mathematical presentation style, whereas the earlier editions present algorithms using pseudo code. I think the 1986 edition is the best. -For a different take on 2nd edition see `Review of the second addition of the "Dragon Book"`_. +For a different take on 2nd edition see `Review of the second addition of the "Dragon Book" `_. Other Book Reviews From a9c1b3bab71a9f7c7169f8c398c5bac9a84526b9 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 12:08:06 +0100 Subject: [PATCH 09/46] Update compiler-books.rst --- site/source/compiler-books.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 497fdfd..fdff41f 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -19,7 +19,7 @@ optimization techniques such as peephole optimzation, data flow analysis, regis The 2nd edition adopts a more mathematical presentation style, whereas the earlier editions present algorithms using pseudo code. I think the 1986 edition is the best. -For a different take on 2nd edition see `Review of the second addition of the "Dragon Book" `_. +For a different take on 2nd edition see `Review of the second addition of the "Dragon Book" `_. Other Book Reviews From 9cd649d04d0b3b21163caf1e7ee1c2b7c984bd25 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 12:17:32 +0100 Subject: [PATCH 10/46] Update compiler-books.rst --- site/source/compiler-books.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index fdff41f..5f4a1e0 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -21,6 +21,14 @@ algorithms using pseudo code. I think the 1986 edition is the best. For a different take on 2nd edition see `Review of the second addition of the "Dragon Book" `_. +Engineering a Compiler, 2nd Ed. Cooper & Torczon. 2012. +======================================================= +This is a more modern version of the Dragon book, one could say. Its less focused on the lexical analysis / parsing +phases, and covers later phases in more detail. Exposition is similar to the Dragon book, mostly describes +techniques conceptually, with some high level algorithm descriptions, but like the Dragon book, does not +go into detailed descriptions of algorithms in general. + + Other Book Reviews ================== From 779bc31bd8dc7fd3fafec86906d5bf4519989d1a Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 12:29:47 +0100 Subject: [PATCH 11/46] Update compiler-books.rst --- site/source/compiler-books.rst | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 5f4a1e0..7099c3c 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -2,7 +2,7 @@ Compiler Books ============== -I have owned a bunch of compiler books that I have purchased over the years. +I own a bunch of compiler books that I have purchased over the years. Dragon Books ============ @@ -28,6 +28,16 @@ phases, and covers later phases in more detail. Exposition is similar to the Dra techniques conceptually, with some high level algorithm descriptions, but like the Dragon book, does not go into detailed descriptions of algorithms in general. +Both this and the Dragon books describe ahead of time compilers and cover topics that are suited for procedural languages +such as C or traditional Pascal or Fortran. Interesting topics such as Object Orientation, Closures, Generics, +or Semantic analysis of languages without forward declarations, etc. are not covered in any detail. + +Modern Compiler Implementation in C. Appel. 1998. +================================================= +This book takes a more hands on approach to describing how to implement both a front end and back end of a compiler, +using a toy language called Tiger as an example. Algorithms are described in pseudo code in more detail. If I had to choose +between the Dragon book, Engineering a compiler, and this book, I would pick this one. + Other Book Reviews From 05e83ae20496d7ca03f20bba4a4479617dafb195 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 13:34:29 +0100 Subject: [PATCH 12/46] Update compiler-books.rst --- site/source/compiler-books.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 7099c3c..fd327ef 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -38,7 +38,18 @@ This book takes a more hands on approach to describing how to implement both a f using a toy language called Tiger as an example. Algorithms are described in pseudo code in more detail. If I had to choose between the Dragon book, Engineering a compiler, and this book, I would pick this one. +This book covers functional languages, closures, as well as Object Oriented languages such as Java. Type inference is +covered too. +Crafting a Compiler. Fischer, LeBlanc, Cytron. 2010. +==================================================== +The last couple of chapters are the most interesting -these focus on code generation and program optimization. + +The 2nd edition of the book (with Cytron as co author) has a description of Static Single assignment that is +perhaps the most complete in all the books I cover here. The 1st edition describes data flow analysis in more +detail. + +Apart from the final two chapters, the rest of the book is about parsing and semantic analysis. Other Book Reviews ================== From 1291ae9876df6caf59f53e26ae1d6edb6b0f216e Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:00:33 +0100 Subject: [PATCH 13/46] Update compiler-books.rst --- site/source/compiler-books.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index fd327ef..e17993a 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -51,6 +51,24 @@ detail. Apart from the final two chapters, the rest of the book is about parsing and semantic analysis. +Building an Optimizing Compiler. Bob Morgan. 1998. +================================================== +I have the kindle edition which is very poor and hard to read. I wish I had a paper copy. + +This book is almost completely about the backend of the compiler. + +Advanced Compiler Design & Implementation. Muchnick. 1997. +========================================================== +I have the kindle edition, which is again very poor and hard to read. + +My impression is that this book describes some algorithms in detail but then leaves out parts of the algorithm, +hence you have to figure things out that are not described. It has a reputation of containing many errors. + +This book describes the idea of multiple levels of intermediate representation, HIR, MIR and LIR. +I guess this has influenced many compiler implementations. + +Its coverage of SSA is rudimentary - I guess it was written when SSA was still very new. + Other Book Reviews ================== * `List of compiler books `_ From f7406a3b582d712f8f375285e8a9c9d680a9ed96 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:19:44 +0100 Subject: [PATCH 14/46] Update compiler-books.rst --- site/source/compiler-books.rst | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index e17993a..273e346 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -23,10 +23,10 @@ For a different take on 2nd edition see `Review of the second addition of the "D Engineering a Compiler, 2nd Ed. Cooper & Torczon. 2012. ======================================================= -This is a more modern version of the Dragon book, one could say. Its less focused on the lexical analysis / parsing +This is a more modern version of the Dragon book, one could say. It is less focused on the lexical analysis / parsing phases, and covers later phases in more detail. Exposition is similar to the Dragon book, mostly describes techniques conceptually, with some high level algorithm descriptions, but like the Dragon book, does not -go into detailed descriptions of algorithms in general. +go into detailed descriptions of algorithms in general. Both this and the Dragon books describe ahead of time compilers and cover topics that are suited for procedural languages such as C or traditional Pascal or Fortran. Interesting topics such as Object Orientation, Closures, Generics, @@ -43,7 +43,7 @@ covered too. Crafting a Compiler. Fischer, LeBlanc, Cytron. 2010. ==================================================== -The last couple of chapters are the most interesting -these focus on code generation and program optimization. +The last couple of chapters are the most interesting - these focus on code generation and program optimization. The 2nd edition of the book (with Cytron as co author) has a description of Static Single assignment that is perhaps the most complete in all the books I cover here. The 1st edition describes data flow analysis in more @@ -61,14 +61,28 @@ Advanced Compiler Design & Implementation. Muchnick. 1997. ========================================================== I have the kindle edition, which is again very poor and hard to read. -My impression is that this book describes some algorithms in detail but then leaves out parts of the algorithm, -hence you have to figure things out that are not described. It has a reputation of containing many errors. +This book is also mostly about the backend of a compiler. + +My impression is that this book describes many algorithms in detail but then leaves out parts of the algorithm, +hence you have to figure things out that are not described. This book describes the idea of multiple levels of intermediate representation, HIR, MIR and LIR. I guess this has influenced many compiler implementations. Its coverage of SSA is rudimentary - I guess it was written when SSA was still very new. +This book has a reputation of containing many errors. + +Retargetable C Compiler, A: Design and Implementation. Hanson & Fraser. 1995. +============================================================================= +Describes a production C compiler. Detailed dsecription of the actual compiler code. + +Weak on theoretical aspects, and limited by features of the compiler being described. + +Program Flow Analysis: Theory and Applications. Editors Muchnick, Jones. 1981. +============================================================================== +Collection of essays on program analysis, by various authors. + Other Book Reviews ================== * `List of compiler books `_ From bbdc9ccb6337108baab4399d218cff2ef23c9d63 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:32:15 +0100 Subject: [PATCH 15/46] Create lexical-analysis.rst --- site/source/lexical-analysis.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 site/source/lexical-analysis.rst diff --git a/site/source/lexical-analysis.rst b/site/source/lexical-analysis.rst new file mode 100644 index 0000000..1333ed7 --- /dev/null +++ b/site/source/lexical-analysis.rst @@ -0,0 +1 @@ +TODO From b5e517a51b03e8f8c8b7840eea49f3f9c7372eb8 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:32:33 +0100 Subject: [PATCH 16/46] Create parsing.rst --- site/source/parsing.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 site/source/parsing.rst diff --git a/site/source/parsing.rst b/site/source/parsing.rst new file mode 100644 index 0000000..1333ed7 --- /dev/null +++ b/site/source/parsing.rst @@ -0,0 +1 @@ +TODO From d3687970a50ab9e954c85d1b4492a4115c3436e8 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:32:54 +0100 Subject: [PATCH 17/46] Create type-systems.rst --- site/source/type-systems.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 site/source/type-systems.rst diff --git a/site/source/type-systems.rst b/site/source/type-systems.rst new file mode 100644 index 0000000..1333ed7 --- /dev/null +++ b/site/source/type-systems.rst @@ -0,0 +1 @@ +TODO From 34908771a3bbeb57a9b50074ea0c427340346ab0 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:33:20 +0100 Subject: [PATCH 18/46] Create semantic-analysis.rst --- site/source/semantic-analysis.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 site/source/semantic-analysis.rst diff --git a/site/source/semantic-analysis.rst b/site/source/semantic-analysis.rst new file mode 100644 index 0000000..1333ed7 --- /dev/null +++ b/site/source/semantic-analysis.rst @@ -0,0 +1 @@ +TODO From b1dc3f3f3d9fe1055e9914e94100f0fdac0f584e Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:33:50 +0100 Subject: [PATCH 19/46] Create abstract-syntax-tree.rst --- site/source/abstract-syntax-tree.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 site/source/abstract-syntax-tree.rst diff --git a/site/source/abstract-syntax-tree.rst b/site/source/abstract-syntax-tree.rst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/site/source/abstract-syntax-tree.rst @@ -0,0 +1 @@ + From 5bae0ed8385408a2abd65ea8b8d5c4214075254a Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:34:02 +0100 Subject: [PATCH 20/46] Update abstract-syntax-tree.rst --- site/source/abstract-syntax-tree.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/source/abstract-syntax-tree.rst b/site/source/abstract-syntax-tree.rst index 8b13789..1333ed7 100644 --- a/site/source/abstract-syntax-tree.rst +++ b/site/source/abstract-syntax-tree.rst @@ -1 +1 @@ - +TODO From 52b0c698be5ad6829a9d10e1478291764b4d549f Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:35:50 +0100 Subject: [PATCH 21/46] Update index.rst --- site/source/index.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/site/source/index.rst b/site/source/index.rst index 1c9f9ee..6e360a8 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -40,11 +40,15 @@ Preliminaries Basic Front-End techniques ========================== -* Lexical analysis -* Parsing -* Abstract Syntax Trees -* Type Systems -* Semantic Analysis +.. toctree:: + :maxdepth: 2 + :caption: Basic Front-end Techniques + + lexical-analysis + parsing + abstract-syntax-tree + type-systems + semantic-analysis Basic Back-end techniques ========================= From 2a1b1ea40c689cde7425b30e2c9ead8f0a4c6b7c Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:37:16 +0100 Subject: [PATCH 22/46] Update lexical-analysis.rst --- site/source/lexical-analysis.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/site/source/lexical-analysis.rst b/site/source/lexical-analysis.rst index 1333ed7..e905f85 100644 --- a/site/source/lexical-analysis.rst +++ b/site/source/lexical-analysis.rst @@ -1 +1,9 @@ +Lexical Analysis +================ + TODO + +Example Implementation in EZ Language +===================================== + +See `Lexer `_. From 15f9f0f70245661b3fdf5647a01a071e6106486e Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:38:25 +0100 Subject: [PATCH 23/46] Update parsing.rst --- site/source/parsing.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/site/source/parsing.rst b/site/source/parsing.rst index 1333ed7..7887307 100644 --- a/site/source/parsing.rst +++ b/site/source/parsing.rst @@ -1 +1,10 @@ +======= +Parsing +======= + TODO + +Example Implementation +====================== + +See `EZ Language Parser `_. From b12a157a21d01e77d458c083c35bfc35d0e1862a Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:39:49 +0100 Subject: [PATCH 24/46] Update type-systems.rst --- site/source/type-systems.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/site/source/type-systems.rst b/site/source/type-systems.rst index 1333ed7..701f68c 100644 --- a/site/source/type-systems.rst +++ b/site/source/type-systems.rst @@ -1 +1,10 @@ +============ +Type Systems +============ + TODO + +Example Implementation +====================== + +See `Type System in EZ Language `_. From bd0bfc9515eb0f4d259a53d8647248be57674381 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 13 Oct 2024 14:41:15 +0100 Subject: [PATCH 25/46] Update abstract-syntax-tree.rst --- site/source/abstract-syntax-tree.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/site/source/abstract-syntax-tree.rst b/site/source/abstract-syntax-tree.rst index 1333ed7..4bf867b 100644 --- a/site/source/abstract-syntax-tree.rst +++ b/site/source/abstract-syntax-tree.rst @@ -1 +1,10 @@ +==================== +Abstract Syntax Tree +==================== + TODO + +Example Implementation +====================== + +* See `AST in EZ Language `_. From 35e29ba5c892eaa276f64574f87b123355fbf861 Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Sun, 13 Oct 2024 18:28:49 +0100 Subject: [PATCH 26/46] Describe lexical analyser --- site/source/index.rst | 4 +- site/source/lexical-analysis.rst | 54 +++++++++++++++++-- site/source/semantic-analysis.rst | 6 +++ .../{parsing.rst => syntax-analysis.rst} | 6 +-- 4 files changed, 61 insertions(+), 9 deletions(-) rename site/source/{parsing.rst => syntax-analysis.rst} (75%) diff --git a/site/source/index.rst b/site/source/index.rst index 6e360a8..9a0bb73 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -42,10 +42,10 @@ Basic Front-End techniques .. toctree:: :maxdepth: 2 - :caption: Basic Front-end Techniques + :caption: Parsing Techniques lexical-analysis - parsing + syntax-analysis abstract-syntax-tree type-systems semantic-analysis diff --git a/site/source/lexical-analysis.rst b/site/source/lexical-analysis.rst index e905f85..ab1d26f 100644 --- a/site/source/lexical-analysis.rst +++ b/site/source/lexical-analysis.rst @@ -1,9 +1,55 @@ +================ Lexical Analysis ================ -TODO +When compiling a program we need to recognize the words and punctuations that make up the vocabulary of the language. +This part of the compiler is therefore known as "lexical" analysis. + +Usually a compiler is given one or more input programs, and the first thing it must do is read the program and +figure out what lexical elements appear in the program. + +Typically, these lexical elements are known as tokens. So for example, in the following snippet of code:: + + print('hi') + +We have a number of lexical elements / tokens: + +* ``print`` +* ``(`` +* ``'hi'`` +* ``)`` + +There are many different ways to implement a "lexer" - the name we give to this component of the compiler. + +* We can write this code by hand. This involves scanning the input program character by character and + deciding what tokens appear in the program. +* Or we can specify the lexical elements in a grammar and have a tool generate the code to process the input + program and give us the tokens that appear in the program. + +A lexical analyser can be designed to process input on demand, or it may be designed to translate the entire +input source to a set of tokens at the very beginning. + +Considerations +============== + +* Should comments in the input program be retained as tokens? Usually a lexer will discard comments, but in languages that + allow comments to be retained as documentation, the lexer must not discard them. +* Should end of line markers be retained? Typically lexers drop all intermediate space including line markers, + but if the language syntax depends on line markers then these may need to be retained. +* Should tokens copy the input text, convert them to another form, or retain pointers to the input itself? + Retaining the original form of the lexical token may be important in some cases, for example if the lexer + is used in a code formatter. +* How much can we peek ahead? During later stages of the compiler, depending on the complexity of the language grammar, + it may be necessary to allow the compiler to look ahead one or more tokens without consuming them. +* Ancillary information regarding tokens such a line number, column number in the input source are invaluable for + error reporting. + +Example Hand-Coded Implementation +================================= + +The `Lexer `_ module in the EZ language +implementation contains an example of hand-coded lexical analyser written in Java. This implementation returns tokens +on demand. -Example Implementation in EZ Language -===================================== +Another example is the `Lua lexer `_. -See `Lexer `_. diff --git a/site/source/semantic-analysis.rst b/site/source/semantic-analysis.rst index 1333ed7..779c487 100644 --- a/site/source/semantic-analysis.rst +++ b/site/source/semantic-analysis.rst @@ -1 +1,7 @@ +================= +Semantic Analysis +================= + TODO + + diff --git a/site/source/parsing.rst b/site/source/syntax-analysis.rst similarity index 75% rename from site/source/parsing.rst rename to site/source/syntax-analysis.rst index 7887307..b14c1cb 100644 --- a/site/source/parsing.rst +++ b/site/source/syntax-analysis.rst @@ -1,6 +1,6 @@ -======= -Parsing -======= +=============== +Syntax Analysis +=============== TODO From d254f87c5f77ea73d673bd66817db9adee5327f8 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Tue, 22 Oct 2024 11:50:14 +0100 Subject: [PATCH 27/46] Update compiler-books.rst --- site/source/compiler-books.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 273e346..f77a553 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -32,11 +32,11 @@ Both this and the Dragon books describe ahead of time compilers and cover topics such as C or traditional Pascal or Fortran. Interesting topics such as Object Orientation, Closures, Generics, or Semantic analysis of languages without forward declarations, etc. are not covered in any detail. -Modern Compiler Implementation in C. Appel. 1998. -================================================= -This book takes a more hands on approach to describing how to implement both a front end and back end of a compiler, -using a toy language called Tiger as an example. Algorithms are described in pseudo code in more detail. If I had to choose -between the Dragon book, Engineering a compiler, and this book, I would pick this one. +Modern Compiler Implementation in C. Appel. 1998. (Tiger book) +============================================================== +This book takes a hands on tutorial like approach to describing how to implement both the front-end and back-end +of a compiler, using a toy language called Tiger as an example. Algorithms are described in pseudo code. +If I had to choose between the Dragon book, Engineering a compiler, and this book, I would pick this one. This book covers functional languages, closures, as well as Object Oriented languages such as Java. Type inference is covered too. @@ -61,7 +61,7 @@ Advanced Compiler Design & Implementation. Muchnick. 1997. ========================================================== I have the kindle edition, which is again very poor and hard to read. -This book is also mostly about the backend of a compiler. +This book is also mostly about the backend of a compiler, focusing on optimization. My impression is that this book describes many algorithms in detail but then leaves out parts of the algorithm, hence you have to figure things out that are not described. From 3282d025725b420372bc3fc8b5c2895ac07fceec Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sat, 2 Nov 2024 10:35:51 +0000 Subject: [PATCH 28/46] Update compiler-books.rst --- site/source/compiler-books.rst | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index f77a553..4bf64a1 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -13,8 +13,9 @@ I have 3 editions of these. * Compilers: Principles, Techniques and Tools, 2nd Ed. Aho, Lam, Sethi, Ullman, 2006. These books are criticised today because of the excessive focus on lexical analysis and parsing techniques. -While this is true, they do cover various aspects of the compiler such as intermediate representation and -optimization techniques such as peephole optimzation, data flow analysis, register allocation etc. +While this is true, they do cover various other aspects of a compiler backend such as intermediate representation, and +optimization techniques including peephole optimzation, data flow analysis, register allocation etc. +I found the description of the lattice in a data flow analysis quite accessible. The 2nd edition adopts a more mathematical presentation style, whereas the earlier editions present algorithms using pseudo code. I think the 1986 edition is the best. @@ -23,14 +24,15 @@ For a different take on 2nd edition see `Review of the second addition of the "D Engineering a Compiler, 2nd Ed. Cooper & Torczon. 2012. ======================================================= -This is a more modern version of the Dragon book, one could say. It is less focused on the lexical analysis / parsing -phases, and covers later phases in more detail. Exposition is similar to the Dragon book, mostly describes +This is a more modern version of the Dragon book. It is less focused on the lexical analysis / parsing +phases, and covers the later phases of a compiler in more detail. Exposition is similar to the Dragon book, i.e. mostly describes techniques conceptually, with some high level algorithm descriptions, but like the Dragon book, does not -go into detailed descriptions of algorithms in general. +go into detailed descriptions of algorithms. Both this and the Dragon books describe ahead of time compilers and cover topics that are suited for procedural languages -such as C or traditional Pascal or Fortran. Interesting topics such as Object Orientation, Closures, Generics, -or Semantic analysis of languages without forward declarations, etc. are not covered in any detail. +such as C or traditional Pascal or Fortran. They cover both front-end and back-end techniques; however, on the front-end +side, interesting topics such as Object Orientation, Closures, Generics, +or Semantic analysis of more complex languages such as Java are not covered. Modern Compiler Implementation in C. Appel. 1998. (Tiger book) ============================================================== @@ -63,8 +65,8 @@ I have the kindle edition, which is again very poor and hard to read. This book is also mostly about the backend of a compiler, focusing on optimization. -My impression is that this book describes many algorithms in detail but then leaves out parts of the algorithm, -hence you have to figure things out that are not described. +My impression is that this book describes many algorithms in detail. But when I tried to implement one of the +simpler algorithms (18.1 Unreachable Code Elimination) I found that the description left out a part (No_Path) of the algorithm. This book describes the idea of multiple levels of intermediate representation, HIR, MIR and LIR. I guess this has influenced many compiler implementations. From caed648b76788c02877d8d4c08e3fd53e6ae66cd Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sat, 2 Nov 2024 13:17:19 +0000 Subject: [PATCH 29/46] Update compiler-books.rst --- site/source/compiler-books.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 4bf64a1..454cf2e 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -61,19 +61,23 @@ This book is almost completely about the backend of the compiler. Advanced Compiler Design & Implementation. Muchnick. 1997. ========================================================== -I have the kindle edition, which is again very poor and hard to read. +I have the kindle edition, which is very poor and hard to read. This book is also mostly about the backend of a compiler, focusing on optimization. My impression is that this book describes many algorithms in detail. But when I tried to implement one of the -simpler algorithms (18.1 Unreachable Code Elimination) I found that the description left out a part (No_Path) of the algorithm. +simpler algorithms (18.1 Unreachable Code Elimination) I found that the description left out a +part (No_Path) of the algorithm. This book describes the idea of multiple levels of intermediate representation, HIR, MIR and LIR. I guess this has influenced many compiler implementations. Its coverage of SSA is rudimentary - I guess it was written when SSA was still very new. -This book has a reputation of containing many errors. +This book has a reputation of containing many errors, although I assume the latest printings have the errors +fixed. + +Despite its faults, it is a must have book if you want to learn about compiler construction. Retargetable C Compiler, A: Design and Implementation. Hanson & Fraser. 1995. ============================================================================= @@ -83,7 +87,8 @@ Weak on theoretical aspects, and limited by features of the compiler being descr Program Flow Analysis: Theory and Applications. Editors Muchnick, Jones. 1981. ============================================================================== -Collection of essays on program analysis, by various authors. +Collection of essays on program analysis, by various authors. This is pre-SSA, hence a bit +dated. Other Book Reviews ================== From 47ffb455eedf79f956dd010c4ebc1799aee1c355 Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Mon, 9 Dec 2024 16:27:01 +0000 Subject: [PATCH 30/46] Fix typo --- site/source/compiler-books.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 454cf2e..ff2a886 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -13,8 +13,8 @@ I have 3 editions of these. * Compilers: Principles, Techniques and Tools, 2nd Ed. Aho, Lam, Sethi, Ullman, 2006. These books are criticised today because of the excessive focus on lexical analysis and parsing techniques. -While this is true, they do cover various other aspects of a compiler backend such as intermediate representation, and -optimization techniques including peephole optimzation, data flow analysis, register allocation etc. +While this is true, they do cover various aspects of a compiler backend such as intermediate representations and +optimization techniques including peephole optimization, data flow analysis, register allocation etc. I found the description of the lattice in a data flow analysis quite accessible. The 2nd edition adopts a more mathematical presentation style, whereas the earlier editions present From 71a1559b34b42cdb1e34a3311de7e3e8de0ab364 Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Wed, 11 Dec 2024 13:12:28 +0000 Subject: [PATCH 31/46] Initial stab at learning resources --- site/source/index.rst | 26 +++++++++++++---- site/source/learning-resources.rst | 46 ++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 site/source/learning-resources.rst diff --git a/site/source/index.rst b/site/source/index.rst index 9a0bb73..a4a6d43 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -28,11 +28,19 @@ The language will be statically typed to start with because this allows us to in optimization pipeline. Dyamically typed languages have their own interesting engineering problems. We will eventually look at gradual typing and dynamic typing. +Implementation and Discussions +============================== + +* The `EeZee programming language implementation `_ will serve as the playground for exploring various compilation + techniques. +* This site is `maintained in github `_ too, and is generated using Sphinx. +* We have a `Discussion Forum `_. + Preliminaries ============= .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Preliminaries prelim-impl-lang @@ -41,7 +49,7 @@ Basic Front-End techniques ========================== .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Parsing Techniques lexical-analysis @@ -61,9 +69,9 @@ Basic Optimization techniques ============================= * Dominators and Control Flow Graph +* Static Single Assignment * Data Flow Analysis, Type Lattices, Abstract Interpretation * Peephole Optimizations -* Static Single Assignment * Sea of Nodes Representation * Code generation and Register Allocation @@ -84,9 +92,14 @@ Advanced Front-end techniques * Generics -Some Useful Projects -==================== +Some Useful Learning Resources +============================== +.. toctree:: + :maxdepth: 2 + :caption: Learning Resources + + learning-resources Book Reviews ============ @@ -97,4 +110,7 @@ Book Reviews compiler-books +Compiler Jobs +============= +* A listing of `compiler, language and runtime teams `_ for people looking for compiler jobs. \ No newline at end of file diff --git a/site/source/learning-resources.rst b/site/source/learning-resources.rst new file mode 100644 index 0000000..3f00d3f --- /dev/null +++ b/site/source/learning-resources.rst @@ -0,0 +1,46 @@ +================== +Learning Resources +================== + +Courses +======= + +CS 6120: Advanced Compilers: The Self-Guided Online Course +---------------------------------------------------------- + +* `CS 6120 `_ +* `BRIL `_ +* `github repo `_ + +CS 618: Program Analysis +------------------------ +* `CS 618 Video Lectures `_ +* `An Introduction to Program Analysis `_ + +Static Program Analysis +----------------------- +* `Static Program Analysis `_ +* `TIP `_ +* `Static Program Analysis Part 1 - PLISS 2019 `_ +* `Static Program Analysis Part 2 - PLISS 2019 `_ + +Papers And implementations +========================== + +Sea of Nodes +------------ +* `From Quads to Graphs: An Intermediate Representation's Journey `_ +* `Combining Analyses, Combining Optimizations `_ +* `A Simple Graph-Based Intermediate Representation `_ +* `Global Code Motion Global Value Numbering `_ +* `Simple Sea of Nodes Implementation `_ + +JikesRVM +-------- +* `Dynamic Optimization through the use of Automatic Runtime Specialization `_ +* `Implementation in JikesRVM `_ + +Others +====== + +* `Automatic Program Optimization, by Ron Cytron `_ From 93eda5638c4a1cd801c18882816a6df6e6336fd1 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Wed, 18 Dec 2024 21:17:32 +0000 Subject: [PATCH 32/46] Create intermediate-representations.rst --- site/source/intermediate-representations.rst | 44 ++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 site/source/intermediate-representations.rst diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst new file mode 100644 index 0000000..fd4fb68 --- /dev/null +++ b/site/source/intermediate-representations.rst @@ -0,0 +1,44 @@ +============================ +Intermediate Representations +============================ + +In general terms, an input program in the source language may go through many intermediate representations within +the compiler before it is in a form ready for execution. + +One of the first such intermediate representations that we have seen is the +the Abstract Syntax Tree, which is mainly concerned with the grammar of the source language. + +From the AST, we generate a different kind of intermediate representation, one that is more amenable +to manipulations required during optimization and execution. + +In the EeZee Programming Language, we implement a stack-based intermediate representation, as well as two +variations of a register-based representation. + +Stack-Based IR +============== + +The stack based IR encodes stack operations as part of the intermediate representation. Lets look at a simple +example:: + + func foo(n: Int)->Int { + return n+1; + } + +Produces:: + + L0: + load 0 + pushi 1 + addi + jump L1 + L1: + +The stack based IR is so called because many of the intructions in the IR push values to an evaluation stack at +runtime. Above for example we have the following instructions: + +* ``load 0`` - this refers to loading the value of the input parameter ``n`` to the stack. +* ``pushi 1`` - pushes the constant ``1`` to the stack. +* ``addi`` - pops the two topmost values on the stack, and computes the sum and pushes this to the stack + +So at the end of the program we are left with the sum of ``n+1`` on the stack, and this forms the return +value of the function. From b5f56598d447fa0b81e84fad0bd144cb348ab309 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Wed, 18 Dec 2024 22:34:07 +0000 Subject: [PATCH 33/46] Update intermediate-representations.rst --- site/source/intermediate-representations.rst | 41 ++++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index fd4fb68..f228047 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -2,17 +2,19 @@ Intermediate Representations ============================ -In general terms, an input program in the source language may go through many intermediate representations within -the compiler before it is in a form ready for execution. +An input program in the source language may go through many intermediate representations within +a compiler before it is in a form ready for execution. One of the first such intermediate representations that we have seen is the -the Abstract Syntax Tree, which is mainly concerned with the grammar of the source language. +the Abstract Syntax Tree (AST), which is mainly concerned with the grammer of the source language. From the AST, we generate a different kind of intermediate representation, one that is more amenable -to manipulations required during optimization and execution. - -In the EeZee Programming Language, we implement a stack-based intermediate representation, as well as two -variations of a register-based representation. +to the manipulations required during optimization and execution. There are many such representations; we will +limit ourselves to the following. + +* Stack based IR +* Register based IR +* Sea of Nodes IR Stack-Based IR ============== @@ -42,3 +44,28 @@ runtime. Above for example we have the following instructions: So at the end of the program we are left with the sum of ``n+1`` on the stack, and this forms the return value of the function. + +Advantages +---------- +* The IR is compact to represent in stored form, hence many languages choose to encode their compiled code in + this form. Examples are Java, C#, Web Assembly. +* The IR can be executed easily by an Interpreter. + +Disadvantages +------------- +* Not easy to implement optimizations +* Harder to analyze the IR, although there are methods available to do so. + +Examples +-------- +* Example implementation on EeZee Programming Language +* Java Specifications +* Approaches for Analysis + +Register Based IR +================= + + + + + From 8665da8e9a44011e5e0dde7bb34bd9954c6982df Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Wed, 18 Dec 2024 22:41:34 +0000 Subject: [PATCH 34/46] Update intermediate-representations.rst --- site/source/intermediate-representations.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index f228047..9410080 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -6,7 +6,7 @@ An input program in the source language may go through many intermediate represe a compiler before it is in a form ready for execution. One of the first such intermediate representations that we have seen is the -the Abstract Syntax Tree (AST), which is mainly concerned with the grammer of the source language. +the Abstract Syntax Tree (AST), which is mainly concerned with the grammar of the source language. From the AST, we generate a different kind of intermediate representation, one that is more amenable to the manipulations required during optimization and execution. There are many such representations; we will From 8321374a0668c3dcebfcb679eff8d1b5e8568742 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Sun, 29 Dec 2024 10:23:07 +0000 Subject: [PATCH 35/46] Update compiler-books.rst --- site/source/compiler-books.rst | 35 +++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index ff2a886..f1da0a1 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -15,30 +15,38 @@ I have 3 editions of these. These books are criticised today because of the excessive focus on lexical analysis and parsing techniques. While this is true, they do cover various aspects of a compiler backend such as intermediate representations and optimization techniques including peephole optimization, data flow analysis, register allocation etc. -I found the description of the lattice in a data flow analysis quite accessible. +I found the description of the lattice in a data flow analysis quite accessible. The 2nd edition adopts a more mathematical presentation style, whereas the earlier editions present algorithms using pseudo code. I think the 1986 edition is the best. +The dragon books are a bit dated in that newer techniques such as Static Single Assignment or Graph +Coloring Register Allocation etc. are not covered in any detail. + For a different take on 2nd edition see `Review of the second addition of the "Dragon Book" `_. Engineering a Compiler, 2nd Ed. Cooper & Torczon. 2012. ======================================================= This is a more modern version of the Dragon book. It is less focused on the lexical analysis / parsing -phases, and covers the later phases of a compiler in more detail. Exposition is similar to the Dragon book, i.e. mostly describes -techniques conceptually, with some high level algorithm descriptions, but like the Dragon book, does not -go into detailed descriptions of algorithms. +phases, and covers the later phases of a compiler in more detail. Exposition is similar to the Dragon book, i.e. describes +techniques conceptually, and some algorithms are described in detail using a form of pseudo code. + +Defines an intermediate language called ILOC, but this does not have support for function calls. + +In practice, I found it useful for implementing Dominator algorithm and SSA transformation. However, liveness +analysis does not cover SSA form, and exiting out of SSA is described conceptually but the algorithm is not +described in detail, so I had to consult the paper by Preston Briggs on which it is based. Both this and the Dragon books describe ahead of time compilers and cover topics that are suited for procedural languages such as C or traditional Pascal or Fortran. They cover both front-end and back-end techniques; however, on the front-end -side, interesting topics such as Object Orientation, Closures, Generics, -or Semantic analysis of more complex languages such as Java are not covered. +side, interesting topics such as Object Orientation, Closures, Generics, Semantic analysis of more complex languages such as Java are not covered. Modern Compiler Implementation in C. Appel. 1998. (Tiger book) ============================================================== This book takes a hands on tutorial like approach to describing how to implement both the front-end and back-end of a compiler, using a toy language called Tiger as an example. Algorithms are described in pseudo code. -If I had to choose between the Dragon book, Engineering a compiler, and this book, I would pick this one. +If I had to choose between the Dragon book, Engineering a compiler, and this book, I would pick this one and +Engineering a Compiler. This book covers functional languages, closures, as well as Object Oriented languages such as Java. Type inference is covered too. @@ -47,9 +55,11 @@ Crafting a Compiler. Fischer, LeBlanc, Cytron. 2010. ==================================================== The last couple of chapters are the most interesting - these focus on code generation and program optimization. -The 2nd edition of the book (with Cytron as co author) has a description of Static Single assignment that is -perhaps the most complete in all the books I cover here. The 1st edition describes data flow analysis in more -detail. +The 2nd edition of the book (with Cytron as co author) has a description of Static Single assignment. However the +description is based on a statement level IR, rather than one that uses Basic Blocks. Also, the algorithm for exiting +SSA is not described. + +The 1st edition describes data flow analysis in more detail, but does not cover SSA. Apart from the final two chapters, the rest of the book is about parsing and semantic analysis. @@ -67,13 +77,16 @@ This book is also mostly about the backend of a compiler, focusing on optimizati My impression is that this book describes many algorithms in detail. But when I tried to implement one of the simpler algorithms (18.1 Unreachable Code Elimination) I found that the description left out a -part (No_Path) of the algorithm. +part (No_Path) of the algorithm. This book describes the idea of multiple levels of intermediate representation, HIR, MIR and LIR. I guess this has influenced many compiler implementations. Its coverage of SSA is rudimentary - I guess it was written when SSA was still very new. +The Graph Coloring register allocation algorithm is presented in detail and is based on the paper by +Preston Briggs. + This book has a reputation of containing many errors, although I assume the latest printings have the errors fixed. From 453211f2a50511b6415458726c3b18fbb47ad69a Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Sun, 12 Jan 2025 13:55:06 +0000 Subject: [PATCH 36/46] Write up on IR updated, and added link to K Cooper course --- site/source/intermediate-representations.rst | 49 ++++++++++++++++++-- site/source/learning-resources.rst | 4 ++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index 9410080..6f94ea0 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -58,14 +58,55 @@ Disadvantages Examples -------- -* Example implementation on EeZee Programming Language +* Example implementation in EeZee Programming Language * Java Specifications -* Approaches for Analysis +* Web Assembly Specifications -Register Based IR -================= +Register Based IR or Three-Address IR +===================================== +This intermediate representation uses named slots called virtual registers in the Instruction when referencing +values. Lets look at the same example we saw above:: + func foo(n: Int)->Int { + return n+1; + } + +Produces:: + + L0: + %t1 = n+1 + ret %t1 + goto L1 + L1: + +The instructions above are as follows: + +* `%t1 = n+1` - is a typical three-address instruction of the form `result = value1 operator value2`. The name `%t1` + refers to a temporary, whereas `n` refers to the input argument `n`. +* `ret %t1` - is the return instruction, in this instance it references the temporary. +The virtual registers in the IR are so called because they do not map to real registers in the target physical machine. +Instead these are just named slots in the abstract machine responsible for executing the IR. Typically, the abstract machine +will assign each virtual register a unqiue location in its stack frame. So we still end up using the function's +stack frame, but the IR references locations within the stack frame via these virtual names, rather than implicitly +through push and pop instructions. +Advantages +---------- +* Each instruction has operands, hence representing the IR in serialized form takes more space. +* Readability: the flow of values is easier to trace, whereas with a stack IR you need to maintain a stack somewhere +* The IR can be executed easily by an Interpreter. +* Most optimization algorithms can be applied with this form of IR. + +Disadvantages +------------- +* Harder to generate the IR during compilation. We will look in detail one way of generating this IR. +* Serialized form can be larger. + +Examples +-------- +* Example implementation in EeZee Programming Language +* LLVM instruction set +* Android Dalvik IR diff --git a/site/source/learning-resources.rst b/site/source/learning-resources.rst index 3f00d3f..7cf7201 100644 --- a/site/source/learning-resources.rst +++ b/site/source/learning-resources.rst @@ -5,6 +5,10 @@ Learning Resources Courses ======= +COMP 512: Advanced Compiler Construction - Rice University, K. Cooper +--------------------------------------------------------------------- +* `COMP 512 Lectures _`. Nice bibliography of important papers related to optimization. + CS 6120: Advanced Compilers: The Self-Guided Online Course ---------------------------------------------------------- From efb5697e19c070ea5786ef25ce1eda6ae68a8cb0 Mon Sep 17 00:00:00 2001 From: Dibyendu Majumdar Date: Mon, 20 Jan 2025 00:26:30 +0000 Subject: [PATCH 37/46] Update intermediate-representations.rst --- site/source/intermediate-representations.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index 6f94ea0..f1a9ff0 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -88,21 +88,20 @@ The instructions above are as follows: The virtual registers in the IR are so called because they do not map to real registers in the target physical machine. Instead these are just named slots in the abstract machine responsible for executing the IR. Typically, the abstract machine -will assign each virtual register a unqiue location in its stack frame. So we still end up using the function's +will assign each virtual register a unique location in its stack frame. So we still end up using the function's stack frame, but the IR references locations within the stack frame via these virtual names, rather than implicitly through push and pop instructions. Advantages ---------- -* Each instruction has operands, hence representing the IR in serialized form takes more space. * Readability: the flow of values is easier to trace, whereas with a stack IR you need to maintain a stack somewhere * The IR can be executed easily by an Interpreter. * Most optimization algorithms can be applied with this form of IR. Disadvantages ------------- +* Each instruction has operands, hence representing the IR in serialized form takes more space. * Harder to generate the IR during compilation. We will look in detail one way of generating this IR. -* Serialized form can be larger. Examples -------- From 11e9423a8ce167e28ef57ea128bb8fa14c3a26c9 Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Tue, 1 Apr 2025 22:23:31 +0100 Subject: [PATCH 38/46] Add examples of control flow --- site/source/intermediate-representations.rst | 65 +++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index f1a9ff0..a521838 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -45,11 +45,54 @@ runtime. Above for example we have the following instructions: So at the end of the program we are left with the sum of ``n+1`` on the stack, and this forms the return value of the function. +In this IR, control flow can be represented either using labels and branching instructions, or by grouping +instructions into basic blocks, and linking basic blocks through jump instructions. These two approaches are +are quite similar, you can think of a label as indicating the start of a basic block, and a jump as ending +a basic block. + +The idea is that inside a basic block, instructions are supposed to execute linearly one after the other. +Each basic block ends with a branching instruction, something like a goto or a conditional jump. + +Here is a simple example of input source code and the IR you might see:: + + func foo()->Int + { + return 1 == 1 && 2 == 2 + } + +This results in IR that may look like this:: + + L0: + pushi 1 + pushi 1 + eq + cbr L2 L3 + L2: + pushi 2 + pushi 2 + eq + jump L4 + L3: + pushi 0 + jump L4 + L4: + jump L1 + L1: + +Each basic block begins with a label, which is just the unique name of the block. + +* The ``jump`` instruction transfers control from a basic block to another. +* The ``cbr`` instruction is the conditional branch. It consumes the top most value from the stack, + and if this value is true, then control is transferred to the first block, else to the second block. +* The ``eq`` instruction compares two values on top of the stack, and replaces them with integer value + ``1`` or ``0``. + Advantages ---------- * The IR is compact to represent in stored form, hence many languages choose to encode their compiled code in this form. Examples are Java, C#, Web Assembly. * The IR can be executed easily by an Interpreter. +* Relatively easy to generate from an AST. Disadvantages ------------- @@ -92,11 +135,29 @@ will assign each virtual register a unique location in its stack frame. So we s stack frame, but the IR references locations within the stack frame via these virtual names, rather than implicitly through push and pop instructions. +Control flow is represented the same way as for the stack IR. Revisting the same example from above, we get following +IR:: + + L0: + %t0 = 1==1 + if %t0 goto L2 else goto L3 + L2: + %t0 = 2==2 + goto L4 + L3: + %t0 = 0 + goto L4 + L4: + ret %t0 + goto L1 + L1: + + Advantages ---------- * Readability: the flow of values is easier to trace, whereas with a stack IR you need to maintain a stack somewhere * The IR can be executed easily by an Interpreter. -* Most optimization algorithms can be applied with this form of IR. +* Most optimization algorithms can be applied to this form of IR. Disadvantages ------------- @@ -109,3 +170,5 @@ Examples * LLVM instruction set * Android Dalvik IR +Sea of Nodes IR +=============== From 3898c68455760f6a2342a8d970626aa63342a219 Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Wed, 2 Apr 2025 10:07:11 +0100 Subject: [PATCH 39/46] Updated book reviews --- site/source/compiler-books.rst | 52 ++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index f1da0a1..2aa304b 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -21,7 +21,8 @@ The 2nd edition adopts a more mathematical presentation style, whereas the earli algorithms using pseudo code. I think the 1986 edition is the best. The dragon books are a bit dated in that newer techniques such as Static Single Assignment or Graph -Coloring Register Allocation etc. are not covered in any detail. +Coloring Register Allocation etc. are not covered in any detail. I would even say that these books are not +useful if your goal is to work with SSA IR. For a different take on 2nd edition see `Review of the second addition of the "Dragon Book" `_. @@ -33,21 +34,33 @@ techniques conceptually, and some algorithms are described in detail using a for Defines an intermediate language called ILOC, but this does not have support for function calls. -In practice, I found it useful for implementing Dominator algorithm and SSA transformation. However, liveness -analysis does not cover SSA form, and exiting out of SSA is described conceptually but the algorithm is not -described in detail, so I had to consult the paper by Preston Briggs on which it is based. +In practice, I found this book helful when implementing Dominator algorithm and SSA transformation. However, it left out +important parts in its coverage of SSA which meant that the algorithms as described do not work. For instance: + +* The SSA construction algorithm inserts Phis in blocks where the original variable is dead (semi-pruned SSA). This then + causes the renaming phase to fail as there is no available definition of the variable. +* Liveness analysis does not cover SSA form and does not handle phis correctly. +* Exiting out of SSA is described conceptually but the algorithms are not + described in detail. + +In practice though it is easy to recommend Engineering a Compiler over the Dragon books. Both this and the Dragon books describe ahead of time compilers and cover topics that are suited for procedural languages such as C or traditional Pascal or Fortran. They cover both front-end and back-end techniques; however, on the front-end -side, interesting topics such as Object Orientation, Closures, Generics, Semantic analysis of more complex languages such as Java are not covered. +side, interesting topics such as Object Orientation, Closures, Generics, Semantic analysis of more complex languages +such as Java are not covered. Modern Compiler Implementation in C. Appel. 1998. (Tiger book) ============================================================== This book takes a hands on tutorial like approach to describing how to implement both the front-end and back-end of a compiler, using a toy language called Tiger as an example. Algorithms are described in pseudo code. -If I had to choose between the Dragon book, Engineering a compiler, and this book, I would pick this one and +If I had to choose from the Dragon book, Engineering a compiler, and this book, I would pick this one and Engineering a Compiler. +It covers a lot of techniques, and usually presents algorithms in pseudo code form. I consulted this book +when implementing SSA and SCCP, but the descriptions were not sufficiently comprehensive so that I had to +consult other material too. + This book covers functional languages, closures, as well as Object Oriented languages such as Java. Type inference is covered too. @@ -67,13 +80,16 @@ Building an Optimizing Compiler. Bob Morgan. 1998. ================================================== I have the kindle edition which is very poor and hard to read. I wish I had a paper copy. -This book is almost completely about the backend of the compiler. +This book is almost completely about the backend of the compiler. I consulted the description of SCCP and +based my implementation at least in part on the descriptions. In particular I found some discussion about how to +exploit local knowledge in conditional branches to handle null checks, which was useful, and not discussed in +other books. Advanced Compiler Design & Implementation. Muchnick. 1997. ========================================================== -I have the kindle edition, which is very poor and hard to read. +I have the kindle edition, which is very poor quality and hard to read. -This book is also mostly about the backend of a compiler, focusing on optimization. +This book is mostly about the backend of a compiler, focusing on optimization. My impression is that this book describes many algorithms in detail. But when I tried to implement one of the simpler algorithms (18.1 Unreachable Code Elimination) I found that the description left out a @@ -82,13 +98,14 @@ part (No_Path) of the algorithm. This book describes the idea of multiple levels of intermediate representation, HIR, MIR and LIR. I guess this has influenced many compiler implementations. -Its coverage of SSA is rudimentary - I guess it was written when SSA was still very new. +Its coverage of SSA is rudimentary - I guess it was written when SSA was still very new. Hence if you are +working with SSA IR then you will need to consult other material. The Graph Coloring register allocation algorithm is presented in detail and is based on the paper by Preston Briggs. This book has a reputation of containing many errors, although I assume the latest printings have the errors -fixed. +fixed. Despite its faults, it is a must have book if you want to learn about compiler construction. @@ -96,13 +113,24 @@ Retargetable C Compiler, A: Design and Implementation. Hanson & Fraser. 1995. ============================================================================= Describes a production C compiler. Detailed dsecription of the actual compiler code. -Weak on theoretical aspects, and limited by features of the compiler being described. +Weak on theoretical aspects, and limited by features of the compiler being described. The compiler +implementation is a single pass code generator, hence its optimizing capabilities are limited. +There is no coverage of data flow analysis or SSA as these weren't used by the implementation. + +In short this describes an old school C compiler that generates code fast, but lacks optimizations. Program Flow Analysis: Theory and Applications. Editors Muchnick, Jones. 1981. ============================================================================== Collection of essays on program analysis, by various authors. This is pre-SSA, hence a bit dated. +SSA-Based Compiler Design - various authors +=========================================== +An online version of this book is available `here `. +This book is a collection of articles on various topics related to SSA. As such it presents more +recent knowledge regarding SSA construction, optimizations based on SSA, and finally destruction and +register allocation. + Other Book Reviews ================== * `List of compiler books `_ From 578ee22cb1c6610ae2b6db9e5f1243bcfadfc75a Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Wed, 2 Apr 2025 10:28:48 +0100 Subject: [PATCH 40/46] Fix link --- site/source/compiler-books.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 2aa304b..8eb7dc7 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -126,7 +126,7 @@ dated. SSA-Based Compiler Design - various authors =========================================== -An online version of this book is available `here `. +An online version of this book is available `here `_. This book is a collection of articles on various topics related to SSA. As such it presents more recent knowledge regarding SSA construction, optimizations based on SSA, and finally destruction and register allocation. From bb90893746edb43b1b30b8e5ace14a51d223831f Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Fri, 4 Apr 2025 21:19:23 +0100 Subject: [PATCH 41/46] More writeup --- site/source/compiler-books.rst | 15 ++-- site/source/intermediate-representations.rst | 78 ++++++++++++++------ 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/site/source/compiler-books.rst b/site/source/compiler-books.rst index 8eb7dc7..77fae24 100644 --- a/site/source/compiler-books.rst +++ b/site/source/compiler-books.rst @@ -30,18 +30,17 @@ Engineering a Compiler, 2nd Ed. Cooper & Torczon. 2012. ======================================================= This is a more modern version of the Dragon book. It is less focused on the lexical analysis / parsing phases, and covers the later phases of a compiler in more detail. Exposition is similar to the Dragon book, i.e. describes -techniques conceptually, and some algorithms are described in detail using a form of pseudo code. +techniques conceptually, and some algorithms are described in more detail using a form of pseudo code. -Defines an intermediate language called ILOC, but this does not have support for function calls. +Defines an intermediate language called ILOC, but this IR does not have support for function calls. -In practice, I found this book helful when implementing Dominator algorithm and SSA transformation. However, it left out +In practice, I found this book helpful when implementing the Dominator algorithm and SSA transformation. However, it left out important parts in its coverage of SSA which meant that the algorithms as described do not work. For instance: * The SSA construction algorithm inserts Phis in blocks where the original variable is dead (semi-pruned SSA). This then causes the renaming phase to fail as there is no available definition of the variable. * Liveness analysis does not cover SSA form and does not handle phis correctly. -* Exiting out of SSA is described conceptually but the algorithms are not - described in detail. +* Exiting out of SSA is described conceptually but the algorithms are not described in detail. In practice though it is easy to recommend Engineering a Compiler over the Dragon books. @@ -95,7 +94,7 @@ My impression is that this book describes many algorithms in detail. But when I simpler algorithms (18.1 Unreachable Code Elimination) I found that the description left out a part (No_Path) of the algorithm. -This book describes the idea of multiple levels of intermediate representation, HIR, MIR and LIR. +Introduces the idea of multiple levels of intermediate representation, HIR, MIR and LIR. I guess this has influenced many compiler implementations. Its coverage of SSA is rudimentary - I guess it was written when SSA was still very new. Hence if you are @@ -111,7 +110,7 @@ Despite its faults, it is a must have book if you want to learn about compiler c Retargetable C Compiler, A: Design and Implementation. Hanson & Fraser. 1995. ============================================================================= -Describes a production C compiler. Detailed dsecription of the actual compiler code. +Describes a production C compiler. Contains detailed walkthrough of the actual compiler code. Weak on theoretical aspects, and limited by features of the compiler being described. The compiler implementation is a single pass code generator, hence its optimizing capabilities are limited. @@ -129,7 +128,7 @@ SSA-Based Compiler Design - various authors An online version of this book is available `here `_. This book is a collection of articles on various topics related to SSA. As such it presents more recent knowledge regarding SSA construction, optimizations based on SSA, and finally destruction and -register allocation. +register allocation. I will have more to say about this book as I use it. Other Book Reviews ================== diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index a521838..12db133 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -35,10 +35,10 @@ Produces:: jump L1 L1: -The stack based IR is so called because many of the intructions in the IR push values to an evaluation stack at -runtime. Above for example we have the following instructions: +The stack based IR is so called because many of the intructions in the IR push and pop values to/from an evaluation stack at +runtime. Above for example, we have the following instructions: -* ``load 0`` - this refers to loading the value of the input parameter ``n`` to the stack. +* ``load 0`` - this pushes the value of the input parameter ``n`` to the stack. The ``0`` here identifies the location of the variable ``n``. * ``pushi 1`` - pushes the constant ``1`` to the stack. * ``addi`` - pops the two topmost values on the stack, and computes the sum and pushes this to the stack @@ -47,10 +47,10 @@ value of the function. In this IR, control flow can be represented either using labels and branching instructions, or by grouping instructions into basic blocks, and linking basic blocks through jump instructions. These two approaches are -are quite similar, you can think of a label as indicating the start of a basic block, and a jump as ending +equivalent, you can think of a label as indicating the start of a basic block, and a jump as ending a basic block. -The idea is that inside a basic block, instructions are supposed to execute linearly one after the other. +The idea is that inside a basic block, instructions executed linearly one after the other. Each basic block ends with a branching instruction, something like a goto or a conditional jump. Here is a simple example of input source code and the IR you might see:: @@ -84,31 +84,32 @@ Each basic block begins with a label, which is just the unique name of the block * The ``jump`` instruction transfers control from a basic block to another. * The ``cbr`` instruction is the conditional branch. It consumes the top most value from the stack, and if this value is true, then control is transferred to the first block, else to the second block. -* The ``eq`` instruction compares two values on top of the stack, and replaces them with integer value +* The ``eq`` instruction pops the topmost two values from the stack, and replaces them with integer value ``1`` or ``0``. Advantages ---------- -* The IR is compact to represent in stored form, hence many languages choose to encode their compiled code in +* The IR is compact to represent in stored form as most instructions do not take have operands. + This is a reason why many languages choose to encode their compiled code in this form. Examples are Java, C#, Web Assembly. * The IR can be executed easily by an Interpreter. -* Relatively easy to generate from an AST. +* Relatively easy to generate IR from an AST. Disadvantages ------------- -* Not easy to implement optimizations +* Not easy to implement optimizations. * Harder to analyze the IR, although there are methods available to do so. Examples -------- -* Example implementation in EeZee Programming Language -* Java Specifications -* Web Assembly Specifications +* `Example implementation in EeZee Programming Language `_. +* `Java Specifications `_. +* `Web Assembly Specifications `_. Register Based IR or Three-Address IR ===================================== -This intermediate representation uses named slots called virtual registers in the Instruction when referencing +This intermediate representation uses named slots called virtual registers in the instruction when referencing values. Lets look at the same example we saw above:: func foo(n: Int)->Int { @@ -125,17 +126,17 @@ Produces:: The instructions above are as follows: -* `%t1 = n+1` - is a typical three-address instruction of the form `result = value1 operator value2`. The name `%t1` - refers to a temporary, whereas `n` refers to the input argument `n`. -* `ret %t1` - is the return instruction, in this instance it references the temporary. +* ``%t1 = n+1`` - is a typical three-address instruction of the form ``result = value1 operator value2``. The name ``%t1`` + refers to a temporary, whereas ``n`` refers to the input argument ``n``. +* ``ret %t1`` - is the return instruction, in this instance it references the temporary. The virtual registers in the IR are so called because they do not map to real registers in the target physical machine. Instead these are just named slots in the abstract machine responsible for executing the IR. Typically, the abstract machine -will assign each virtual register a unique location in its stack frame. So we still end up using the function's +will assign each virtual register a unique location in its stack frame. So we still end up using the function's stack frame, but the IR references locations within the stack frame via these virtual names, rather than implicitly -through push and pop instructions. +through push and pop instructions. During optimization some of the virtual registers will end up in real hardware registers. -Control flow is represented the same way as for the stack IR. Revisting the same example from above, we get following +Control flow is represented the same way as for the stack IR. Revisiting the same source example from above, we get following IR:: L0: @@ -155,20 +156,49 @@ IR:: Advantages ---------- -* Readability: the flow of values is easier to trace, whereas with a stack IR you need to maintain a stack somewhere +* Readability: the flow of values is easier to trace, whereas with a stack IR you need to conceptualize a stack somewhere, + and track values being pushed and popped. * The IR can be executed easily by an Interpreter. * Most optimization algorithms can be applied to this form of IR. +* The IR can represent Static Single Assignment (SSA) in a natural way. Disadvantages ------------- * Each instruction has operands, hence representing the IR in serialized form takes more space. -* Harder to generate the IR during compilation. We will look in detail one way of generating this IR. +* Harder to generate the IR during compilation. Examples -------- -* Example implementation in EeZee Programming Language -* LLVM instruction set -* Android Dalvik IR +* `Example basic register IR in EeZee Programming Language `_. +* `Example register IR including SSA form and optimizations in EeZee Programming Language `_. +* `LLVM instruction set `_. +* `Android Dalvik IR `_. Sea of Nodes IR =============== +The final example we will look at is known as the Sea of Nodes IR. + +It is quite different from the IRs we described above. + +The key features of this IR are: + +* Instructions are NOT organized into Basic Blocks - instead, intructions form a graph, where + each instruction has as its inputs the definitions it uses. +* Instructions that produce data values are not directly bound to a Basic Block, instead they "float" around, + the order being defined purely in terms of the dependencies between the instructions. +* Control flow is also represented in the same way, and control flows between control flow + instructions. Dependencies between data instructions and control intructions occur at few well + defined places. +* The IR as described above cannot be readily executed, because to execute the IR, the instructions + must be scheduled, which you can think of a process by which the instructions are put into + a traditional register based IR such as one described earlier. + +Describing Sea of Nodes IR is quite involved. For now, I direct you to the `Simple project `_; this +is an ongoing effort to explain the Sea of Nodes IR representation and how to implement it. + +Beyond how the IR is represented, the main benefits of the Sea of Nodes IR are that: + +* It is an SSA IR +* Various optimizations such as peephole optimizations, value numbering and common subexpressions elimination, + dead code elimitation, occur as the IR is built. +* This makes the SoN IR suitable for quick optimizations, suitable for Just-In-Time (JIT) compilers. From dcc414298d047a7c9f13060c23d0b7e1c0828d0d Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Fri, 4 Apr 2025 22:55:39 +0100 Subject: [PATCH 42/46] Doc fixes --- site/source/index.rst | 8 +++++--- site/source/intermediate-representations.rst | 4 ++-- site/source/learning-resources.rst | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/site/source/index.rst b/site/source/index.rst index a4a6d43..a64f32d 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -61,9 +61,11 @@ Basic Front-End techniques Basic Back-end techniques ========================= -* Stack based vs register based Intermediate Representation -* Control flow graphs and Basic Blocks -* Bytecode VM with simple garbage collection +.. toctree:: + :maxdepth: 1 + :caption: Backend Basics + + intermediate-representations Basic Optimization techniques ============================= diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index 12db133..3698823 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -190,8 +190,8 @@ The key features of this IR are: instructions. Dependencies between data instructions and control intructions occur at few well defined places. * The IR as described above cannot be readily executed, because to execute the IR, the instructions - must be scheduled, which you can think of a process by which the instructions are put into - a traditional register based IR such as one described earlier. + must be scheduled; you can think of this as a process that puts the instructions into a traditional + Basic Block IR as described earlier. Describing Sea of Nodes IR is quite involved. For now, I direct you to the `Simple project `_; this is an ongoing effort to explain the Sea of Nodes IR representation and how to implement it. diff --git a/site/source/learning-resources.rst b/site/source/learning-resources.rst index 7cf7201..8f92714 100644 --- a/site/source/learning-resources.rst +++ b/site/source/learning-resources.rst @@ -7,7 +7,7 @@ Courses COMP 512: Advanced Compiler Construction - Rice University, K. Cooper --------------------------------------------------------------------- -* `COMP 512 Lectures _`. Nice bibliography of important papers related to optimization. +* `COMP 512 Lectures `_. Nice bibliography of important papers related to optimization. CS 6120: Advanced Compilers: The Self-Guided Online Course ---------------------------------------------------------- From e4d75cd06d25b3974c75a7a54e9a152802c3581b Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Tue, 8 Apr 2025 08:51:08 +0100 Subject: [PATCH 43/46] Some fixes --- site/source/intermediate-representations.rst | 26 +++++++++++--------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index 3698823..45c1bec 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -50,7 +50,7 @@ instructions into basic blocks, and linking basic blocks through jump instructio equivalent, you can think of a label as indicating the start of a basic block, and a jump as ending a basic block. -The idea is that inside a basic block, instructions executed linearly one after the other. +The idea is that inside a basic block, instructions execute linearly one after the other. Each basic block ends with a branching instruction, something like a goto or a conditional jump. Here is a simple example of input source code and the IR you might see:: @@ -83,13 +83,14 @@ Each basic block begins with a label, which is just the unique name of the block * The ``jump`` instruction transfers control from a basic block to another. * The ``cbr`` instruction is the conditional branch. It consumes the top most value from the stack, - and if this value is true, then control is transferred to the first block, else to the second block. -* The ``eq`` instruction pops the topmost two values from the stack, and replaces them with integer value - ``1`` or ``0``. + and if this value is true (in this case, a non-zero value), then control is transferred + to the first block, else to the second block. +* The ``eq`` instruction pops the two topmost values from the stack, compares them and pushes a result: + ``1`` for true or ``0`` for false. Advantages ---------- -* The IR is compact to represent in stored form as most instructions do not take have operands. +* The IR is compact to represent in stored form as most instructions do not have operands. This is a reason why many languages choose to encode their compiled code in this form. Examples are Java, C#, Web Assembly. * The IR can be executed easily by an Interpreter. @@ -98,6 +99,8 @@ Advantages Disadvantages ------------- * Not easy to implement optimizations. +* For a reader it is hard to trace values as they flow through instructions, + as it requires tracking them through a conceptual stack. * Harder to analyze the IR, although there are methods available to do so. Examples @@ -127,13 +130,13 @@ Produces:: The instructions above are as follows: * ``%t1 = n+1`` - is a typical three-address instruction of the form ``result = value1 operator value2``. The name ``%t1`` - refers to a temporary, whereas ``n`` refers to the input argument ``n``. + refers to a temporary, whereas ``n`` refers to the input argument ``n``. Both of these names are virtual registers. * ``ret %t1`` - is the return instruction, in this instance it references the temporary. The virtual registers in the IR are so called because they do not map to real registers in the target physical machine. Instead these are just named slots in the abstract machine responsible for executing the IR. Typically, the abstract machine will assign each virtual register a unique location in its stack frame. So we still end up using the function's -stack frame, but the IR references locations within the stack frame via these virtual names, rather than implicitly +stack frame, but the IR references locations within the stack frame directly using these virtual names, rather than implicitly through push and pop instructions. During optimization some of the virtual registers will end up in real hardware registers. Control flow is represented the same way as for the stack IR. Revisiting the same source example from above, we get following @@ -157,7 +160,8 @@ IR:: Advantages ---------- * Readability: the flow of values is easier to trace, whereas with a stack IR you need to conceptualize a stack somewhere, - and track values being pushed and popped. + and track values being pushed and popped. +* Fewer instructions are needed compared to stack IR. * The IR can be executed easily by an Interpreter. * Most optimization algorithms can be applied to this form of IR. * The IR can represent Static Single Assignment (SSA) in a natural way. @@ -178,15 +182,15 @@ Sea of Nodes IR =============== The final example we will look at is known as the Sea of Nodes IR. -It is quite different from the IRs we described above. +This IR is quite different from the IRs we described above. The key features of this IR are: * Instructions are NOT organized into Basic Blocks - instead, intructions form a graph, where each instruction has as its inputs the definitions it uses. * Instructions that produce data values are not directly bound to a Basic Block, instead they "float" around, - the order being defined purely in terms of the dependencies between the instructions. -* Control flow is also represented in the same way, and control flows between control flow + the order being defined purely in terms of the dependencies between the instructions. +* Control flow is represented in a similar way, and control flows between control flow instructions. Dependencies between data instructions and control intructions occur at few well defined places. * The IR as described above cannot be readily executed, because to execute the IR, the instructions From 64eab4e1f2c499b4ebbcf8b2d9d05e00a6e73b68 Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Tue, 8 Apr 2025 09:05:32 +0100 Subject: [PATCH 44/46] More fixes --- site/source/intermediate-representations.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/site/source/intermediate-representations.rst b/site/source/intermediate-representations.rst index 45c1bec..0bbf344 100644 --- a/site/source/intermediate-representations.rst +++ b/site/source/intermediate-representations.rst @@ -204,5 +204,5 @@ Beyond how the IR is represented, the main benefits of the Sea of Nodes IR are t * It is an SSA IR * Various optimizations such as peephole optimizations, value numbering and common subexpressions elimination, - dead code elimitation, occur as the IR is built. -* This makes the SoN IR suitable for quick optimizations, suitable for Just-In-Time (JIT) compilers. + dead code elimination, occur as the IR is built. +* The SoN IR can generate optimized code quickly, suitable for Just-In-Time (JIT) compilers. From c9b4d8b7a4bea9c6ef3f0a7959f5fab3a87cdc2f Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Sun, 13 Apr 2025 14:43:12 +0100 Subject: [PATCH 45/46] Add a writeup on EZ --- site/source/ez-lang.rst | 365 +++++++++++++++++++++++++++++++ site/source/index.rst | 1 + site/source/prelim-impl-lang.rst | 9 +- 3 files changed, 369 insertions(+), 6 deletions(-) create mode 100644 site/source/ez-lang.rst diff --git a/site/source/ez-lang.rst b/site/source/ez-lang.rst new file mode 100644 index 0000000..0817ecb --- /dev/null +++ b/site/source/ez-lang.rst @@ -0,0 +1,365 @@ +The EeZee Programming Language +============================== + +The EeZee programming language is a toy language with just enough features to allow +experimenting with various compiler techniques. + +The base language is intentionally very small. Eventually there will be extended versions +that allow functional and object oriented paradigms. + +Language features +----------------- +* User defined functions +* Integer type +* User defined ``struct`` types +* One dimensional arrays +* Basic control flow such as ``if`` and ``while`` statements + +Keywords +-------- +Following are keywords in the language:: + + func var int struct if else while break continue return + +Source Unit +----------- + +The EeZee language does not have the concept of modules or imports. Each source file must be +self contained. + +There is no predefined ``main`` function in a source unit. The runtime should allow +any defined function to be invoked by supplying appropriate arguments. + +Types +----- + +The only primitive type in the language is the integer type ``Int``. +The size of this type is unspecified, the default implementation is 64-bit integers. + +There is not a distinct boolean type, non-zero integer values evaluate as true, and ``0`` evaluates as false. + +Users can define one-dimensional arrays and structs. + +Arrays and structs are implicitly reference types, i.e. instances of these types are +allocated on the heap. + +The language does not specify whether the heap is garbage collected or manually managed, it is +up to the implementation. + +A ``struct`` type is a named aggregate with one or more fields. Fields may of be of any supported +type. + +An array type is declared by enclosing the element type in brackets, i.e. ``[`` and ``]``. + +There is a ``Null`` type, with a predefined literal named ``null`` of this type. + +When declaring fields or variables of reference types, user may suffix the type name with ``?`` to +indicate a ``Nullable`` type. A ``Null`` is an implicit subtype of all ``Nullable`` types. + +Examples:: + + struct Tree { + var left: Tree? + var right: Tree? + } + struct Test { + var intArray: [Int] + } + struct TreeArray { + var array: [Tree?]? + } + +Struct types are nominal, i.e. each struct type is identified uniquely by its name. +Multiple definitions of struct types is not allowed. + +The language does not require forward declarations. + +Functions +--------- + +Users can declare functions, each function must have a unique name. + +Polymorphic functions are not supported. + +Functions can accept one or more arguments and may optionally return a result. + +The ``func`` keyword instroduces a function declaration. + +Examples:: + + func fib(n: Int)->Int { + var f1=1 + var f2=1 + var i=n + while( i>1 ){ + var temp = f1+f2 + f1=f2 + f2=temp + i=i-1 + } + return f2 + } + + func foo()->Int { + return fib(10) + } + +Variables and Fields +-------------------- + +The ``var`` keyword is used to introduce a new variable in the current lexical scope, +or to add a field to a struct. + +There are two forms of this: + +When introducing variables, you can supply an initializer; this removes the need to +specify a type. Examples:: + + var i = 1 + var j = foo() + +In this form the type of the variable is inferred from the initializer's type. + +The second form is more suited when declaring fields in a struct. In this form +a type is required - initializer cannot be set. + +Example:: + + struct T + { + var f: Int + var arry: [Int] + } + +Creating new instances of Arrays +-------------------------------- + +The ``new`` keyword is used to create array instances. + +It must be followed by an array type name, and optionally followed by an initializer. + +The array initializer must be a comma separated list of values, enclosed in ``{`` and ``}``. + +The array is sized based on number of values in the initilizer. + +Alternatively the array initializer may have a field named ``len`` that specifies the size of the +array, and a field named ``value`` to specify the value to use. + +Examples:: + + var arry = new [Int] {1,2,3} + var arry2 = new [Int] {len=10, value=0} + +The second example creates an array with 10 elements and sets the initial value to 0. + +Creating new instances of structs +--------------------------------- + +The ``new`` keyword is used to create struct instances. + +It must be followed by the struct type name, and optionally followed by an initializer. + +The struct initializer must be a comma separated list of field initializers, enclosed in ``{`` and ``}``. + +A field initializer has the form of name followed by ``=`` followed by an expression. + +Examples:: + + var stats = new Stats { age=10, height=100 } + + +Control Flow +------------ + +The language is lexically scoped, and block structured. + +A block is enclosed in ``{`` and ``}`` and introduces a lexical scope. + +The ``if`` statement allows branching based on a condition. The condition must be an +integer expression; a value of ``0`` is false, any other value is ``true``. + +The ``if`` statement can have an optional ``else`` branch. + +The only looping construct is the ``while`` statement; this executes the sub statement +as long as the supplied condition evaluates to a non zero value. + +The ``break`` statement exits a loop. + +The ``continue`` statement branches to the beginning of the loop. + +The ``return`` statement takes an expression if the function is meant to return a value. +It causes the currently executing function to terminate. + +Expressions +----------- + +Following table describes the available operators by their precedence (low to high): + ++------------+-----------------+----------+ +| Operator | Meaning | Type | +| | | | ++============+=================+==========+ +| ``||`` | logical or | Binary | ++------------+-----------------+----------+ +| ``&&`` | logical and | Binary | ++------------+-----------------+----------+ +| ``==`` | relational | Binary | +| ``!=`` | | | +| ``<`` | | | +| ``<=`` | | | +| ``>`` | | | +| ``>=`` | | | ++------------+-----------------+----------+ +| ``+`` | addition | Binary | +| ``-`` | | | ++------------+-----------------+----------+ +| ``*`` | multiplication | Binary | +| ``/`` | | | ++------------+-----------------+----------+ +| ``-`` | negate | Unary | +| ``!`` | | | ++------------+-----------------+----------+ +| ``(...)``, | function call, | Postfix | +| ``[]``, | array index, | | +| ``.`` ID | field access | | ++------------+-----------------+----------+ + + + +Grammar +------- + +The following grammar describes the language syntax:: + + program + : declaration+ EOF + ; + + declaration + : structDeclaration + | functionDeclaration + ; + + structDeclaration + : 'struct' IDENTIFIER '{' fields '}' + ; + + fields + : varDeclaration+ + ; + + varDeclaration + : 'var' IDENTIFIER ':' typeName ';'? + ; + + typeName + : simpleType + | arrayType + ; + + simpleType + : IDENTIFIER ('?')? + ; + + arrayType + : '[' simpleType ']' ('?')? + ; + + functionDeclaration + : 'func' IDENTIFIER '(' parameters? ')' ('->' typeName)? block + ; + + parameters + : parameter (',' parameter)* + ; + + parameter + : IDENTIFIER ':' typeName + ; + + block + : '{' statement* '}' + ; + + statement + : 'if' '(' expression ')' statement + | 'if' '(' expression ')' statement 'else' statement + | 'while' '(' expression ')' statement + | postfixExpression '=' expression ';'? + | block + | 'break' ';'? + | 'continue' ';'? + | varDeclaration + | 'var' IDENTIFIER '=' expression ';'? + | 'return' orExpression? ';'? + | expression ';'? + ; + + expression + : orExpression + ; + + orExpression + : andExpression ('||' andExpression)* + ; + + andExpression + : relationalExpression ('&&' relationalExpression)* + ; + + relationalExpression + : additionExpression (('==' | '!='| '>'| '<'| '>='| '<=') additionExpression)* + ; + + additionExpression + : multiplicationExpression (('+' | '-') multiplicationExpression)* + ; + + multiplicationExpression + : unaryExpression (('*' | '/' ) unaryExpression)* + ; + + unaryExpression + : ('-' | '!') unaryExpression + | postfixExpression + ; + + postfixExpression + : primaryExpression (indexExpression | callExpression | fieldExpression)* + ; + + indexExpression + : '[' orExpression ']' + ; + + callExpression + : '(' arguments? ')' + ; + + arguments + : orExpression (',' orExpression)* + ; + + fieldExpression + : '.' IDENTIFIER + ; + + primaryExpression + : INTEGER_LITERAL + | IDENTIFIER + | '(' orExpression ')' + | 'new' typeName initExpression + ; + + initExpression + : '{' initializers? '}' + ; + + initializers + : initializer (',' initializer)* + ; + + initializer + : (IDENTIFIER '=')? orExpression + ; + diff --git a/site/source/index.rst b/site/source/index.rst index a64f32d..be92cb2 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -44,6 +44,7 @@ Preliminaries :caption: Preliminaries prelim-impl-lang + ez-lang Basic Front-End techniques ========================== diff --git a/site/source/prelim-impl-lang.rst b/site/source/prelim-impl-lang.rst index ee6c764..eef121c 100644 --- a/site/source/prelim-impl-lang.rst +++ b/site/source/prelim-impl-lang.rst @@ -1,5 +1,5 @@ -Implementation Language -======================= +Compiler Implementation Language +================================ A compiler can be implemented in any language we choose. For a pedagogical project it is more convenient to choose a language that is widely used, has garbage collection, and comes with excellent tools such @@ -17,10 +17,7 @@ from a technical standpoint, that is. It is a garbage collection language that h work with. The main negatives are that it is not a popular language, and the tooling is not up to the standards of other languages. -Go would be a good candidate except that its an opinionated language that forces a certain programming model, -whereas we would like a language that offers least resistance. - -Java, Kotlin, Swift and C# seem like good candidates. Java has some limitations that make it harder to write memory optimized +Go, Java, Kotlin, Swift and C# seem like good candidates. Java has some limitations that make it harder to write memory optimized code that is often necessary in a production compiler, but we don't care so much about that. I decided to use Java because it is the language I am most familar with, has great tooling, and despite some From 0c696465d5039709d5d46903a9849103015e6fc1 Mon Sep 17 00:00:00 2001 From: dibyendumajumdar Date: Mon, 14 Apr 2025 11:46:52 +0100 Subject: [PATCH 46/46] Fix typos and clarify some detail --- site/source/ez-lang.rst | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/site/source/ez-lang.rst b/site/source/ez-lang.rst index 0817ecb..21edbfc 100644 --- a/site/source/ez-lang.rst +++ b/site/source/ez-lang.rst @@ -19,7 +19,7 @@ Keywords -------- Following are keywords in the language:: - func var int struct if else while break continue return + func var int struct if else while break continue return null Source Unit ----------- @@ -36,7 +36,7 @@ Types The only primitive type in the language is the integer type ``Int``. The size of this type is unspecified, the default implementation is 64-bit integers. -There is not a distinct boolean type, non-zero integer values evaluate as true, and ``0`` evaluates as false. +There is not a distinct boolean type, non-zero integer values evaluate as true, and zero evaluates as false. Users can define one-dimensional arrays and structs. @@ -47,7 +47,8 @@ The language does not specify whether the heap is garbage collected or manually up to the implementation. A ``struct`` type is a named aggregate with one or more fields. Fields may of be of any supported -type. +type. Struct types are nominal, i.e. each struct type is identified uniquely by its name. +Multiple definitions of a struct type are not allowed. An array type is declared by enclosing the element type in brackets, i.e. ``[`` and ``]``. @@ -69,9 +70,6 @@ Examples:: var array: [Tree?]? } -Struct types are nominal, i.e. each struct type is identified uniquely by its name. -Multiple definitions of struct types is not allowed. - The language does not require forward declarations. Functions @@ -79,11 +77,11 @@ Functions Users can declare functions, each function must have a unique name. -Polymorphic functions are not supported. +Functions cannot be overloaded. Functions are not closures. Functions can accept one or more arguments and may optionally return a result. -The ``func`` keyword instroduces a function declaration. +The ``func`` keyword introduces a function declaration. Examples:: @@ -104,6 +102,11 @@ Examples:: return fib(10) } +Literals +-------- + +The only literals are integer values and ``null``. + Variables and Fields -------------------- @@ -171,12 +174,12 @@ Examples:: Control Flow ------------ -The language is lexically scoped, and block structured. +The language is block structured. A block is enclosed in ``{`` and ``}`` and introduces a lexical scope. The ``if`` statement allows branching based on a condition. The condition must be an -integer expression; a value of ``0`` is false, any other value is ``true``. +integer expression; a value of zero is ``false``, any other value is ``true``. The ``if`` statement can have an optional ``else`` branch. @@ -253,16 +256,17 @@ The following grammar describes the language syntax:: ; typeName - : simpleType + : nominalType | arrayType ; - simpleType - : IDENTIFIER ('?')? + nominalType + : 'Int' + | IDENTIFIER ('?')? ; arrayType - : '[' simpleType ']' ('?')? + : '[' nominalType ']' ('?')? ; functionDeclaration