From 6a494ff7873af857b94ae733c3faa36d406c3817 Mon Sep 17 00:00:00 2001
From: Balazs Komuves <bkomuves@gmail.com>
Date: Mon, 6 Oct 2025 00:57:05 +0200
Subject: [PATCH] initial import

---
 .gitignore                                  |   2 +
 LICENSE-APACHEv2                            | 177 ++++++++++++
 LICENSE-MIT                                 |  18 ++
 README.md                                   |  17 ++
 docs/FRI_details.md                         | 176 ++++++++++++
 docs/Overview.md                            | 303 ++++++++++++++++++++
 docs/Protocol.md                            |   9 +
 docs/Security.md                            |  50 ++++
 reference/.gitignore                        |   1 +
 reference/LICENSE                           |   7 +
 reference/README.md                         |  24 ++
 reference/src/FRI.hs                        |  21 ++
 reference/src/FRI/LDE.hs                    |  55 ++++
 reference/src/FRI/Matrix.hs                 |  56 ++++
 reference/src/FRI/Prover.hs                 | 241 ++++++++++++++++
 reference/src/FRI/Shared.hs                 |  78 +++++
 reference/src/FRI/Types.hs                  | 166 +++++++++++
 reference/src/FRI/Verifier.hs               |  68 +++++
 reference/src/Field/Encode.hs               |  36 +++
 reference/src/Field/Goldilocks.hs           |   8 +
 reference/src/Field/Goldilocks/Extension.hs | 144 ++++++++++
 reference/src/Field/Goldilocks/Slow.hs      | 119 ++++++++
 reference/src/Hash.hs                       |  11 +
 reference/src/Hash/Common.hs                |  94 ++++++
 reference/src/Hash/Duplex/Monad.hs          | 103 +++++++
 reference/src/Hash/Duplex/Pure.hs           | 114 ++++++++
 reference/src/Hash/Merkle.hs                | 298 +++++++++++++++++++
 reference/src/Hash/Monolith/Constants.hs    |  83 ++++++
 reference/src/Hash/Monolith/Permutation.hs  |  64 +++++
 reference/src/Hash/Permutations.hs          |  16 ++
 reference/src/Hash/Sponge.hs                | 119 ++++++++
 reference/src/Misc.hs                       | 177 ++++++++++++
 reference/src/NTT.hs                        |  15 +
 reference/src/NTT/Poly.hs                   | 221 ++++++++++++++
 reference/src/NTT/Slow.hs                   | 186 ++++++++++++
 reference/src/NTT/Subgroup.hs               |  90 ++++++
 reference/src/testMain.hs                   |  95 ++++++
 37 files changed, 3462 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE-APACHEv2
 create mode 100644 LICENSE-MIT
 create mode 100644 README.md
 create mode 100644 docs/FRI_details.md
 create mode 100644 docs/Overview.md
 create mode 100644 docs/Protocol.md
 create mode 100644 docs/Security.md
 create mode 100644 reference/.gitignore
 create mode 100644 reference/LICENSE
 create mode 100644 reference/README.md
 create mode 100644 reference/src/FRI.hs
 create mode 100644 reference/src/FRI/LDE.hs
 create mode 100644 reference/src/FRI/Matrix.hs
 create mode 100644 reference/src/FRI/Prover.hs
 create mode 100644 reference/src/FRI/Shared.hs
 create mode 100644 reference/src/FRI/Types.hs
 create mode 100644 reference/src/FRI/Verifier.hs
 create mode 100644 reference/src/Field/Encode.hs
 create mode 100644 reference/src/Field/Goldilocks.hs
 create mode 100644 reference/src/Field/Goldilocks/Extension.hs
 create mode 100644 reference/src/Field/Goldilocks/Slow.hs
 create mode 100644 reference/src/Hash.hs
 create mode 100644 reference/src/Hash/Common.hs
 create mode 100644 reference/src/Hash/Duplex/Monad.hs
 create mode 100644 reference/src/Hash/Duplex/Pure.hs
 create mode 100644 reference/src/Hash/Merkle.hs
 create mode 100644 reference/src/Hash/Monolith/Constants.hs
 create mode 100644 reference/src/Hash/Monolith/Permutation.hs
 create mode 100644 reference/src/Hash/Permutations.hs
 create mode 100644 reference/src/Hash/Sponge.hs
 create mode 100644 reference/src/Misc.hs
 create mode 100644 reference/src/NTT.hs
 create mode 100644 reference/src/NTT/Poly.hs
 create mode 100644 reference/src/NTT/Slow.hs
 create mode 100644 reference/src/NTT/Subgroup.hs
 create mode 100644 reference/src/testMain.hs

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d4eb940
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+tmp
diff --git a/LICENSE-APACHEv2 b/LICENSE-APACHEv2
new file mode 100644
index 0000000..f433b1a
--- /dev/null
+++ b/LICENSE-APACHEv2
@@ -0,0 +1,177 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..6bf41db
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,18 @@
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f731b93
--- /dev/null
+++ b/README.md
@@ -0,0 +1,17 @@
+Trustless outsourcing of Reed-Solomon encoding of datasets
+----------------------------------------------------------
+
+This is intended to be an implementation of the following idea:
+
+A client, owning a dataset (authenticated with a Merkle commitment) wants
+to Reed-Solomon encode that data to an extended one (extend with parity data),
+but instead of doing it themselves, they want an untrusted server to do it.
+
+The server should respond with a Merkle commitment of the encoded data, 
+and a proof that the encoding was done correctly and corresponds (contains) 
+the original dataset.
+
+The core idea is to use the FRI (or similar) low-degree test as a proof of
+correct Reed-Solomon encoding.
+
+See [docs/Overview.md](docs/Outsourcing.md) for more details.
diff --git a/docs/FRI_details.md b/docs/FRI_details.md
new file mode 100644
index 0000000..f4aba9f
--- /dev/null
+++ b/docs/FRI_details.md
@@ -0,0 +1,176 @@
+FRI row indexing and FFT details
+--------------------------------
+
+The motivation here was to try and work out how to index the rows in a Plonky2-style FRI protocol. More generally, other details like the FFT folding verifier are also worked out.
+
+### Evaluation domains
+
+Plonky2 uses cosets for evaluation domains (instead of subgroups). This is presumably the "out of domain sampling" idea (DEEP-FRI?).
+
+$$ \mathcal{D}_0 \;:=\; \mathbf{g}\cdot \langle \omega \rangle \;=\; \big\{\, \mathbf{g}\cdot\omega^i\;:\; 0\le i < N\cdot 2^r \,\big\}$$
+
+where $\mathbf{g}\in\mathbb{F}^\times$ is a generator of $\mathbb{F}^\times$, and $\omega\in\mathbb{F}^\times$ is an $N\cdot 2^r=2^{(n+r)}$-th root of unity, where $n=\log_2(N)$ is the logarithm of the size $N$ of the original data, and $r=-\log_2(\rho)$ is the "rate bits".
+
+Then if we fold by a factor of $K=2^\kappa$, the next domain will be
+
+$$ \mathcal{D}_1 \;:=\; (\mathcal{D}_0)^K = \mathbf{g}^K \cdot \langle\omega^K \rangle \;=\; \big\{\, \mathbf{g}^K\cdot\omega^{Ki}\;:\; 0\le i < (N/K)\cdot 2^r \,\big\}$$
+
+And so on. Of course we don't have to use the same $k$ at every folding step (this is called "reduction strategies" in Plonky2)
+
+### Basic FRI verification workflow
+
+Given a query index $\mathsf{idx}$, we
+
+- first check the opened row against the Merkle root of the LDE (low-degree extended) matrix with a Merkle proof ("initial tree proof" in Plonky2 lingo)
+- combine the opened row with $\alpha\in\widetilde{\mathbb{F}}$ to get the "upstream value"
+- then repeatedly fold:
+    - open the "folding coset" from the commit phase Merkle tree (checking the corresponding Merkle proof)
+    - check the upstream value against the corresponding element in the coset
+    - calculate a small FFT on this coset (see below for details)
+    - combine the transformed values with the step's $\beta\in\widetilde{\mathbb{F}}$ (a new $\beta$ is sampled for each folding step, during the commit phase)
+    - this will be the next step's "upstream value"
+    - loop until the folded degree becomes small enough
+- check the final polynomial (sent as coefficients) against the last upstream value by evaluating at the given location.
+
+### Discrete Fourier Transform (DFT)
+
+Given a subgroup $H=\langle \omega \rangle\subset \mathbb{F}^\times$ of size $|H|=K$, the DFT and its inverse converts between the coefficient form and evaluation form of a polynomial $f(x)\in\mathbb{F}^{<K} [x]$ of degree $K-1$:
+
+\begin{align*}
+f(\omega^i) &= \sum_{k=0}^{K-1} \omega^{ik}a_k \\
+a_i &= \frac{1}{K} \sum_{k=0}^{K-1} \omega^{-ik}f(\omega^k)
+\end{align*}
+
+It's easy to shift this to a coset $\eta H=\{\eta\cdot \omega^i\;:\;0\le i < K\}$:
+
+\begin{align*}
+f(\eta\cdot \omega^i) &= \sum_{k=0}^{K-1}  \omega^{ik} \eta^k a_k \\
+a_i &= \eta^{-i}  \frac{1}{K}\sum_{k=0}^{K-1} \omega^{-ik}f(\eta\cdot\omega^k)
+\end{align*}
+
+### Folding with FFT
+
+Given a polynomial $\mathcal{P}(x)$ of size $N$ (degree $\deg(\mathcal{P})=N-1$), when folding with factor $K=2^\kappa$, we want to decompose it as 
+
+$$ \mathcal{P}(x) = \sum_{i=0}^{K-1} x^i\cdot p_i(x^K) $$
+
+and then combine these into the folded polynomial $\mathcal{P}'$:
+
+$$ \mathcal{P}'(y) := \sum_{i=0}^{K-1} \beta^i\cdot p_i(y) $$
+
+So this what the prover does, internally; meantime the verifier, in each query round, gets the values of these polynomials on a coset $\mathcal{C}\subset \mathcal{D}$ of size $|\mathcal{C}|=K=2^\kappa$ in the evaluation domain $\mathcal{D}=\{\mathbf{h}\cdot \omega^i\}$ of size $N$:
+
+$$ \mathcal{C} \;=\; \big\{\; x_j = \mathbf{h}\cdot \omega^{\mathsf{idx} + j\times (N/K)} = \mathbf{h}\cdot \omega^{\mathsf{idx}}\mu^j \;:\; 0 \le j < K 
+\;\big\}
+$$
+
+where $0 \le \mathsf{idx} < N$ is the "upstream" query index. The corresponding "upstream location" was $x_0=\mathbf{h}\cdot \omega^\mathsf{idx}$, and the folded location will be $y:=x_0^K = \mathbf{h}^K\cdot \omega^{K\times \mathsf{idx}}$ (with "downstream" query index $\mathsf{idx}' := \mathsf{idx}\;\textrm{mod} \;N/K$; note that we use natural ordering here, unlike Plonky2 which uses bit-reversed ordering).
+
+**So why does the FFT works here?**
+
+Let $\mu=\omega^{N/K}$ be the generator of the (subgroup corresponding to) the coset $\mathcal{C}$ of size $|\mathcal{C}|=K$.
+
+Observe the following elementary fact:
+$$ \frac{1}{K}\sum_{i=0}^{K-1} \mu^{ik} \;=\; 
+  \left\{\begin{array}{ll}
+  1 &\textrm{if }\, k\equiv 0\;\,(\textrm{mod}\; K) \\
+  0 &\textrm{otherwise}
+  \end{array}\right.
+$$
+
+From this, we have
+$$
+p_l(x^K) \; =\; \frac{1}{Kx^l} \sum_{j=0}^{K-1}\mu^{-jl} \cdot \mathcal{P}(\mu^j\cdot x)
+$$
+
+Too see this, write $\mathcal{P}(x)=\sum_i a_ix^i$; since everything is linear, it's enough to check that for the term $a_mx^m$, we have
+
+$$
+ \frac{1}{K} \sum_{j=0}^{K-1}\mu^{-jl} \cdot a_m(\mu^j x)^m = \frac{a_m x^m}{K}
+ \sum_{j=0}^{K-1}\mu^{j(m-l)} = 
+  \left\{\begin{array}{ll}
+  a_m x^m &\textrm{if }\, m\equiv l\;\,(\textrm{mod}\; K) \\
+  0 &\textrm{otherwise}
+  \end{array}\right.
+$$
+
+Now substituting the coset elements $x_m=\mathbf{h}\cdot \omega^{\mathsf{idx}}\mu^m$ and the corresponding vlaues $y_m=\mathcal{P}(x_m)$ into our formula for $p_l(x^K$), we get:
+
+$$
+\begin{align*}p_l(x_m^K) \;&=\; p_l(\mathbf{h}^K\cdot \omega^{K\mathsf{idx}}\mu^{Km}) \\
+&=\;
+\frac{1}{K\cdot\mathbf{h}^l\cdot \omega^{l\mathsf{idx}}\mu^{lm}} \sum_{j=0}^{K-1}\mu^{-jl} \cdot \mathcal{P}(\mathbf{h}\cdot \omega^{\mathsf{idx}}\mu^{m+j}) \\
+  &= \frac{1}{K\cdot (\mathbf{h}\omega^{\mathsf{idx}})^l}
+   \sum_{j=0}^{K-1}\mu^{-l(m+j)} y_{m+j} 
+ \;=\; \frac{1}{K\cdot (\mathbf{h}\omega^{\mathsf{idx}})^l}
+   \sum_{j=0}^{K-1}\mu^{-lj} y_{j}
+\end{align*} 
+$$
+
+in which we can recognize the inverse coset DFT formula from above.
+
+TODO: maybe make the indices notation more consinstent...
+
+**Winograd small FFT**
+
+For small sized FFTs with specific sizes, up to maybe size $2^5=32$, there are specialized FFT algorithms with a bit fewer number of multiplications than the standard one.
+
+This may be worthwhile to investigate and benchmark.
+
+### Row ordering
+
+First a remark about FFT. Some (in-place) FFT implementations naturally give the result in the "bit-reversed" permutation. This is the permutation where the vector indices, written in binary, have their bits reversed:
+
+$$
+\begin{align*}
+  \big[2^n\big] \quad &\longrightarrow\quad \big[2^n\big] \\
+       \sum_{i=0}^{n-1} 2^i b_i \quad&\longmapsto\quad \sum_{i=0}^{n-1} 2^i b_{n-1-i}
+\end{align*}
+$$
+
+As our FFT implementation gives the result in the natural order (cyclic group order), and bit-reversal just makes everything harder to understand, we always use the natural order here.
+
+It's straightforward, if confusing, to adapt everything for a bit-reversed FFT (for example Plonky2 indices mostly everything in the bit-reversed order).
+
+#### Matrix row ordering
+
+As we only open single, randomly-selected rows, the Merkle tree ordering doesn't matter for efficiency.
+
+However, to be able prove the connection of the original data and the parity data, we need these to be subtrees.
+
+Since with FFT, we normally put the original data on a subgroup, and the parity data on its $2^r-1$ cosets, the most natural indexing is the following:
+
+\begin{align*}
+\mathsf{data}     &: \{\,0,\,R\quad\;\;\;,\,2R\quad\;\;,\,\dots,\, (N-1)R\quad\;\;\,\, \}  \\
+\mathsf{parity_1} &: \{\,1,\,R+1,\,2R+1,\,\dots,\,(N-1)R+1\, \}  \\
+\mathsf{parity_2} &: \{\,2,\,R+2,\,2R+2,\,\dots,\, 
+(N-1)R+2\, \} \\ 
+\vdots & \\
+\mathsf{parity}_{R-1} &: \{\,R-1,\,2R-1,\,3R-1,\,\dots,\, 
+NR-1\, \} 
+\end{align*}
+
+where $R=2^r=1/\rho$ is the expansion ratio.
+
+Note: using $r>1$ (so $R>2$ or $\rho<1/2$) in the FRI protocol may make sense even if we only keep a smaller amount of the parity data at the end (to limit the storage overhead), as it may improve soundness or decrease proof size (while also making the proof time longer).
+
+#### Commit phase vectors ordering
+
+Here we open cosets of size corresponding to the folding step arity $K$ (which can be different in the different folding steps).
+
+So such cosets should be the leaves of the tree. If the size of the of the vector is $M$, then the cosets (in natural ordering) are indexed as:
+
+$$ \mathcal{C}_i = \big\{\, i ,\, i+M/K ,\, i + 2(M/K) ,\, \dots ,\, i + (K-1)(M/K) \,\big\} $$
+
+for $0\le i < M/K$. Thus the Merkle tree should have size $M/K$ and with the $i$-th leaf being $\mathcal{C}_i$.
+
+Of course we could permute the order of the leaves as we want, but this seems to be most natural order.
+
+Remark: Which coset we want to open is determined by the "upstream index" $0\le \mathsf{idx} < M$. To get the corresponding coset index we can simply compute
+$i:= \mathsf{idx}\;\textrm{mod}\;M/K$. However, to match the above FFT computation, we also have to cyclically permute the coset, so that it starts at $\mathsf{idx}$. In practice this mean rotating the opened coset by $\lfloor \mathsf{idx}/(M/K)\rfloor$.
+
+Alternatively, instead of "rotating" the coset, we can instead calculate which element of the coset should match the upstream value. This is maybe a little bit more intuitive.
+
+### Unique decoding, list decoding, etc
+
+TODO; also see the security document
diff --git a/docs/Overview.md b/docs/Overview.md
new file mode 100644
index 0000000..8406536
--- /dev/null
+++ b/docs/Overview.md
@@ -0,0 +1,303 @@
+Outsourcing local erasure coding
+--------------------------------
+
+The purpose of local erasure coding (we used to call this "slot-level EC" in old Codex) is to increase the strength of storage proofs based on random sampling.
+
+The core principle behind this idea is the distance amplification property of Reed-Solomon codes.
+
+The concept is simple: If we encode $K$ data symbols into $N$ code symbols, then for the data to be irrecoverably lost, you need to lose at least $N-K+1$ symbols (it's a bit more complicated with data corruption, but let's ignore that for now). In a typical case of $N=2K$, this means that checking just one randomly chosen symbol gives you approximately $p\approx 1/2$ probability of detecting data loss.
+
+### Outsourcing to the provider
+
+In "old Codex", this encoding (together with the network-level erasure coding) was done by the client before uploading. 
+
+However, it would preferable to outsource the local encoding to the providers, for several reasons:
+
+- the providers typically have more computational resources than the clients (especially if the client is for example a mobile phone)
+- because the network chunks are hold by different providers, the work could be distributed among several providers, further decreasing the per-person work
+- if it's the provider who does it, it can be postponed until enough  data (possible many small pieces from many different clients) is accumulated to make the resulting data unit size economical
+
+However, in general we don't want to trust the provider(s), but instead verify that they did it correctly.
+
+We need to verify 3 things:
+
+- the original data (authenticated by a Merkle root or similar commitment) is kept intact;
+- the encoding is done correctly;
+- the new Merkle root corresponds to the encoded data.
+ 
+Fortunately, there is a way to do all this.
+
+### FRI low-degree check
+
+The FRI protocol (short for "Fast Reed-Solomon Interactive Oracle Proofs of Proximity") is an interactive protocol in which the prover convinces the verifier that a Merkle-committed vector of finite field elements is _close to_ a Reed-Solomon codeword (with rate $\rho=2^{-r}$).
+
+Note that obviously we cannot do better than "close to" without checking every single element of the vector (which again obviously wouldn't be a useful approach), so "close to" must be an acceptable compromise.
+
+However, in the ideal situation, if the precise distance bound $\delta$ of the "close to" concept is small enough, then there is exactly 1 codeword within that radius ("unique decoding regime"). In that situation errors in the codeword can be corrected simply by replacing it with the closest codeword. A somewhat more complicated situation is the so-called "list decoding regime".
+
+As usual we can make this non-interactive via the Fiat-Shamir heuristic.
+
+This gives us a relatively simple plan of attack:
+
+- the client (uploader) calculates the Merkle root of the original data (assumed to have size $K=2^k$)
+- the provider calculates the parity data (same amount, so the encoding has rate $\rho=1/2$), and its Merkle root
+- the Merkle tree of the codeword (the concatenation of the original data and the parity data) is built by attaching these two subtrees to a single root node. That is, the Merkle root of the codeword is the hash of the pair of roots, of the original data and of the parity data
+- the provider executes the FRI protocol to prove that the codeword is in fact (close to) a Reed-Solomon codeword
+- the provider also distributes the Merkle root of the parity data together with the FRI proof. This is the proof (a singleton Merkle path) connecting the original data Merkle root and the codeword Merkle root (storage proofs will be validated against the latter)
+- the metadata is updated: the new content ID will be Merkle root of the codeword, against which storage proofs will be required in the future. Of course one will also need a mapping from the original content ID(s) to the new locations (root + pointer(s) inside the RS encoded data)
+
+Remark: If the 2x storage overhead is too big, after executing the protocol, we may try to trim some of the parity (say 50% of it). You can probably still prove some new Merkle root with a little bit of care, but non-power-of-two sizes make everything more complicated.
+
+Remark #2: There are also improved versions of the FRI protocol like STIR and WHIR. I believe those can be used in the same way. But as FRI is already rather complicated, for now let's concentrate on that.
+
+### Batching
+
+FRI is a relatively expensive protocol. I expect this proposal to work well for say up to 1 GB data sizes, and acceptably up to 10 GB of data. But directly executing FRI on such data sizes would be presumably very expensive.
+
+Fortunately, FRI can be batched, in a very simple way: Suppose you want to prove that $M$ vectors $v_i\in \mathbb{F}^N$ are all codewords. To do that, just consider a random linear combination (recall that Reed-Solomon is a linear code)
+ 
+$$ V := \sum_{i=1}^M \alpha^i v_i $$
+
+with a randomly chosen $0\neq\alpha\in\mathbb{F}$ (choosen by the verifier or via Fiat-Shamir). Intuitively, it's very unlikely that any of $v_i$ is _not a codeword_ but $V$ is (this can be quantified precisely). So it's enough to run the FRI on the combined vector $V$.
+
+Note: If the field is not big enough, you may need to either repeat this with several different $\alpha$-s, or consider a field extension. This is the case for example with the Goldilocks field, which has size $|\mathbb{F}|\approx 2^{64}$. Plonky2 for example choses $\alpha\in\widetilde{\mathbb{F}}$ from a degree two field extension $\widetilde{\mathbb{F}}$ (so approx. 128 bits), which is big enough for any practical purposes. FRI is then executed in that bigger field.
+
+This approach has another nice consequence: Now instead of doing one big RS encoding, we have to do many smaller ones. This is good, because:
+
+- that's always faster (because of the $O(N\log(N))$ scaling of FFT)
+- it needs much less memory
+- can be trivially parallelized
+
+This "wide" approach is basically the same as the one used in Plonky2 and Plonky3.
+
+### Field and hash choice
+
+We need to choose a prime field (but see below) for the Reed-Solomon encoding, and one (or more) hash functions to construct the Merkle tree.
+
+Just for executing the FRI protocol, the hash function could be any (cryptographic) hash, and we could even use different hash functions for constructing the row hashes and the Merkle tree. However, if in the future we want to do recursive proof aggregation, then since in that situation the Merkle path proofs need to be to be calculated inside ZK too, it's better to choose a ZK-friendly hash.
+
+With these in mind, a reasonable choice seems to be the Goldilocks field ($p=2^{64}-2^{32}+1$) and the Monolith hash function (which is one of the fastest ZK-friendly hashes). This way the Reed-Solomon encoding and the hash function uses a compatible structure.
+
+Remark: While in principle both FFT and FRI should work with a binary field instead of a prime field (see eg. FRI-Binius), I'm not at all familiar with those variations, so let's leave that for future work. Also, if we want to do recursive proof aggregation, again using prime fields for such proof systems is more common (but again, in principle that should be possible too with a binary field).
+
+### Data layout
+
+So the basic idea is to organize the data into a $2^n\times M$ matrix of field elements.
+
+Then extend each column via a rate $\rho=1/2$ Reed-Solomon code to a matrix of size $2^{n+1}\times M$, so that top half is the original data, and the bottom half is parity.
+
+Then hash each row, and build a Merkle tree on the top of the row hashes (this is the structure we need for the batched FRI argument).
+
+#### Row hashes
+
+However, we have a lot of freedom on how to construct the row hashes. The simplest is of course just a linear (sequential) hash. This is efficient (linear sponge hash with $t=12$ should be faster than building a Merkle tree, as you can consume 8 field elements with a single permutation call, instead of an average 4 with a binary Merkle tree); however a disadvantage is that a Merkle path proof needs to include a whole row, which can be potentially a large amount of data if $M$ is big.
+
+The other end of the spectrum is to use a Merkle tree over the rows too. Then the Merkle path is really only a Merkle path. However, to make this reasonably simple, we need $M$ to be a power-of-two.
+
+We can also go anywhere inbetween: Split the row into say 8, 16 or 32 chunks; hash those individually; and build a tiny Merkle tree (of depth 3, 4 or 5, respectively) on the top of them. The Merkle root of this small tree will be the row hash. This looks like a nice compromise with the good properties of both extreme cases, while also keeping the Merkle trees complete binary trees (that is, power-of-two number of leaves).
+
+A problem though with this approach is that a single Merkle path doesn't really "proves" the full file, except when you also include the full row data. Which can be much bigger than the Merkle path itself...
+
+So maybe actually including full rows is the right approach, and we just have to accept larger proofs (eg. with 2048 byte rows and a 16GB dataset, we have a Merkle tree of depth 23, that is, a Merkle proof is 736 bytes + the 2048 bytes row data is 2784 bytes proof per sample).
+
+We also have the freedom to reorder the rows (except keeping the original data in the top half and the parity in the bottom; this is needed to be able to connect the Merkle root of the original data with the Merkle root of the encoded data).
+
+Such reorderings can help making the FRI protocol more efficient.
+
+#### Choice of the width
+
+We may need to constraint the width $M$ too. This is unfortunate, because the height is already constrained to be a power of two, so we may end up with too many constraints (requiring too much padding and thus making the system less efficient).
+
+One reason to constraint the width, if we want our Merkle tree to be compatible with the network block structure. Recall that from the network point of view, the datasets are organized in blocks of uniform size (currently 64kb blocks), and then having a SHA256 Merkle tree on the top of them.
+
+Though I'm not sure at the moment if this compatibility is required at all?
+
+But if we want this, then we want a row, or a small (power of two) number of rows to contain exactly a single network block's data.
+
+Recall that with Goldilocks field we can pack either 7 bytes into a single field element, or somewhat better, 31 bytes into 4 field elements.
+
+So one possible arrangement could be for example to have rows of size $M=268$, then each row can fit $67\times 4$ field elements, that's $67\times 31 = 2077 > 2048$ bytes. Then 32 rows can hold 64kb.
+
+Another (wider) version could be $265\times 4$ field element per row, fitting $265\times 31 = 8215 > 8192$ bytes, so that 8 rows hold 64kb. But see above about including a full row in the proof.
+
+
+### On-disk layout
+
+Another subtle issue is how to order this data on the disk. Unfortunately spinning disks are very slow (150-300 MB/sec sequential fastest, and really really slow seeking: typically well below 1000 seeks per second).
+
+What are our typical access patterns?
+
+- to do the FFT encoding, we need to access all the columns, independently (and then they will be processed in parallel)
+- to do the query phase of the FRI protocol, we need to access some randomly selected rows (maybe 50--100 of them)
+- when doing the randomly sampled "storage proofs" (which is supposed to happen periodically!), again we need to access random rows
+
+Accessing both rows and columns efficiently is pretty much contradictory to each other...
+
+Maybe a compromise could be something like a "cache-oblivious" layout on disk, for example, partitition the matrix into medium-sized squares, so that both rows and columns are somewhat painful, but neither of them too much painful.
+
+On the other hand, ideally the encoding and the FRI proof is done only once, while storage proofs are done periodically, so maybe row access should have a priority? It's a bit hard to estimate the cost-benefit profile of this, it also depends on the average lifetime. We have a rather heavy one-time cost, and a rather light but periodically occuring cost...
+
+Btw, the harddrive _sequential_ speed limit means, that unless there the data is on SSD or some other fast medium, the bottleneck will be always close to the spinning drive. For example, while Goldilocks FFT has similar speeds as sequential harddrive reads/writes, it can be trivially parallelized (as we do many independent FFTs, one for each column).
+
+### Bundling of files
+
+The idea was that when we have to deal with lots of small files, we can start collecting and merging them, until the size of this bundle reaches an economical size (we don't want too many proofs circulating in the network).
+
+When a good size is reached (or too much time passed), we can then do the erasure coding, kind of "sealing" the bundle (though in principle we could collect even more and redo the encoding the same way).
+
+#### Discrepancy of lifetimes
+
+If we have a lot of small files (from potentially many users), they most probably have different expected lifetimes.
+
+However, I expect that typical small files have similar lifetimes (probably something on the order of 1 month). And keeping a file for a little bit longer shouldn't be a big issue.
+
+Furthermore it's the provider who chooses what to bundle together, so they can select for similar expiries (maybe building up several bundles at the same time).
+
+So I don't see this as a serious issue.
+
+#### Discrepancy of file sizes
+
+Files can also have different sizes, and non-power-of-two sizes.
+
+Merging two Merkle trees is the simplest when they both have the same number of leaves, which number is also a power of two.
+
+We can in theory pad any file to have power-of-two number of Merkle leaves, though this results in waste.
+
+But again, if we expect a distribution of (padded) file sizes, then we can try some heuristics to pack them nicely.
+
+For example: `(((1+1)+2)+4+4+4)+16 = 32`
+
+#### How does this interacts with repair?
+
+Well obviously when such a bundle is lost, then the cost of repair is bigger than if a chunk of a small file is lost.
+
+However, we still have the information that where different chunks of the files in the different parts of the bundle are (that is required just for downloading them anyway), so in principle the repair can be done, alas with a larger number of network connections (because it's unlikely that other chunks of the constituent files are located at the same providers).
+
+Another question is whether we want to repair the whole bundle, or the constituent files separately? By which I mean that the chunks of files have to be moved to other providers, should that be a single provider for the whole bundle, or different ones for the different files in that bundle?
+
+### The FRI protocol
+
+The FRI protocol (invented by Eli Ben-Sasson et al.) consists of two phases:
+
+- the commit phase
+- and the query phase
+
+Note that somewhat confusingly, the commit phase is NOT calculating the Merkle commitment of the Reed-Solomon encoded data above; that's kind of "external" to the protocol, as we want to prove that an already committed vector is (close to) a codeword.
+
+We will first describe the simplest version, then describe optimizations which are commonly used.
+
+#### The commit phase
+
+First we calculate the combined vector (linear combination of the columns); this is completely straightfoward.
+
+This vector is interpreted as the values of a polynomial on a multiplicative subgroup (with initial size $2^{n}/\rho = 2^{n+1}$ as we use rate $\rho=1/2$; however with degree only $2^n-1$).
+
+Then the prover will repeatedly "fold" this polynomial, halving both the size and the degree, until it reaches a constant polynomial (which will be represeneted by a vector of size two, both values being the same).
+
+Each folded version is committed to using a Merkle tree, and at the very end the final singleton coefficient is sent in clear. In the query phase, we will then check (probabilistically) that the folding was done correctly.
+
+The folding step works as follow: Let $p_k(x)$ be the polynomial before the $k$-the folding, then if
+
+$$ p_k(x) = p_k^{\textrm{even}}(x^2) + x \cdot p_k^{\textrm{odd}}(x^2) $$
+
+the next, folded polynomial will be
+
+$$ p_{k+1}(y) := p_{k}^{\textrm{even}}(y) + \beta_k\cdot p_{k}^{\textrm{odd}}(y) $$
+
+with $\beta_k \in\widetilde{\mathbb{F}}$ chosen by the verifier (or in practice, via Fiat-Shamir); and we evaluate it on the half-sized domain (multiplicative subroup) $D_{k+1} := (D_{k})^2$, generated by $\omega^2$ if the previous one was generated by $\omega$. 
+
+So both the size of the vector and the degree of polynomial (whose evaluations are the elements of the vectors) is halved in each such folding step. After $\log_2(\deg)$ steps we get a constant polynomial, represented (in this simplest version) by a vector consisting two equal numbers.
+
+Remark: In practice the domains are usually shifted to be a coset instead of a subgroup (this is called the DEEP method?).
+
+Note: By simple calculation, we have
+
+$$ 
+\begin{align*}
+p_{\textrm{even}}(x^2) &= \;\;\frac{1}{2}\Big[ p(x) + p(-x) \Big] \\
+p_{\textrm{odd}}(x^2) &= \frac{1}{2 x}\Big[ p(x) - p(-x) 
+\Big] 
+\end{align*}
+$$
+
+This should be instantly recongizable as the "butterfly" step in the inverse FFT algorithm.
+
+#### The query phase
+
+In the query phase, we do several query rounds (enough to match the required security level); each round verifies the correctness folding from a randomly chosen starting point (with some probability of false positive; but enough such checks should drive down the probability to be negligible).
+
+First, the verifier chooses a random 
+
+$$x_{0}=\eta\cdot\omega^{\mathsf{idx}}\quad\in\quad D_{0}=\{\,\eta\cdot\omega^i\;:\; 0 \le i < 2^{n+1}\,\}$$
+
+The corresponding full row (the $\mathsf{idx}$-th row of the LDE matrix) is opened with a Merkle proof, and the verifier computes the linear combination with the powers of $\alpha\in\widetilde{\mathbb{F}}^\times$. This will become the initial "upstream" value, against which we do the consistency check.
+
+Next the following folding consistency check is repeated, until we reach a constant (degree 0) polynomial.
+
+We want to check that for $x_{1}:=(x_0)^2=\eta^2\cdot\omega^{2\times\mathsf{idx}}$ we have $p_{1}(x_{1})$ as the expected value. From the above equations, the verifier can compute this from $p_{0}(x_{0})$ an $p_{0}(-x_{0})$.
+
+So to do this, the provers opens these two values (with Merkle proofs against the commit phase Merkle commitment). The first value $p_{0}(x_{0})$ is checked to match the "upstream" value.
+
+Then, using the above formulas, the verifier can compute the expected $p_{1}(x_{1})$:
+
+$$
+\begin{align*}
+p_1(x_1) &= p_1(x_0^2) = p_0^{\textrm{even}}(x_0^2) + \beta_0\cdot p_0^{\textrm{odd}}(x_0^2) = \\
+ &=
+ \frac{1}{2}\Big[ p_0(x_0) + p_0(-x_0) \Big] +
+  \beta_0 \cdot \frac{1}{2 x_0}\Big[ p_0(x_0) - p_0(-x_0) 
+   \Big] \\
+ &= \left(\frac{1}{2}+\frac{\beta_0}{2x_0}\right)p_0(x_0) + 
+ \left(\frac{1}{2}-\frac{\beta_0}{2x_0}\right)p_0(-x_0) 
+\end{align*}
+$$
+
+which becomes the "downstream" value, that is, the "upstream" value of the next folding step.
+
+The same kind step is iterated with $x_{i+1}:=(x_{i})^2$, until we get down to degree zero. Finally, the last "downstream value" is checked against the pre-committed final constant value.
+
+Note: opening a full row only happens once, before the very first folding step. But that's required otherwise we would have no connection between the data matrix and the combined vector(s).
+
+Remark: The verifier should sample the query locations for _all rounds_ before the first round - otherwise a malicious prover could try and cheat round-by-round.
+
+#### Optimizations
+
+_Merkle caps._ Instead of using Merkle roots to commit, we can use "Merkle caps". This simply means to cut the Merkle tree at a given level; so the commitment will be say 16 hashes instead a single root hash. This is a tradeoff between commitment size and Merkle proof sizes.
+
+_Pruning Merkle paths_. Alternatively, we could merge Merkle paths: the top parts will have a lot of shared values. This however makes recursive proofs too complicated.
+
+_Folding in larger steps_ (eg. $16\mapsto 1$ or $8\mapsto 1$ instead of $2\mapsto 1$). In this case the computation of a single step will be a (very small) FFT. See the companion document for more details.
+
+_Opening full cosets_. We can reoder the rows, so that when we need to open values (a small coset for folding), we can do that with a single Merkle path proof.
+
+_Early stopping at a low degree polynomial_. We can stop at say a degree 31 polynomial instead of a constant (degree 0) one. Sending the coefficients of this polynomial in clear is usually a win.
+
+_Grinding_. Between the commit phase and the query phase we can add a proof-of-work challenge. This ensures that a malicious prover can only do a much slower rate of brute-force trying to cheat Fiat-Shamir. Note: the challenge (and response) must go into the Fiat-Shamir transcript.
+
+#### Why does this work?
+
+It's _complicated_, but some important observations are:
+
+- when folding, both the size of the vector and the degree of the polynomial decreases by the same factor; in particular, the rate of folded code is the same as the rate of the original code
+- if $f$ was $\delta$-far from the original code, then $\mathsf{Fold}(f,\beta)$ is also $\delta$-far from the folded code (with high probability, if $\beta$ was chosen randomly)
+
+#### Security
+
+The math behind all this is very complicated; however it's a somewhat intuitive conjecture that the security level (assuming big enough fields) should be approximately
+
+$$\lambda \approx \textrm{rate_bits} \times \textrm{num_query_rounds} + \textrm{proof_of_work_bits}$$
+
+(TODO: justify this somehow?)
+
+A standard security target is 100 bits. In our case the $\textrm{rate_bits} = -\log_2(\rho)$ is 1; so with 16 bits of grinding, we need 84 query rounds. The rate is thus more-or-less a tradeoff between prover speed and proof size. 
+
+While we need an at most 2:1 expansion from the original data (otherwise the storage overhead would be too big), in theory we could try using a larger expansion rate (say $\rho=1/4$ or $\rho=1/8$), but then after executing the protocol, keeping only the same number of parity data as the original data. This could result in a smaller proof size (less query rounds) but larger prover time.
+
+See also the companion security document for more details.
+
+### References
+
+- Eli Ben-Sasson, Iddo Bentov, Yinon Horesh and Michael Riabzev: _"Fast Reed-Solomon Interactive Oracle Proofs of Proximity"_
+- Eli Ben-Sasson, Lior Goldberg, Swastik Kopparty and Shubhangi Saraf: _"DEEP-FRI: Sampling Outside the Box Improves Soundness"_
+- Ulrich Haböck: _"A summary on the FRI low degree test"_
+- Alexander R. Block et al: _"Fiat-Shamir Security of FRI and Related SNARKs"_
diff --git a/docs/Protocol.md b/docs/Protocol.md
new file mode 100644
index 0000000..1673776
--- /dev/null
+++ b/docs/Protocol.md
@@ -0,0 +1,9 @@
+The outsourcing protocol
+------------------------
+
+TODO: I should describe exactly what happens in the protocol.
+
+This is mostly modelled on [Plonky2](https://github.com/0xPolygonZero/plonky2), 
+which in turn is essentially the same as
+
+- E. Ben-Sasson, L. Goldberg, S. Kopparty, and S. Saraf: _"DEEP-FRI: Sampling outside the box improves soundness"_
\ No newline at end of file
diff --git a/docs/Security.md b/docs/Security.md
new file mode 100644
index 0000000..00591aa
--- /dev/null
+++ b/docs/Security.md
@@ -0,0 +1,50 @@
+Security of FRI
+---------------
+
+Soundness properties of FRI are very complicated. Here I try to collect together some information.
+
+### Some basic notions
+
+We will node some basics about coding theory:
+
+- the _rate_ of a code $\mathcal{C}$ is $\rho=K/N$, where we encode $K$ data symbols into a size $N$ codeword (so the number of parity symbols is $N-1)
+- the _distance_ between two codewords (of size $N$) is understood as relative Hamming distance: $\Delta(u,v) = \frac{1}{N}|\{i\;:\;u_i\neq v_i\}|$
+- the minimum distance between two codewords is denoted by $0\le\mu\le 1$. In case of Reed-Solomon (or more generally, an MDS code), $\mu=(N-K+1)/N\approx 1-\rho$
+- the _unique decoding radius_ is $\delta_{\textrm{unique}}=\mu/2$. Obviously if $\Delta(u,\mathcal{C})<\delta_{\textrm{unique}}$ then there is exactly 1 codeword "close to" $u$; more generally if $u$ is any string, there is _at most_ 1 codeword within radius $\delta_{\textrm{unique}}$
+- for Reed-Solomon, $\delta_{\textrm{unique}}\approx (1-\rho)/2$
+
+After this point, it becomes a bit more involved:
+
+- the Johnson radius is $\delta_{\textrm{Johnson}} = 1 - \sqrt{1 − \mu} \approx 1-\sqrt{\rho}$
+- within the Johnson radius, that is $\delta<\delta_{\textrm{Johnson}}$ we have the Johnson bound for the number of codewords "closer than $\delta$": $|\textrm{List}|<1/\epsilon(\delta)$, where $\epsilon(\delta) = 2\sqrt{1-\mu}(1-\sqrt{1-\mu}-\delta) \approx 2\sqrt{\rho}(1-\sqrt{\rho}-\delta)$
+- the capacity radius is $\delta_{\textrm{capacity}} = 1-\rho$. Between the Johnson radius and the capacity radious, it's somewhat unknown territory, but there are conjectures that for "nice" codes (like RS), the number of close codewords is $\mathsf{poly}(N)$.
+- above the capacity radius, the number of codewords is at least $|\mathbb{F}|$
+
+In summary (for Reed-Solomon): Below in the unique decoding radius $\delta < (1-\rho)/2$ everything is fine. 
+
+Above it we are in the "list decoding regime". Below the Johnson radius $\delta < 1-\sqrt{\rho}$ we can bound the number of close codewords explicitely:
+
+$$ \Big|\big\{ v\in\mathcal{C} \;:\; \Delta(u,v)<\delta \;\big\}\Big| \le \frac{1}{2\sqrt{\rho}(1-\sqrt{\rho}-\delta)} $$
+
+Finally, below the capacity radius we have some conjectures about the asymptotic growth of the number of close codewords.
+
+### Reed-Solomon IOP of proximity
+
+TODO; see Lecture 15
+
+### Soundness of basic FRI
+
+TODO
+
+### Batch FRI
+
+TODO; see Haböck's paper.
+
+### FRI as polynomial commiment and out-of-domain sampling
+
+TODO
+
+### References
+
+- Eli Ben-Sasson, Dan Carmon, Yuval Ishai, Swastik Kopparty, and Shubhangi Saraf: _"Proximity gaps for Reed-Solomon codes"_
+- Ulrich Haböck: _"A summary on the FRI low degree test"_
diff --git a/reference/.gitignore b/reference/.gitignore
new file mode 100644
index 0000000..12c1a12
--- /dev/null
+++ b/reference/.gitignore
@@ -0,0 +1 @@
+.ghc.environment.*
diff --git a/reference/LICENSE b/reference/LICENSE
new file mode 100644
index 0000000..61783bf
--- /dev/null
+++ b/reference/LICENSE
@@ -0,0 +1,7 @@
+Dual licensed under MIT and Apache-V2 (see the corresponding documents).
+
+Significant portions of code is taken from "zikkurat-algebra" (c) 2023-2025, Faulhorn Labs
+
+The rest is (c) 2025 Status Research & Development GmbH
+
+This is experimental software, no warranties whatsoever.
diff --git a/reference/README.md b/reference/README.md
new file mode 100644
index 0000000..fd656d1
--- /dev/null
+++ b/reference/README.md
@@ -0,0 +1,24 @@
+Reference implementation in Haskell
+-----------------------------------
+
+First we implement a slow but hopefully easier to understand version in Haskell,
+to get more familiarity with all the details.
+
+The implementation is loosely based on (the FRI portion of) Plonky2, which in 
+turn is more-or-less the same as the DEEP-FRI paper. We use different conventions 
+though, as Plonky2 is rather over-complicated.
+
+See the [docs](../docs/) directory for details.
+
+### Improving performance
+
+We could significantly improve the speed of the Haskell implementation by binding C code 
+(from the [`zikkurat-algebra`](https://github.com/faulhornlabs/zikkurat-algebra/) library)
+for some of the critical routines: Goldilocks field and extension, hashing,
+fast Fourier transform.
+
+### References
+
+- E. Ben-Sasson, L. Goldberg, S. Kopparty, and S. Saraf: _"DEEP-FRI: Sam-
+pling outside the box improves soundness"_ - https://ia.cr/2019/336.
+- Ulrich Haböck: _"A summary on the FRI low degree test"_
diff --git a/reference/src/FRI.hs b/reference/src/FRI.hs
new file mode 100644
index 0000000..41986a0
--- /dev/null
+++ b/reference/src/FRI.hs
@@ -0,0 +1,21 @@
+
+module FRI
+  ( module Field.Goldilocks
+  , module Field.Goldilocks.Extension
+  , module NTT.Subgroup
+  , module FRI.Prover
+  , module FRI.Verifier
+  , module FRI.LDE
+  , module FRI.Matrix
+  , module FRI.Types
+  )
+  where
+
+import Field.Goldilocks
+import Field.Goldilocks.Extension ( FExt )
+import NTT.Subgroup
+import FRI.Prover
+import FRI.Verifier
+import FRI.LDE
+import FRI.Matrix
+import FRI.Types
\ No newline at end of file
diff --git a/reference/src/FRI/LDE.hs b/reference/src/FRI/LDE.hs
new file mode 100644
index 0000000..86790db
--- /dev/null
+++ b/reference/src/FRI/LDE.hs
@@ -0,0 +1,55 @@
+
+-- | Low-degree extension, that is, Reed-Solomon encoding of the data columns
+
+{-# LANGUAGE RecordWildCards #-}
+module FRI.LDE 
+  ( module Field.Goldilocks          
+  , module Field.Goldilocks.Extension
+  , module FRI.Matrix
+  , ldeEncodeMatrix
+  , ldeEncodeColumn
+  )
+  where
+
+--------------------------------------------------------------------------------
+
+import Data.Array ( Array )
+import Data.Array.IArray
+
+import Field.Goldilocks           ( F    )
+import Field.Goldilocks.Extension ( FExt )
+
+import NTT
+import Hash
+import Misc
+
+import FRI.Matrix
+import FRI.Types
+
+--------------------------------------------------------------------------------
+
+-- | Reed-Solomon encode the columns of the input (data) matrix
+ldeEncodeMatrix :: RSConfig -> Matrix F -> Matrix F
+ldeEncodeMatrix rsConfig@(MkRSConfig{..}) dataMatrix 
+  | n /= exp2_ rsDataSize  = error "ldeEncodeMatrix: input data column dimension is not compatible with the RS configuration" 
+  | otherwise              = joinColumns ldeColumns
+  where
+    (n,m) = matrixDimensions dataMatrix
+    ldeColumns = amap (ldeEncodeColumn rsConfig) (matrixColumns dataMatrix)
+
+-- | Reed-Solomon encode a single column vector
+ldeEncodeColumn :: RSConfig -> Vector F -> Vector F
+ldeEncodeColumn rsConfig@(MkRSConfig{..}) dataVector 
+  | n /= exp2_ rsDataSize  = error "ldeEncodeColumn: input data column dimension is not compatible with the RS configuration" 
+  | otherwise              = ldeVector
+  where
+    n = vectorLength dataVector
+    sg1 = getSubgroup  rsDataSize
+    sg2 = getSubgroup (rsDataSize + rsRateBits)
+    coset1 = MkCoset sg1 rsCosetShift
+    coset2 = MkCoset sg2 rsCosetShift
+    poly      =           cosetINTT coset1 dataVector
+    ldeVector = asymmetricCosetNTT  coset2 poly
+
+--------------------------------------------------------------------------------
+
diff --git a/reference/src/FRI/Matrix.hs b/reference/src/FRI/Matrix.hs
new file mode 100644
index 0000000..15a96a5
--- /dev/null
+++ b/reference/src/FRI/Matrix.hs
@@ -0,0 +1,56 @@
+
+module FRI.Matrix where
+
+--------------------------------------------------------------------------------
+
+import Data.Array ( Array )
+import Data.Array.IArray
+
+import Misc
+
+--------------------------------------------------------------------------------
+
+type Vector a = Array  Int      a
+type Matrix a = Array (Int,Int) a
+
+-- type MatrixF   = Matrix F
+-- type VectorF   = Vector F
+-- type VectorExt = Vector FExt
+
+vectorLength :: Array Int a -> Int
+vectorLength vector = n+1 where
+  (0,n) = bounds vector
+
+matrixDimensions :: Matrix a -> (Int,Int)
+matrixDimensions matrix = (n+1,m+1) where
+  ((0,0),(n,m)) = bounds matrix
+
+extractColumn :: Matrix a -> Int -> Vector a
+extractColumn matrix j 
+  | 0 <= j && j < m  = listArray (0,n-1) [ matrix!(i,j) | i<-[0..n-1] ]
+  | otherwise        = error "extractColumn: column index out of range" 
+  where
+    (n,m) = matrixDimensions matrix
+
+extractRow :: Matrix a -> Int -> Vector a
+extractRow matrix i 
+  | 0 <= i && i < n  = listArray (0,m-1) [ matrix!(i,j) | j<-[0..m-1] ]
+  | otherwise        = error "extractRow: row index out of range" 
+  where
+    (n,m) = matrixDimensions matrix
+
+matrixColumns :: Matrix a -> Array Int (Vector a)
+matrixColumns matrix = listArray (0,m-1) [ extractColumn matrix j | j<-[0..m-1] ] where
+  (n,m) = matrixDimensions matrix
+
+matrixRows :: Matrix a -> Array Int (Vector a)
+matrixRows matrix = listArray (0,n-1) [ extractRow matrix i | i<-[0..n-1] ] where
+  (n,m) = matrixDimensions matrix
+
+joinColumns :: Array Int (Vector a) -> Matrix a
+joinColumns columns = array ((0,0),(n-1,m-1)) entries where
+  m = arrayLength   columns
+  n = vectorLength (columns!0)
+  entries = [ ( (i,j) , (columns!j)!i ) | i<-[0..n-1] , j<-[0..m-1] ]
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/FRI/Prover.hs b/reference/src/FRI/Prover.hs
new file mode 100644
index 0000000..ae60365
--- /dev/null
+++ b/reference/src/FRI/Prover.hs
@@ -0,0 +1,241 @@
+
+{-# LANGUAGE RecordWildCards, StrictData #-}
+module FRI.Prover where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Data.Bits
+import Data.Word
+
+import Control.Monad.IO.Class
+import System.Random
+
+import Field.Goldilocks           ( F          )
+import Field.Goldilocks.Extension ( FExt , scl )
+import Field.Encode
+
+import NTT
+import Hash
+import Misc
+
+import Hash.Duplex.Pure ( DuplexState )
+import Hash.Duplex.Monad hiding ( absorb )
+import qualified Hash.Duplex.Monad as Duplex
+
+import FRI.LDE
+import FRI.Matrix
+import FRI.Shared
+import FRI.Types
+
+--------------------------------------------------------------------------------
+
+data MerkleCommitments = MkMerkleCommitments 
+  { _dataCommitment :: MerkleCap
+  , _ldeCommitment  :: MerkleCap
+  }
+  deriving Show
+
+encodeAndProveFRI :: FriConfig -> Matrix F -> DuplexIO (MerkleCommitments, FriProof)
+encodeAndProveFRI friConfig@(MkFriConfig{..}) dataMatrix = 
+  do
+    absorb friConfig                                                   -- initialize Fiat-Shamir with the global parameters
+    absorb matrixCap                                                   -- absorb the matrix hash
+    alpha <- squeeze :: DuplexIO FExt                                  -- row combining coefficient challenge alpha
+    let bigColumn = combineColumnsWith alpha ldeColumns                -- combined column
+    let bigPoly   = interpolate bigCoset bigColumn                     -- initial polynomial
+    (phases, finalPoly) <- repeatedlyFoldPoly 
+        friConfig friReductionStrategy bigCoset bigPoly                -- compute commit phases 
+    absorb finalPoly                                                   -- absorb the final final polynomial
+    MkPoW powWitness powResponse <- performGrinding friGrindingBits    -- do the grinding
+    queryIndices <- generateQueryIndices nn friNQueryRounds            -- generate query indices
+    let queries = map (singleQueryRound phases) queryIndices           -- execute the query rounds
+
+    -- only for debugging purposes
+    let challenges = MkFriChallenges
+          { friAlpha         = alpha
+          , friBetas         = map commitPhaseBeta phases
+          , friGrindResponse = powResponse
+          , friQueryIndices  = queryIndices
+          }
+    duplexPPrint "challenges" challenges
+
+    let friProof = MkFriProof 
+          { proofFriConfig       = friConfig
+          , proofCommitPhaseCaps = map commitPhaseMerkleCap phases
+          , proofFinalPoly       = finalPoly
+          , proofQueryRounds     = queries
+          , proofPowWitness      = powWitness
+          }
+
+    let commits = MkMerkleCommitments 
+          { _dataCommitment = origDataCap
+          , _ldeCommitment  = matrixCap
+          }
+
+    return ( commits , friProof )
+  where
+
+    MkRSConfig{..} = friRSConfig
+
+    origDataCap = calcMerkleCap friMerkleCapSize (elems $ matrixRows dataMatrix)
+
+    ldeMatrix   = ldeEncodeMatrix friRSConfig dataMatrix
+    ldeRows     = matrixRows    ldeMatrix
+    ldeColumns  = matrixColumns ldeMatrix
+    
+    bigSubgroup = getSubgroup ldeSizeLog2
+    bigCoset    = MkCoset bigSubgroup rsCosetShift     -- initial evaluation domain
+
+    matrixTree  = calcArrayMerkleTree' (fullDomainIndexMapBwd friRSConfig) ldeRows :: MerkleTree FRow
+    matrixCap   = extractMerkleCap friMerkleCapSize matrixTree
+    nn          = arraySize (ldeColumns!0)
+    ldeSizeLog2 = exactLog2__ nn
+  
+    -- interpolate from the LDE combined column into a smaller degree polynomial
+    interpolate :: Coset F -> Vector FExt -> Poly FExt
+    interpolate ldeCoset@(MkCoset ldeSubgroup shift) ldeVector = cosetINTTExt dataCoset dataVector where
+      dataVector = extractSubgroupArray (exp2_ rsRateBits) ldeVector
+      dataCoset  = MkCoset (powSubgroup ldeSubgroup expFactor) shift
+      expFactor  = exp2_ rsRateBits
+
+    singleQueryRound :: [CommitPhaseData] -> Idx -> FriQueryRound
+    singleQueryRound phases queryIdx = result where
+      initialProof = extractMerkleProof' friMerkleCapSize matrixTree $ fullDomainIndexMapFwd friRSConfig queryIdx
+      result = MkFriQueryRound 
+        { queryRow              = _leafData   initialProof
+        , queryInitialTreeProof = _merklePath initialProof
+        , querySteps            = go phases queryIdx
+        }
+
+      go :: [CommitPhaseData] -> Idx -> [FriQueryStep]
+      go []                             _   = []
+      go (MkCommitPhaseData{..} : rest) idx = thisStep : go rest idx' where
+        intArity    = exp2_ commitPhaseArity
+        domainSize' = Prelude.div (cosetSize commitPhaseDomain) intArity    -- N/K
+        idx'        = Prelude.mod idx domainSize'                           -- idx' = idx mod (N/K)
+        proof       = extractMerkleProof' friMerkleCapSize commitPhaseMerkleTree idx'
+        cosetValues = elems (_leafData proof)
+        thisStep    = MkFriQueryStep
+          { queryEvals      = cosetValues
+          , queryMerklePath = _merklePath proof
+          }
+
+--------------------------------------------------------------------------------
+-- * Grinding
+
+data ProofOfWork = MkPoW 
+  { powWitness  :: F
+  , powResponse :: F
+  }
+  deriving (Eq,Show)
+
+performGrinding :: Log2 -> DuplexIO ProofOfWork
+performGrinding grindingBits = 
+  do
+    oldState <- unsafeGetInnerState
+    worker oldState
+  where
+    worker :: DuplexState -> DuplexIO ProofOfWork
+    worker origState = do
+      witnessCandidate <- liftIO randomIO :: DuplexIO F 
+      absorb witnessCandidate
+      responseCandidate <- squeeze 
+      case checkGrindBits grindingBits responseCandidate of
+        True  -> do
+          -- duplexPrint powCandidate
+          return $ MkPoW 
+            { powWitness  = witnessCandidate 
+            , powResponse = responseCandidate
+            }
+        False -> do
+          unsafeSetInnerState origState
+          worker origState
+
+--------------------------------------------------------------------------------
+-- * Combining columns
+
+combineColumnsWith :: FExt -> Array Int (Array Int F) -> Array Int FExt
+combineColumnsWith alpha columns = listArray (0,n-1) list where
+  m = arraySize  columns
+  n = arraySize (columns!0)
+  alphas = powersOf m alpha
+  row i  = [ (columns!j)!i | j<-[0..m-1] ]
+  list   = [ sum (safeZipWith scl (row i) alphas) | i<-[0..n-1] ]
+
+--------------------------------------------------------------------------------
+-- * Folding
+
+data CommitPhaseData = MkCommitPhaseData 
+  { commitPhasePoly       :: Poly FExt                      -- ^ the polynomial before folding
+  , commitPhaseDomain     :: Coset F                        -- ^ evaluation domain
+  , commitPhaseArity      :: Log2                           -- ^ the folding arity
+  , commitPhaseBeta       :: FExt                           -- ^ folding coefficient beta
+  , commitPhaseMerkleTree :: MerkleTree (Array Int FExt)    -- ^ Merkle tree over the folded cosets
+  , commitPhaseMerkleCap  :: MerkleCap                      -- ^ commit phase Merkle cap
+  }
+  deriving Show
+
+-- | Starting from a polynomial of degree @N-1@, we repeatedly 
+-- 
+-- * evaluate it on the corresponding evaluation domain (coset)
+--
+-- * commit to the values with a Merkle tree
+--
+-- * generate a folding challenge (after absorbing the Merkle commitment)
+--
+-- * fold the polynomial and also the domain
+--
+
+repeatedlyFoldPoly :: FriConfig -> [Arity] -> Coset F -> Poly FExt -> DuplexIO ( [CommitPhaseData] , Poly FExt )
+repeatedlyFoldPoly (MkFriConfig{..}) arities domain poly = go arities domain poly where
+  go []           domain poly = return ( [] , poly )
+  go (arity:rest) domain poly = do
+    let intArity = exp2_ arity                                  -- size of the folded coset
+    let values   = asymmetricCosetNTTExt domain poly            -- values on the evaluation domain
+    let stride   = Prelude.div (cosetSize domain) intArity      -- folding coset stride
+    let cosets   = untangleArray' stride values                 -- cosets
+    let tree     = calcArrayMerkleTree cosets                   -- Merkle tree on the top of the cosets
+    let cap      = extractMerkleCap friMerkleCapSize tree
+    absorb cap                                                  -- commit to the Merkle cap
+    beta <- squeeze :: DuplexIO FExt                            -- generate folding challenge beta
+    -- duplexPrint "beta" beta
+    let thisPhase = MkCommitPhaseData 
+          { commitPhasePoly       = poly
+          , commitPhaseDomain     = domain
+          , commitPhaseArity      = arity
+          , commitPhaseBeta       = beta
+          , commitPhaseMerkleTree = tree
+          , commitPhaseMerkleCap  = cap
+          }
+    let poly'    = foldPoly arity beta poly
+    -- duplexPutStrLn ""
+    -- duplexPrint "poly"  poly
+    -- duplexPrint "poly'" poly'
+    let domain'  = powCoset domain intArity
+    (phases, final) <- go rest domain' poly'
+    return ( thisPhase:phases , final ) 
+
+{-
+repeatedlyFoldPoly :: [(Arity,FExt)] -> Poly FExt -> ( [Poly FExt] , Poly FExt )
+repeatedlyFoldPoly = go where
+  go []                  poly = ( []          , poly  )
+  go ((arity,beta):rest) poly = ( poly:phases , final ) where
+    poly' = foldPoly arity beta poly
+    (phases, final) = go rest poly'
+-}
+
+foldPoly :: Arity -> FExt -> Poly FExt -> Poly FExt
+foldPoly arity beta (Poly poly) 
+  | r /= 0    = error $ "foldPoly: input polynomial's size " ++ show bigSize ++ " is not divisible by the arity " ++ show intArity
+  | otherwise = Poly foldedPoly
+  where
+    intArity  = exp2_ arity
+    bigSize   = arraySize poly
+    smallSize = q
+    (q,r)  = divMod bigSize intArity
+    pieces = untangleArray' intArity poly 
+    foldedPoly = makeArray q $ \i -> sum (safeZipWith (*) betaCoeffs [ (pieces!j)!i | j<-[0..intArity-1] ])
+    betaCoeffs = powersOf intArity beta
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/FRI/Shared.hs b/reference/src/FRI/Shared.hs
new file mode 100644
index 0000000..622d536
--- /dev/null
+++ b/reference/src/FRI/Shared.hs
@@ -0,0 +1,78 @@
+
+-- | Stuff shared between the prover and the verifier
+
+{-# LANGUAGE RecordWildCards #-}
+module FRI.Shared where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Data.Bits
+import Data.Word
+
+import Control.Monad.IO.Class
+import System.Random
+
+import Field.Goldilocks           ( F          )
+import Field.Goldilocks.Extension ( FExt , scl )
+import Field.Encode
+
+import NTT
+import Hash
+import Misc
+
+import Hash.Duplex.Monad hiding ( absorb )
+import qualified Hash.Duplex.Monad as Duplex
+
+import FRI.Types
+
+--------------------------------------------------------------------------------
+
+absorb :: FieldEncode a => a -> DuplexIO ()
+absorb = Duplex.absorb . fieldEncode
+
+--------------------------------------------------------------------------------
+
+checkGrindBits :: Log2 -> F -> Bool
+checkGrindBits (Log2 grindingBits) candidate = (fromF candidate .&. mask == 0) where
+  mask = shiftL 1 grindingBits - 1 :: Word64
+
+--------------------------------------------------------------------------------
+
+type Idx = Int
+
+generateQueryIndices :: Int -> Int -> DuplexIO [Idx]
+generateQueryIndices nn nQueryRounds = do
+  felts <- squeezeN nQueryRounds :: DuplexIO [F]
+  return $ map (indexMapping nn) felts
+
+-- | Note: the bias is extremely small with Goldilocks, if @nn@ is a power of two
+indexMapping :: Int -> F -> Idx
+indexMapping nn x 
+  = fromIntegral 
+  $ mod (fromF x :: Word64) (fromIntegral nn :: Word64)   
+
+--------------------------------------------------------------------------------
+
+-- | Maps a natural index to the matrix Merkle tree leaf index
+-- 
+-- (we have to reorder the leaves, so that the original data is a subtree, 
+-- and the parity data consists of subtrees to)
+--
+fullDomainIndexMapFwd :: RSConfig -> Int -> Int
+fullDomainIndexMapFwd (MkRSConfig{..}) naturalIdx = merkleIdx where
+  mm = exp2_ rsDataSize 
+  kk = exp2_ rsRateBits
+  (q,j) = divMod naturalIdx kk
+  merkleIdx = q + mm * j
+
+-- | The inverse mapping
+fullDomainIndexMapBwd :: RSConfig -> Int -> Int
+fullDomainIndexMapBwd (MkRSConfig{..}) merkleIdx = naturalIdx where
+  mm = exp2_ rsDataSize 
+  kk = exp2_ rsRateBits
+  (j,q) = divMod merkleIdx mm
+  naturalIdx = q * kk + j
+
+--------------------------------------------------------------------------------
+
diff --git a/reference/src/FRI/Types.hs b/reference/src/FRI/Types.hs
new file mode 100644
index 0000000..7d4f205
--- /dev/null
+++ b/reference/src/FRI/Types.hs
@@ -0,0 +1,166 @@
+
+{-# LANGUAGE StrictData, RecordWildCards #-}
+module FRI.Types where
+  
+--------------------------------------------------------------------------------
+
+import Field.Goldilocks
+import Field.Goldilocks.Extension ( FExt , F2(..) )
+import Field.Encode
+
+import Hash.Merkle ( MerkleCap , MerkleProof , RawMerklePath , FRow )
+import NTT.Subgroup
+import NTT.Poly (Poly)
+import Misc
+
+--------------------------------------------------------------------------------
+
+type Indent = Int
+
+class Print a where
+  showWithIndent    :: Indent -> a -> [String]
+  showWithoutIndent :: a -> [String]
+
+  showWithIndent indent what = indentLines indent (showWithoutIndent what)
+  showWithoutIndent          = showWithIndent 0 
+
+printWithIndent :: Print a => Indent -> a -> IO ()
+printWithIndent indent what = putStrLn $ unlines (showWithIndent indent what)
+
+justPrint :: Print a => a -> IO ()
+justPrint = printWithIndent 0
+
+indentLines :: Int -> [String] -> [String]
+indentLines indent ls = map (prefix++) ls where prefix = replicate indent ' '
+
+--------------------------------------------------------------------------------
+
+-- | Reed-Solomon configuration
+data RSConfig = MkRSConfig 
+  { rsRateBits    :: Log2        -- ^ @r = -log2(rho)@
+  , rsDataSize    :: Log2        -- ^ @n = log2(N)@
+  , rsCosetShift  :: F           -- ^ the shift of the evaluation domain wrt. the subgroup
+  }
+  deriving (Eq,Show)
+
+exampleRSConfig :: RSConfig
+exampleRSConfig = MkRSConfig 8 3 theMultiplicativeGenerator
+
+instance FieldEncode RSConfig where
+  fieldEncode (MkRSConfig{..}) 
+    =  fieldEncode rsRateBits
+    ++ fieldEncode rsDataSize
+    ++ fieldEncode rsCosetShift
+
+rsEncodedSize :: RSConfig -> Log2
+rsEncodedSize cfg = rsDataSize cfg + rsRateBits cfg
+
+rsCosetSmall, rsCosetBig :: RSConfig -> Coset F
+rsCosetSmall cfg = MkCoset (getSubgroup $ rsDataSize cfg                 ) (rsCosetShift cfg)
+rsCosetBig   cfg = MkCoset (getSubgroup $ rsDataSize cfg + rsRateBits cfg) (rsCosetShift cfg)
+
+instance Print RSConfig where
+  showWithoutIndent (MkRSConfig{..}) = 
+    [ " - rsBateRits   = " ++ show rsRateBits
+    , " - rsDataSize   = " ++ show rsDataSize
+    , " - rsCosetShift = " ++ show rsCosetShift
+    ]
+
+--------------------------------------------------------------------------------
+
+-- | Folding arity
+type Arity = Log2
+
+type ReductionStrategy = [Arity]
+
+-- | FRI configuration
+data FriConfig = MkFriConfig 
+  { friRSConfig          :: RSConfig             -- ^ Reed-Solomon configuration
+  , friNColumns          :: Int                  -- ^ number of columns (batch FRI width)
+  , friMerkleCapSize     :: Log2                 -- ^ size of the Merkle caps
+  , friReductionStrategy :: ReductionStrategy    -- ^ folding arities
+  , friNQueryRounds      :: Int                  -- ^ number of query rounds
+  , friGrindingBits      :: Log2                 -- ^ grinding hardness
+  }
+  deriving (Eq,Show)
+
+instance Print FriConfig where
+  showWithIndent indent (MkFriConfig{..}) = 
+    [ " - friRSConfig\n"           ++ unlines1 (showWithIndent (indent+2) friRSConfig)
+    , " - friNColumns          = " ++ show friNColumns         
+    , " - friMerkleCapSize     = " ++ show friMerkleCapSize    
+    , " - friReductionStrategy = " ++ show (map fromLog2 friReductionStrategy)
+    , " - friNQueryRounds      = " ++ show friNQueryRounds     
+    , " - friGrindingBits      = " ++ show friGrindingBits     
+    ]
+
+-- instance FieldEncode ReductionStrategy where 
+--   fieldEncode = concatMap fieldEncode
+
+instance FieldEncode FriConfig where
+  fieldEncode (MkFriConfig{..}) = concat
+    [ fieldEncode friRSConfig          
+    , fieldEncode friNColumns          
+    , fieldEncode friMerkleCapSize     
+    , fieldEncode friReductionStrategy 
+    , fieldEncode friNQueryRounds      
+    , fieldEncode friGrindingBits      
+    ]
+
+--------------------------------------------------------------------------------
+
+data FriChallenges = MkFriChallenges
+  { friAlpha          :: FExt          -- ^ column linear combination coefficient
+  , friBetas          :: [FExt]        -- ^ folding step betas
+  , friGrindResponse  :: F             -- ^ the PoW response (computed via Fiat-Shamir), which should have predetermined bit patterns
+  , friQueryIndices   :: [Int]         -- ^ query indices
+  }
+  deriving (Eq,Show)
+
+--------------------------------------------------------------------------------
+
+data ReductionStrategyParams = MkRedStratPars
+  { redStoppingDegree :: Log2             -- ^ stopping degree        
+  , redFoldingArity   :: Log2             -- ^ default folding arity
+  }
+  deriving (Eq,Show)
+
+-- | stop at degree 32 and folding arity 16
+defaultReductionStrategyParams :: ReductionStrategyParams
+defaultReductionStrategyParams = MkRedStratPars
+  { redStoppingDegree = Log2 5            -- ^ stopping degree        
+  , redFoldingArity   = Log2 4            -- ^ default folding arity
+  }
+
+findReductionStrategy :: ReductionStrategyParams -> RSConfig -> ReductionStrategy
+findReductionStrategy (MkRedStratPars{..}) (MkRSConfig{..}) = worker (rsDataSize + rsRateBits) where
+  worker k 
+    | k <= redStoppingDegree                    = []
+    | k >= redStoppingDegree + redFoldingArity  = redFoldingArity : worker (k - redFoldingArity)
+    | otherwise                                 = [ k - redStoppingDegree ]
+
+--------------------------------------------------------------------------------
+
+data FriQueryStep = MkFriQueryStep
+  { queryEvals      :: [FExt] 
+  , queryMerklePath :: RawMerklePath
+  }
+  deriving (Eq,Show)
+
+data FriQueryRound = MkFriQueryRound 
+  { queryRow              :: FRow
+  , queryInitialTreeProof :: RawMerklePath
+  , querySteps            :: [FriQueryStep]
+  }
+  deriving (Eq,Show)
+
+data FriProof = MkFriProof 
+  { proofFriConfig       :: FriConfig          -- ^ the FRI configuration
+  , proofCommitPhaseCaps :: [MerkleCap]        -- ^ commit phase Merkle caps
+  , proofFinalPoly       :: Poly FExt          -- ^ the final polynomial in coefficient form
+  , proofQueryRounds     :: [FriQueryRound]    -- ^ query rounds
+  , proofPowWitness      :: F                  -- ^ witness showing that the prover did PoW
+  }
+  deriving Show
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/FRI/Verifier.hs b/reference/src/FRI/Verifier.hs
new file mode 100644
index 0000000..e00ee46
--- /dev/null
+++ b/reference/src/FRI/Verifier.hs
@@ -0,0 +1,68 @@
+
+{-# LANGUAGE RecordWildCards, StrictData #-}
+module FRI.Verifier where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Data.Bits
+import Data.Word
+
+import Control.Monad
+
+import Field.Goldilocks           ( F          )
+import Field.Goldilocks.Extension ( FExt , scl )
+import Field.Encode
+
+import NTT
+import Hash
+import Misc
+
+import Hash.Duplex.Pure ( DuplexState )
+import Hash.Duplex.Monad hiding ( absorb )
+import qualified Hash.Duplex.Monad as Duplex
+
+import FRI.Shared
+import FRI.Types
+
+--------------------------------------------------------------------------------
+
+verifyFRI :: MerkleCap -> FriProof -> DuplexIO Bool
+verifyFRI matrixCap friProof@(MkFriProof{..}) = do
+
+  challenges <- computeFriChallenges matrixCap friProof
+
+  duplexPPrint "verifier challenges" challenges
+
+  return False
+
+computeFriChallenges :: MerkleCap -> FriProof -> DuplexIO FriChallenges
+computeFriChallenges matrixCap (MkFriProof{..}) = do
+
+  absorb proofFriConfig                                            -- initialize Fiat-Shamir with the global parameters
+  absorb matrixCap                                                 -- absorb the matrix hash
+  alpha <- squeeze :: DuplexIO FExt                                -- row combining coefficient challenge alpha
+  betas <- forM proofCommitPhaseCaps $ \cap -> do
+    absorb cap                                                     -- commit to the (commit phase) Merkle cap
+    beta <- squeeze :: DuplexIO FExt                               -- generate folding challenge beta
+    return beta
+  absorb proofFinalPoly                                            -- absorb the final final polynomial
+  absorb proofPowWitness                                           -- absorb the grinding PoW witness
+  powResponse  <- squeeze :: DuplexIO F                            -- generate the PoW response 
+  queryIndices <- generateQueryIndices nn friNQueryRounds          -- generate query indices
+
+  return $ MkFriChallenges
+    { friAlpha         = alpha
+    , friBetas         = betas
+    , friGrindResponse = powResponse
+    , friQueryIndices  = queryIndices
+    }
+
+  where
+
+    MkFriConfig{..} = proofFriConfig
+    MkRSConfig{..}  = friRSConfig
+
+    nn = exp2_ (rsDataSize + rsRateBits)
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Field/Encode.hs b/reference/src/Field/Encode.hs
new file mode 100644
index 0000000..9f31015
--- /dev/null
+++ b/reference/src/Field/Encode.hs
@@ -0,0 +1,36 @@
+
+-- | Encode stuff into field elements
+--
+-- This is to be able to hash the Fiat-Shamir transcript
+--
+
+module Field.Encode where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+
+import Field.Goldilocks
+import Field.Goldilocks.Extension ( FExt , F2(..) )
+
+import Misc
+
+--------------------------------------------------------------------------------
+
+-- | This is to be able to hash stuff for the Fiat-Shamir transcript
+class FieldEncode a where
+  fieldEncode :: a -> [F]
+
+instance FieldEncode F    where fieldEncode x        = [x]
+instance FieldEncode F2   where fieldEncode (F2 x y) = [x,y]
+
+instance FieldEncode Log2 where fieldEncode (Log2 k) = [fromIntegral k]
+instance FieldEncode Int  where fieldEncode n        = [fromIntegral n]
+
+instance FieldEncode a => FieldEncode [a] where 
+  fieldEncode = concatMap fieldEncode 
+
+instance FieldEncode a => FieldEncode (Array Int a) where 
+  fieldEncode = concatMap fieldEncode . elems
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Field/Goldilocks.hs b/reference/src/Field/Goldilocks.hs
new file mode 100644
index 0000000..35787d0
--- /dev/null
+++ b/reference/src/Field/Goldilocks.hs
@@ -0,0 +1,8 @@
+
+module Field.Goldilocks 
+  ( module Field.Goldilocks.Slow
+  )
+  where
+
+import Field.Goldilocks.Slow
+
diff --git a/reference/src/Field/Goldilocks/Extension.hs b/reference/src/Field/Goldilocks/Extension.hs
new file mode 100644
index 0000000..dbe4cfd
--- /dev/null
+++ b/reference/src/Field/Goldilocks/Extension.hs
@@ -0,0 +1,144 @@
+
+-- | Quadratic extension over the Goldilocks field
+--
+-- We use the irreducble polynomial @x^2 - 7@ to be compatible with Plonky3
+
+module Field.Goldilocks.Extension where
+
+--------------------------------------------------------------------------------
+
+import Prelude hiding ( div )
+
+import Data.Bits
+import Data.Ratio
+
+import System.Random
+
+import Field.Goldilocks ( F )
+
+--------------------------------------------------------------------------------
+
+type FExt = F2
+
+data F2 = F2 
+  { real :: !F 
+  , imag :: !F 
+  }
+  deriving (Eq)
+
+instance Show F2 where
+  show (F2 r i) = "[ " ++ show r ++ " + j * " ++ show i ++ " ]"
+
+instance Num F2 where
+  fromInteger = inj . fromIntegral
+  negate = neg
+  (+)    = add
+  (-)    = sub
+  (*)    = mul
+  abs    = id
+  signum _ = inj 1
+
+instance Fractional F2 where
+  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
+  recip  = inv
+  (/)    = div
+
+instance Random F2 where
+  -- random :: RandomGen g => g -> (a, g) 
+  random  g = let (x,g' ) = random g
+                  (y,g'') = random g'
+              in  (F2 x y, g'')
+  randomR = error "randomR/F2: doesn't make any sense"
+
+--------------------------------------------------------------------------------
+
+inj :: F -> F2
+inj r = F2 r 0 
+
+neg :: F2 -> F2
+neg (F2 r i) = F2 (negate r) (negate i)
+
+add :: F2 -> F2 -> F2
+add (F2 r1 i1) (F2 r2 i2) = F2 (r1 + r2) (i1 + i2)
+
+sub :: F2 -> F2 -> F2
+sub (F2 r1 i1) (F2 r2 i2) = F2 (r1 - r2) (i1 - i2)
+
+scl :: F -> F2 -> F2
+scl s (F2 r i) = F2 (s * r) (s * i)
+
+sqrNaive :: F2 -> F2
+sqrNaive (F2 r i) = F2 r3 i3 where
+  r3 = r*r + 7 * i*i
+  i3 = 2 * r*i
+
+mulNaive :: F2 -> F2 -> F2
+mulNaive (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
+  r3 = r1 * r2 + 7 * i1 * i2
+  i3 = r1 * i2 +     i1 * r2
+
+-- uses Karatsuba trick to have one less multiplications
+mulKaratsuba :: F2 -> F2 -> F2
+mulKaratsuba (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
+  u = r1*r2
+  w = i1*i2
+  v = (r1+i1)*(r2+i2)
+  r3 = u + 7*w
+  i3 = v - u - w
+
+sqr :: F2 -> F2
+sqr = sqrNaive
+
+mul :: F2 -> F2 -> F2
+mul = mulKaratsuba
+
+--------------------------------------------------------------------------------
+-- * inverse and division
+
+-- | We can solve the equation explicitly.
+--
+-- > irred = x^2 + p*x + q
+-- > (a*x + b) * (c*x + d) = (a*c)*x^2 + (a*d+b*c)*x + (b*d)
+-- >                       = (a*d + b*c - a*c*p)*x + (b*d - a*c*q)
+--
+-- and then we want to solve
+--
+-- > b*d       - a*c*q == 1
+-- > a*d + b*c - a*c*p == 0
+--
+-- which has the solution:
+--
+-- > c = - a       / (b^2 - a*b*p + a^2*q)  
+-- > d = (b - a*p) / (b^2 - a*b*p + a^2*q)
+--
+-- Remark: It seems the denominator being zero would mean that our
+-- defining polynomial is not irreducible.
+--
+-- Note: we can optimize for the common case p=0; and also for q=1.
+--
+inv :: F2 -> F2
+inv (F2 b a) = F2 d c where
+  denom = b*b - 7*a*a
+  c = - a / denom
+  d =   b / denom
+
+div :: F2 -> F2 -> F2
+div u v = mul u (inv v)
+
+--------------------------------------------------------------------------------
+
+pow_ :: F2 -> Int -> F2
+pow_ x e = pow x (fromIntegral e)
+
+pow :: F2 -> Integer -> F2
+pow x e 
+  | e == 0    = 1
+  | e <  0    = pow (inv x) (negate e)
+  | otherwise = go 1 x e
+  where
+    go !acc _  0     = acc
+    go !acc !s !expo = case expo .&. 1 of
+      0 -> go acc     (sqr s) (shiftR expo 1)
+      _ -> go (acc*s) (sqr s) (shiftR expo 1)
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Field/Goldilocks/Slow.hs b/reference/src/Field/Goldilocks/Slow.hs
new file mode 100644
index 0000000..d82a15a
--- /dev/null
+++ b/reference/src/Field/Goldilocks/Slow.hs
@@ -0,0 +1,119 @@
+
+-- | Reference (slow) implementation of the Goldilocks prime field
+
+{-# LANGUAGE BangPatterns, NumericUnderscores #-}
+module Field.Goldilocks.Slow where
+
+--------------------------------------------------------------------------------
+
+import Prelude hiding ( div )
+import qualified Prelude
+
+import Data.Bits
+import Data.Word
+import Data.Ratio
+
+import System.Random
+
+import Text.Printf
+
+--------------------------------------------------------------------------------
+
+type F = Goldilocks
+
+fromF :: F -> Word64
+fromF (MkGoldilocks x) = fromInteger x
+
+toF :: Word64 -> F
+toF = mkGoldilocks . fromIntegral
+
+intToF :: Int -> F
+intToF = mkGoldilocks . fromIntegral
+
+--------------------------------------------------------------------------------
+
+newtype Goldilocks 
+  = MkGoldilocks Integer 
+  deriving Eq
+
+instance Show Goldilocks where
+  show (MkGoldilocks k) = printf "0x%016x" k
+
+--------------------------------------------------------------------------------
+
+instance Num Goldilocks where
+  fromInteger = mkGoldilocks
+  negate = neg
+  (+)    = add
+  (-)    = sub
+  (*)    = mul
+  abs    = id
+  signum _ = MkGoldilocks 1
+
+instance Fractional Goldilocks where
+  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
+  recip  = inv
+  (/)    = div
+
+instance Random Goldilocks where
+  -- random :: RandomGen g => g -> (a, g) 
+  random  g = let (x,g') = randomR (0,goldilocksPrime-1) g in (MkGoldilocks x, g')
+  randomR = error "randomR/Goldilocks: doesn't make much sense"
+
+--------------------------------------------------------------------------------
+
+-- | @p = 2^64 - 2^32 + 1@
+goldilocksPrime :: Integer
+goldilocksPrime = 0x_ffff_ffff_0000_0001
+
+modp :: Integer -> Integer
+modp a = mod a goldilocksPrime
+
+mkGoldilocks :: Integer -> Goldilocks
+mkGoldilocks = MkGoldilocks . modp
+
+-- | A fixed generator of the multiplicative subgroup of the field
+theMultiplicativeGenerator :: Goldilocks
+theMultiplicativeGenerator = mkGoldilocks 7
+
+--------------------------------------------------------------------------------
+
+neg :: Goldilocks -> Goldilocks
+neg (MkGoldilocks k) = mkGoldilocks (negate k) 
+
+add :: Goldilocks -> Goldilocks -> Goldilocks
+add (MkGoldilocks a) (MkGoldilocks b) = mkGoldilocks (a+b) 
+
+sub :: Goldilocks -> Goldilocks -> Goldilocks
+sub (MkGoldilocks a) (MkGoldilocks b) = mkGoldilocks (a-b) 
+
+sqr :: Goldilocks -> Goldilocks
+sqr x = mul x x
+
+mul :: Goldilocks -> Goldilocks -> Goldilocks
+mul (MkGoldilocks a) (MkGoldilocks b) = mkGoldilocks (a*b) 
+
+inv :: Goldilocks -> Goldilocks
+inv x = pow x (goldilocksPrime - 2)
+
+div :: Goldilocks -> Goldilocks -> Goldilocks
+div a b = mul a (inv b)
+
+--------------------------------------------------------------------------------
+
+pow_ :: Goldilocks -> Int -> Goldilocks
+pow_ x e = pow x (fromIntegral e)
+
+pow :: Goldilocks -> Integer -> Goldilocks
+pow x e 
+  | e == 0    = 1
+  | e <  0    = pow (inv x) (negate e)
+  | otherwise = go 1 x e
+  where
+    go !acc _  0     = acc
+    go !acc !s !expo = case expo .&. 1 of
+      0 -> go acc     (sqr s) (shiftR expo 1)
+      _ -> go (acc*s) (sqr s) (shiftR expo 1)
+
+--------------------------------------------------------------------------------
+
diff --git a/reference/src/Hash.hs b/reference/src/Hash.hs
new file mode 100644
index 0000000..5f33d81
--- /dev/null
+++ b/reference/src/Hash.hs
@@ -0,0 +1,11 @@
+
+module Hash 
+  ( module Hash.Common
+  , module Hash.Sponge
+  , module Hash.Merkle  
+  )
+  where
+
+import Hash.Common
+import Hash.Sponge
+import Hash.Merkle
diff --git a/reference/src/Hash/Common.hs b/reference/src/Hash/Common.hs
new file mode 100644
index 0000000..a7dfef8
--- /dev/null
+++ b/reference/src/Hash/Common.hs
@@ -0,0 +1,94 @@
+
+module Hash.Common where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Data.Bits
+import Data.Word
+
+import Field.Goldilocks
+import Field.Encode
+
+--------------------------------------------------------------------------------
+
+data Hash
+  = Monolith
+  deriving (Eq,Show)
+
+hashT :: Hash -> Int
+hashT hash = case hash of
+  Monolith      -> 12
+
+--------------------------------------------------------------------------------
+
+newtype Rate 
+  = Rate Int
+  deriving (Eq,Ord,Show)
+
+hashRate :: Hash -> Rate
+hashRate hash = case hash of
+  Monolith      -> Rate 8
+
+--------------------------------------------------------------------------------
+
+type State = Array Int F
+
+listToState' :: Int -> [F] -> State
+listToState' n = listArray (0,n-1)
+
+listToState :: Hash -> [F] -> State
+listToState hash = listToState' (hashT hash)
+
+zeroState' :: Int -> State
+zeroState' n = listToState' n (replicate n 0)
+
+zeroState :: Hash -> State
+zeroState hash = zeroState' (hashT hash)
+
+--------------------------------------------------------------------------------
+
+data Digest 
+  = MkDigest !F !F !F !F
+  deriving (Eq,Show)
+
+instance FieldEncode Digest where
+  fieldEncode (MkDigest a b c d) = [a,b,c,d]
+  
+zeroDigest :: Digest
+zeroDigest = MkDigest 0 0 0 0
+
+extractDigest :: State -> Digest
+extractDigest state = case elems state of 
+  (a:b:c:d:_) -> MkDigest a b c d
+
+listToDigest :: [F] -> Digest
+listToDigest [a,b,c,d] = MkDigest a b c d
+
+digestToList :: Digest -> [F]
+digestToList (MkDigest a b c d) = [a,b,c,d]
+
+--------------------------------------------------------------------------------
+
+digestToWord64s :: Digest -> [Word64]
+digestToWord64s (MkDigest a b c d) = [ fromF a, fromF b, fromF c, fromF d]
+
+digestToBytes :: Digest -> [Word8]
+digestToBytes = concatMap bytesFromWord64LE . digestToWord64s
+
+--------------------------------------------------------------------------------
+
+bytesFromWord64LE :: Word64 -> [Word8]
+bytesFromWord64LE = go 0 where
+  go 8  _  = []
+  go !k !w = fromIntegral (w .&. 0xff) : go (k+1) (shiftR w 8)
+
+bytesToWord64LE :: [Word8] -> Word64
+bytesToWord64LE = fromInteger . bytesToIntegerLE
+
+bytesToIntegerLE :: [Word8] -> Integer
+bytesToIntegerLE = go where
+  go []          = 0 
+  go (this:rest) = fromIntegral this + 256 * go rest
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Hash/Duplex/Monad.hs b/reference/src/Hash/Duplex/Monad.hs
new file mode 100644
index 0000000..8c2fb09
--- /dev/null
+++ b/reference/src/Hash/Duplex/Monad.hs
@@ -0,0 +1,103 @@
+
+-- | Monadic interface to do Fiat-Shamir challenges
+
+{-# LANGUAGE StrictData, GeneralizedNewtypeDeriving #-}
+module Hash.Duplex.Monad where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+
+import Control.Monad
+import Control.Monad.Identity
+import qualified Control.Monad.State.Strict as S
+import Control.Monad.IO.Class
+
+import Text.Show.Pretty
+
+import Field.Goldilocks
+import Hash.Common
+import Hash.Duplex.Pure ( DuplexState, Squeeze, Absorb , theHashFunction )
+import qualified Hash.Duplex.Pure as Pure
+
+--------------------------------------------------------------------------------
+-- * Monadic interface
+
+newtype DuplexT m a 
+  = DuplexT (S.StateT DuplexState m a)
+  deriving (Functor,Applicative,Monad)
+
+type Duplex a = DuplexT Identity a
+
+runDuplexT :: Monad m => DuplexT m a -> State -> m a
+runDuplexT (DuplexT action) ini = S.evalStateT action (Pure.duplexInitialState ini)
+
+runDuplex :: Duplex a -> State -> a
+runDuplex action ini = runIdentity (runDuplexT action ini)
+
+absorb :: (Monad m, Absorb a) => a -> DuplexT m ()
+absorb x = DuplexT $ S.modify (Pure.absorb x)
+
+squeeze :: (Monad m, Squeeze a) => DuplexT m a
+squeeze = DuplexT $ S.state Pure.squeeze
+
+squeezeN :: (Monad m, Squeeze a) => Int -> DuplexT m [a]
+squeezeN n = DuplexT $ S.state (Pure.squeezeN n)
+
+-- | For debugging only
+inspectDuplexState :: Monad m => DuplexT m (DuplexState) 
+inspectDuplexState = DuplexT S.get
+
+--------------------------------------------------------------------------------
+-- * Access to the internal state (so that we can implement grinding)
+
+unsafeGetInnerState :: Monad m => DuplexT m DuplexState
+unsafeGetInnerState = DuplexT S.get
+
+unsafeSetInnerState :: Monad m => DuplexState -> DuplexT m ()
+unsafeSetInnerState s = DuplexT (S.put s)
+
+--------------------------------------------------------------------------------
+-- * Duplex in IO
+
+type DuplexIO a = DuplexT IO a
+
+instance MonadIO (DuplexT IO) where 
+  liftIO action = DuplexT (liftIO action)
+
+duplexPutStrLn :: String -> DuplexIO ()
+duplexPutStrLn s = DuplexT (liftIO $ putStrLn s)
+
+duplexPrint_ :: Show a => a -> DuplexIO ()
+duplexPrint_ x = DuplexT (liftIO $ print x)
+
+duplexPrint :: Show a => String -> a -> DuplexIO ()
+duplexPrint n x = DuplexT (liftIO $ putStrLn $ n ++ " = " ++ show x)
+
+duplexPPrint :: Show a => String -> a -> DuplexIO ()
+duplexPPrint n x = DuplexT (liftIO $ putStrLn $ n ++ ":\n\n" ++ ppShow x ++ "\n")
+
+printDuplexState :: DuplexIO ()
+printDuplexState = duplexPrint "state" =<< inspectDuplexState
+
+runDuplexIO :: DuplexIO a -> State -> IO a
+runDuplexIO = runDuplexT
+
+runDuplexIO_ :: DuplexIO a -> IO a
+runDuplexIO_ action 
+  = runDuplexIO action 
+  $ zeroState theHashFunction
+
+--------------------------------------------------------------------------------
+
+duplexTest :: Int -> IO ()
+duplexTest m = runDuplexT action (zeroState theHashFunction) where
+
+  action :: DuplexIO ()
+  action = do
+    forM_ [0..19] $ \(k :: Int) -> do
+      absorb (map intToF [1..k])
+      ys <- squeezeN k :: DuplexIO [F]
+      duplexPrint_ ys
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Hash/Duplex/Pure.hs b/reference/src/Hash/Duplex/Pure.hs
new file mode 100644
index 0000000..c8535a0
--- /dev/null
+++ b/reference/src/Hash/Duplex/Pure.hs
@@ -0,0 +1,114 @@
+
+-- | Duplex sponge used for Fiat-Shamir challenges
+
+{-# LANGUAGE StrictData, GeneralizedNewtypeDeriving #-}
+module Hash.Duplex.Pure 
+  ( DuplexState
+  , duplexInitialState
+  , Absorb(..)
+  , Squeeze(..)
+  , squeezeN
+  , theHashFunction
+  )
+  where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+
+import Field.Goldilocks           ( F )
+import Field.Goldilocks.Extension ( FExt , F2(..) )
+import Hash.Permutations
+import Hash.Common
+
+--------------------------------------------------------------------------------
+
+theHashFunction :: Hash
+theHashFunction = Monolith
+
+--------------------------------------------------------------------------------
+
+-- | Duplex sponge construction with overwrite mode
+data DuplexState 
+  = Absorbing { duplexOld :: State, duplexInp :: [F] } 
+  | Squeezing { duplexOld :: State, duplexOut :: [F] }
+  deriving (Eq,Show)
+
+duplexInitialState :: State -> DuplexState
+duplexInitialState state = Absorbing state [] 
+
+overwrite :: [F] -> State -> State
+overwrite new old = listToState theHashFunction $ new ++ drop (length new) (elems old)
+
+duplex :: [F] -> State -> State
+duplex inp old = permute theHashFunction (overwrite inp old)
+
+extract :: State -> [F]
+extract state = reverse $ take rate (elems state) where 
+  rate = 8
+
+freshSqueezing :: State -> DuplexState
+freshSqueezing new = Squeezing new (extract new)
+
+--------------------------------------------------------------------------------
+
+absorbFelt :: F -> DuplexState -> DuplexState
+absorbFelt x mode = 
+  case mode of
+    Squeezing old _   -> absorbFelt x (Absorbing old [])
+    Absorbing old inp -> if length inp < rate
+      then                Absorbing             old (inp ++ [x]) 
+      else absorbFelt x $ Absorbing (duplex inp old) [] 
+  where
+    rate = 8
+
+squeezeFelt :: DuplexState -> (F, DuplexState)
+squeezeFelt mode =
+  case mode of
+    Squeezing old out -> case out of
+      []     -> let new = permute theHashFunction old
+                in  squeezeFelt $ freshSqueezing new 
+      (y:ys) -> (y, Squeezing old ys)
+    Absorbing old inp -> case inp of
+      []     -> squeezeFelt $ freshSqueezing (permute theHashFunction old) 
+      (x:xs) -> squeezeFelt $ freshSqueezing (duplex inp old) 
+
+--------------------------------------------------------------------------------
+
+class Absorb a where
+  absorb :: a -> DuplexState -> DuplexState
+
+instance Absorb F where
+  absorb = absorbFelt
+
+instance Absorb FExt where absorb (F2 a b) = absorb [a,b]
+
+instance Absorb a => Absorb [a] where
+  absorb []     = id
+  absorb (x:xs) = absorb xs . absorb x
+
+instance Absorb Digest where absorb h = absorb (digestToList h)
+
+-- instance Absorb MerkleCap where
+--   absorb (MkMerkleCap digests) = absorb digests
+
+--------------------------------------------------------------------------------
+
+class Squeeze a where
+  squeeze :: DuplexState -> (a, DuplexState)
+
+squeezeN :: Squeeze a => Int -> DuplexState -> ([a], DuplexState)
+squeezeN 0 state0 = ([],state0)
+squeezeN n state0 = let (x   , state1) = squeeze        state0 
+                        (xs  , state2) = squeezeN (n-1) state1
+                    in  (x:xs, state2)
+
+instance Squeeze F where squeeze = squeezeFelt
+
+instance Squeeze FExt where 
+  squeeze state0 = 
+    let (x, state1) = squeeze state0
+        (y, state2) = squeeze state1
+    in  (F2 x y, state2)
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Hash/Merkle.hs b/reference/src/Hash/Merkle.hs
new file mode 100644
index 0000000..3126900
--- /dev/null
+++ b/reference/src/Hash/Merkle.hs
@@ -0,0 +1,298 @@
+
+{-| Merkle tree construction (using a T=12 hash)
+
+Conventions:
+
+  * we use a "keyed compression function" to avoid collisions for different inputs
+
+  * when hashing the bottom-most layer, we use the key bit 0x01
+
+  * when hashing an odd layer, we pad with a single 0 hash and use the key bit 0x02
+
+  * when building a tree on a singleton input, we apply 1 round of compression 
+    (with key 0x03, as it's both the bottom-most layer and odd)
+
+-}
+
+{-# LANGUAGE StrictData #-}
+module Hash.Merkle where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Data.Bits
+
+import Field.Goldilocks
+import Field.Goldilocks.Extension ( FExt , F2(..) )
+import Field.Encode
+
+import Hash.Permutations
+import Hash.Common
+import Hash.Sponge
+
+import Misc
+
+--------------------------------------------------------------------------------
+
+type Key = Int
+
+theHashFunction :: Hash
+theHashFunction = Monolith
+
+--------------------------------------------------------------------------------
+
+type FRow = Array Int F
+
+hashFRow :: FRow -> Digest
+hashFRow farr = hashFieldElems theHashFunction (elems farr)
+
+hashFExt :: FExt -> Digest
+hashFExt (F2 x y) = hashFieldElems theHashFunction [x,y]
+
+{-
+data LeafData
+  = RowData   FRow
+  | Singleton FExt
+  deriving (Eq,Show)
+
+instance FieldEncode LeafData where
+  fieldEncode (RowData   farr) = elems farr
+  fieldEncode (Singleton fext) = fieldEncode fext
+
+hashLeafData :: LeafData -> Digest
+hashLeafData leaf = case leaf of
+  RowData   frow  -> hashFRow frow
+  Singleton fext  -> hashFExt fext
+-}
+
+hashAny :: FieldEncode a => a -> Digest
+hashAny = hashFieldElems theHashFunction . fieldEncode
+
+--------------------------------------------------------------------------------
+
+newtype MerkleCap 
+  = MkMerkleCap { fromMerkleCap :: Array Int Digest }
+  deriving (Eq,Show)
+
+instance FieldEncode MerkleCap where
+  fieldEncode (MkMerkleCap arr) = concatMap fieldEncode (elems arr)
+
+merkleCapSize :: MerkleCap -> Int
+merkleCapSize (MkMerkleCap ds) = (arrayLength ds)
+
+merkleCapLogSize :: MerkleCap -> Log2 
+merkleCapLogSize (MkMerkleCap ds) = exactLog2__ (arrayLength ds)
+
+-- | Computes the root of a Merkle cap 
+--
+-- (we implicitly assume that the cap was not the bottom layer)
+merkleCapRoot :: MerkleCap -> Digest
+merkleCapRoot (MkMerkleCap hashArray) =  
+  case elems hashArray of
+    []  -> error "merkleCapRoot: fatal: input is empty"
+    [z] -> keyedCompress theHashFunction (nodeKey BottomLayer OddNode) z zeroDigest
+    zs  -> go zs 
+  where
+    go :: [Digest] -> Digest
+    go [x] = x
+    go xs  = go (map (evenOddCompressPair OtherLayer) $ eiPairs xs)
+
+--------------------------------------------------------------------------------
+
+-- | Note: index 0 is the bottom (widest) layer
+data MerkleTree a = MkMerkleTree 
+  { _merkleTree   :: Array Int (Array Int Digest)
+  , _merkleLeaves :: Array Int a -- LeafData
+  }
+  deriving Show
+
+-- | @log2( number-of-leaves )@.
+-- 
+-- NOTE: this is one less than the actual number of layers! 
+-- However it equals to the length of a Merkle path 
+--
+merkleTreeDepth :: MerkleTree a -> Log2
+merkleTreeDepth = Log2 . merkleTreeDepth_
+
+merkleTreeDepth_ :: MerkleTree a -> Int
+merkleTreeDepth_ (MkMerkleTree outer _) = (b - a) where
+  (a,b) = bounds outer
+
+extractMerkleCap :: Log2 -> MerkleTree a -> MerkleCap
+extractMerkleCap (Log2 capdepth) (MkMerkleTree layers _) = cap where
+  (0,n) = bounds layers
+  cap = MkMerkleCap (layers ! (n-capdepth))
+
+treeBottomLayer :: MerkleTree a -> Array Int Digest
+treeBottomLayer (MkMerkleTree outer _) = outer!0
+
+--------------------------------------------------------------------------------
+
+-- | Only the Merkle path (siblings)
+newtype RawMerklePath
+  = MkRawMerklePath [Digest]
+  deriving (Eq,Show)
+
+instance FieldEncode RawMerklePath where
+  fieldEncode (MkRawMerklePath ds) = concatMap fieldEncode ds
+
+data MerkleProof a = MkMerkleProof
+  { _leafIndex   :: Int               -- ^ linear index of the leaf we prove, 0..dataSize-1
+  , _leafData    :: a                 -- ^ the data on the leaf
+  , _merklePath  :: RawMerklePath     -- ^ the path up the root
+  , _dataSize    :: Int               -- ^ number of leaves in the tree
+  }
+  deriving (Eq,Show)
+
+-- | Returns the leaf and Merkle path of the given leaf
+extractMerkleProof :: MerkleTree a -> Int -> MerkleProof a
+extractMerkleProof = extractMerkleProof' (Log2 0)
+
+-- | Returns the leaf and Merkle path of the given leaf, up to a given Merkle cap depth
+extractMerkleProof' :: Log2 -> MerkleTree a -> Int -> MerkleProof a
+extractMerkleProof' (Log2 capDepth) tree@(MkMerkleTree outer leaves) idx = MkMerkleProof idx leaf path size where
+  leaf  = leaves!idx
+  size  = arrayLength (outer!0)
+  depth = merkleTreeDepth_ tree
+  path  = MkRawMerklePath $ takePrefix (worker depth idx)
+
+  worker 0     0 = []
+  worker 0     _ = error "extractMerkleProof: this should not happen"
+  worker level j = this : worker (level-1) (shiftR j 1) where
+    this = outer ! (depth - level) ! (j `xor` 1)
+
+  takePrefix = take (depth - capDepth)
+
+--------------------------------------------------------------------------------
+
+calcMerkleTree' :: [Digest] -> [Array Int Digest]
+calcMerkleTree' input = 
+  case input of
+    []  -> error "calcMerkleTree': input is empty"
+    [z] -> [ singletonArray $ keyedCompress theHashFunction (nodeKey BottomLayer OddNode) z zeroDigest ]
+    zs  -> map listToArray (go layerFlags zs) 
+  where
+    go :: [LayerFlag] -> [Digest] -> [[Digest]]
+    go _ [x]     = [[x]]
+    go (f:fs) xs = xs : go fs (map (evenOddCompressPair f) $ eiPairs xs)
+
+calcMerkleTree :: FieldEncode a => [a] -> MerkleTree a
+calcMerkleTree input = MkMerkleTree tree leafData where
+  tree     = listToArray (calcMerkleTree' $ map hashAny input)
+  leafData = listToArray input
+
+calcArrayMerkleTree :: FieldEncode a => Array Int a -> MerkleTree a
+calcArrayMerkleTree = calcMerkleTree . elems
+
+-- | Applies a permutation of the rows.
+--
+-- We need the backward mapping (from Merkle tree indices to array indices)
+calcArrayMerkleTree' :: FieldEncode a => (Int -> Int) -> Array Int a -> MerkleTree a
+calcArrayMerkleTree' bwd arr = calcMerkleTree [ arr!(bwd i) | i<-[0..n-1] ] where
+  n = arraySize arr
+
+--------------------------------------------------------------------------------
+
+reconstructMerkleRoot :: FieldEncode a => MerkleProof a -> Digest
+reconstructMerkleRoot (MkMerkleProof idx leaf (MkRawMerklePath path) size) = digest where
+
+  digest = go layerFlags size idx (hashAny leaf) path 
+ 
+  go :: [LayerFlag] -> Int -> Int -> Digest -> [Digest] -> Digest
+  go _      !sz  0 !h []      = h
+  go (f:fs) !sz !j !h !(p:ps) = case (j.&.1, j==sz-1)  of
+    (0, False) -> go fs sz' j' (evenOddCompressPair f $ Right (h,p)) ps
+    (0, True ) -> go fs sz' j' (evenOddCompressPair f $ Left   h   ) ps
+    (1, _    ) -> go fs sz' j' (evenOddCompressPair f $ Right (p,h)) ps
+    where
+      sz' = shiftR (sz+1) 1
+      j'  = shiftR  j     1
+
+--------------------------------------------------------------------------------
+
+compress :: Hash -> Digest -> Digest -> Digest
+compress which (MkDigest a b c d) (MkDigest p q r s) = extractDigest output where
+  input  = listArray (0,11) [ a,b,c,d , p,q,r,s , 0,0,0,0 ]
+  output = permute which input
+
+keyedCompress ::  Hash -> Key -> Digest -> Digest -> Digest
+keyedCompress which key (MkDigest a b c d) (MkDigest p q r s) = extractDigest output where
+  k = fromIntegral key :: F
+  input  = listArray (0,11) [ a,b,c,d , p,q,r,s , k,0,0,0 ]
+  output = permute which input
+
+--------------------------------------------------------------------------------
+
+-- | bit masks
+keyBottom = 1 :: Key
+keyOdd    = 2 :: Key
+
+--------------------------------------------------------------------------------
+
+data LayerFlag  
+  = BottomLayer           -- ^ it's the bottom (initial, widest) layer
+  | OtherLayer            -- ^ it's not the bottom layer 
+  deriving (Eq,Show)
+
+data NodeParity
+  = EvenNode              -- ^ it has 2 children
+  | OddNode               -- ^ it has 1 child
+  deriving (Eq,Show)
+
+-- | Key based on the node type: 
+-- 
+-- > bit0 := 1 if bottom layer, 0 otherwise
+-- > bit1 := 1 if odd, 0 if even
+--
+nodeKey :: LayerFlag -> NodeParity -> Key
+nodeKey OtherLayer  EvenNode = 0x00
+nodeKey BottomLayer EvenNode = 0x01
+nodeKey OtherLayer  OddNode  = 0x02
+nodeKey BottomLayer OddNode  = 0x03
+
+evenOddCompressPair :: LayerFlag -> Either Digest (Digest,Digest) -> Digest
+evenOddCompressPair !lf (Right (x,y)) = keyedCompress theHashFunction (nodeKey lf EvenNode) x y
+evenOddCompressPair !lf (Left   x   ) = keyedCompress theHashFunction (nodeKey lf OddNode ) x zeroDigest
+
+eiPairs :: [a] -> [Either a (a,a)]
+eiPairs []         = []
+eiPairs [x]        = Left   x    : []
+eiPairs (x:y:rest) = Right (x,y) : eiPairs rest
+
+layerFlags :: [LayerFlag]    
+layerFlags = BottomLayer : repeat OtherLayer 
+
+calcMerkleRoot' :: [Digest] -> Digest
+calcMerkleRoot' input = 
+  case input of
+    []  -> error "calcMerkleRoot: input is empty"
+    [z] -> keyedCompress theHashFunction (nodeKey BottomLayer OddNode) z zeroDigest
+    zs  -> go layerFlags zs 
+  where
+    go :: [LayerFlag] -> [Digest] -> Digest
+    go _      [x] = x
+    go (f:fs) xs  = go fs (map (evenOddCompressPair f) $ eiPairs xs)
+
+calcMerkleRoot :: FieldEncode a => [a] -> Digest
+calcMerkleRoot = calcMerkleRoot' . map hashAny  -- hashLeafData
+
+--------------------------------------------------------------------------------
+
+calcMerkleCap' :: Log2 -> [Digest] -> MerkleCap
+calcMerkleCap' (Log2 capDepth) input = 
+  case input of
+    []  -> error "calcMerkleRoot: input is empty"
+    [z] -> MkMerkleCap $ listToArray $ [ keyedCompress theHashFunction (nodeKey BottomLayer OddNode) z zeroDigest ]
+    zs  -> MkMerkleCap $ listToArray $ select $ go layerFlags zs 
+  where
+    go :: [LayerFlag] -> [Digest] -> [[Digest]]
+    go _      [x] = [[x]]
+    go (f:fs) xs  = xs : go fs (map (evenOddCompressPair f) $ eiPairs xs)
+
+    select :: [[Digest]] -> [Digest]
+    select xs = xs !! (length xs - 1 - capDepth)
+
+calcMerkleCap :: FieldEncode a => Log2 -> [a] -> MerkleCap
+calcMerkleCap capDepth = calcMerkleCap' capDepth . map hashAny  
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Hash/Monolith/Constants.hs b/reference/src/Hash/Monolith/Constants.hs
new file mode 100644
index 0000000..1f597b0
--- /dev/null
+++ b/reference/src/Hash/Monolith/Constants.hs
@@ -0,0 +1,83 @@
+
+module Hash.Monolith.Constants where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+
+import Field.Goldilocks
+
+--------------------------------------------------------------------------------
+
+monolithRoundConstants :: Array (Int,Int) F
+monolithRoundConstants = listArray ((0,0),(5,11)) $ map toF $ concat
+  [ [ 0xbcaf2516e5926dcf
+    , 0x4ec5a76bce1e7676
+    , 0x9d804725bebb56ab
+    , 0x2ec05fca215a5be3
+    , 0xe16274e4acab86a0
+    , 0x80b0fddcc3c4380f
+    , 0xc87c769ad77ffece
+    , 0x37f85ec9117d287c
+    , 0x3b8d825b014c458d
+    , 0xb7a01d0cb850d75e
+    , 0x1333b751bac704bd
+    , 0x7b7ef14183d47b6f
+    ]
+  , [ 0x2114517643e3b286
+    , 0x542d15ea3cd12ade
+    , 0xe847d363f17a93e9
+    , 0x24f0421c6ff41c56
+    , 0x66e3eda93e2ca216
+    , 0xfb88d475279cb568
+    , 0x7f421c6269938a22
+    , 0xdbb973acce857401
+    , 0xe172409cb1563a6a
+    , 0x996f729f6340447d
+    , 0x925c579738b6fa4a
+    , 0x752e9ec9e0b34686
+    ]
+  , [ 0xdb419e0bd38469bd
+    , 0xba41cee828bd26d8
+    , 0xd6630f8f0969db39
+    , 0x2340e955ae2f0d94
+    , 0x282f553d35872e2e
+    , 0x77f7c3ff1ae496b3
+    , 0xf5f2efab64bc5eef
+    , 0x47b23a00830284f4
+    , 0xe18a2d2242486fa
+    , 0x3d101838a773dab0
+    , 0x47d686fd16856524
+    , 0x3eb2d254189b3534
+    ]
+  , [ 0xfe886e291ca8c5bd
+    , 0xb97ec74df1e4b0b6
+    , 0x574fdef3a600e370
+    , 0x8ad61c6f132d4feb
+    , 0x41e69ca4ecc7e8c7
+    , 0x151ad562e1f90ca4
+    , 0x747c051439a5603c
+    , 0x990151d3e52d502c
+    , 0x532c7f258282ea12
+    , 0x65e62cb34275dd5
+    , 0x5288008954f5d0b2
+    , 0xee7c3407cf3d6e02
+    ]
+  , [ 0xda07029808bad5de
+    , 0x7bebdf38dcc7a673
+    , 0x20a3f252688c312d
+    , 0x9c5248f7bbf8d188
+    , 0xcf1cf778994382d4
+    , 0x8c434b1738b8338c
+    , 0xfe504398813b67a8
+    , 0xe879562fdef813b9
+    , 0xd4666793b2a2f191
+    , 0xd9096b87de22de01
+    , 0xcaf4cea5f22abf34
+    , 0x3128d1e75d0204fa
+    ]
+  , replicate 12 0
+  ]
+
+--------------------------------------------------------------------------------
+
diff --git a/reference/src/Hash/Monolith/Permutation.hs b/reference/src/Hash/Monolith/Permutation.hs
new file mode 100644
index 0000000..60da24a
--- /dev/null
+++ b/reference/src/Hash/Monolith/Permutation.hs
@@ -0,0 +1,64 @@
+
+-- | The Monolith permutation with @t = 12@
+
+{-# LANGUAGE Strict #-}
+module Hash.Monolith.Permutation where
+
+--------------------------------------------------------------------------------
+
+import Data.Array (Array)
+import Data.Array.IArray
+import Data.Bits
+import Data.Word
+
+import Field.Goldilocks
+import Hash.Monolith.Constants
+import Hash.Common
+
+--------------------------------------------------------------------------------
+
+permutation :: State -> State
+permutation 
+  = foldr1 (.) (map monolithRound $ reverse [0..5])
+  . linearDiffusion
+
+monolithRound :: Int -> State -> State
+monolithRound ridx = concrete ridx . bricks . bars
+
+--------------------------------------------------------------------------------
+
+sboxByte :: Word8 -> Word8
+sboxByte y = rol1 $ y `xor` (rol1 ny .&. rol2 y .&. rol3 y) where
+  ny   = complement y
+  rol1 = flip rotateL 1
+  rol2 = flip rotateL 2
+  rol3 = flip rotateL 3
+
+sboxField :: F -> F
+sboxField = toF . bytesToWord64LE . map sboxByte . bytesFromWord64LE . fromF
+
+bars :: State -> State
+bars old = case splitAt 4 (elems old) of
+  (four,eight) -> listToState' 12 (map sboxField four ++ eight)
+
+bricks :: State -> State
+bricks old = listToState' 12 $ zipWith (+) (0 : map sqr xs) xs where xs = elems old
+
+concrete' :: [F] -> State -> State
+concrete' rcs = listToState' 12 . zipWith (+) rcs . elems . linearDiffusion
+
+concrete :: Int -> State -> State
+concrete ridx = concrete' [ monolithRoundConstants ! (ridx,j) | j<-[0..11] ]
+
+--------------------------------------------------------------------------------
+
+circulantRow :: State
+circulantRow = listToState' 12 [ 7, 23, 8, 26, 13, 10, 9, 7, 6, 22, 21, 8 ]
+
+linearDiffusion :: State -> State
+linearDiffusion old = listToState' 12 
+  [ sum [ old!j * circulantRow!(mod (j-k) 12) | j<-[0..11] ]
+  | k <- [0..11]
+  ]
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Hash/Permutations.hs b/reference/src/Hash/Permutations.hs
new file mode 100644
index 0000000..6fe3148
--- /dev/null
+++ b/reference/src/Hash/Permutations.hs
@@ -0,0 +1,16 @@
+
+module Hash.Permutations where
+
+--------------------------------------------------------------------------------
+
+import qualified Hash.Monolith.Permutation as Monolith
+
+import Hash.Common
+
+--------------------------------------------------------------------------------
+
+permute :: Hash -> State -> State
+permute hash = case hash of
+  Monolith -> Monolith.permutation
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Hash/Sponge.hs b/reference/src/Hash/Sponge.hs
new file mode 100644
index 0000000..24fc7f8
--- /dev/null
+++ b/reference/src/Hash/Sponge.hs
@@ -0,0 +1,119 @@
+
+{-# LANGUAGE ScopedTypeVariables, NumericUnderscores #-}
+
+{-| Sponge construction
+
+Conventions:
+
+  * when hashing a sequence of field elements, we pad using the @10*@ padding 
+    strategy to the next multiple of the rate
+
+  * when hashing a sequence of bytes, we only allow a rate of 4 or 8; we pad
+    to a multiple of 31 or 62 bytes (depending on the rate) using again the
+    @10*@ strategy, but now with bytes. We don't do extra padding on the 
+    resulting field element sequence, as it's unnecessary.
+
+  * when converting 31 bytes to 4 field elements, we use 62 bits for each
+    field element, interpreting them as a little-endian 62 bit numbers.
+
+  * when serializing a digest of four field elements, we interpret them
+    as 64 bit numbers (resulting in a 32 byte long hash digest)
+
+-}
+
+module Hash.Sponge where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Data.Bits
+import Data.Word
+import Data.List
+
+import Field.Goldilocks
+import Hash.Permutations
+import Hash.Common
+
+--------------------------------------------------------------------------------
+
+-- | Pad with @10*@ strategy
+splitAndPadSequence :: forall a. Num a => Int -> [a] -> [[a]]
+splitAndPadSequence r xs = go xs1 where
+  xs1 = xs ++ [0x01]
+  go :: [a] -> [[a]]
+  go list = case splitAt r list of 
+         (this,rest) -> case rest of
+           [] -> [this ++ replicate (r - length this) 0]
+           _  -> this : go rest
+
+--------------------------------------------------------------------------------
+
+hashFieldElems :: Hash -> [F] -> Digest
+hashFieldElems which = hashFieldElems' which (Rate 8)
+
+hashFieldElems' :: Hash -> Rate -> [F] -> Digest
+hashFieldElems' which rate@(Rate r) fels 
+  | r < 1 || r > 8  = error "the rate should be between 1 and 8"
+  | otherwise       = internalSponge which 63 rate (splitAndPadSequence r fels) 
+
+-- | @nbits@ is how many bits is the size of a single element of the original input sequence.
+-- This is used for domain separation, which is encoded as @domSep = 65536*nbits + 256*t + r@.
+--
+-- Some possible values:
+--
+--  * 1 for bit sequence
+--
+--  * 8 for byte sequence
+--
+--  * 63 for field element sequence
+--
+internalSponge :: Hash -> Int -> Rate -> [[F]] -> Digest
+internalSponge which nbits (Rate r) blocks = extractDigest (loop blocks iv) where
+  iv     = listArray (0,11) $ [ 0,0,0,0 , 0,0,0,0 , domSep,0,0,0 ] :: State
+  domSep = fromIntegral (65536*nbits + 256*t + r) :: F
+  t      = 12
+
+  step :: [F] -> State -> State
+  step block state = permute which (addToState block state)
+
+  loop :: [[F]] -> State -> State
+  loop list state = case list of
+    (this:rest) -> loop rest (step this state)
+    []          -> state
+
+addToState :: [F] -> State -> State
+addToState xs arr = listArray (0,11) $ zipWith (+) (xs ++ repeat 0) (elems arr)
+
+--------------------------------------------------------------------------------
+
+hashBytes :: Hash -> [Word8] -> Digest
+hashBytes which = hashBytes' which (Rate 8)
+
+hashBytes' :: Hash -> Rate -> [Word8] -> Digest
+hashBytes' which rate input = case rate of
+  Rate 4 -> internalSponge which nbits rate $ map decode31Bytes $ splitAndPadSequence 31 input
+  Rate 8 -> internalSponge which nbits rate $ map decode62Bytes $ splitAndPadSequence 62 input
+  _      -> error "for hashing of byte sequences, we only support rate = 4 or 8"
+  where
+    nbits = 8
+
+--------------------------------------------------------------------------------
+
+mask62bits :: Integer -> Word64
+mask62bits n = fromInteger (n .&. 0x_3fff_ffff_ffff_ffff) 
+
+decode31Bytes :: [Word8] -> [F]
+decode31Bytes input
+  | length input /= 31  = error "consume31Bytes: input is not exactly 31 bytes"
+  | otherwise           = [a,b,c,d]
+  where
+    a = toF $ mask62bits                 $ bytesToIntegerLE           input
+    b = toF $ mask62bits $ flip shiftR 6 $ bytesToIntegerLE $ drop  7 input
+    c = toF $ mask62bits $ flip shiftR 4 $ bytesToIntegerLE $ drop 15 input
+    d = toF $ mask62bits $ flip shiftR 2 $ bytesToIntegerLE $ drop 23 input
+
+decode62Bytes :: [Word8] -> [F]
+decode62Bytes input = decode31Bytes as ++ decode31Bytes bs where
+  (as,bs) = splitAt 31 input
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Misc.hs b/reference/src/Misc.hs
new file mode 100644
index 0000000..1a45253
--- /dev/null
+++ b/reference/src/Misc.hs
@@ -0,0 +1,177 @@
+
+module Misc where
+
+--------------------------------------------------------------------------------
+
+import Data.Bits
+import Data.Array
+import Data.List
+
+import qualified Data.Set as Set ; import Data.Set (Set)
+
+import Debug.Trace
+
+--------------------------------------------------------------------------------
+-- * Debug
+
+debug_ :: Show a => a -> b -> b
+debug_ x y = trace (">>> " ++ show x) y
+
+debug :: Show a => String -> a -> b -> b
+debug n x y = trace (">>> " ++ n ++ " = " ++ show x) y
+
+--------------------------------------------------------------------------------
+-- * Integers
+
+isEven :: Integer -> Bool
+isEven n = (n .&. 1) == 0 
+
+isOdd :: Integer -> Bool
+isOdd n = (n .&. 1) /= 0 
+
+--------------------------------------------------------------------------------
+-- * Strings
+
+-- | The difference from 'unlines' is that this one doesn't add a final newline
+unlines1 :: [String] -> String
+unlines1 = intercalate "\n" 
+
+--------------------------------------------------------------------------------
+-- * Log2
+
+newtype Log2 
+  = Log2 Int
+  deriving (Eq,Ord,Show,Num)
+
+fromLog2 :: Log2 -> Int
+fromLog2 (Log2 k) = k
+
+exp2 :: Log2 -> Integer
+exp2 (Log2 k) = shiftL 1 k
+
+exp2_ :: Log2 -> Int
+exp2_ (Log2 k) = shiftL 1 k
+
+-- | Smallest integer @k@ such that @2^k@ is larger or equal to @n@
+ceilingLog2 :: Integer -> Log2
+ceilingLog2 = Log2 . wrapper where
+  wrapper 0 = 0
+  wrapper n = 1 + go (n-1) where
+    go 0 = -1
+    go k = 1 + go (shiftR k 1)
+
+exactLog2 :: Integer -> Maybe Log2
+exactLog2 n = if re == n then Just log2 else Nothing where
+  log2 = ceilingLog2 n
+  re   = exp2 log2
+
+exactLog2_ :: Integer -> Log2
+exactLog2_ n = case exactLog2 n of
+  Just log2 -> log2
+  Nothing   -> error "exactLog2_: not a power of two"
+
+exactLog2__ :: Int -> Log2
+exactLog2__ = exactLog2_ . fromIntegral
+
+--------------------------------------------------------------------------------
+-- * Lists
+
+safeZipWith :: (a -> b -> c) -> [a] -> [b] -> [c]
+safeZipWith f = go where
+  go []     []     = []
+  go (x:xs) (y:ys) = f x y : go xs ys
+  go _      _      = error "safeZipWith: incompatible lengths"
+
+interleave :: [a] -> [a] -> [a]
+interleave (x:xs) (y:ys) = x:y:interleave xs ys
+interleave []     []     = []
+interleave _      _      = error "interleave: expecting input lists of the same length"
+
+partitionIntoChunks :: Int -> [a] -> [[a]]
+partitionIntoChunks k = go where
+  go [] = []
+  go xs = take k xs : go (drop k xs)
+
+nubOrd :: Ord a => [a] -> [a]
+nubOrd = worker Set.empty where
+  worker _ [] = []
+  worker s (x:xs) 
+    | Set.member x s = worker s xs
+    | otherwise      = x : worker (Set.insert x s) xs
+
+--------------------------------------------------------------------------------
+-- * Arrays
+
+singletonArray :: a -> Array Int a
+singletonArray x = listArray (0,0) [x]
+
+listToArray :: [a] -> Array Int a
+listToArray xs = listArray (0, length xs - 1) xs
+
+makeArray :: Int -> (Int -> a) -> Array Int a
+makeArray n fun = listArray (0,n-1) [ fun i | i<-[0..n-1] ]
+
+arrayLength :: Array Int a -> Int
+arrayLength arr = b - a + 1 where (a,b) = bounds arr
+
+-- | Synonym for 'arrayLength'
+arraySize :: Array Int a -> Int
+arraySize = arrayLength
+
+-- | Returns the default value when out of range
+safeIndex :: a -> Array Int a -> Int -> a
+safeIndex def arr j 
+  | j < a      = def
+  | j > b      = def
+  | otherwise  = arr!j
+  where
+    (a,b) = bounds arr
+
+interleaveArrays' :: Array Int (Array Int a) -> Array Int a
+interleaveArrays' arrs 
+  | nubOrd (elems sizes) == [n]   = big
+  | otherwise                     = error "interleaveArrays': incompatible array sizes" 
+  where
+    m     = arraySize arrs
+    sizes = fmap arrayLength arrs
+    n     = sizes!0
+    big   = listArray (0,n*m-1) [ (arrs!j)!i | i<-[0..n-1] , j<-[0..m-1] ]
+
+interleaveArrays :: [Array Int a] -> Array Int a
+interleaveArrays arrayList = interleaveArrays' (listToArray arrayList)
+
+-- | This is the inverse of @interleaveArrays@. The integer parameter is the number
+-- of output vectors (or \"stride\")
+untangleArray :: Int -> Array Int a -> [Array Int a]
+untangleArray stride input 
+  | r /= 0     = error "untangleArrays: input array's size is not divisible by the stride"
+  | otherwise  = pieces
+  where
+    n = arraySize input
+    (q,r) = divMod n stride
+    pieces = [ extractCosetArray j stride input | j<-[0..stride-1] ]
+{-
+    pieces = 
+      [ listArray (0,q-1) [ input ! (j + i*stride) | i <- [0..q-1] ] 
+      | j <- [0..stride-1] 
+      ]
+-}
+
+untangleArray' :: Int -> Array Int a -> Array Int (Array Int a)
+untangleArray' stride = listToArray . untangleArray stride 
+
+-- | This extracts a subarray with indices of the form @[ offset + i*stride | i<-[0..n-1] ]@
+extractCosetArray :: Int -> Int -> Array Int a -> Array Int a
+extractCosetArray offset stride input 
+  | r /= 0     = error "extractCosetArray: input array's size is not divisible by the stride"
+  | otherwise  = piece
+  where
+    n = arraySize input
+    (q,r) = divMod n stride
+    piece = listArray (0,q-1) [ input ! (offset + i*stride) | i <- [0..q-1] ] 
+
+-- | This extracts a subarray with indices of the form @[ i*stride | i<-[0..n-1] ]@
+extractSubgroupArray :: Int -> Array Int a -> Array Int a
+extractSubgroupArray stride = extractCosetArray 0 stride
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/NTT.hs b/reference/src/NTT.hs
new file mode 100644
index 0000000..8fe02a0
--- /dev/null
+++ b/reference/src/NTT.hs
@@ -0,0 +1,15 @@
+
+-- | Number-theoretical transform (FFT)
+
+module NTT 
+  ( module Field.Goldilocks
+  , module NTT.Subgroup
+  , module NTT.Poly
+  , module NTT.Slow
+  ) where
+
+import Field.Goldilocks
+import NTT.Subgroup
+import NTT.Poly
+import NTT.Slow
+
diff --git a/reference/src/NTT/Poly.hs b/reference/src/NTT/Poly.hs
new file mode 100644
index 0000000..8a8f45d
--- /dev/null
+++ b/reference/src/NTT/Poly.hs
@@ -0,0 +1,221 @@
+
+-- | Dense univariate polynomials
+
+{-# LANGUAGE StrictData, BangPatterns, ScopedTypeVariables, DeriveFunctor #-}
+module NTT.Poly where
+
+--------------------------------------------------------------------------------
+
+import Data.List
+import Data.Array
+import Data.Array.ST (STArray)
+import Data.Array.MArray (newArray, readArray, writeArray, thaw, freeze)
+
+import Control.Monad
+import Control.Monad.ST.Strict
+
+import System.Random
+
+import Field.Goldilocks
+import Field.Goldilocks.Extension ( FExt )
+import Field.Encode
+
+import Misc
+
+--------------------------------------------------------------------------------
+-- * Univariate polynomials
+
+-- | A dense univariate polynomial. The array index corresponds to the exponent.
+newtype Poly a 
+  = Poly (Array Int a) 
+  deriving (Show,Functor)
+
+instance (Num a, Eq a) => Eq (Poly a) where
+  p == q = polyIsZero (polySub p q)
+
+instance FieldEncode (Poly F) where
+  fieldEncode (Poly arr) = fieldEncode arr
+
+instance FieldEncode (Poly FExt) where
+  fieldEncode (Poly arr) = fieldEncode arr
+
+mkPoly :: [a] -> Poly a
+mkPoly coeffs = Poly $ listArray (0,length coeffs-1) coeffs
+
+-- | Degree of the polynomial 
+polyDegree :: (Eq a, Num a) => Poly a -> Int
+polyDegree (Poly arr) = worker d0 where
+  (0,d0) = bounds arr
+  worker d 
+    | d < 0        = -1
+    | arr!d /= 0   =  d
+    | otherwise    = worker (d-1)
+
+-- | Size of the polynomial
+polySize :: (Eq a, Num a) => Poly a -> Int
+polySize (Poly p) = arraySize p
+
+polyIsZero :: (Eq a, Num a) => Poly a -> Bool
+polyIsZero (Poly arr) = all (==0) (elems arr)
+
+-- | Returns the coefficient of @x^k@
+polyCoeff :: Num a => Poly a -> Int -> a
+polyCoeff (Poly coeffs) k = safeIndex 0 coeffs k
+
+-- | Note: this can include zero coeffs at higher than the actual degree!
+polyCoeffArray :: Poly a -> Array Int a
+polyCoeffArray (Poly coeffs) = coeffs
+
+-- | Note: this cuts off the potential extra zeros at the end. 
+-- The order is little-endian (constant term first).
+polyCoeffList :: (Eq a, Num a) => Poly a -> [a]
+polyCoeffList poly@(Poly arr) = take (polyDegree poly + 1) (elems arr)
+
+--------------------------------------------------------------------------------
+-- * Elementary polynomials
+
+-- | Constant polynomial
+polyConst :: a -> Poly a
+polyConst x = Poly $ listArray (0,0) [x]
+
+-- | Zero polynomial
+polyZero :: Num a => Poly a
+polyZero = polyConst 0
+
+-- | The polynomial @f(x) = x@
+polyVarX :: Num a => Poly a
+polyVarX = mkPoly [0,1]
+
+-- | @polyLinear (A,B)@ means the linear polynomial @f(x) = A*x + B@
+polyLinear :: (a,a) -> Poly a
+polyLinear (a,b) = mkPoly [b,a]
+
+-- | The monomial @x^n@
+polyXpowN :: Num a => Int -> Poly a
+polyXpowN n = Poly $ listArray (0,n) (replicate n 0 ++ [1])
+
+-- | The binomial @(x^n - 1)@
+polyXpowNminus1 :: Num a => Int -> Poly a
+polyXpowNminus1 n = Poly $ listArray (0,n) (-1 : replicate (n-1) 0 ++ [1])
+
+--------------------------------------------------------------------------------
+-- * Evaluate polynomials
+
+polyEvalAt :: forall f. Fractional f => Poly f -> f -> f
+polyEvalAt (Poly arr) x = go 0 1 0 where
+  (0,d) = bounds arr
+  go :: f -> f -> Int -> f
+  go !acc !y !i = if i > d 
+    then acc 
+    else go (acc + (arr!i)*y) (y*x) (i+1)
+
+polyEvalOnList :: forall f. Fractional f => Poly f -> [f] -> [f]
+polyEvalOnList poly = map (polyEvalAt poly)
+
+polyEvalOnArray :: forall f. Fractional f => Poly f -> Array Int f -> Array Int f
+polyEvalOnArray poly = fmap (polyEvalAt poly)
+
+--------------------------------------------------------------------------------
+-- * Basic arithmetic operations on polynomials
+
+polyNeg :: Num a => Poly a -> Poly a
+polyNeg (Poly arr) = Poly $ fmap negate arr
+
+polyAdd :: Num a => Poly a -> Poly a -> Poly a
+polyAdd (Poly arr1) (Poly arr2) = Poly $ listArray (0,d3) zs where
+  (0,d1) = bounds arr1
+  (0,d2) = bounds arr2
+  d3 = max d1 d2
+  zs = zipWith (+) (elems arr1 ++ replicate (d3-d1) 0)
+                   (elems arr2 ++ replicate (d3-d2) 0)
+
+polySub :: Num a => Poly a -> Poly a -> Poly a
+polySub (Poly arr1) (Poly arr2) = Poly $ listArray (0,d3) zs where
+  (0,d1) = bounds arr1
+  (0,d2) = bounds arr2
+  d3 = max d1 d2
+  zs = zipWith (-) (elems arr1 ++ replicate (d3-d1) 0)
+                   (elems arr2 ++ replicate (d3-d2) 0)
+
+polyMul :: Num a => Poly a -> Poly a -> Poly a
+polyMul (Poly arr1) (Poly arr2) = Poly $ listArray (0,d3) zs where
+  (0,d1) = bounds arr1
+  (0,d2) = bounds arr2
+  d3 = d1 + d2
+  zs = [ f k | k<-[0..d3] ]
+  f !k = foldl' (+) 0 [ arr1!i * arr2!(k-i) | i<-[ max 0 (k-d2) .. min d1 k ] ]
+
+instance Num a => Num (Poly a) where
+  fromInteger = polyConst . fromInteger
+  negate = polyNeg
+  (+)    = polyAdd
+  (-)    = polySub
+  (*)    = polyMul
+  abs    = id
+  signum = \_ -> polyConst 1
+
+polySum :: Num a => [Poly a] -> Poly a
+polySum = foldl' polyAdd 0
+
+polyProd :: Num a => [Poly a] -> Poly a
+polyProd = foldl' polyMul 1
+
+--------------------------------------------------------------------------------
+-- * Polynomial long division
+
+-- | @polyDiv f h@ returns @(q,r)@ such that @f = q*h + r@ and @deg r < deg h@
+polyDiv :: forall f. (Eq f, Fractional f) => Poly f -> Poly f -> (Poly f, Poly f)
+polyDiv poly_f@(Poly arr_f) poly_h@(Poly arr_h) 
+  | deg_q < 0   =  (polyZero, poly_f)
+  | otherwise   =  runST action
+  where
+    deg_f = polyDegree poly_f
+    deg_h = polyDegree poly_h
+    deg_q = deg_f - deg_h
+
+    -- inverse of the top coefficient of divisor
+    b_inv = recip (arr_h ! deg_h)
+
+    action :: forall s. ST s (Poly f, Poly f) 
+    action = do
+      p <- thaw arr_f           :: ST s (STArray s Int f)
+      q <- newArray (0,deg_q) 0 :: ST s (STArray s Int f)
+      forM_ [deg_q,deg_q-1..0] $ \k -> do
+        top <- readArray p (deg_h + k)
+        let y = b_inv * top
+        writeArray q k y
+        forM_ [0..deg_h] $ \j -> do
+          a <- readArray p (j+k)
+          writeArray p (j+k) (a - y*(arr_h!j))
+      qarr <- freeze q
+      rs   <- forM [0..deg_h-1] $ \i -> readArray p i
+      let rarr = listArray (0,deg_h-1) rs
+      return (Poly qarr, Poly rarr)
+
+-- | Returns only the quotient
+polyDivQuo :: (Eq f, Fractional f) => Poly f -> Poly f -> Poly f
+polyDivQuo f g = fst $ polyDiv f g
+
+-- | Returns only the remainder
+polyDivRem :: (Eq f, Fractional f) => Poly f -> Poly f -> Poly f
+polyDivRem f g = snd $ polyDiv f g
+
+--------------------------------------------------------------------------------
+-- * Sample random polynomials
+
+randomPoly :: (RandomGen g, Random a) => Int -> g -> (Poly a, g)
+randomPoly deg g0 = 
+  let (coeffs,gfinal) = worker (deg+1) g0 
+      poly = Poly (listArray (0,deg) coeffs)
+  in  (poly, gfinal) 
+
+  where
+    worker 0 g = ([] , g)
+    worker n g = let (x ,g1) = random g 
+                     (xs,g2) = worker (n-1) g1
+                 in  ((x:xs) , g)
+
+randomPolyIO :: Random a => Int -> IO (Poly a)
+randomPolyIO deg = getStdRandom (randomPoly deg)
+
+--------------------------------------------------------------------------------
\ No newline at end of file
diff --git a/reference/src/NTT/Slow.hs b/reference/src/NTT/Slow.hs
new file mode 100644
index 0000000..77110ce
--- /dev/null
+++ b/reference/src/NTT/Slow.hs
@@ -0,0 +1,186 @@
+
+{-# LANGUAGE ScopedTypeVariables #-}
+module NTT.Slow where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Data.Bits
+
+import NTT.Poly
+import NTT.Subgroup
+
+import Field.Goldilocks
+import Field.Goldilocks.Extension ( FExt , scl , inj )
+
+import Misc
+
+--------------------------------------------------------------------------------
+
+-- | Evaluate the polynomial on a multiplicative subgroup /of the same size/, using FFT 
+polyEvaluate :: Subgroup F -> Poly F -> Array Int F 
+polyEvaluate = subgroupNTT
+
+-- | Interpolate the values on a multiplicative subgroup into a polynomial, using inverse FFT 
+polyInterpolate :: Subgroup F -> Array Int F -> Poly F 
+polyInterpolate = subgroupINTT
+
+--------------------------------------------------------------------------------
+
+-- | Evaluates a polynomial on a subgroup /of the same size/
+subgroupNTT :: Subgroup F -> Poly F -> Array Int F
+subgroupNTT subgroup (Poly coeffs)
+  | n1+1 /= subgroupOrder subgroup = error "ntt: input size does not match the subgroup order"
+  | n1 == 0   = listArray (0,0) [coeffs!0]
+  | otherwise = final
+  where
+    (0,n1) = bounds coeffs
+    n      = n1 + 1
+    hn     = Prelude.div n 2
+    hn1    = hn - 1
+    hsub   = halveSubgroup subgroup
+    g      = subgroupGen   subgroup
+    v_even = elems $ subgroupNTT hsub $ Poly $ listArray (0,hn1) [ coeffs!(2*i  ) | i<-[0..hn1] ]
+    v_odd  = elems $ subgroupNTT hsub $ Poly $ listArray (0,hn1) [ coeffs!(2*i+1) | i<-[0..hn1] ]
+    gpows  = powersOf hn g
+    first  = zipWith3 (\gk x y -> (x + gk * y)) gpows v_even v_odd
+    second = zipWith3 (\gk x y -> (x - gk * y)) gpows v_even v_odd
+    final  = listArray (0,n1) (first ++ second) 
+
+-- | Evaluates a polynomial on a subgroup /of the same size/
+subgroupNTTExt :: Subgroup F -> Poly FExt -> Array Int FExt
+subgroupNTTExt subgroup (Poly coeffs)
+  | n1+1 /= subgroupOrder subgroup = error "ntt: input size does not match the subgroup order"
+  | n1 == 0   = listArray (0,0) [coeffs!0]
+  | otherwise = final
+  where
+    (0,n1) = bounds coeffs
+    n      = n1 + 1
+    hn     = Prelude.div n 2
+    hn1    = hn - 1
+    hsub   = halveSubgroup subgroup
+    g      = subgroupGen   subgroup
+    v_even = elems $ subgroupNTTExt hsub $ Poly $ listArray (0,hn1) [ coeffs!(2*i  ) | i<-[0..hn1] ]
+    v_odd  = elems $ subgroupNTTExt hsub $ Poly $ listArray (0,hn1) [ coeffs!(2*i+1) | i<-[0..hn1] ]
+    gpows  = powersOf hn g
+    first  = zipWith3 (\gk x y -> (x + gk `scl` y)) gpows v_even v_odd
+    second = zipWith3 (\gk x y -> (x - gk `scl` y)) gpows v_even v_odd
+    final  = listArray (0,n1) (first ++ second) 
+
+----------------------------------------
+
+-- | Interpolates values into a polynomial on a subgroup
+subgroupINTT :: Subgroup F -> Array Int F -> Poly F
+subgroupINTT subgroup values 
+  | n1+1 /= subgroupOrder subgroup = error "intt: input size does not match the subgroup order"
+  | n1 == 0   = Poly $ listArray (0,0) [values!0]
+  | otherwise = final
+  where
+    (0,n1) = bounds values
+    n      = n1 + 1
+    hn     = Prelude.div n 2
+    hn1    = hn - 1
+    hsub   = halveSubgroup subgroup
+    g      = subgroupGen   subgroup
+    first  = [ values!(i   ) | i<-[0..hn1] ]
+    second = [ values!(i+hn) | i<-[0..hn1] ]
+    gpows  = powersOf hn g
+    v_even = zipWith  (\  x y -> (x + y) /  2   )       first second
+    v_odd  = zipWith3 (\g x y -> (x - y) / (2*g)) gpows first second
+    p_even = elems $ polyCoeffArray $ subgroupINTT hsub $ listArray (0,hn1) v_even
+    p_odd  = elems $ polyCoeffArray $ subgroupINTT hsub $ listArray (0,hn1) v_odd
+    final  = Poly $ listArray (0,n1) (interleave p_even p_odd) 
+
+-- | Interpolates values into a polynomial on a subgroup
+subgroupINTTExt :: Subgroup F -> Array Int FExt -> Poly FExt
+subgroupINTTExt subgroup values 
+  | n1+1 /= subgroupOrder subgroup = error "intt: input size does not match the subgroup order"
+  | n1 == 0   = Poly $ listArray (0,0) [values!0]
+  | otherwise = final
+  where
+    (0,n1) = bounds values
+    n      = n1 + 1
+    hn     = Prelude.div n 2
+    hn1    = hn - 1
+    hsub   = halveSubgroup subgroup
+    g      = subgroupGen   subgroup
+    first  = [ values!(i   ) | i<-[0..hn1] ]
+    second = [ values!(i+hn) | i<-[0..hn1] ]
+    gpows  = powersOf hn g
+    v_even = zipWith  (\  x y -> (x + y) /  2       )       first second
+    v_odd  = zipWith3 (\g x y -> (x - y) / (2*inj g)) gpows first second
+    p_even = elems $ polyCoeffArray $ subgroupINTTExt hsub $ listArray (0,hn1) v_even
+    p_odd  = elems $ polyCoeffArray $ subgroupINTTExt hsub $ listArray (0,hn1) v_odd
+    final  = Poly $ listArray (0,n1) (interleave p_even p_odd) 
+
+--------------------------------------------------------------------------------
+
+-- | Evaluates a polynomial on a coset /of the same size/
+cosetNTT :: Coset F -> Poly F -> Array Int F
+cosetNTT (MkCoset subgroup offset) poly = subgroupNTT subgroup (shiftPolyCoeffs offset poly)
+
+-- | Evaluates a polynomial on a coset /of the same size/
+cosetNTTExt :: Coset F -> Poly FExt -> Array Int FExt
+cosetNTTExt (MkCoset subgroup offset) poly = subgroupNTTExt subgroup (shiftPolyCoeffsExt offset poly)
+
+-- | Interpolates a polynomial from its values on a coset
+cosetINTT :: Coset F -> Array Int F -> Poly F 
+cosetINTT (MkCoset subgroup offset) values = shiftPolyCoeffs (recip offset) (subgroupINTT subgroup values)
+
+-- | Interpolates a polynomial from its values on a coset
+cosetINTTExt :: Coset F -> Array Int FExt -> Poly FExt 
+cosetINTTExt (MkCoset subgroup offset) values = shiftPolyCoeffsExt (recip offset) (subgroupINTTExt subgroup values)
+
+----------------------------------------
+
+-- | multiplies the @k@-th coefficient with @eta^k@
+shiftPolyCoeffs :: F -> Poly F -> Poly F
+shiftPolyCoeffs eta (Poly coeffs) = Poly (arrayPointWiseProduct (powersOf n eta) coeffs) where
+  n = arrayLength coeffs
+
+-- | multiplies the @k@-th coefficient with @eta^k@
+shiftPolyCoeffsExt :: F -> Poly FExt -> Poly FExt
+shiftPolyCoeffsExt eta (Poly coeffs) = Poly (arrayPointWiseScale (powersOf n eta) coeffs) where
+  n = arrayLength coeffs
+
+-- | pointwise product of a list and array
+arrayPointWiseProduct :: [F] -> Array Int F ->  Array Int F
+arrayPointWiseProduct list array 
+  = listArray (bounds array) 
+  $ safeZipWith (*) list (elems array)
+
+-- | pointwise product of a list and array
+arrayPointWiseScale :: [F] -> Array Int FExt ->  Array Int FExt
+arrayPointWiseScale list array 
+  = listArray (bounds array) 
+  $ safeZipWith scl list (elems array)
+
+--------------------------------------------------------------------------------
+
+-- | Evaluates a polynomial on a coset larger than the polynomial
+asymmetricCosetNTT :: Coset F -> Poly F -> Array Int F
+asymmetricCosetNTT coset@(MkCoset subgroup shift) poly
+  | r /= 0     = error "asymmetricCosetNTT: we expect a subgroup whose size is a multiple of the polynomial's size"
+  | otherwise  = interleaveArrays pieces
+  where
+    n = subgroupOrder subgroup
+    g = subgroupGen   subgroup
+    m = polySize poly
+    (q,r) = divMod n m
+    smallSubgroup = powSubgroup subgroup q
+    pieces = [ cosetNTT (MkCoset smallSubgroup (shift * g^j)) poly | j <- [0..q-1] ]
+
+-- | Evaluates a polynomial on a coset larger than the polynomial
+asymmetricCosetNTTExt :: Coset F -> Poly FExt -> Array Int FExt
+asymmetricCosetNTTExt coset@(MkCoset subgroup shift) poly
+  | r /= 0     = error "asymmetricCosetNTTExt: we expect a subgroup whose size is a multiple of the polynomial's size"
+  | otherwise  = interleaveArrays pieces
+  where
+    n = subgroupOrder subgroup
+    g = subgroupGen   subgroup
+    m = polySize poly
+    (q,r) = divMod n m
+    smallSubgroup = powSubgroup subgroup q
+    pieces = [ cosetNTTExt (MkCoset smallSubgroup (shift * g^j)) poly | j <- [0..q-1] ]
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/NTT/Subgroup.hs b/reference/src/NTT/Subgroup.hs
new file mode 100644
index 0000000..6f86864
--- /dev/null
+++ b/reference/src/NTT/Subgroup.hs
@@ -0,0 +1,90 @@
+
+module NTT.Subgroup where
+
+--------------------------------------------------------------------------------
+
+import Data.Bits
+
+import Field.Goldilocks
+import Misc
+
+--------------------------------------------------------------------------------
+
+-- | A cyclic subgroup (multiplicative)
+data Subgroup g = MkSubgroup 
+  { subgroupGen   :: !g          -- ^ the cyclic generator 
+  , subgroupOrder :: !Int        -- ^ size of the subgroup
+  }
+  deriving (Eq,Show)
+
+subgroupSize :: Subgroup g -> Int
+subgroupSize = subgroupOrder
+
+getSubgroup :: Log2 -> Subgroup F
+getSubgroup log2@(Log2 n) 
+  | n<0       = error "getSubgroup: negative logarithm"
+  | n>32      = error "getSubgroup: we cannot fit a smooth subgroup larger than 2^32 into Goldilocks"
+  | otherwise = MkSubgroup 
+     { subgroupGen   = pow theSubgroupGenSize32 (2^(32-n)) 
+     , subgroupOrder = exp2_ log2
+     }
+
+theSubgroupGenSize32 :: F
+theSubgroupGenSize32 = pow theMultiplicativeGenerator expo where
+  (expo,0) = Prelude.divMod (goldilocksPrime-1) (2^32)
+
+-- | lists all elements of the (cyclic) subgroup
+subgroupElems :: forall g. Num g => Subgroup g -> [g]
+subgroupElems (MkSubgroup gen order) = go 1 order where
+  go :: g -> Int -> [g]
+  go _ 0 = []
+  go a n = a : go (a * gen) (n-1)
+
+halveSubgroup :: Num g => Subgroup g -> Subgroup g
+halveSubgroup (MkSubgroup gen size) = if (size .&. 1 == 0)
+  then MkSubgroup (gen * gen) (shiftR size 1)
+  else error "halveSubgroup: subgroup order not divisible by two"
+
+-- | Synonym for 'halveSubgroup'
+squareSubgroup :: Num g => Subgroup g -> Subgroup g
+squareSubgroup = halveSubgroup
+
+-- | Generalization of 'squareSubgroup'
+powSubgroup :: Subgroup F -> Int -> Subgroup F
+powSubgroup (MkSubgroup gen order) k 
+  | r /= 0     = error $ "subgroupPower: size of the subgroup is not divisible by " ++ show k
+  | otherwise  = MkSubgroup (pow_ gen k) q
+  where
+    (q,r) = divMod order k
+
+--------------------------------------------------------------------------------
+
+data Coset g = MkCoset 
+  { cosetGroup  :: !(Subgroup g)  
+  , cosetOffset :: !g
+  }
+  deriving (Eq,Show)
+
+cosetSize :: Coset g -> Int
+cosetSize (MkCoset sg ofs) = subgroupSize sg
+
+getCoset :: F -> Log2 -> Coset F
+getCoset shift size = MkCoset (getSubgroup size) shift
+
+squareCoset :: Num g => Coset g -> Coset g
+squareCoset (MkCoset subgroup ofs) = MkCoset subgroup' ofs' where
+  subgroup' = halveSubgroup subgroup
+  ofs'      = ofs * ofs
+
+powCoset :: Coset F -> Int -> Coset F
+powCoset (MkCoset subgroup offset) expo = MkCoset (powSubgroup subgroup expo) (pow_ offset expo)
+
+--------------------------------------------------------------------------------
+
+-- | First @n@ powers of @g@
+powersOf :: Num g => Int -> g -> [g]
+powersOf n g = go 1 n where
+  go  _  0 = []
+  go !y !n = y : go (g*y) (n-1)
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/testMain.hs b/reference/src/testMain.hs
new file mode 100644
index 0000000..233408e
--- /dev/null
+++ b/reference/src/testMain.hs
@@ -0,0 +1,95 @@
+
+module Main where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Text.Show.Pretty
+import System.Random
+
+import Hash.Duplex.Monad
+import FRI
+import Misc
+
+--------------------------------------------------------------------------------
+
+cosetShift :: F
+cosetShift = theMultiplicativeGenerator 
+
+reductionStrategyParams :: ReductionStrategyParams 
+reductionStrategyParams = MkRedStratPars
+  { redStoppingDegree    = Log2 3
+  , redFoldingArity      = Log2 2    
+  }
+
+rsConfig :: RSConfig
+rsConfig = MkRSConfig
+  { rsRateBits    = Log2 2    
+  , rsDataSize    = Log2 8        
+  , rsCosetShift  = cosetShift
+  }
+
+friConfig :: FriConfig
+friConfig = MkFriConfig 
+  { friRSConfig          = rsConfig
+  , friNColumns          = 3     
+  , friMerkleCapSize     = Log2 2
+  , friReductionStrategy = findReductionStrategy reductionStrategyParams rsConfig
+  , friNQueryRounds      = 2 -- 10
+  , friGrindingBits      = Log2 5
+  }
+
+origCoset :: Coset F
+origCoset = getCoset cosetShift (rsDataSize rsConfig)
+
+ldeCoset :: Coset F
+ldeCoset = getCoset cosetShift (rsDataSize rsConfig + rsRateBits rsConfig)
+
+--------------------------------------------------------------------------------
+
+testData :: Matrix F
+testData = array ((0,0),(n-1,m-1)) (zip coords values) where
+  n = exp2_ (rsDataSize  rsConfig )
+  m =        friNColumns friConfig
+  coords = [ (i,j) | i<-[0..n-1], j<-[0..m-1] ]    -- row-major
+  values = take (n*m) $ genValues 31 55 107
+
+ldeTestData :: Matrix F
+ldeTestData = ldeEncodeMatrix rsConfig testData
+
+testValues :: [F]
+testValues = genValues 31 55 107
+
+genValues :: F -> F -> F -> [F] 
+genValues a b c = d : genValues a' b' c' where
+  d  = a + 2*b + 3*c
+  aa = a*a
+  bb = b*b
+  cc = c*c
+  a' = aa + c*bb + c + 1003
+  b' = a + bb - b*cc + 3137
+  c' = aa - b + a*cc + 15222
+
+--------------------------------------------------------------------------------
+
+printSeparator :: IO ()
+printSeparator = putStrLn "----------------------------------------"
+
+main :: IO ()
+main = do
+  putStrLn "testMain (outsourcing Reed-Solomon with FRI)\n"
+  printSeparator
+
+  setStdGen (mkStdGen 1337)    -- make it deterministic
+
+  justPrint friConfig
+  printSeparator
+  
+  (commits, friProof) <- runDuplexIO_ (encodeAndProveFRI friConfig testData)
+  pPrint commits
+  pPrint friProof
+
+  ok <- runDuplexIO_ (verifyFRI (_ldeCommitment commits) friProof)
+  putStrLn $ "verify FRI succeed = " ++ show ok
+
+--------------------------------------------------------------------------------