e2store: avoid uint48 (#4625)

In SSZ, `uint32` is used for offsets, effectively limiting the size of an SSZ entry to 2**32 bytes. Also, `uint48` isn't a valid SSZ type, so the header was not correctly defined according to the SSZ spec - the extra 2 bytes are left for future expansion instead.
2025-02-24 20:28:33 +00:00 · 2023-02-15 14:51:17 +01:00 · 2023-02-15 14:51:17 +01:00 · 822c339607
commit 822c339607
parent 1ac7f1a47a
3 changed files with 25 additions and 10 deletions
--- a/docs/e2store.md
+++ b/docs/e2store.md
@ -17,10 +17,15 @@ The header corresponds to an SSZ object defined as such:
 ```python
 class Header(Container):
    type: Vector[byte, 2]
-    length: uint48
+    length: uint32
+    reserved: uint16
 ```

-The `length` is the length of the data that follows the header, not including the length of the header itself. For example, the entry with header type `[0x22, 0x32]`, the length `4` and the bytes `[0x01, 0x02, 0x03, 0x04]` will be stored as the byte sequence `[0x22, 0x32, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04]`.
+The `length` is the length of the data that follows the header, not including the length of the header itself.
+
+The `reserved` field must be set to `0`.
+
+For example, an entry with header type `[0x22, 0x32]`, length `4` and the content `[0x01, 0x02, 0x03, 0x04]` will be stored as the byte sequence `[0x22, 0x32, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04]`.

 `.e2s` files may freely be concatenated, and may contain out-of-order records.

@ -40,7 +45,7 @@ def read_entry(f):
  if not header: return None

  typ = header[0:2] # 2 bytes of type
-  dlen = struct.unpack("<q", header[2:8] + b"\0\0")[0] # 6 bytes of little-endian length
+  dlen = struct.unpack("<I", header[2:6])[0] # 4 bytes of unsigned little-endian length

  data = f.read(dlen)

@ -67,7 +72,6 @@ def print_stats(name):
      print("type", k.hex(), "bytes", v[0], "count", v[1], "average", v[0] / v[1])
 ```

-
 ## Writing

 `e2s` files are written record-by-record starting with a version record. Files may be concatenated freely, meaning that the version record may appear multiple times in the file and a single file may have multiple versions.
@ -340,3 +344,9 @@ In the end though, the applied block state is used throughout in the protocol -
 ## How can block roots be accessed without computing them?

 Each era file contains a full `BeaconState` object whose `block_roots` field corresponds to the block contents of the file. The easiest way to access the roots is to read the "header" of the `BeaconState` without reading all fields.
+
+## Why is length `uint32`?
+
+Offsets in `SSZ` are `uint32` thus from a practical point of view, any one SSZ object may generally not exceed that size.
+
+A future entry type can introduce chunking should larger entries be needed, or spill the remaining size bytes into `reserved`, effectively turning the encoding of the length into a fictive `uint48` type.
--- a/ncli/e2store.nim
+++ b/ncli/e2store.nim
@ -20,8 +20,9 @@ const
  SnappyBeaconState* = [byte 0x02, 0x00]

  TypeFieldLen = 2
-  LengthFieldLen = 6
-  HeaderFieldLen = TypeFieldLen + LengthFieldLen
+  LengthFieldLen = 4
+  ReservedFieldLen = 2
+  HeaderFieldLen = TypeFieldLen + LengthFieldLen + ReservedFieldLen

  FAR_FUTURE_ERA* = Era(not 0'u64)

@ -71,10 +72,14 @@ proc append(f: IoHandle, data: openArray[byte]): Result[void, string] =
  ok()

 proc appendHeader(f: IoHandle, typ: Type, dataLen: int): Result[int64, string] =
+  if dataLen.uint64 > uint32.high:
+    return err("entry does not fit 32-bit length")
+
  let start = ? getFilePos(f).mapErr(toString)

  ? append(f, typ)
-  ? append(f, toBytesLE(dataLen.uint64).toOpenArray(0, 5))
+  ? append(f, toBytesLE(dataLen.uint32))
+  ? append(f, [0'u8, 0'u8])

  ok(start)

@ -137,9 +142,9 @@ proc readHeader(f: IoHandle): Result[Header, string] =
    typ: Type
  discard typ.copyFrom(buf)

-  # Cast safe because we had only 6 bytes of length data
+  # Cast safe because we had only 4 bytes of length data
  let
-    len = cast[int64](uint64.fromBytesLE(buf.toOpenArray(2, 9)))
+    len = cast[int64](uint32.fromBytesLE(buf.toOpenArray(2, 5)))

  # No point reading these..
  if len > int.high(): return err("header length exceeds int.high")
--- a/ncli/e2store.py
+++ b/ncli/e2store.py
@ -5,7 +5,7 @@ def read_entry(f):
  if not header: return (None, None)

  typ = header[0:2] # 2 bytes of type
-  dlen = struct.unpack("<q", header[2:8] + b"\0\0")[0] # 6 bytes of little-endian length
+  dlen = struct.unpack("<I", header[2:6])[0] # 4 bytes of unsigned little-endian length

  data = f.read(dlen)