mirror of
https://github.com/status-im/nim-stew.git
synced 2025-02-13 12:36:57 +00:00
Optimized and exception-less encoding/decoding procedures for decimal integers (#78)
* Optimized and exception-less encoding/decoding procedures for decimal integers. * Add tests. * Fix import path. * Fix review comments. * Code simplification. * Make toBytes() allocation free. * Do not perform conversion to signed type to avoid compiler's overflow checks.
This commit is contained in:
parent
42475fd2f1
commit
6bcb21184a
197
stew/base10.nim
Normal file
197
stew/base10.nim
Normal file
@ -0,0 +1,197 @@
|
||||
## Copyright (c) 2021 Status Research & Development GmbH
|
||||
## Licensed under either of
|
||||
## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
|
||||
## * MIT license ([LICENSE-MIT](LICENSE-MIT))
|
||||
## at your option.
|
||||
## This file may not be copied, modified, or distributed except according to
|
||||
## those terms.
|
||||
|
||||
## This module implements BASE10 (decimal) encoding and decoding procedures.
|
||||
##
|
||||
## Encoding procedures are adopted versions of C functions described here:
|
||||
## # https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920
|
||||
import results
|
||||
export results
|
||||
|
||||
{.push raises: [Defect].}
|
||||
|
||||
type
|
||||
Base10* = object
|
||||
|
||||
func maxLen*(T: typedesc[Base10], I: type): int8 =
|
||||
## The maximum number of bytes needed to encode any value of type I
|
||||
when I is uint8:
|
||||
3
|
||||
elif I is uint16:
|
||||
5
|
||||
elif I is uint32:
|
||||
10
|
||||
elif I is uint64:
|
||||
20
|
||||
else:
|
||||
when sizeof(uint) == 4:
|
||||
10
|
||||
else:
|
||||
20
|
||||
|
||||
type
|
||||
Base10Buf*[T: SomeUnsignedInt] = object
|
||||
data*: array[maxLen(Base10, T), byte]
|
||||
len*: int8 # >= 1 when holding valid unsigned integer
|
||||
|
||||
proc decode*[A: byte|char](B: typedesc[Base10], T: typedesc[SomeUnsignedInt],
|
||||
src: openarray[A]): Result[T, cstring] =
|
||||
## Convert base10 encoded string or array of bytes to unsigned integer.
|
||||
const
|
||||
MaxValue = T(high(T) div 10)
|
||||
MaxNumber = T(high(T) - MaxValue * 10)
|
||||
|
||||
if len(src) == 0:
|
||||
return err("Missing decimal value")
|
||||
var v = T(0)
|
||||
for i in 0 ..< len(src):
|
||||
let ch = when A is char: byte(src[i]) else: src[i]
|
||||
let d =
|
||||
if (ch >= ord('0')) and (ch <= ord('9')):
|
||||
T(ch - ord('0'))
|
||||
else:
|
||||
return err("Non-decimal character encountered")
|
||||
if (v > MaxValue) or (v == MaxValue and T(d) > MaxNumber):
|
||||
return err("Integer overflow")
|
||||
v = (v shl 3) + (v shl 1) + T(d)
|
||||
ok(v)
|
||||
|
||||
proc encodedLength*(B: typedesc[Base10], value: SomeUnsignedInt): int8 =
|
||||
## Procedure returns number of characters needed to encode integer ``value``.
|
||||
when type(value) is uint8:
|
||||
if value < 10'u8:
|
||||
return 1'i8
|
||||
if value < 100'u8:
|
||||
return 2'i8
|
||||
3'i8
|
||||
elif type(value) is uint16:
|
||||
if value < 10'u16:
|
||||
return 1'i8
|
||||
if value < 100'u16:
|
||||
return 2'i8
|
||||
if value < 1000'u16:
|
||||
return 3'i8
|
||||
if value < 10000'u16:
|
||||
return 4'i8
|
||||
5'i8
|
||||
elif type(value) is uint32:
|
||||
const
|
||||
P04 = 1_0000'u32
|
||||
P05 = 1_0000_0'u32
|
||||
P06 = 1_0000_00'u32
|
||||
P07 = 1_0000_000'u32
|
||||
P08 = 1_0000_0000'u32
|
||||
P09 = 1_0000_0000_0'u32
|
||||
if value < 10'u32:
|
||||
return 1'i8
|
||||
if value < 100'u32:
|
||||
return 2'i8
|
||||
if value < 1000'u32:
|
||||
return 3'i8
|
||||
if value < P08:
|
||||
if value < P06:
|
||||
if value < P04:
|
||||
return 4'i8
|
||||
return 5'i8 + (if value >= P05: 1'i8 else: 0'i8)
|
||||
return 7'i8 + (if value >= P07: 1'i8 else: 0'i8)
|
||||
9'i8 + (if value >= P09: 1'i8 else: 0'i8)
|
||||
elif type(value) is uint64:
|
||||
const
|
||||
P04 = 1_0000'u64
|
||||
P05 = 1_0000_0'u64
|
||||
P06 = 1_0000_00'u64
|
||||
P07 = 1_0000_000'u64
|
||||
P08 = 1_0000_0000'u64
|
||||
P09 = 1_0000_0000_0'u64
|
||||
P10 = 1_0000_0000_00'u64
|
||||
P11 = 1_0000_0000_000'u64
|
||||
P12 = 1_0000_0000_0000'u64
|
||||
if value < 10'u64:
|
||||
return 1'i8
|
||||
if value < 100'u64:
|
||||
return 2'i8
|
||||
if value < 1000'u64:
|
||||
return 3'i8
|
||||
if value < P12:
|
||||
if value < P08:
|
||||
if value < P06:
|
||||
if value < P04:
|
||||
return 4'i8
|
||||
return 5'i8 + (if value >= P05: 1'i8 else: 0)
|
||||
return 7'i8 + (if value >= P07: 1'i8 else: 0)
|
||||
if value < P10:
|
||||
return 9'i8 + (if value >= P09: 1'i8 else: 0)
|
||||
return 11'i8 + (if value >= P11: 1'i8 else: 0)
|
||||
return 12'i8 + B.encodedLength(value div P12)
|
||||
|
||||
proc encode[A: byte|char](B: typedesc[Base10], value: SomeUnsignedInt,
|
||||
output: var openarray[A],
|
||||
length: int8): Result[int8, cstring] =
|
||||
const Digits = cstring(
|
||||
"0001020304050607080910111213141516171819" &
|
||||
"2021222324252627282930313233343536373839" &
|
||||
"4041424344454647484950515253545556575859" &
|
||||
"6061626364656667686970717273747576777879" &
|
||||
"8081828384858687888990919293949596979899"
|
||||
)
|
||||
|
||||
if len(output) < length:
|
||||
return err("Not enough space to store decimal value")
|
||||
|
||||
var v = value
|
||||
var next = length - 1
|
||||
|
||||
while v >= type(value)(100):
|
||||
let index = uint8((v mod type(value)(100)) shl 1)
|
||||
v = v div type(value)(100)
|
||||
when A is char:
|
||||
output[next] = Digits[index + 1]
|
||||
output[next - 1] = Digits[index]
|
||||
else:
|
||||
output[next] = byte(Digits[index + 1])
|
||||
output[next - 1] = byte(Digits[index])
|
||||
dec(next, 2)
|
||||
|
||||
if v < type(value)(10):
|
||||
when A is char:
|
||||
output[next] = char(ord('0') + (v and type(value)(0x0F)))
|
||||
else:
|
||||
output[next] = byte('0') + byte(v and type(value)(0x0F))
|
||||
else:
|
||||
let index = uint8(v) shl 1
|
||||
when A is char:
|
||||
output[next] = Digits[index + 1]
|
||||
output[next - 1] = Digits[index]
|
||||
else:
|
||||
output[next] = byte(Digits[index + 1])
|
||||
output[next - 1] = byte(Digits[index])
|
||||
ok(length)
|
||||
|
||||
proc encode*[A: byte|char](B: typedesc[Base10], value: SomeUnsignedInt,
|
||||
output: var openarray[A]): Result[int8, cstring] =
|
||||
## Encode integer value to array of characters or bytes.
|
||||
B.encode(value, output, B.encodedLength(value))
|
||||
|
||||
proc toString*(B: typedesc[Base10], value: SomeUnsignedInt): string =
|
||||
## Encode integer value ``value`` to string.
|
||||
var buf = newString(B.encodedLength(value))
|
||||
# Buffer of proper size is allocated, so error is not possible
|
||||
discard B.encode(value, buf, int8(len(buf)))
|
||||
buf
|
||||
|
||||
proc toBytes*[I: SomeUnsignedInt](B: typedesc[Base10], v: I): Base10Buf[I] {.
|
||||
noinit.} =
|
||||
## Encode integer value ``value`` to array of bytes.
|
||||
let res = B.encode(v, result.data, B.encodedLength(v))
|
||||
result.len = int8(res.get())
|
||||
|
||||
proc toBytes*[I: SomeUnsignedInt](v: I, B: typedesc[Base10]): Base10Buf[I] {.
|
||||
noinit.} =
|
||||
## Encode integer value ``value`` to array of bytes.
|
||||
let res = B.encode(v, result.data, B.encodedLength(v))
|
||||
result.len = int8(res.get())
|
@ -11,6 +11,7 @@ import
|
||||
ranges/all,
|
||||
test_assign2,
|
||||
test_arrayops,
|
||||
test_base10,
|
||||
test_base32,
|
||||
test_base58,
|
||||
test_base64,
|
||||
|
185
tests/test_base10.nim
Normal file
185
tests/test_base10.nim
Normal file
@ -0,0 +1,185 @@
|
||||
import unittest
|
||||
import ../stew/base10
|
||||
|
||||
when defined(nimHasUsed): {.used.}
|
||||
|
||||
const
|
||||
DecVectors = [
|
||||
("0", 0'u64, 1),
|
||||
("1", 1'u64, 1),
|
||||
("9", 9'u64, 1),
|
||||
("10", 10'u64, 2),
|
||||
("11", 11'u64, 2),
|
||||
("99", 99'u64, 2),
|
||||
("100", 100'u64, 3),
|
||||
("101", 101'u64, 3),
|
||||
("255", 255'u64, 3), # end of uint8
|
||||
("256", 256'u64, 3),
|
||||
("999", 999'u64, 3),
|
||||
("1000", 1000'u64, 4),
|
||||
("1001", 1001'u64, 4),
|
||||
("9999", 9999'u64, 4),
|
||||
("10000", 10000'u64, 5),
|
||||
("10001", 10001'u64, 5),
|
||||
("65535", 65535'u64, 5), # end of uint16
|
||||
("65536", 65536'u64, 5),
|
||||
("99999", 99999'u64, 5),
|
||||
("100000", 100000'u64, 6),
|
||||
("100001", 100001'u64, 6),
|
||||
("999999", 999999'u64, 6),
|
||||
("1000000", 1000000'u64, 7),
|
||||
("1000001", 1000001'u64, 7),
|
||||
("9999999", 9999999'u64, 7),
|
||||
("10000000", 10000000'u64, 8),
|
||||
("10000001", 10000001'u64, 8),
|
||||
("99999999", 99999999'u64, 8),
|
||||
("100000000", 100000000'u64, 9),
|
||||
("100000001", 100000001'u64, 9),
|
||||
("999999999", 999999999'u64, 9),
|
||||
("1000000000", 1000000000'u64, 10),
|
||||
("1000000001", 1000000001'u64, 10),
|
||||
("4294967295", 4294967295'u64, 10), # end of uint32
|
||||
("4294967296", 4294967296'u64, 10),
|
||||
("9999999999", 9999999999'u64, 10),
|
||||
("10000000000", 10000000000'u64, 11),
|
||||
("10000000001", 10000000001'u64, 11),
|
||||
("99999999999", 99999999999'u64, 11),
|
||||
("100000000000", 100000000000'u64, 12),
|
||||
("100000000001", 100000000001'u64, 12),
|
||||
("999999999999", 999999999999'u64, 12),
|
||||
("1000000000000", 1000000000000'u64, 13),
|
||||
("1000000000001", 1000000000001'u64, 13),
|
||||
("9999999999999", 9999999999999'u64, 13),
|
||||
("10000000000000", 10000000000000'u64, 14),
|
||||
("10000000000001", 10000000000001'u64, 14),
|
||||
("99999999999999", 99999999999999'u64, 14),
|
||||
("100000000000000", 100000000000000'u64, 15),
|
||||
("100000000000001", 100000000000001'u64, 15),
|
||||
("999999999999999", 999999999999999'u64, 15),
|
||||
("1000000000000000", 1000000000000000'u64, 16),
|
||||
("1000000000000001", 1000000000000001'u64, 16),
|
||||
("9999999999999999", 9999999999999999'u64, 16),
|
||||
("10000000000000000", 10000000000000000'u64, 17),
|
||||
("10000000000000001", 10000000000000001'u64, 17),
|
||||
("99999999999999999", 99999999999999999'u64, 17),
|
||||
("100000000000000000", 100000000000000000'u64, 18),
|
||||
("100000000000000001", 100000000000000001'u64, 18),
|
||||
("999999999999999999", 999999999999999999'u64, 18),
|
||||
("1000000000000000000", 1000000000000000000'u64, 19),
|
||||
("1000000000000000001", 1000000000000000001'u64, 19),
|
||||
("9999999999999999999", 9999999999999999999'u64, 19),
|
||||
("10000000000000000000", 10000000000000000000'u64, 20),
|
||||
("10000000000000000001", 10000000000000000001'u64, 20),
|
||||
("18446744073709551615", 18446744073709551615'u64, 20), # end of uint64
|
||||
("18446744073709551616", 0'u64, 0),
|
||||
("99999999999999999999", 0'u64, 0)
|
||||
]
|
||||
|
||||
template testVectors(T: typedesc[SomeUnsignedInt]) =
|
||||
let max = uint64(high(T))
|
||||
for item in DecVectors:
|
||||
if (item[1] <= max) and (item[2] != 0):
|
||||
let r1 = Base10.decode(T, item[0])
|
||||
let r2 = Base10.decode(T, cast[seq[byte]](item[0]))
|
||||
check:
|
||||
r1.isOk()
|
||||
r2.isOk()
|
||||
r1.get() == item[1]
|
||||
r2.get() == item[1]
|
||||
Base10.encodedLength(item[1]) == item[2]
|
||||
var outbuf = newSeq[byte](Base10.encodedLength(item[1]))
|
||||
var outstr = newString(Base10.encodedLength(item[1]))
|
||||
let r3 = Base10.encode(T(item[1]), outbuf)
|
||||
let r4 = Base10.encode(T(item[1]), outstr)
|
||||
|
||||
check:
|
||||
r3.isOk()
|
||||
r4.isOk()
|
||||
r3.get() == Base10.encodedLength(item[1])
|
||||
r4.get() == Base10.encodedLength(item[1])
|
||||
cast[string](outbuf) == item[0]
|
||||
outstr == item[0]
|
||||
|
||||
var neoutbuf = newSeq[byte](Base10.encodedLength(item[1]) - 1)
|
||||
var neoutstr = newString(Base10.encodedLength(item[1]) - 1)
|
||||
let r5 = Base10.encode(T(item[1]), neoutbuf)
|
||||
let r6 = Base10.encode(T(item[1]), neoutstr)
|
||||
|
||||
check:
|
||||
r5.isErr()
|
||||
r6.isErr()
|
||||
|
||||
else:
|
||||
var emptySeq: seq[byte]
|
||||
var emptyStr: string
|
||||
let r1 = Base10.decode(T, emptyStr)
|
||||
let r2 = Base10.decode(T, emptySeq)
|
||||
check:
|
||||
r1.isErr()
|
||||
r2.isErr()
|
||||
|
||||
template testValues(T: typedesc[SomeUnsignedInt]) =
|
||||
let max = int(min(uint64(high(T)), 100000'u64)) + 1
|
||||
for i in 0 ..< max:
|
||||
let bufstr = Base10.toString(T(i))
|
||||
let bufarr1 = Base10.toBytes(T(i))
|
||||
let bufarr2 = T(i).toBytes(Base10)
|
||||
let r1 = Base10.decode(T, bufstr)
|
||||
let r2 = Base10.decode(T, bufarr1.data.toOpenArray(0, bufarr1.len - 1))
|
||||
let r3 = Base10.decode(T, bufarr2.data.toOpenArray(0, bufarr2.len - 1))
|
||||
check:
|
||||
r1.isOk()
|
||||
r2.isOk()
|
||||
r3.isOk()
|
||||
r1.get() == T(i)
|
||||
r2.get() == T(i)
|
||||
r3.get() == T(i)
|
||||
|
||||
|
||||
template testEdge(T: typedesc[SomeUnsignedInt]) =
|
||||
var bufstr: string
|
||||
var bufseq: seq[byte]
|
||||
let r1 = Base10.decode(T, bufstr)
|
||||
let r2 = Base10.decode(T, bufseq)
|
||||
check:
|
||||
r1.isErr()
|
||||
r2.isErr()
|
||||
|
||||
var buf1str = newString(1)
|
||||
var buf1seq = newSeq[byte](1)
|
||||
for i in 0 ..< 256:
|
||||
let ch = char(i)
|
||||
if ch notin {'0'..'9'}:
|
||||
buf1str[0] = ch
|
||||
buf1seq[0] = byte(ch)
|
||||
let r3 = Base10.decode(T, buf1str)
|
||||
let r4 = Base10.decode(T, buf1seq)
|
||||
check:
|
||||
r3.isErr()
|
||||
r4.isErr()
|
||||
|
||||
suite "Base10 (decimal) test suite":
|
||||
test "[uint8] encode/decode/length test":
|
||||
testVectors(uint8)
|
||||
test "[uint16] encode/decode/length test":
|
||||
testVectors(uint16)
|
||||
test "[uint32] encode/decode/length test":
|
||||
testVectors(uint32)
|
||||
test "[uint64] encode/decode/length test":
|
||||
testVectors(uint64)
|
||||
test "[uint8] all values comparison test":
|
||||
testValues(uint8)
|
||||
test "[uint16] all values comparison test":
|
||||
testValues(uint16)
|
||||
test "[uint32] 100,000 values comparison test":
|
||||
testValues(uint32)
|
||||
test "[uint64] 100,000 values comparison test":
|
||||
testValues(uint64)
|
||||
test "[uint8] edge cases":
|
||||
testEdge(uint8)
|
||||
test "[uint16] edge cases":
|
||||
testEdge(uint16)
|
||||
test "[uint32] edge cases":
|
||||
testEdge(uint32)
|
||||
test "[uint64] edge cases":
|
||||
testEdge(uint64)
|
Loading…
x
Reference in New Issue
Block a user