Optimized and exception-less encoding/decoding procedures for decimal integers (#78)

* Optimized and exception-less encoding/decoding procedures for decimal integers.

* Add tests.

* Fix import path.

* Fix review comments.

* Code simplification.

* Make toBytes() allocation free.

* Do not perform conversion to signed type to avoid compiler's overflow checks.
This commit is contained in:
Eugene Kabanov 2021-03-05 20:50:36 +02:00 committed by GitHub
parent 42475fd2f1
commit 6bcb21184a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 383 additions and 0 deletions

197
stew/base10.nim Normal file
View File

@ -0,0 +1,197 @@
## Copyright (c) 2021 Status Research & Development GmbH
## Licensed under either of
## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
## * MIT license ([LICENSE-MIT](LICENSE-MIT))
## at your option.
## This file may not be copied, modified, or distributed except according to
## those terms.
## This module implements BASE10 (decimal) encoding and decoding procedures.
##
## Encoding procedures are adopted versions of C functions described here:
## # https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920
import results
export results
{.push raises: [Defect].}
type
Base10* = object
func maxLen*(T: typedesc[Base10], I: type): int8 =
## The maximum number of bytes needed to encode any value of type I
when I is uint8:
3
elif I is uint16:
5
elif I is uint32:
10
elif I is uint64:
20
else:
when sizeof(uint) == 4:
10
else:
20
type
Base10Buf*[T: SomeUnsignedInt] = object
data*: array[maxLen(Base10, T), byte]
len*: int8 # >= 1 when holding valid unsigned integer
proc decode*[A: byte|char](B: typedesc[Base10], T: typedesc[SomeUnsignedInt],
src: openarray[A]): Result[T, cstring] =
## Convert base10 encoded string or array of bytes to unsigned integer.
const
MaxValue = T(high(T) div 10)
MaxNumber = T(high(T) - MaxValue * 10)
if len(src) == 0:
return err("Missing decimal value")
var v = T(0)
for i in 0 ..< len(src):
let ch = when A is char: byte(src[i]) else: src[i]
let d =
if (ch >= ord('0')) and (ch <= ord('9')):
T(ch - ord('0'))
else:
return err("Non-decimal character encountered")
if (v > MaxValue) or (v == MaxValue and T(d) > MaxNumber):
return err("Integer overflow")
v = (v shl 3) + (v shl 1) + T(d)
ok(v)
proc encodedLength*(B: typedesc[Base10], value: SomeUnsignedInt): int8 =
## Procedure returns number of characters needed to encode integer ``value``.
when type(value) is uint8:
if value < 10'u8:
return 1'i8
if value < 100'u8:
return 2'i8
3'i8
elif type(value) is uint16:
if value < 10'u16:
return 1'i8
if value < 100'u16:
return 2'i8
if value < 1000'u16:
return 3'i8
if value < 10000'u16:
return 4'i8
5'i8
elif type(value) is uint32:
const
P04 = 1_0000'u32
P05 = 1_0000_0'u32
P06 = 1_0000_00'u32
P07 = 1_0000_000'u32
P08 = 1_0000_0000'u32
P09 = 1_0000_0000_0'u32
if value < 10'u32:
return 1'i8
if value < 100'u32:
return 2'i8
if value < 1000'u32:
return 3'i8
if value < P08:
if value < P06:
if value < P04:
return 4'i8
return 5'i8 + (if value >= P05: 1'i8 else: 0'i8)
return 7'i8 + (if value >= P07: 1'i8 else: 0'i8)
9'i8 + (if value >= P09: 1'i8 else: 0'i8)
elif type(value) is uint64:
const
P04 = 1_0000'u64
P05 = 1_0000_0'u64
P06 = 1_0000_00'u64
P07 = 1_0000_000'u64
P08 = 1_0000_0000'u64
P09 = 1_0000_0000_0'u64
P10 = 1_0000_0000_00'u64
P11 = 1_0000_0000_000'u64
P12 = 1_0000_0000_0000'u64
if value < 10'u64:
return 1'i8
if value < 100'u64:
return 2'i8
if value < 1000'u64:
return 3'i8
if value < P12:
if value < P08:
if value < P06:
if value < P04:
return 4'i8
return 5'i8 + (if value >= P05: 1'i8 else: 0)
return 7'i8 + (if value >= P07: 1'i8 else: 0)
if value < P10:
return 9'i8 + (if value >= P09: 1'i8 else: 0)
return 11'i8 + (if value >= P11: 1'i8 else: 0)
return 12'i8 + B.encodedLength(value div P12)
proc encode[A: byte|char](B: typedesc[Base10], value: SomeUnsignedInt,
output: var openarray[A],
length: int8): Result[int8, cstring] =
const Digits = cstring(
"0001020304050607080910111213141516171819" &
"2021222324252627282930313233343536373839" &
"4041424344454647484950515253545556575859" &
"6061626364656667686970717273747576777879" &
"8081828384858687888990919293949596979899"
)
if len(output) < length:
return err("Not enough space to store decimal value")
var v = value
var next = length - 1
while v >= type(value)(100):
let index = uint8((v mod type(value)(100)) shl 1)
v = v div type(value)(100)
when A is char:
output[next] = Digits[index + 1]
output[next - 1] = Digits[index]
else:
output[next] = byte(Digits[index + 1])
output[next - 1] = byte(Digits[index])
dec(next, 2)
if v < type(value)(10):
when A is char:
output[next] = char(ord('0') + (v and type(value)(0x0F)))
else:
output[next] = byte('0') + byte(v and type(value)(0x0F))
else:
let index = uint8(v) shl 1
when A is char:
output[next] = Digits[index + 1]
output[next - 1] = Digits[index]
else:
output[next] = byte(Digits[index + 1])
output[next - 1] = byte(Digits[index])
ok(length)
proc encode*[A: byte|char](B: typedesc[Base10], value: SomeUnsignedInt,
output: var openarray[A]): Result[int8, cstring] =
## Encode integer value to array of characters or bytes.
B.encode(value, output, B.encodedLength(value))
proc toString*(B: typedesc[Base10], value: SomeUnsignedInt): string =
## Encode integer value ``value`` to string.
var buf = newString(B.encodedLength(value))
# Buffer of proper size is allocated, so error is not possible
discard B.encode(value, buf, int8(len(buf)))
buf
proc toBytes*[I: SomeUnsignedInt](B: typedesc[Base10], v: I): Base10Buf[I] {.
noinit.} =
## Encode integer value ``value`` to array of bytes.
let res = B.encode(v, result.data, B.encodedLength(v))
result.len = int8(res.get())
proc toBytes*[I: SomeUnsignedInt](v: I, B: typedesc[Base10]): Base10Buf[I] {.
noinit.} =
## Encode integer value ``value`` to array of bytes.
let res = B.encode(v, result.data, B.encodedLength(v))
result.len = int8(res.get())

View File

@ -11,6 +11,7 @@ import
ranges/all,
test_assign2,
test_arrayops,
test_base10,
test_base32,
test_base58,
test_base64,

185
tests/test_base10.nim Normal file
View File

@ -0,0 +1,185 @@
import unittest
import ../stew/base10
when defined(nimHasUsed): {.used.}
const
DecVectors = [
("0", 0'u64, 1),
("1", 1'u64, 1),
("9", 9'u64, 1),
("10", 10'u64, 2),
("11", 11'u64, 2),
("99", 99'u64, 2),
("100", 100'u64, 3),
("101", 101'u64, 3),
("255", 255'u64, 3), # end of uint8
("256", 256'u64, 3),
("999", 999'u64, 3),
("1000", 1000'u64, 4),
("1001", 1001'u64, 4),
("9999", 9999'u64, 4),
("10000", 10000'u64, 5),
("10001", 10001'u64, 5),
("65535", 65535'u64, 5), # end of uint16
("65536", 65536'u64, 5),
("99999", 99999'u64, 5),
("100000", 100000'u64, 6),
("100001", 100001'u64, 6),
("999999", 999999'u64, 6),
("1000000", 1000000'u64, 7),
("1000001", 1000001'u64, 7),
("9999999", 9999999'u64, 7),
("10000000", 10000000'u64, 8),
("10000001", 10000001'u64, 8),
("99999999", 99999999'u64, 8),
("100000000", 100000000'u64, 9),
("100000001", 100000001'u64, 9),
("999999999", 999999999'u64, 9),
("1000000000", 1000000000'u64, 10),
("1000000001", 1000000001'u64, 10),
("4294967295", 4294967295'u64, 10), # end of uint32
("4294967296", 4294967296'u64, 10),
("9999999999", 9999999999'u64, 10),
("10000000000", 10000000000'u64, 11),
("10000000001", 10000000001'u64, 11),
("99999999999", 99999999999'u64, 11),
("100000000000", 100000000000'u64, 12),
("100000000001", 100000000001'u64, 12),
("999999999999", 999999999999'u64, 12),
("1000000000000", 1000000000000'u64, 13),
("1000000000001", 1000000000001'u64, 13),
("9999999999999", 9999999999999'u64, 13),
("10000000000000", 10000000000000'u64, 14),
("10000000000001", 10000000000001'u64, 14),
("99999999999999", 99999999999999'u64, 14),
("100000000000000", 100000000000000'u64, 15),
("100000000000001", 100000000000001'u64, 15),
("999999999999999", 999999999999999'u64, 15),
("1000000000000000", 1000000000000000'u64, 16),
("1000000000000001", 1000000000000001'u64, 16),
("9999999999999999", 9999999999999999'u64, 16),
("10000000000000000", 10000000000000000'u64, 17),
("10000000000000001", 10000000000000001'u64, 17),
("99999999999999999", 99999999999999999'u64, 17),
("100000000000000000", 100000000000000000'u64, 18),
("100000000000000001", 100000000000000001'u64, 18),
("999999999999999999", 999999999999999999'u64, 18),
("1000000000000000000", 1000000000000000000'u64, 19),
("1000000000000000001", 1000000000000000001'u64, 19),
("9999999999999999999", 9999999999999999999'u64, 19),
("10000000000000000000", 10000000000000000000'u64, 20),
("10000000000000000001", 10000000000000000001'u64, 20),
("18446744073709551615", 18446744073709551615'u64, 20), # end of uint64
("18446744073709551616", 0'u64, 0),
("99999999999999999999", 0'u64, 0)
]
template testVectors(T: typedesc[SomeUnsignedInt]) =
let max = uint64(high(T))
for item in DecVectors:
if (item[1] <= max) and (item[2] != 0):
let r1 = Base10.decode(T, item[0])
let r2 = Base10.decode(T, cast[seq[byte]](item[0]))
check:
r1.isOk()
r2.isOk()
r1.get() == item[1]
r2.get() == item[1]
Base10.encodedLength(item[1]) == item[2]
var outbuf = newSeq[byte](Base10.encodedLength(item[1]))
var outstr = newString(Base10.encodedLength(item[1]))
let r3 = Base10.encode(T(item[1]), outbuf)
let r4 = Base10.encode(T(item[1]), outstr)
check:
r3.isOk()
r4.isOk()
r3.get() == Base10.encodedLength(item[1])
r4.get() == Base10.encodedLength(item[1])
cast[string](outbuf) == item[0]
outstr == item[0]
var neoutbuf = newSeq[byte](Base10.encodedLength(item[1]) - 1)
var neoutstr = newString(Base10.encodedLength(item[1]) - 1)
let r5 = Base10.encode(T(item[1]), neoutbuf)
let r6 = Base10.encode(T(item[1]), neoutstr)
check:
r5.isErr()
r6.isErr()
else:
var emptySeq: seq[byte]
var emptyStr: string
let r1 = Base10.decode(T, emptyStr)
let r2 = Base10.decode(T, emptySeq)
check:
r1.isErr()
r2.isErr()
template testValues(T: typedesc[SomeUnsignedInt]) =
let max = int(min(uint64(high(T)), 100000'u64)) + 1
for i in 0 ..< max:
let bufstr = Base10.toString(T(i))
let bufarr1 = Base10.toBytes(T(i))
let bufarr2 = T(i).toBytes(Base10)
let r1 = Base10.decode(T, bufstr)
let r2 = Base10.decode(T, bufarr1.data.toOpenArray(0, bufarr1.len - 1))
let r3 = Base10.decode(T, bufarr2.data.toOpenArray(0, bufarr2.len - 1))
check:
r1.isOk()
r2.isOk()
r3.isOk()
r1.get() == T(i)
r2.get() == T(i)
r3.get() == T(i)
template testEdge(T: typedesc[SomeUnsignedInt]) =
var bufstr: string
var bufseq: seq[byte]
let r1 = Base10.decode(T, bufstr)
let r2 = Base10.decode(T, bufseq)
check:
r1.isErr()
r2.isErr()
var buf1str = newString(1)
var buf1seq = newSeq[byte](1)
for i in 0 ..< 256:
let ch = char(i)
if ch notin {'0'..'9'}:
buf1str[0] = ch
buf1seq[0] = byte(ch)
let r3 = Base10.decode(T, buf1str)
let r4 = Base10.decode(T, buf1seq)
check:
r3.isErr()
r4.isErr()
suite "Base10 (decimal) test suite":
test "[uint8] encode/decode/length test":
testVectors(uint8)
test "[uint16] encode/decode/length test":
testVectors(uint16)
test "[uint32] encode/decode/length test":
testVectors(uint32)
test "[uint64] encode/decode/length test":
testVectors(uint64)
test "[uint8] all values comparison test":
testValues(uint8)
test "[uint16] all values comparison test":
testValues(uint16)
test "[uint32] 100,000 values comparison test":
testValues(uint32)
test "[uint64] 100,000 values comparison test":
testValues(uint64)
test "[uint8] edge cases":
testEdge(uint8)
test "[uint16] edge cases":
testEdge(uint16)
test "[uint32] edge cases":
testEdge(uint32)
test "[uint64] edge cases":
testEdge(uint64)