From 6bcb21184aeb96ce6c62e187a64d678b74609f1e Mon Sep 17 00:00:00 2001 From: Eugene Kabanov Date: Fri, 5 Mar 2021 20:50:36 +0200 Subject: [PATCH] Optimized and exception-less encoding/decoding procedures for decimal integers (#78) * Optimized and exception-less encoding/decoding procedures for decimal integers. * Add tests. * Fix import path. * Fix review comments. * Code simplification. * Make toBytes() allocation free. * Do not perform conversion to signed type to avoid compiler's overflow checks. --- stew/base10.nim | 197 ++++++++++++++++++++++++++++++++++++++++++ tests/all_tests.nim | 1 + tests/test_base10.nim | 185 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 383 insertions(+) create mode 100644 stew/base10.nim create mode 100644 tests/test_base10.nim diff --git a/stew/base10.nim b/stew/base10.nim new file mode 100644 index 0000000..4bcae38 --- /dev/null +++ b/stew/base10.nim @@ -0,0 +1,197 @@ +## Copyright (c) 2021 Status Research & Development GmbH +## Licensed under either of +## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE)) +## * MIT license ([LICENSE-MIT](LICENSE-MIT)) +## at your option. +## This file may not be copied, modified, or distributed except according to +## those terms. + +## This module implements BASE10 (decimal) encoding and decoding procedures. +## +## Encoding procedures are adopted versions of C functions described here: +## # https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920 +import results +export results + +{.push raises: [Defect].} + +type + Base10* = object + +func maxLen*(T: typedesc[Base10], I: type): int8 = + ## The maximum number of bytes needed to encode any value of type I + when I is uint8: + 3 + elif I is uint16: + 5 + elif I is uint32: + 10 + elif I is uint64: + 20 + else: + when sizeof(uint) == 4: + 10 + else: + 20 + +type + Base10Buf*[T: SomeUnsignedInt] = object + data*: array[maxLen(Base10, T), byte] + len*: int8 # >= 1 when holding valid unsigned integer + +proc decode*[A: byte|char](B: typedesc[Base10], T: typedesc[SomeUnsignedInt], + src: openarray[A]): Result[T, cstring] = + ## Convert base10 encoded string or array of bytes to unsigned integer. + const + MaxValue = T(high(T) div 10) + MaxNumber = T(high(T) - MaxValue * 10) + + if len(src) == 0: + return err("Missing decimal value") + var v = T(0) + for i in 0 ..< len(src): + let ch = when A is char: byte(src[i]) else: src[i] + let d = + if (ch >= ord('0')) and (ch <= ord('9')): + T(ch - ord('0')) + else: + return err("Non-decimal character encountered") + if (v > MaxValue) or (v == MaxValue and T(d) > MaxNumber): + return err("Integer overflow") + v = (v shl 3) + (v shl 1) + T(d) + ok(v) + +proc encodedLength*(B: typedesc[Base10], value: SomeUnsignedInt): int8 = + ## Procedure returns number of characters needed to encode integer ``value``. + when type(value) is uint8: + if value < 10'u8: + return 1'i8 + if value < 100'u8: + return 2'i8 + 3'i8 + elif type(value) is uint16: + if value < 10'u16: + return 1'i8 + if value < 100'u16: + return 2'i8 + if value < 1000'u16: + return 3'i8 + if value < 10000'u16: + return 4'i8 + 5'i8 + elif type(value) is uint32: + const + P04 = 1_0000'u32 + P05 = 1_0000_0'u32 + P06 = 1_0000_00'u32 + P07 = 1_0000_000'u32 + P08 = 1_0000_0000'u32 + P09 = 1_0000_0000_0'u32 + if value < 10'u32: + return 1'i8 + if value < 100'u32: + return 2'i8 + if value < 1000'u32: + return 3'i8 + if value < P08: + if value < P06: + if value < P04: + return 4'i8 + return 5'i8 + (if value >= P05: 1'i8 else: 0'i8) + return 7'i8 + (if value >= P07: 1'i8 else: 0'i8) + 9'i8 + (if value >= P09: 1'i8 else: 0'i8) + elif type(value) is uint64: + const + P04 = 1_0000'u64 + P05 = 1_0000_0'u64 + P06 = 1_0000_00'u64 + P07 = 1_0000_000'u64 + P08 = 1_0000_0000'u64 + P09 = 1_0000_0000_0'u64 + P10 = 1_0000_0000_00'u64 + P11 = 1_0000_0000_000'u64 + P12 = 1_0000_0000_0000'u64 + if value < 10'u64: + return 1'i8 + if value < 100'u64: + return 2'i8 + if value < 1000'u64: + return 3'i8 + if value < P12: + if value < P08: + if value < P06: + if value < P04: + return 4'i8 + return 5'i8 + (if value >= P05: 1'i8 else: 0) + return 7'i8 + (if value >= P07: 1'i8 else: 0) + if value < P10: + return 9'i8 + (if value >= P09: 1'i8 else: 0) + return 11'i8 + (if value >= P11: 1'i8 else: 0) + return 12'i8 + B.encodedLength(value div P12) + +proc encode[A: byte|char](B: typedesc[Base10], value: SomeUnsignedInt, + output: var openarray[A], + length: int8): Result[int8, cstring] = + const Digits = cstring( + "0001020304050607080910111213141516171819" & + "2021222324252627282930313233343536373839" & + "4041424344454647484950515253545556575859" & + "6061626364656667686970717273747576777879" & + "8081828384858687888990919293949596979899" + ) + + if len(output) < length: + return err("Not enough space to store decimal value") + + var v = value + var next = length - 1 + + while v >= type(value)(100): + let index = uint8((v mod type(value)(100)) shl 1) + v = v div type(value)(100) + when A is char: + output[next] = Digits[index + 1] + output[next - 1] = Digits[index] + else: + output[next] = byte(Digits[index + 1]) + output[next - 1] = byte(Digits[index]) + dec(next, 2) + + if v < type(value)(10): + when A is char: + output[next] = char(ord('0') + (v and type(value)(0x0F))) + else: + output[next] = byte('0') + byte(v and type(value)(0x0F)) + else: + let index = uint8(v) shl 1 + when A is char: + output[next] = Digits[index + 1] + output[next - 1] = Digits[index] + else: + output[next] = byte(Digits[index + 1]) + output[next - 1] = byte(Digits[index]) + ok(length) + +proc encode*[A: byte|char](B: typedesc[Base10], value: SomeUnsignedInt, + output: var openarray[A]): Result[int8, cstring] = + ## Encode integer value to array of characters or bytes. + B.encode(value, output, B.encodedLength(value)) + +proc toString*(B: typedesc[Base10], value: SomeUnsignedInt): string = + ## Encode integer value ``value`` to string. + var buf = newString(B.encodedLength(value)) + # Buffer of proper size is allocated, so error is not possible + discard B.encode(value, buf, int8(len(buf))) + buf + +proc toBytes*[I: SomeUnsignedInt](B: typedesc[Base10], v: I): Base10Buf[I] {. + noinit.} = + ## Encode integer value ``value`` to array of bytes. + let res = B.encode(v, result.data, B.encodedLength(v)) + result.len = int8(res.get()) + +proc toBytes*[I: SomeUnsignedInt](v: I, B: typedesc[Base10]): Base10Buf[I] {. + noinit.} = + ## Encode integer value ``value`` to array of bytes. + let res = B.encode(v, result.data, B.encodedLength(v)) + result.len = int8(res.get()) diff --git a/tests/all_tests.nim b/tests/all_tests.nim index 35e7923..7944902 100644 --- a/tests/all_tests.nim +++ b/tests/all_tests.nim @@ -11,6 +11,7 @@ import ranges/all, test_assign2, test_arrayops, + test_base10, test_base32, test_base58, test_base64, diff --git a/tests/test_base10.nim b/tests/test_base10.nim new file mode 100644 index 0000000..f69095b --- /dev/null +++ b/tests/test_base10.nim @@ -0,0 +1,185 @@ +import unittest +import ../stew/base10 + +when defined(nimHasUsed): {.used.} + +const + DecVectors = [ + ("0", 0'u64, 1), + ("1", 1'u64, 1), + ("9", 9'u64, 1), + ("10", 10'u64, 2), + ("11", 11'u64, 2), + ("99", 99'u64, 2), + ("100", 100'u64, 3), + ("101", 101'u64, 3), + ("255", 255'u64, 3), # end of uint8 + ("256", 256'u64, 3), + ("999", 999'u64, 3), + ("1000", 1000'u64, 4), + ("1001", 1001'u64, 4), + ("9999", 9999'u64, 4), + ("10000", 10000'u64, 5), + ("10001", 10001'u64, 5), + ("65535", 65535'u64, 5), # end of uint16 + ("65536", 65536'u64, 5), + ("99999", 99999'u64, 5), + ("100000", 100000'u64, 6), + ("100001", 100001'u64, 6), + ("999999", 999999'u64, 6), + ("1000000", 1000000'u64, 7), + ("1000001", 1000001'u64, 7), + ("9999999", 9999999'u64, 7), + ("10000000", 10000000'u64, 8), + ("10000001", 10000001'u64, 8), + ("99999999", 99999999'u64, 8), + ("100000000", 100000000'u64, 9), + ("100000001", 100000001'u64, 9), + ("999999999", 999999999'u64, 9), + ("1000000000", 1000000000'u64, 10), + ("1000000001", 1000000001'u64, 10), + ("4294967295", 4294967295'u64, 10), # end of uint32 + ("4294967296", 4294967296'u64, 10), + ("9999999999", 9999999999'u64, 10), + ("10000000000", 10000000000'u64, 11), + ("10000000001", 10000000001'u64, 11), + ("99999999999", 99999999999'u64, 11), + ("100000000000", 100000000000'u64, 12), + ("100000000001", 100000000001'u64, 12), + ("999999999999", 999999999999'u64, 12), + ("1000000000000", 1000000000000'u64, 13), + ("1000000000001", 1000000000001'u64, 13), + ("9999999999999", 9999999999999'u64, 13), + ("10000000000000", 10000000000000'u64, 14), + ("10000000000001", 10000000000001'u64, 14), + ("99999999999999", 99999999999999'u64, 14), + ("100000000000000", 100000000000000'u64, 15), + ("100000000000001", 100000000000001'u64, 15), + ("999999999999999", 999999999999999'u64, 15), + ("1000000000000000", 1000000000000000'u64, 16), + ("1000000000000001", 1000000000000001'u64, 16), + ("9999999999999999", 9999999999999999'u64, 16), + ("10000000000000000", 10000000000000000'u64, 17), + ("10000000000000001", 10000000000000001'u64, 17), + ("99999999999999999", 99999999999999999'u64, 17), + ("100000000000000000", 100000000000000000'u64, 18), + ("100000000000000001", 100000000000000001'u64, 18), + ("999999999999999999", 999999999999999999'u64, 18), + ("1000000000000000000", 1000000000000000000'u64, 19), + ("1000000000000000001", 1000000000000000001'u64, 19), + ("9999999999999999999", 9999999999999999999'u64, 19), + ("10000000000000000000", 10000000000000000000'u64, 20), + ("10000000000000000001", 10000000000000000001'u64, 20), + ("18446744073709551615", 18446744073709551615'u64, 20), # end of uint64 + ("18446744073709551616", 0'u64, 0), + ("99999999999999999999", 0'u64, 0) + ] + +template testVectors(T: typedesc[SomeUnsignedInt]) = + let max = uint64(high(T)) + for item in DecVectors: + if (item[1] <= max) and (item[2] != 0): + let r1 = Base10.decode(T, item[0]) + let r2 = Base10.decode(T, cast[seq[byte]](item[0])) + check: + r1.isOk() + r2.isOk() + r1.get() == item[1] + r2.get() == item[1] + Base10.encodedLength(item[1]) == item[2] + var outbuf = newSeq[byte](Base10.encodedLength(item[1])) + var outstr = newString(Base10.encodedLength(item[1])) + let r3 = Base10.encode(T(item[1]), outbuf) + let r4 = Base10.encode(T(item[1]), outstr) + + check: + r3.isOk() + r4.isOk() + r3.get() == Base10.encodedLength(item[1]) + r4.get() == Base10.encodedLength(item[1]) + cast[string](outbuf) == item[0] + outstr == item[0] + + var neoutbuf = newSeq[byte](Base10.encodedLength(item[1]) - 1) + var neoutstr = newString(Base10.encodedLength(item[1]) - 1) + let r5 = Base10.encode(T(item[1]), neoutbuf) + let r6 = Base10.encode(T(item[1]), neoutstr) + + check: + r5.isErr() + r6.isErr() + + else: + var emptySeq: seq[byte] + var emptyStr: string + let r1 = Base10.decode(T, emptyStr) + let r2 = Base10.decode(T, emptySeq) + check: + r1.isErr() + r2.isErr() + +template testValues(T: typedesc[SomeUnsignedInt]) = + let max = int(min(uint64(high(T)), 100000'u64)) + 1 + for i in 0 ..< max: + let bufstr = Base10.toString(T(i)) + let bufarr1 = Base10.toBytes(T(i)) + let bufarr2 = T(i).toBytes(Base10) + let r1 = Base10.decode(T, bufstr) + let r2 = Base10.decode(T, bufarr1.data.toOpenArray(0, bufarr1.len - 1)) + let r3 = Base10.decode(T, bufarr2.data.toOpenArray(0, bufarr2.len - 1)) + check: + r1.isOk() + r2.isOk() + r3.isOk() + r1.get() == T(i) + r2.get() == T(i) + r3.get() == T(i) + + +template testEdge(T: typedesc[SomeUnsignedInt]) = + var bufstr: string + var bufseq: seq[byte] + let r1 = Base10.decode(T, bufstr) + let r2 = Base10.decode(T, bufseq) + check: + r1.isErr() + r2.isErr() + + var buf1str = newString(1) + var buf1seq = newSeq[byte](1) + for i in 0 ..< 256: + let ch = char(i) + if ch notin {'0'..'9'}: + buf1str[0] = ch + buf1seq[0] = byte(ch) + let r3 = Base10.decode(T, buf1str) + let r4 = Base10.decode(T, buf1seq) + check: + r3.isErr() + r4.isErr() + +suite "Base10 (decimal) test suite": + test "[uint8] encode/decode/length test": + testVectors(uint8) + test "[uint16] encode/decode/length test": + testVectors(uint16) + test "[uint32] encode/decode/length test": + testVectors(uint32) + test "[uint64] encode/decode/length test": + testVectors(uint64) + test "[uint8] all values comparison test": + testValues(uint8) + test "[uint16] all values comparison test": + testValues(uint16) + test "[uint32] 100,000 values comparison test": + testValues(uint32) + test "[uint64] 100,000 values comparison test": + testValues(uint64) + test "[uint8] edge cases": + testEdge(uint8) + test "[uint16] edge cases": + testEdge(uint16) + test "[uint32] edge cases": + testEdge(uint32) + test "[uint64] edge cases": + testEdge(uint64)