add utf-8 validator

2025-02-13 12:36:57 +00:00 · 2021-07-10 11:12:27 +07:00 · 2021-07-10 11:12:27 +07:00 · 92d5a8cc55
commit 92d5a8cc55
parent 3c91b8694e
2 changed files with 112 additions and 0 deletions
--- a/stew/utf.nim
+++ b/stew/utf.nim
@ -0,0 +1,45 @@
+## utf
+## Copyright (c) 2021 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+# DFA based UTF8 decoder/validator
+# See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+type
+  Utf8* = object
+
+const
+  UTF8_ACCEPT* = 0
+  UTF8_REJECT* = 12
+
+const utf8Table = [
+  # The first part of the table maps bytes to character classes that
+  # to reduce the size of the transition table and create bitmasks.
+  0'u8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  1   ,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+  7   ,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  8   ,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10  ,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+  # The second part is a transition table that maps a combination
+  # of a state of the automaton and a character class to a state.
+  0 ,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12
+]
+
+proc validate*[T: byte | char](_: type Utf8, text: openArray[T]): bool =
+  var state = 0
+  for c in text:
+    let x = utf8Table[c.int].int
+    state = utf8Table[256 + state + x].int
+  state == UTF8_ACCEPT
--- a/tests/test_utf.nim
+++ b/tests/test_utf.nim
@ -0,0 +1,67 @@
+## utf
+## Copyright (c) 2021 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+import
+  std/[unittest],
+  ../stew/utf
+
+suite "UTF-8 DFA validator":
+  test "single octet":
+    check:
+      Utf8.validate("\x01")
+      Utf8.validate("\x32")
+      Utf8.validate("\x7f")
+      Utf8.validate("\x80") == false
+
+  test "two octets":
+    check:
+      Utf8.validate("\xc2\x80")
+      Utf8.validate("\xc4\x80")
+      Utf8.validate("\xdf\xbf")
+      Utf8.validate("\xdfu\xc0") == false
+      Utf8.validate("\xdf") == false
+
+  test "three octets":
+    check:
+      Utf8.validate("\xe0\xa0\x80")
+      Utf8.validate("\xe1\x80\x80")
+      Utf8.validate("\xef\xbf\xbf")
+      Utf8.validate("\xef\xbf\xc0") == false
+      Utf8.validate("\xef\xbf") == false
+
+  test "four octets":
+    check:
+      Utf8.validate("\xf0\x90\x80\x80")
+      Utf8.validate("\xf0\x92\x80\x80")
+      Utf8.validate("\xf0\x9f\xbf\xbf")
+      Utf8.validate("\xf0\x9f\xbf\xc0") == false
+      Utf8.validate("\xf0\x9f\xbf") == false
+
+  test "overlong sequence":
+    check:
+      Utf8.validate("\xc0\xaf") == false
+      Utf8.validate("\xe0\x80\xaf") == false
+      Utf8.validate("\xf0\x80\x80\xaf") == false
+      Utf8.validate("\xf8\x80\x80\x80\xaf") == false
+      Utf8.validate("\xfc\x80\x80\x80\x80\xaf") == false
+
+  test "max overlong sequence":
+    check:
+      Utf8.validate("\xc1\xbf") == false
+      Utf8.validate("\xe0\x9f\xbf") == false
+      Utf8.validate("\xf0\x8f\xbf\xbf") == false
+      Utf8.validate("\xf8\x87\xbf\xbf\xbf") == false
+      Utf8.validate("\xfc\x83\xbf\xbf\xbf\xbf") == false
+
+  test "distinct codepoint":
+    check:
+      Utf8.validate("foobar")
+      Utf8.validate("foob\xc3\xa6r")
+      Utf8.validate("foob\xf0\x9f\x99\x88r")
+