mirror of
https://github.com/status-im/nim-stew.git
synced 2025-02-13 12:36:57 +00:00
add utf-8 validator
This commit is contained in:
parent
3c91b8694e
commit
92d5a8cc55
45
stew/utf.nim
Normal file
45
stew/utf.nim
Normal file
@ -0,0 +1,45 @@
|
||||
## utf
|
||||
## Copyright (c) 2021 Status Research & Development GmbH
|
||||
## Licensed under either of
|
||||
## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
|
||||
## * MIT license ([LICENSE-MIT](LICENSE-MIT))
|
||||
## at your option.
|
||||
## This file may not be copied, modified, or distributed except according to
|
||||
## those terms.
|
||||
|
||||
# DFA based UTF8 decoder/validator
|
||||
# See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||
type
|
||||
Utf8* = object
|
||||
|
||||
const
|
||||
UTF8_ACCEPT* = 0
|
||||
UTF8_REJECT* = 12
|
||||
|
||||
const utf8Table = [
|
||||
# The first part of the table maps bytes to character classes that
|
||||
# to reduce the size of the transition table and create bitmasks.
|
||||
0'u8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1 ,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
||||
7 ,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
||||
8 ,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
10 ,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
||||
|
||||
# The second part is a transition table that maps a combination
|
||||
# of a state of the automaton and a character class to a state.
|
||||
0 ,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
||||
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
||||
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
||||
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
||||
12,36,12,12,12,12,12,12,12,12,12,12
|
||||
]
|
||||
|
||||
proc validate*[T: byte | char](_: type Utf8, text: openArray[T]): bool =
|
||||
var state = 0
|
||||
for c in text:
|
||||
let x = utf8Table[c.int].int
|
||||
state = utf8Table[256 + state + x].int
|
||||
state == UTF8_ACCEPT
|
67
tests/test_utf.nim
Normal file
67
tests/test_utf.nim
Normal file
@ -0,0 +1,67 @@
|
||||
## utf
|
||||
## Copyright (c) 2021 Status Research & Development GmbH
|
||||
## Licensed under either of
|
||||
## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
|
||||
## * MIT license ([LICENSE-MIT](LICENSE-MIT))
|
||||
## at your option.
|
||||
## This file may not be copied, modified, or distributed except according to
|
||||
## those terms.
|
||||
|
||||
import
|
||||
std/[unittest],
|
||||
../stew/utf
|
||||
|
||||
suite "UTF-8 DFA validator":
|
||||
test "single octet":
|
||||
check:
|
||||
Utf8.validate("\x01")
|
||||
Utf8.validate("\x32")
|
||||
Utf8.validate("\x7f")
|
||||
Utf8.validate("\x80") == false
|
||||
|
||||
test "two octets":
|
||||
check:
|
||||
Utf8.validate("\xc2\x80")
|
||||
Utf8.validate("\xc4\x80")
|
||||
Utf8.validate("\xdf\xbf")
|
||||
Utf8.validate("\xdfu\xc0") == false
|
||||
Utf8.validate("\xdf") == false
|
||||
|
||||
test "three octets":
|
||||
check:
|
||||
Utf8.validate("\xe0\xa0\x80")
|
||||
Utf8.validate("\xe1\x80\x80")
|
||||
Utf8.validate("\xef\xbf\xbf")
|
||||
Utf8.validate("\xef\xbf\xc0") == false
|
||||
Utf8.validate("\xef\xbf") == false
|
||||
|
||||
test "four octets":
|
||||
check:
|
||||
Utf8.validate("\xf0\x90\x80\x80")
|
||||
Utf8.validate("\xf0\x92\x80\x80")
|
||||
Utf8.validate("\xf0\x9f\xbf\xbf")
|
||||
Utf8.validate("\xf0\x9f\xbf\xc0") == false
|
||||
Utf8.validate("\xf0\x9f\xbf") == false
|
||||
|
||||
test "overlong sequence":
|
||||
check:
|
||||
Utf8.validate("\xc0\xaf") == false
|
||||
Utf8.validate("\xe0\x80\xaf") == false
|
||||
Utf8.validate("\xf0\x80\x80\xaf") == false
|
||||
Utf8.validate("\xf8\x80\x80\x80\xaf") == false
|
||||
Utf8.validate("\xfc\x80\x80\x80\x80\xaf") == false
|
||||
|
||||
test "max overlong sequence":
|
||||
check:
|
||||
Utf8.validate("\xc1\xbf") == false
|
||||
Utf8.validate("\xe0\x9f\xbf") == false
|
||||
Utf8.validate("\xf0\x8f\xbf\xbf") == false
|
||||
Utf8.validate("\xf8\x87\xbf\xbf\xbf") == false
|
||||
Utf8.validate("\xfc\x83\xbf\xbf\xbf\xbf") == false
|
||||
|
||||
test "distinct codepoint":
|
||||
check:
|
||||
Utf8.validate("foobar")
|
||||
Utf8.validate("foob\xc3\xa6r")
|
||||
Utf8.validate("foob\xf0\x9f\x99\x88r")
|
||||
|
Loading…
x
Reference in New Issue
Block a user