implement practical UTF-8/16 codepoints module
this module designed for common cases where unicode text representation converted to nim string or blob. usually this module used in a parser or unicode bytes stream validator.
This commit is contained in:
parent
92d5a8cc55
commit
51e7e0ecfd
275
stew/utf.nim
275
stew/utf.nim
|
@ -9,33 +9,57 @@
|
|||
|
||||
# DFA based UTF8 decoder/validator
|
||||
# See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||
|
||||
import stew/ranges/ptr_arith
|
||||
import stew/results
|
||||
|
||||
type
|
||||
Utf8* = object
|
||||
Utf8* = object
|
||||
Utf16* = object
|
||||
Utf32* = object
|
||||
Utf* = uint32
|
||||
|
||||
const
|
||||
UTF8_ACCEPT* = 0
|
||||
UTF8_REJECT* = 12
|
||||
|
||||
const utf8Table = [
|
||||
# The first part of the table maps bytes to character classes that
|
||||
# to reduce the size of the transition table and create bitmasks.
|
||||
0'u8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1 ,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
||||
7 ,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
||||
8 ,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
10 ,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
||||
highBegin = 0xD800
|
||||
highEnd = 0xDBFF
|
||||
lowBegin = 0xDC00
|
||||
lowEnd = 0xDFFF
|
||||
|
||||
# The second part is a transition table that maps a combination
|
||||
# of a state of the automaton and a character class to a state.
|
||||
0 ,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
||||
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
||||
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
||||
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
||||
12,36,12,12,12,12,12,12,12,12,12,12
|
||||
]
|
||||
Utf16Shift = 10
|
||||
Utf16Base = 0x0010000
|
||||
Utf16Mask = 0x3FF
|
||||
Utf16Maxbmp= 0xFFFF
|
||||
MaxUtf = 0x10FFFF
|
||||
|
||||
DefaultReplacement* = 0xFFFD
|
||||
InvalidUTF8 = "invalid UTF-8 sequence"
|
||||
InvalidUTF16 = "invalid UTF-16 sequence"
|
||||
InvalidUTF32 = "invalid UTF-32 sequence"
|
||||
|
||||
const
|
||||
utf8Table = [
|
||||
# The first part of the table maps bytes to character classes that
|
||||
# to reduce the size of the transition table and create bitmasks.
|
||||
0'u8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1 ,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
||||
7 ,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
||||
8 ,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
10 ,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
||||
|
||||
# The second part is a transition table that maps a combination
|
||||
# of a state of the automaton and a character class to a state.
|
||||
0 ,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
||||
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
||||
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
||||
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
||||
12,36,12,12,12,12,12,12,12,12,12,12
|
||||
]
|
||||
|
||||
proc validate*[T: byte | char](_: type Utf8, text: openArray[T]): bool =
|
||||
var state = 0
|
||||
|
@ -43,3 +67,214 @@ proc validate*[T: byte | char](_: type Utf8, text: openArray[T]): bool =
|
|||
let x = utf8Table[c.int].int
|
||||
state = utf8Table[256 + state + x].int
|
||||
state == UTF8_ACCEPT
|
||||
|
||||
proc count*[T: byte | char](_: type Utf8,
|
||||
text: openArray[T]): Result[int, string] =
|
||||
var
|
||||
state = 0
|
||||
res = 0
|
||||
for c in text:
|
||||
let x = utf8Table[c.int].int
|
||||
state = utf8Table[256 + state + x].int
|
||||
if state == UTF8_ACCEPT:
|
||||
inc res
|
||||
if state == UTF8_ACCEPT:
|
||||
ok(res)
|
||||
else:
|
||||
err(InvalidUTF8)
|
||||
|
||||
proc highSurrogate*(_: type Utf16, c: int): bool =
|
||||
c >= highBegin and c <= highEnd
|
||||
|
||||
proc lowSurrogate*(_: type Utf16, c: int): bool =
|
||||
c >= lowBegin and c <= lowEnd
|
||||
|
||||
proc utf*(_: type Utf16, c1, c2: int): Utf =
|
||||
Utf(((c1 - highBegin) shl Utf16Shift) + (c2 - lowBegin) + Utf16Base)
|
||||
|
||||
proc inc*(_: type Utf16, cp: int, res: var int): bool =
|
||||
if cp <= Utf16Maxbmp:
|
||||
if cp >= highBegin and cp <= lowBegin:
|
||||
return false
|
||||
else:
|
||||
inc res
|
||||
elif cp > MaxUtf:
|
||||
return false
|
||||
else:
|
||||
inc res, 2
|
||||
|
||||
return true
|
||||
|
||||
proc utf16Len*[T: byte | char](_: type Utf8,
|
||||
text: openArray[T]): Result[int, string] =
|
||||
var
|
||||
state = 0
|
||||
cp = 0
|
||||
res = 0
|
||||
for c in text:
|
||||
let x = utf8Table[c.int].int
|
||||
cp = if state != UTF8_ACCEPT:
|
||||
(c and 0x3fu) or (cp shl 6)
|
||||
else:
|
||||
(0xff shr x) and c
|
||||
state = utf8Table[256 + state + x].int
|
||||
if state == UTF8_ACCEPT:
|
||||
if not Utf16.inc(cp, res):
|
||||
return err(InvalidUTF8)
|
||||
if state == UTF8_ACCEPT:
|
||||
ok(res)
|
||||
else:
|
||||
err(InvalidUTF8)
|
||||
|
||||
proc inc*(_: type Utf8, cp: int, res: var int): bool =
|
||||
if cp < 0x80:
|
||||
inc res
|
||||
elif cp < 0x800:
|
||||
inc res, 2
|
||||
elif cp < 0x10000:
|
||||
inc res, 3
|
||||
elif cp <= MaxUtf:
|
||||
inc res, 4
|
||||
else:
|
||||
return false
|
||||
return true
|
||||
|
||||
proc utf8Len*(_: type Utf32, text: openArray[uint32]): Result[int, string] =
|
||||
var res = 0
|
||||
for cp in text:
|
||||
if not Utf8.inc(cp, res):
|
||||
return err(InvalidUTF32)
|
||||
ok(res)
|
||||
|
||||
proc utf16Len*(_: type Utf32, text: openArray[uint32]): Result[int, string] =
|
||||
var res = 0
|
||||
for cp in text:
|
||||
if not Utf16.inc(cp, res):
|
||||
return err(InvalidUTF32)
|
||||
ok(res)
|
||||
|
||||
proc utf8Len*(_: type Utf16, text: openArray[uint16]): Result[int, string] =
|
||||
var
|
||||
i = 0
|
||||
res = 0
|
||||
while i < text.len:
|
||||
let c1 = text[i]
|
||||
if c1 >= highBegin and c1 <= highEnd:
|
||||
inc i
|
||||
if i >= text.len:
|
||||
return err(InvalidUtf16)
|
||||
# surrogate pairs
|
||||
let c2 = text[i]
|
||||
if c2 < lowBegin or c2 > lowEnd:
|
||||
return err(InvalidUtf16)
|
||||
let cp = Utf16.utf(c1, c2)
|
||||
if not Utf8.inc(cp, res):
|
||||
return err(InvalidUtf16)
|
||||
elif c1 >= lowBegin and c1 <= lowEnd:
|
||||
return err(InvalidUtf16)
|
||||
inc i
|
||||
if not Utf8.inc(c1, res):
|
||||
return err(InvalidUtf16)
|
||||
|
||||
ok(res)
|
||||
|
||||
proc validate*(_: type Utf16, text: openArray[uint16]): bool =
|
||||
var i = 0
|
||||
while i < text.len:
|
||||
let c1 = text[i]
|
||||
if c1 >= highBegin and c1 <= highEnd:
|
||||
inc i
|
||||
if i >= text.len:
|
||||
return false
|
||||
# surrogate pairs
|
||||
let c2 = text[i]
|
||||
if c2 < lowBegin or c2 > lowEnd:
|
||||
return false
|
||||
elif c1 >= lowBegin and c1 <= lowEnd:
|
||||
return false
|
||||
inc i
|
||||
return true
|
||||
|
||||
proc validate*[T: byte | char](_: type Utf16, text: openArray[T]): bool =
|
||||
if text.len mod 2 != 0:
|
||||
return false
|
||||
if text.len == 0:
|
||||
return true
|
||||
Utf16.validate(makeOpenArray(text[0].unsafeAddr, uint16, text.len div 2))
|
||||
|
||||
proc append*(_: type Utf8, text: var (string | seq[byte]), cp: int): bool =
|
||||
var len = 0
|
||||
if not Utf8.inc(cp, len):
|
||||
return false
|
||||
let pos = text.len
|
||||
text.setLen(text.len + len)
|
||||
|
||||
when text is string:
|
||||
type T = char
|
||||
else:
|
||||
type T = byte
|
||||
|
||||
if len == 1:
|
||||
text[pos + 0] = T(cp)
|
||||
elif len == 2:
|
||||
text[pos + 0] = T(0xC0 + (cp shr 6))
|
||||
text[pos + 1] = T(0x80 + (cp and 0x3f))
|
||||
elif len == 3:
|
||||
text[pos + 0] = T(0xE0 + ( cp shr 12))
|
||||
text[pos + 1] = T(0x80 + ((cp shr 6) and 0x3F))
|
||||
text[pos + 2] = T(0x80 + ( cp and 0x3F))
|
||||
else:
|
||||
text[pos + 0] = T(0xF0 + ( cp shr 18))
|
||||
text[pos + 1] = T(0x80 + ((cp shr 12) and 0x3F))
|
||||
text[pos + 2] = T(0x80 + ((cp shr 6) and 0x3F))
|
||||
text[pos + 3] = T(0x80 + ( cp and 0x3F))
|
||||
|
||||
return true
|
||||
|
||||
proc append*(_: type Utf8, text: var (string | seq[byte]), c1, c2: int): bool =
|
||||
Utf8.append(text, Utf16.utf(c1, c2))
|
||||
|
||||
proc append*(_: type Utf16, text: var seq[uint16], cp: int): bool =
|
||||
if cp <= Utf16Maxbmp:
|
||||
if cp >= highBegin and cp <= lowBegin:
|
||||
return false
|
||||
else:
|
||||
text.add uint16(cp)
|
||||
elif cp > MaxUtf:
|
||||
return false
|
||||
else:
|
||||
let c = cp - Utf16Base
|
||||
text.add uint16((c shr Utf16Shift) + highBegin)
|
||||
text.add uint16((c and Utf16Mask) + lowBegin)
|
||||
|
||||
return true
|
||||
|
||||
proc append*[T: byte | char](_: type Utf16,
|
||||
res: var seq[uint16],
|
||||
text: openArray[T]): Result[int, string] =
|
||||
let r = Utf8.utf16Len(text)
|
||||
if r.isErr:
|
||||
return r
|
||||
var pos = res.len
|
||||
res.setLen(pos + r.get())
|
||||
|
||||
var
|
||||
state = 0
|
||||
cp = 0
|
||||
for c in text:
|
||||
let x = utf8Table[c.int].int
|
||||
cp = if state != UTF8_ACCEPT:
|
||||
(c and 0x3fu) or (cp shl 6)
|
||||
else:
|
||||
(0xff shr x) and c
|
||||
state = utf8Table[256 + state + x].int
|
||||
if state == UTF8_ACCEPT:
|
||||
if cp <= Utf16MaxBmp:
|
||||
res[pos] = uint16(cp)
|
||||
inc pos
|
||||
else:
|
||||
res[pos + 0] = uint16((cp shr Utf16Shift) + highBegin)
|
||||
res[pos + 1] = uint16((cp and Utf16Mask) + lowBegin)
|
||||
inc pos, 2
|
||||
|
||||
return r
|
||||
|
|
|
@ -27,4 +27,5 @@ import
|
|||
test_sequtils2,
|
||||
test_results,
|
||||
test_varints,
|
||||
test_winacl
|
||||
test_winacl,
|
||||
test_utf
|
||||
|
|
|
@ -65,3 +65,32 @@ suite "UTF-8 DFA validator":
|
|||
Utf8.validate("foob\xc3\xa6r")
|
||||
Utf8.validate("foob\xf0\x9f\x99\x88r")
|
||||
|
||||
test "boundary test":
|
||||
check:
|
||||
Utf8.validate("κόσμε")
|
||||
Utf8.validate("\xC2\x80")
|
||||
Utf8.validate("\xE0\xA0\x80")
|
||||
Utf8.validate("\xF0\x90\x80\x80")
|
||||
Utf8.validate("\xF8\x88\x80\x80\x80") == false
|
||||
Utf8.validate("\xFC\x84\x80\x80\x80\x80") == false
|
||||
Utf8.validate("\x7F")
|
||||
Utf8.validate("\xDF\xBF")
|
||||
Utf8.validate("\xEF\xBF\xBF")
|
||||
Utf8.validate("\xF4\x8F\xBF\xBF")
|
||||
Utf8.validate("\xF4\x90\x80\x80") == false
|
||||
Utf8.validate("\xFB\xBF\xBF\xBF\xBF") == false
|
||||
Utf8.validate("\xFD\xBF\xBF\xBF\xBF\xBF") == false
|
||||
Utf8.validate("\xed\x9f\xbf")
|
||||
Utf8.validate("\xee\x80\x80")
|
||||
Utf8.validate("\xef\xbf\xbd")
|
||||
|
||||
#[
|
||||
import unicode, strutils
|
||||
func toHex(s: string): string =
|
||||
for c in s:
|
||||
result.add toHex(c.int, 2)
|
||||
|
||||
|
||||
echo toUTF8(0x110000.Rune).toHex
|
||||
|
||||
]#
|
Loading…
Reference in New Issue