mirror of
https://github.com/status-im/nim-stew.git
synced 2025-01-23 10:29:44 +00:00
Fix UTF-32 encoder/decoder.
Add tests for UTF-8 to UTF-32 and UTF-32 to UTF-8 encoders.
This commit is contained in:
parent
39fb71bcec
commit
1746bc0095
@ -355,7 +355,7 @@ elif defined(posix):
|
||||
if maxChars < len(wbuffer):
|
||||
wbuffer.setLen(maxChars)
|
||||
# Conversion of wide characters sequence to UTF-8 encoded string.
|
||||
let ures = wbuffer.wcharToUtf8()
|
||||
let ures = wbuffer.utf32toUtf8()
|
||||
if ures.isOk():
|
||||
ok(ures.get())
|
||||
else:
|
||||
|
193
stew/utf8.nim
193
stew/utf8.nim
@ -12,7 +12,8 @@ export results
|
||||
|
||||
type
|
||||
UResult*[T] = Result[T, cstring]
|
||||
Wides* = int16 | uint16 | int32 | uint32
|
||||
Wides32* = int32 | uint32
|
||||
Wides16* = int16 | uint16
|
||||
Bytes* = int8 | char | uint8 | byte
|
||||
|
||||
const
|
||||
@ -206,72 +207,200 @@ proc utf8Substr*[T: Bytes](data: openarray[T],
|
||||
inc(k)
|
||||
ok(res)
|
||||
|
||||
proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A],
|
||||
proc utf32toUtf8*[A: Wides32, B: Bytes](input: openarray[A],
|
||||
output: var openarray[B]): UResult[int] =
|
||||
## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``.
|
||||
##
|
||||
## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input.
|
||||
## Converts UTF-32 sequence ``input`` to UTF-8 array ``output``.
|
||||
var offset = 0
|
||||
for item in input:
|
||||
let uitem = uint(item)
|
||||
let codepoint =
|
||||
if uitem >= 0xD800'u and uitem <= 0xDBFF'u:
|
||||
0x10000'u + ((uitem - 0xD800'u) shl 10)
|
||||
else:
|
||||
if uitem >= 0xDC00'u and uitem <= 0xDFFF'u:
|
||||
uitem - 0xDC00'u
|
||||
else:
|
||||
uitem
|
||||
if codepoint <= 0x7F'u:
|
||||
block:
|
||||
if (uint32(item) >= 0xD800'u32) and (uint32(item) <= 0xDFFF'u32):
|
||||
# high and low surrogates U+D800 through U+DFFF prohibited in UTF-32.
|
||||
return err(ErrorInvalidSequence)
|
||||
elif (uint32(item) == 0xFFFE'u32) or (uint32(item) == 0xFFFF'u32):
|
||||
# these codes are intended for process-internal uses, and not a
|
||||
# unicode characters.
|
||||
return err(ErrorInvalidSequence)
|
||||
uint32(item)
|
||||
if codepoint <= 0x7F'u32:
|
||||
if len(output) > 0:
|
||||
if offset < len(output):
|
||||
output[offset] = cast[B](codepoint and 0x7F'u)
|
||||
output[offset] = cast[B](codepoint and 0x7F'u32)
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(offset, 1)
|
||||
elif codepoint <= 0x7FF'u:
|
||||
elif codepoint <= 0x7FF'u32:
|
||||
if len(output) > 0:
|
||||
if offset + 1 < len(output):
|
||||
output[offset + 0] = cast[B](0xC0'u8 or
|
||||
byte((codepoint shr 6) and 0x1F'u))
|
||||
output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
|
||||
byte((codepoint shr 6) and 0x1F'u32))
|
||||
output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(offset, 2)
|
||||
elif codepoint <= 0xFFFF'u:
|
||||
elif codepoint <= 0xFFFF'u32:
|
||||
if len(output) > 0:
|
||||
if offset + 2 < len(output):
|
||||
output[offset + 0] = cast[B](0xE0'u8 or
|
||||
byte((codepoint shr 12) and 0x0F'u))
|
||||
byte((codepoint shr 12) and 0x0F'u32))
|
||||
output[offset + 1] = cast[B](0x80'u8 or
|
||||
byte((codepoint shr 6) and 0x3F'u))
|
||||
output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
|
||||
byte((codepoint shr 6) and 0x3F'u32))
|
||||
output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(offset, 3)
|
||||
elif codepoint <= 0x10FFFF'u:
|
||||
elif codepoint <= 0x10FFFF'u32:
|
||||
if len(output) > 0:
|
||||
if offset + 3 < len(output):
|
||||
output[offset + 0] = cast[B](0xF0'u8 or
|
||||
byte((codepoint shr 18) and 0x07'u))
|
||||
byte((codepoint shr 18) and 0x07'u32))
|
||||
output[offset + 1] = cast[B](0x80'u8 or
|
||||
byte((codepoint shr 12) and 0x3F'u))
|
||||
byte((codepoint shr 12) and 0x3F'u32))
|
||||
output[offset + 2] = cast[B](0x80'u8 or
|
||||
byte((codepoint shr 6) and 0x3F'u))
|
||||
output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
|
||||
byte((codepoint shr 6) and 0x3F'u32))
|
||||
output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
|
||||
else:
|
||||
return err("")
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(offset, 4)
|
||||
else:
|
||||
return err(ErrorInvalidSequence)
|
||||
ok(offset)
|
||||
|
||||
proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} =
|
||||
## Converts wide character
|
||||
proc utf32toUtf8*[T: Wides32](input: openarray[T]): UResult[string] {.inline.} =
|
||||
## Converts wide character sequence ``input`` to UTF-8 encoded string.
|
||||
var empty: array[0, char]
|
||||
let size = ? wcharToUtf8(input, empty)
|
||||
let size = ? utf32ToUtf8(input, empty)
|
||||
var output = newString(size)
|
||||
let res {.used.} = ? wcharToUtf8(input, output)
|
||||
let res {.used.} = ? utf32ToUtf8(input, output)
|
||||
ok(output)
|
||||
|
||||
proc utf8toUtf32*[A: Bytes, B: Wides32](input: openarray[A],
|
||||
output: var openarray[B]): UResult[int] =
|
||||
## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
|
||||
## sequences of 32bit limbs.
|
||||
##
|
||||
## To obtain required size of ``output`` you need to pass ``output`` as
|
||||
## zero-length array, in such way required size will be returned as result of
|
||||
## procedure.
|
||||
##
|
||||
## If size of ``output`` is not zero, and there not enough space in ``output``
|
||||
## array to store whole ``input`` array, error ``ErrorBufferOverflow`` will
|
||||
## be returned.
|
||||
var index = 0
|
||||
var dindex = 0
|
||||
if len(output) == 0:
|
||||
return utf8Length(input)
|
||||
else:
|
||||
while true:
|
||||
if index >= len(input):
|
||||
break
|
||||
let byte1 = uint32(input[index])
|
||||
inc(index)
|
||||
|
||||
if (byte1 and 0x80) == 0x00:
|
||||
if dindex < len(output):
|
||||
output[dindex] = B(byte1)
|
||||
inc(dindex)
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
elif (byte1 and 0xE0'u32) == 0xC0'u32:
|
||||
# Two-byte form (110xxxxx 10xxxxxx)
|
||||
if index >= len(input):
|
||||
return err(ErrorInvalidSequence)
|
||||
# overlong sequence test
|
||||
if (byte1 and 0xFE'u32) == 0xC0'u32:
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
let byte2 = uint32(input[index])
|
||||
if (byte2 and 0xC0'u32) != 0x80'u32:
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
if dindex < len(output):
|
||||
output[dindex] = B(((byte1 and 0x1F'u32) shl 6) or
|
||||
(byte2 and 0x3F'u32))
|
||||
inc(dindex)
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(index)
|
||||
elif (byte1 and 0xF0'u32) == 0xE0'u32:
|
||||
# Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
|
||||
if (index + 1) >= len(input):
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
let byte2 = uint32(input[index])
|
||||
if (byte2 and 0xC0'u32) != 0x80'u32:
|
||||
return err(ErrorInvalidSequence)
|
||||
# overlong sequence test
|
||||
if (byte1 == 0xE0'u32) and ((byte2 and 0xE0'u32) == 0x80'u32):
|
||||
return err(ErrorInvalidSequence)
|
||||
# 0xD800–0xDFFF (UTF-16 surrogates) test
|
||||
if (byte1 == 0xED'u32) and ((byte2 and 0xE0'u32) == 0xA0'u32):
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
let byte3 = uint32(input[index + 1])
|
||||
if (byte3 and 0xC0'u32) != 0x80'u32:
|
||||
return err(ErrorInvalidSequence)
|
||||
# U+FFFE or U+FFFF test
|
||||
if (byte1 == 0xEF'u32) and (byte2 == 0xBF'u32) and
|
||||
((byte3 and 0xFE'u32) == 0xBE'u32):
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
if dindex < len(output):
|
||||
output[dindex] = B(((byte1 and 0x0F'u32) shl 12) or
|
||||
((byte2 and 0x3F'u32) shl 6) or
|
||||
(byte3 and 0x3F'u32))
|
||||
inc(dindex)
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(index, 2)
|
||||
|
||||
elif (byte1 and 0xF8'u8) == 0xF0'u8:
|
||||
# Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||
if (index + 2) >= len(input):
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
let byte2 = uint32(input[index])
|
||||
if (byte2 and 0xC0'u32) != 0x80'u32:
|
||||
return err(ErrorInvalidSequence)
|
||||
# overlong sequence test
|
||||
if (byte1 == 0xF0'u32) and ((byte2 and 0xF0'u32) == 0x80'u32):
|
||||
return err(ErrorInvalidSequence)
|
||||
# According to RFC 3629 no point above U+10FFFF should be used, which
|
||||
# limits characters to four bytes.
|
||||
if ((byte1 == 0xF4'u32) and (byte2 > 0x8F'u32)) or (byte1 > 0xF4'u32):
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
let byte3 = uint32(input[index + 1])
|
||||
if (byte3 and 0xC0'u32) != 0x80'u32:
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
let byte4 = uint32(input[index + 2])
|
||||
if (byte4 and 0xC0'u32) != 0x80'u32:
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
if dindex < len(output):
|
||||
output[dindex] = B(((byte1 and 0x07'u32) shl 18) or
|
||||
((byte2 and 0x3F'u32) shl 12) or
|
||||
((byte3 and 0x3F'u32) shl 6) or
|
||||
(byte4 and 0x3F'u32))
|
||||
inc(dindex)
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(index, 3)
|
||||
|
||||
else:
|
||||
return err(ErrorInvalidSequence)
|
||||
|
||||
ok(dindex)
|
||||
|
||||
proc utf8toUtf32*[A: Bytes, B: Wides32](et: typedesc[B],
|
||||
input: openarray[A]): UResult[seq[B]] =
|
||||
## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
|
||||
## sequence of 32bit limbs and return it.
|
||||
var empty: array[0, B]
|
||||
let size = ? utf8toUtf32(input, empty)
|
||||
var output = newSeq[B](size)
|
||||
let res {.used.} = ? utf8toUtf32(input, output)
|
||||
ok(output)
|
||||
|
||||
when defined(posix):
|
||||
|
@ -302,18 +302,40 @@ suite "UTF-8 validation test suite":
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == ""
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == ""
|
||||
|
||||
test "wcharToUtf8() tests":
|
||||
test "UTF-32 -> UTF-8 conversion test":
|
||||
for i in 0 ..< 0x11_0000:
|
||||
if i != 0xFFFE and i != 0xFFFF:
|
||||
if i < 0x10000:
|
||||
var data16 = [uint16(i)]
|
||||
let res = wcharToUtf8(data16)
|
||||
check:
|
||||
res.isOk() == true
|
||||
utf8Validate(res.get()) == true
|
||||
|
||||
var data32 = [uint32(i)]
|
||||
if i >= 0xD800 and i <= 0xDFFF:
|
||||
check utf32toUtf8(data32).isErr()
|
||||
elif i == 0xFFFE:
|
||||
check utf32toUtf8(data32).isErr()
|
||||
elif i == 0xFFFF:
|
||||
check utf32toUtf8(data32).isErr()
|
||||
elif i == 0x11_0000:
|
||||
check utf32toUtf8(data32).isErr()
|
||||
else:
|
||||
var data32 = [uint32(i)]
|
||||
let res = wcharToUtf8(data32)
|
||||
let res = utf32toUtf8(data32)
|
||||
check:
|
||||
res.isOk() == true
|
||||
utf8Validate(res.get()) == true
|
||||
|
||||
test "UTF-8 -> UTF-32 conversion test":
|
||||
for i in 0 ..< 0x11_0001:
|
||||
var data32 = [uint32(i)]
|
||||
if i >= 0xD800 and i <= 0xDFFF:
|
||||
check utf32toUtf8(data32).isErr()
|
||||
elif i == 0xFFFE:
|
||||
check utf32toUtf8(data32).isErr()
|
||||
elif i == 0xFFFF:
|
||||
check utf32toUtf8(data32).isErr()
|
||||
elif i == 0x11_0000:
|
||||
check utf32toUtf8(data32).isErr()
|
||||
else:
|
||||
var data32 = [uint32(i)]
|
||||
let res8 = utf32toUtf8(data32)
|
||||
check res8.isOk()
|
||||
let res32 = utf8toUtf32(uint32, res8.get())
|
||||
check:
|
||||
res32.isOk()
|
||||
res32.get() == data32
|
||||
|
Loading…
x
Reference in New Issue
Block a user