Fix UTF-32 encoder/decoder.

Add tests for UTF-8 to UTF-32 and UTF-32 to UTF-8 encoders.
This commit is contained in:
cheatfate 2020-10-14 12:25:33 +03:00
parent 39fb71bcec
commit 1746bc0095
No known key found for this signature in database
GPG Key ID: 46ADD633A7201F95
3 changed files with 194 additions and 43 deletions

View File

@ -355,7 +355,7 @@ elif defined(posix):
if maxChars < len(wbuffer):
wbuffer.setLen(maxChars)
# Conversion of wide characters sequence to UTF-8 encoded string.
let ures = wbuffer.wcharToUtf8()
let ures = wbuffer.utf32toUtf8()
if ures.isOk():
ok(ures.get())
else:

View File

@ -12,7 +12,8 @@ export results
type
UResult*[T] = Result[T, cstring]
Wides* = int16 | uint16 | int32 | uint32
Wides32* = int32 | uint32
Wides16* = int16 | uint16
Bytes* = int8 | char | uint8 | byte
const
@ -206,72 +207,200 @@ proc utf8Substr*[T: Bytes](data: openarray[T],
inc(k)
ok(res)
proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A],
proc utf32toUtf8*[A: Wides32, B: Bytes](input: openarray[A],
output: var openarray[B]): UResult[int] =
## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``.
##
## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input.
## Converts UTF-32 sequence ``input`` to UTF-8 array ``output``.
var offset = 0
for item in input:
let uitem = uint(item)
let codepoint =
if uitem >= 0xD800'u and uitem <= 0xDBFF'u:
0x10000'u + ((uitem - 0xD800'u) shl 10)
else:
if uitem >= 0xDC00'u and uitem <= 0xDFFF'u:
uitem - 0xDC00'u
else:
uitem
if codepoint <= 0x7F'u:
block:
if (uint32(item) >= 0xD800'u32) and (uint32(item) <= 0xDFFF'u32):
# high and low surrogates U+D800 through U+DFFF prohibited in UTF-32.
return err(ErrorInvalidSequence)
elif (uint32(item) == 0xFFFE'u32) or (uint32(item) == 0xFFFF'u32):
# these codes are intended for process-internal uses, and not a
# unicode characters.
return err(ErrorInvalidSequence)
uint32(item)
if codepoint <= 0x7F'u32:
if len(output) > 0:
if offset < len(output):
output[offset] = cast[B](codepoint and 0x7F'u)
output[offset] = cast[B](codepoint and 0x7F'u32)
else:
return err(ErrorBufferOverflow)
inc(offset, 1)
elif codepoint <= 0x7FF'u:
elif codepoint <= 0x7FF'u32:
if len(output) > 0:
if offset + 1 < len(output):
output[offset + 0] = cast[B](0xC0'u8 or
byte((codepoint shr 6) and 0x1F'u))
output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
byte((codepoint shr 6) and 0x1F'u32))
output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
else:
return err(ErrorBufferOverflow)
inc(offset, 2)
elif codepoint <= 0xFFFF'u:
elif codepoint <= 0xFFFF'u32:
if len(output) > 0:
if offset + 2 < len(output):
output[offset + 0] = cast[B](0xE0'u8 or
byte((codepoint shr 12) and 0x0F'u))
byte((codepoint shr 12) and 0x0F'u32))
output[offset + 1] = cast[B](0x80'u8 or
byte((codepoint shr 6) and 0x3F'u))
output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
byte((codepoint shr 6) and 0x3F'u32))
output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
else:
return err(ErrorBufferOverflow)
inc(offset, 3)
elif codepoint <= 0x10FFFF'u:
elif codepoint <= 0x10FFFF'u32:
if len(output) > 0:
if offset + 3 < len(output):
output[offset + 0] = cast[B](0xF0'u8 or
byte((codepoint shr 18) and 0x07'u))
byte((codepoint shr 18) and 0x07'u32))
output[offset + 1] = cast[B](0x80'u8 or
byte((codepoint shr 12) and 0x3F'u))
byte((codepoint shr 12) and 0x3F'u32))
output[offset + 2] = cast[B](0x80'u8 or
byte((codepoint shr 6) and 0x3F'u))
output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
byte((codepoint shr 6) and 0x3F'u32))
output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
else:
return err("")
return err(ErrorBufferOverflow)
inc(offset, 4)
else:
return err(ErrorInvalidSequence)
ok(offset)
proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} =
## Converts wide character
proc utf32toUtf8*[T: Wides32](input: openarray[T]): UResult[string] {.inline.} =
## Converts wide character sequence ``input`` to UTF-8 encoded string.
var empty: array[0, char]
let size = ? wcharToUtf8(input, empty)
let size = ? utf32ToUtf8(input, empty)
var output = newString(size)
let res {.used.} = ? wcharToUtf8(input, output)
let res {.used.} = ? utf32ToUtf8(input, output)
ok(output)
proc utf8toUtf32*[A: Bytes, B: Wides32](input: openarray[A],
output: var openarray[B]): UResult[int] =
## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
## sequences of 32bit limbs.
##
## To obtain required size of ``output`` you need to pass ``output`` as
## zero-length array, in such way required size will be returned as result of
## procedure.
##
## If size of ``output`` is not zero, and there not enough space in ``output``
## array to store whole ``input`` array, error ``ErrorBufferOverflow`` will
## be returned.
var index = 0
var dindex = 0
if len(output) == 0:
return utf8Length(input)
else:
while true:
if index >= len(input):
break
let byte1 = uint32(input[index])
inc(index)
if (byte1 and 0x80) == 0x00:
if dindex < len(output):
output[dindex] = B(byte1)
inc(dindex)
else:
return err(ErrorBufferOverflow)
elif (byte1 and 0xE0'u32) == 0xC0'u32:
# Two-byte form (110xxxxx 10xxxxxx)
if index >= len(input):
return err(ErrorInvalidSequence)
# overlong sequence test
if (byte1 and 0xFE'u32) == 0xC0'u32:
return err(ErrorInvalidSequence)
let byte2 = uint32(input[index])
if (byte2 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
if dindex < len(output):
output[dindex] = B(((byte1 and 0x1F'u32) shl 6) or
(byte2 and 0x3F'u32))
inc(dindex)
else:
return err(ErrorBufferOverflow)
inc(index)
elif (byte1 and 0xF0'u32) == 0xE0'u32:
# Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
if (index + 1) >= len(input):
return err(ErrorInvalidSequence)
let byte2 = uint32(input[index])
if (byte2 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
# overlong sequence test
if (byte1 == 0xE0'u32) and ((byte2 and 0xE0'u32) == 0x80'u32):
return err(ErrorInvalidSequence)
# 0xD8000xDFFF (UTF-16 surrogates) test
if (byte1 == 0xED'u32) and ((byte2 and 0xE0'u32) == 0xA0'u32):
return err(ErrorInvalidSequence)
let byte3 = uint32(input[index + 1])
if (byte3 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
# U+FFFE or U+FFFF test
if (byte1 == 0xEF'u32) and (byte2 == 0xBF'u32) and
((byte3 and 0xFE'u32) == 0xBE'u32):
return err(ErrorInvalidSequence)
if dindex < len(output):
output[dindex] = B(((byte1 and 0x0F'u32) shl 12) or
((byte2 and 0x3F'u32) shl 6) or
(byte3 and 0x3F'u32))
inc(dindex)
else:
return err(ErrorBufferOverflow)
inc(index, 2)
elif (byte1 and 0xF8'u8) == 0xF0'u8:
# Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if (index + 2) >= len(input):
return err(ErrorInvalidSequence)
let byte2 = uint32(input[index])
if (byte2 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
# overlong sequence test
if (byte1 == 0xF0'u32) and ((byte2 and 0xF0'u32) == 0x80'u32):
return err(ErrorInvalidSequence)
# According to RFC 3629 no point above U+10FFFF should be used, which
# limits characters to four bytes.
if ((byte1 == 0xF4'u32) and (byte2 > 0x8F'u32)) or (byte1 > 0xF4'u32):
return err(ErrorInvalidSequence)
let byte3 = uint32(input[index + 1])
if (byte3 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
let byte4 = uint32(input[index + 2])
if (byte4 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
if dindex < len(output):
output[dindex] = B(((byte1 and 0x07'u32) shl 18) or
((byte2 and 0x3F'u32) shl 12) or
((byte3 and 0x3F'u32) shl 6) or
(byte4 and 0x3F'u32))
inc(dindex)
else:
return err(ErrorBufferOverflow)
inc(index, 3)
else:
return err(ErrorInvalidSequence)
ok(dindex)
proc utf8toUtf32*[A: Bytes, B: Wides32](et: typedesc[B],
input: openarray[A]): UResult[seq[B]] =
## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
## sequence of 32bit limbs and return it.
var empty: array[0, B]
let size = ? utf8toUtf32(input, empty)
var output = newSeq[B](size)
let res {.used.} = ? utf8toUtf32(input, output)
ok(output)
when defined(posix):

View File

@ -302,18 +302,40 @@ suite "UTF-8 validation test suite":
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == ""
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == ""
test "wcharToUtf8() tests":
test "UTF-32 -> UTF-8 conversion test":
for i in 0 ..< 0x11_0000:
if i != 0xFFFE and i != 0xFFFF:
if i < 0x10000:
var data16 = [uint16(i)]
let res = wcharToUtf8(data16)
check:
res.isOk() == true
utf8Validate(res.get()) == true
var data32 = [uint32(i)]
if i >= 0xD800 and i <= 0xDFFF:
check utf32toUtf8(data32).isErr()
elif i == 0xFFFE:
check utf32toUtf8(data32).isErr()
elif i == 0xFFFF:
check utf32toUtf8(data32).isErr()
elif i == 0x11_0000:
check utf32toUtf8(data32).isErr()
else:
var data32 = [uint32(i)]
let res = wcharToUtf8(data32)
let res = utf32toUtf8(data32)
check:
res.isOk() == true
utf8Validate(res.get()) == true
test "UTF-8 -> UTF-32 conversion test":
for i in 0 ..< 0x11_0001:
var data32 = [uint32(i)]
if i >= 0xD800 and i <= 0xDFFF:
check utf32toUtf8(data32).isErr()
elif i == 0xFFFE:
check utf32toUtf8(data32).isErr()
elif i == 0xFFFF:
check utf32toUtf8(data32).isErr()
elif i == 0x11_0000:
check utf32toUtf8(data32).isErr()
else:
var data32 = [uint32(i)]
let res8 = utf32toUtf8(data32)
check res8.isOk()
let res32 = utf8toUtf32(uint32, res8.get())
check:
res32.isOk()
res32.get() == data32