mirror of
https://github.com/status-im/nim-stew.git
synced 2025-02-02 15:23:49 +00:00
Add UTF-8 length procedure.
Add UTF-8 offset procedure. Add UTF-8 substr procedure. Add wchar_t to UTF-8 conversion procedure. Add multibyte to wchar_t conversion procedure (posix). Add UTF-8 tests. Fix password reader to validate utf-8 encoding when reading from pipe. Fix password reader to read utf-8 encoded strings from *nix console.
This commit is contained in:
parent
b0bbeb49d2
commit
39fb71bcec
@ -7,7 +7,7 @@
|
||||
## those terms.
|
||||
|
||||
## This module implements cross-platform console procedures.
|
||||
import io2
|
||||
import io2, utf8
|
||||
export io2
|
||||
|
||||
when defined(windows):
|
||||
@ -62,6 +62,7 @@ when defined(windows):
|
||||
ENABLE_PROCESSED_INPUT = 0x0001'u32
|
||||
ENABLE_ECHO_INPUT = 0x0004'u32
|
||||
FILE_TYPE_CHAR = 0x0002'u32
|
||||
ERROR_NO_UNICODE_TRANSLATION = 1113'u32
|
||||
|
||||
proc isConsoleRedirected*(hConsole: uint): bool =
|
||||
## Returns ``true`` if console handle was redirected.
|
||||
@ -73,7 +74,7 @@ when defined(windows):
|
||||
else:
|
||||
true
|
||||
|
||||
proc readConsoleInput(maxBytes: int): IoResult[string] =
|
||||
proc readConsoleInput(maxChars: int): IoResult[string] =
|
||||
let hConsoleInput =
|
||||
block:
|
||||
let res = getStdHandle(STD_INPUT_HANDLE)
|
||||
@ -94,8 +95,9 @@ when defined(windows):
|
||||
if setConsoleCP(CP_UTF8) == 0'i32:
|
||||
return err(ioLastError())
|
||||
|
||||
# Allocating buffer with size equal to `maxBytes` + len(CRLF)
|
||||
var buffer = newString(maxBytes + 2)
|
||||
# Allocating buffer with size equal to `(maxChars + len(CRLF)) * 4`,
|
||||
# where 4 is maximum expected size of one character (UTF8 encoding).
|
||||
var buffer = newString((maxChars + 2) * 4)
|
||||
let bytesToRead = uint32(len(buffer))
|
||||
var bytesRead: uint32
|
||||
let rres = readFile(hConsoleInput, cast[pointer](addr buffer[0]),
|
||||
@ -109,7 +111,7 @@ when defined(windows):
|
||||
return err(ioLastError())
|
||||
|
||||
# Truncate additional bytes from buffer.
|
||||
buffer.setLen(int(min(bytesRead, uint32(maxBytes))))
|
||||
buffer.setLen(int(bytesRead))
|
||||
|
||||
# Trim CR/CRLF from buffer.
|
||||
if len(buffer) > 0:
|
||||
@ -123,7 +125,13 @@ when defined(windows):
|
||||
buffer.setLen(len(buffer) - 1)
|
||||
elif buffer[^1] == char(0x0D):
|
||||
buffer.setLen(len(buffer) - 1)
|
||||
ok(buffer)
|
||||
|
||||
# Check if buffer is valid UTF-8 encoded string.
|
||||
if utf8Validate(buffer):
|
||||
# Cut result buffer to `maxChars` characters.
|
||||
ok(utf8Substr(buffer, 0, maxChars - 1).get())
|
||||
else:
|
||||
err(IoErrorCode(ERROR_NO_UNICODE_TRANSLATION))
|
||||
else:
|
||||
let prevMode =
|
||||
block:
|
||||
@ -147,8 +155,8 @@ when defined(windows):
|
||||
discard setConsoleCP(prevInputCP)
|
||||
return err(errCode)
|
||||
|
||||
# Allocating buffer with size equal to `maxBytes` + len(CRLF)
|
||||
var buffer = newSeq[Utf16Char](maxBytes + 2)
|
||||
# Allocating buffer with size equal to `maxChars` + len(CRLF).
|
||||
var buffer = newSeq[Utf16Char](maxChars + 2)
|
||||
let charsToRead = uint32(len(buffer))
|
||||
var charsRead: uint32
|
||||
let rres = readConsole(hConsoleInput, cast[pointer](addr buffer[0]),
|
||||
@ -170,7 +178,8 @@ when defined(windows):
|
||||
return err(ioLastError())
|
||||
|
||||
# Truncate additional bytes from buffer.
|
||||
buffer.setLen(int(min(charsRead, uint32(maxBytes))))
|
||||
buffer.setLen(int(min(charsRead, uint32(maxChars))))
|
||||
|
||||
# Truncate CRLF in result wide string.
|
||||
if len(buffer) > 0:
|
||||
if int16(buffer[^1]) == int16(0x0A):
|
||||
@ -184,7 +193,7 @@ when defined(windows):
|
||||
elif int16(buffer[^1]) == int16(0x0D):
|
||||
buffer.setLen(len(buffer) - 1)
|
||||
|
||||
# Convert Windows UTF-16 encoded string to UTF-8 encoded string.
|
||||
# Convert Windows UCS-2 encoded string to UTF-8 encoded string.
|
||||
if len(buffer) > 0:
|
||||
var pwd = ""
|
||||
let bytesNeeded = wideCharToMultiByte(CP_UTF8, 0'u32, addr buffer[0],
|
||||
@ -277,17 +286,37 @@ elif defined(posix):
|
||||
else:
|
||||
ok()
|
||||
|
||||
proc readConsoleInput(maxBytes: int): IoResult[string] =
|
||||
# Allocating buffer with size equal to `maxBytes` + len(LF)
|
||||
var buffer = newString(maxBytes + 1)
|
||||
let bytesRead =
|
||||
proc readConsoleInput(maxChars: int): IoResult[string] =
|
||||
# Allocating buffer with size equal to `(maxChars + len(LF)) * 4`, where
|
||||
# 4 is maximum expected size of one character (UTF8 encoding).
|
||||
var buffer = newString((maxChars + 1) * 4)
|
||||
|
||||
if isConsoleRedirected(STDIN_FILENO):
|
||||
let bytesRead =
|
||||
block:
|
||||
let res = posix.read(STDIN_FILENO, cast[pointer](addr buffer[0]),
|
||||
len(buffer))
|
||||
if res < 0:
|
||||
return err(ioLastError())
|
||||
res
|
||||
|
||||
# Truncate additional bytes from buffer.
|
||||
buffer.setLen(bytesRead)
|
||||
|
||||
# Trim LF in result string
|
||||
if len(buffer) > 0:
|
||||
if buffer[^1] == char(0x0A):
|
||||
buffer.setLen(len(buffer) - 1)
|
||||
|
||||
# Check if buffer is valid UTF-8 encoded string.
|
||||
if utf8Validate(buffer):
|
||||
# Cut result buffer to `maxChars` characters.
|
||||
ok(utf8Substr(buffer, 0, maxChars - 1).get())
|
||||
else:
|
||||
err(IoErrorCode(EILSEQ))
|
||||
else:
|
||||
let bytesRead =
|
||||
block:
|
||||
var cur, old: Termios
|
||||
if tcGetAttr(STDIN_FILENO, addr cur) != cint(0):
|
||||
return err(ioLastError())
|
||||
@ -310,28 +339,48 @@ elif defined(posix):
|
||||
res
|
||||
|
||||
# Truncate additional bytes from buffer.
|
||||
buffer.setLen(min(maxBytes, bytesRead))
|
||||
buffer.setLen(bytesRead)
|
||||
|
||||
# Trim LF in result string
|
||||
if len(buffer) > 0:
|
||||
if buffer[^1] == char(0x0A):
|
||||
buffer.setLen(len(buffer) - 1)
|
||||
ok(buffer)
|
||||
buffer.add(char(0x00))
|
||||
|
||||
# Conversion of console input into wide characters sequence.
|
||||
let wres = mbstowcs(uint32, buffer)
|
||||
if wres.isOk():
|
||||
# Trim wide character sequence to `maxChars` number of characters.
|
||||
var wbuffer = wres.get()
|
||||
if maxChars < len(wbuffer):
|
||||
wbuffer.setLen(maxChars)
|
||||
# Conversion of wide characters sequence to UTF-8 encoded string.
|
||||
let ures = wbuffer.wcharToUtf8()
|
||||
if ures.isOk():
|
||||
ok(ures.get())
|
||||
else:
|
||||
err(IoErrorCode(EILSEQ))
|
||||
else:
|
||||
err(IoErrorCode(EILSEQ))
|
||||
|
||||
proc readConsolePassword*(prompt: string,
|
||||
maxBytes = 32768): IoResult[string] =
|
||||
## Reads a password from stdin without printing it with length in bytes up to
|
||||
## ``maxBytes``.
|
||||
maxChars = 32768): IoResult[string] =
|
||||
## Reads a password from stdin without printing it with length in characters
|
||||
## up to ``maxChars``.
|
||||
##
|
||||
## This procedure supports reading of UTF-8 encoded passwords from console or
|
||||
## redirected pipe. But ``maxBytes`` will limit
|
||||
## redirected pipe.
|
||||
##
|
||||
## Before reading password ``prompt`` will be printed.
|
||||
##
|
||||
## Please note that ``maxBytes`` should be in range (0, 32768].
|
||||
doAssert(maxBytes > 0 and maxBytes <= 32768,
|
||||
"maxBytes should be integer in (0, 32768]")
|
||||
## Please note that ``maxChars`` should be in range (0, 32768].
|
||||
doAssert(maxChars > 0 and maxChars <= 32768,
|
||||
"maxChars should be integer in (0, 32768]")
|
||||
? writeConsoleOutput(prompt)
|
||||
let res = ? readConsoleInput(maxBytes)
|
||||
let res = ? readConsoleInput(maxChars)
|
||||
# `\p` is platform specific newline: CRLF on Windows, LF on Unix
|
||||
? writeConsoleOutput("\p")
|
||||
ok(res)
|
||||
|
||||
when isMainModule:
|
||||
echo readConsolePassword("Enter password: ", 4)
|
||||
|
248
stew/utf8.nim
248
stew/utf8.nim
@ -7,8 +7,21 @@
|
||||
## those terms.
|
||||
|
||||
## This module implements UTF-8 related procedures.
|
||||
import results, io2
|
||||
export results
|
||||
|
||||
proc validateUtf8*[T: byte|char](data: openarray[T]): bool =
|
||||
type
|
||||
UResult*[T] = Result[T, cstring]
|
||||
Wides* = int16 | uint16 | int32 | uint32
|
||||
Bytes* = int8 | char | uint8 | byte
|
||||
|
||||
const
|
||||
ErrorBufferOverflow* = cstring"Buffer is not large enough"
|
||||
ErrorInvalidSequence* = cstring"Invalid Unicode sequence found"
|
||||
ErrorInvalidLocale* = cstring"Could not obtain system locale"
|
||||
ErrorNotEnoughCharacters* = cstring"Not enough characters in string"
|
||||
|
||||
proc utf8Validate*[T: Bytes](data: openarray[T]): bool =
|
||||
## Returns ``true`` if ``data`` is correctly UTF-8 encoded string.
|
||||
var index = 0
|
||||
|
||||
@ -89,3 +102,236 @@ proc validateUtf8*[T: byte|char](data: openarray[T]): bool =
|
||||
|
||||
else:
|
||||
return false
|
||||
|
||||
proc utf8Length*[T: Bytes](data: openarray[T]): UResult[int] =
|
||||
## Returns number of UTF-8 encoded characters in array ``data``.
|
||||
##
|
||||
## NOTE: Validate data with `utf8Validate()` before using this procedure,
|
||||
## otherwise length returned by this procedure could be incorrect.
|
||||
var index = 0
|
||||
var size = 0
|
||||
while index < len(data):
|
||||
let ch = uint(data[index])
|
||||
if ch < 0x80:
|
||||
inc(index, 1)
|
||||
elif (ch and 0xE0'u8) == 0xC0'u8:
|
||||
inc(index, 2)
|
||||
elif (ch and 0xF0'u8) == 0xE0'u8:
|
||||
inc(index, 3)
|
||||
elif (ch and 0xF8'u8) == 0xF0'u8:
|
||||
inc(index, 4)
|
||||
else:
|
||||
return err(ErrorInvalidSequence)
|
||||
inc(size)
|
||||
if index == len(data):
|
||||
ok(size)
|
||||
else:
|
||||
err(ErrorInvalidSequence)
|
||||
|
||||
proc utf8Offset*[T: Bytes](data: openarray[T], index: int): UResult[int] =
|
||||
## Return offset in UTF-8 encoded string ``data`` for character position
|
||||
## ``index``.
|
||||
if index <= 0:
|
||||
return ok(0)
|
||||
|
||||
var byteIndex = 0
|
||||
var charIndex = 0
|
||||
|
||||
while (byteIndex < len(data)) and (charIndex < index):
|
||||
let ch = uint(data[byteIndex])
|
||||
if ch < 0x80:
|
||||
inc(byteIndex, 1)
|
||||
elif (ch and 0xE0'u8) == 0xC0'u8:
|
||||
inc(byteIndex, 2)
|
||||
elif (ch and 0xF0'u8) == 0xE0'u8:
|
||||
inc(byteIndex, 3)
|
||||
elif (ch and 0xF8'u8) == 0xF0'u8:
|
||||
inc(byteIndex, 4)
|
||||
else:
|
||||
return err(ErrorInvalidSequence)
|
||||
inc(charIndex)
|
||||
|
||||
if charIndex == index:
|
||||
ok(byteIndex)
|
||||
else:
|
||||
err(ErrorNotEnoughCharacters)
|
||||
|
||||
proc utf8Substr*[T: Bytes](data: openarray[T],
|
||||
start, finish: int): UResult[string] =
|
||||
## Substring string ``data`` using starting character (not byte) index
|
||||
## ``start`` and terminating character (not byte) index ``finish`` and return
|
||||
## result string.
|
||||
##
|
||||
## ``data`` should be correct UTF-8 encoded string, because only initial
|
||||
## octets got validated.
|
||||
##
|
||||
## ``start`` - The starting index of the substring, any value BELOW or EQUAL
|
||||
## to zero will be considered as zero. If ``start`` index is not present in
|
||||
## string ``data`` empty string will be returned as result.
|
||||
##
|
||||
## ``finish`` - The terminating index of the substring, any value BELOW
|
||||
## zero will be considered as `len(data)`.
|
||||
let soffset =
|
||||
if start <= 0:
|
||||
0
|
||||
elif start >= len(data):
|
||||
return ok("")
|
||||
else:
|
||||
let res = utf8Offset(data, start)
|
||||
if res.isErr():
|
||||
if res.error != ErrorNotEnoughCharacters:
|
||||
return err(res.error)
|
||||
return ok("")
|
||||
else:
|
||||
res.get()
|
||||
|
||||
let eoffset =
|
||||
if finish < 0:
|
||||
len(data)
|
||||
elif finish >= len(data):
|
||||
len(data)
|
||||
else:
|
||||
let res = utf8Offset(data, finish + 1)
|
||||
if res.isErr():
|
||||
if res.error != ErrorNotEnoughCharacters:
|
||||
return err(res.error)
|
||||
len(data)
|
||||
else:
|
||||
res.get()
|
||||
|
||||
var res = newString(eoffset - soffset)
|
||||
var k = 0
|
||||
for i in soffset ..< eoffset:
|
||||
res[k] = cast[char](data[i])
|
||||
inc(k)
|
||||
ok(res)
|
||||
|
||||
proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A],
|
||||
output: var openarray[B]): UResult[int] =
|
||||
## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``.
|
||||
##
|
||||
## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input.
|
||||
var offset = 0
|
||||
for item in input:
|
||||
let uitem = uint(item)
|
||||
let codepoint =
|
||||
if uitem >= 0xD800'u and uitem <= 0xDBFF'u:
|
||||
0x10000'u + ((uitem - 0xD800'u) shl 10)
|
||||
else:
|
||||
if uitem >= 0xDC00'u and uitem <= 0xDFFF'u:
|
||||
uitem - 0xDC00'u
|
||||
else:
|
||||
uitem
|
||||
if codepoint <= 0x7F'u:
|
||||
if len(output) > 0:
|
||||
if offset < len(output):
|
||||
output[offset] = cast[B](codepoint and 0x7F'u)
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(offset, 1)
|
||||
elif codepoint <= 0x7FF'u:
|
||||
if len(output) > 0:
|
||||
if offset + 1 < len(output):
|
||||
output[offset + 0] = cast[B](0xC0'u8 or
|
||||
byte((codepoint shr 6) and 0x1F'u))
|
||||
output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(offset, 2)
|
||||
elif codepoint <= 0xFFFF'u:
|
||||
if len(output) > 0:
|
||||
if offset + 2 < len(output):
|
||||
output[offset + 0] = cast[B](0xE0'u8 or
|
||||
byte((codepoint shr 12) and 0x0F'u))
|
||||
output[offset + 1] = cast[B](0x80'u8 or
|
||||
byte((codepoint shr 6) and 0x3F'u))
|
||||
output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
|
||||
else:
|
||||
return err(ErrorBufferOverflow)
|
||||
inc(offset, 3)
|
||||
elif codepoint <= 0x10FFFF'u:
|
||||
if len(output) > 0:
|
||||
if offset + 3 < len(output):
|
||||
output[offset + 0] = cast[B](0xF0'u8 or
|
||||
byte((codepoint shr 18) and 0x07'u))
|
||||
output[offset + 1] = cast[B](0x80'u8 or
|
||||
byte((codepoint shr 12) and 0x3F'u))
|
||||
output[offset + 2] = cast[B](0x80'u8 or
|
||||
byte((codepoint shr 6) and 0x3F'u))
|
||||
output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
|
||||
else:
|
||||
return err("")
|
||||
inc(offset, 4)
|
||||
else:
|
||||
return err(ErrorInvalidSequence)
|
||||
ok(offset)
|
||||
|
||||
proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} =
|
||||
## Converts wide character
|
||||
var empty: array[0, char]
|
||||
let size = ? wcharToUtf8(input, empty)
|
||||
var output = newString(size)
|
||||
let res {.used.} = ? wcharToUtf8(input, output)
|
||||
ok(output)
|
||||
|
||||
when defined(posix):
|
||||
import posix
|
||||
|
||||
type
|
||||
Mbstate {.importc: "mbstate_t",
|
||||
header: "<wchar.h>", pure, final.} = object
|
||||
|
||||
proc mbsrtowcs(dest: pointer, src: pointer, n: csize_t,
|
||||
ps: ptr Mbstate): csize_t {.
|
||||
importc, header: "<wchar.h>".}
|
||||
|
||||
proc mbstowcs*[A: Bytes, B: Wides](t: typedesc[B],
|
||||
input: openarray[A]): UResult[seq[B]] =
|
||||
## Converts multibyte encoded string to OS specific wide char string.
|
||||
##
|
||||
## Note, that `input` should be `0` terminated.
|
||||
##
|
||||
## Encoding is made using `mbsrtowcs`, so procedure supports invalid
|
||||
## sequences and able to decoded all the characters before first invalid
|
||||
## character encountered.
|
||||
|
||||
# Without explicitely setting locale because `mbsrtowcs` will fail with
|
||||
# EILSEQ.
|
||||
# If locale is an empty string, "", each part of the locale that should
|
||||
# be modified is set according to the environment variables.
|
||||
let sres = setlocale(LC_ALL, cstring"")
|
||||
if isNil(sres):
|
||||
return err(ErrorInvalidLocale)
|
||||
|
||||
var buffer = newSeq[B](len(input))
|
||||
if len(input) == 0:
|
||||
return ok(buffer)
|
||||
|
||||
doAssert(input[^1] == A(0), "Input array should be zero-terminated")
|
||||
var data = @input
|
||||
var ostr = addr data[0]
|
||||
var pstr = ostr
|
||||
var mstate = Mbstate()
|
||||
|
||||
while true:
|
||||
let res = mbsrtowcs(addr buffer[0], addr pstr, csize_t(len(buffer)),
|
||||
addr mstate)
|
||||
if res == cast[csize_t](-1):
|
||||
# If invalid multibyte sequence has been encountered, ``pstr`` is left
|
||||
## pointing to the invalid multibyte sequence, ``-1`` is returned, and
|
||||
## errno is set to EILSEQ.
|
||||
let diff = cast[uint](pstr) - cast[uint](ostr)
|
||||
if diff == 0:
|
||||
return err(ErrorInvalidSequence)
|
||||
else:
|
||||
# We have partially decoded sequence, `diff` is position of first
|
||||
# invalid character in sequence.
|
||||
data[diff] = A(0x00)
|
||||
ostr = addr data[0]
|
||||
pstr = ostr
|
||||
mstate = Mbstate()
|
||||
else:
|
||||
# Its safe to convert `csize_t` to `int` here because `len(input)`
|
||||
# is also `int`.
|
||||
buffer.setLen(res)
|
||||
return ok(buffer)
|
||||
|
@ -32,29 +32,29 @@ proc toUTF1(value: uint32): array[1, byte] =
|
||||
suite "UTF-8 validation test suite":
|
||||
test "Values [U+0000, U+007F] are allowed":
|
||||
for i in 0x00'u32 .. 0x7F'u32:
|
||||
check validateUtf8(toUTF1(i)) == true
|
||||
check utf8Validate(toUTF1(i)) == true
|
||||
test "Values [U+0080, U+07FF] are allowed":
|
||||
for i in 0x80'u32 .. 0x7FF'u32:
|
||||
check validateUtf8(toUTF2(i)) == true
|
||||
check utf8Validate(toUTF2(i)) == true
|
||||
test "Values [U+0800, U+D7FF] are allowed":
|
||||
for i in 0x800'u32 .. 0xD7FF'u32:
|
||||
check validateUtf8(toUTF3(i)) == true
|
||||
check utf8Validate(toUTF3(i)) == true
|
||||
test "Values [U+D800, U+DFFF] (UTF-16 surrogates) are not allowed":
|
||||
for i in 0xD800'u32 .. 0xDFFF'u32:
|
||||
check validateUtf8(toUTF3(i)) == false
|
||||
check utf8Validate(toUTF3(i)) == false
|
||||
test "Values [U+E000, U+FFFD] are allowed":
|
||||
for i in 0xE000'u32 .. 0xFFFD'u32:
|
||||
check validateUtf8(toUTF3(i)) == true
|
||||
check utf8Validate(toUTF3(i)) == true
|
||||
test "Values U+FFFE and U+FFFF are not allowed":
|
||||
check:
|
||||
validateUtf8(toUTF3(0xFFFE'u32)) == false
|
||||
validateUtf8(toUTF3(0xFFFF'u32)) == false
|
||||
utf8Validate(toUTF3(0xFFFE'u32)) == false
|
||||
utf8Validate(toUTF3(0xFFFF'u32)) == false
|
||||
test "Values [U+10000, U10FFFF] are allowed":
|
||||
for i in 0x10000'u32 .. 0x10FFFF'u32:
|
||||
check validateUtf8(toUTF4(i)) == true
|
||||
check utf8Validate(toUTF4(i)) == true
|
||||
test "Values bigger U+10FFFF are not allowed":
|
||||
for i in 0x11_0000'u32 .. 0x1F_FFFF'u32:
|
||||
check validateUtf8(toUTF4(i)) == false
|
||||
check utf8Validate(toUTF4(i)) == false
|
||||
test "fastvalidate-utf-8 bad sequences":
|
||||
# https://github.com/lemire/fastvalidate-utf-8 test vectors
|
||||
const
|
||||
@ -95,9 +95,9 @@ suite "UTF-8 validation test suite":
|
||||
"\xef\xbf"
|
||||
]
|
||||
for item in BadSequences:
|
||||
check validateUtf8(item) == false
|
||||
check utf8Validate(item) == false
|
||||
for item in GoodSequences:
|
||||
check validateUtf8(item) == true
|
||||
check utf8Validate(item) == true
|
||||
test "UTF-8 decoder capability and stress test":
|
||||
# https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
||||
const Tests2 = [
|
||||
@ -184,10 +184,136 @@ suite "UTF-8 validation test suite":
|
||||
]
|
||||
|
||||
for item in Tests2:
|
||||
check validateUtf8(item[0]) == item[1]
|
||||
check utf8Validate(item[0]) == item[1]
|
||||
for item in Tests3:
|
||||
check validateUtf8(item[0]) == item[1]
|
||||
check utf8Validate(item[0]) == item[1]
|
||||
for item in Tests4:
|
||||
check validateUtf8(item[0]) == item[1]
|
||||
check utf8Validate(item[0]) == item[1]
|
||||
for item in Tests5:
|
||||
check validateUtf8(item[0]) == item[1]
|
||||
check utf8Validate(item[0]) == item[1]
|
||||
|
||||
test "UTF-8 length() test":
|
||||
const
|
||||
Cyrillic = "\xd0\x9f\xd1\x80\xd0\xbe\xd0\xb3" &
|
||||
"\xd1\x80\xd0\xb0\xd0\xbc\xd0\xbc\xd0\xb0"
|
||||
check:
|
||||
utf8Length("Программа").tryGet() == 9
|
||||
utf8Length("Программ").tryGet() == 8
|
||||
utf8Length("Програм").tryGet() == 7
|
||||
utf8Length("Програ").tryGet() == 6
|
||||
utf8Length("Прогр").tryGet() == 5
|
||||
utf8Length("Прог").tryGet() == 4
|
||||
utf8Length("Про").tryGet() == 3
|
||||
utf8Length("Пр").tryGet() == 2
|
||||
utf8Length("П").tryGet() == 1
|
||||
utf8Length("").tryGet() == 0
|
||||
utf8Length("П⠯🤗").tryGet() == 3
|
||||
utf8Length("⠯🤗").tryGet() == 2
|
||||
utf8Length("🤗").tryGet() == 1
|
||||
|
||||
check:
|
||||
utf8Length(Cyrillic).tryGet() == 9
|
||||
utf8Length(Cyrillic.toOpenArray(0, len(Cyrillic) - 2)).isErr() == true
|
||||
|
||||
test "UTF-8 substr() test":
|
||||
check:
|
||||
utf8Substr("Программа", -1, -1).tryGet() == "Программа"
|
||||
utf8Substr("Программа", 0, 0).tryGet() == "П"
|
||||
utf8Substr("Программа", 0, 1).tryGet() == "Пр"
|
||||
utf8Substr("Программа", 0, 2).tryGet() == "Про"
|
||||
utf8Substr("Программа", 0, 3).tryGet() == "Прог"
|
||||
utf8Substr("Программа", 0, 4).tryGet() == "Прогр"
|
||||
utf8Substr("Программа", 0, 5).tryGet() == "Програ"
|
||||
utf8Substr("Программа", 0, 6).tryGet() == "Програм"
|
||||
utf8Substr("Программа", 0, 7).tryGet() == "Программ"
|
||||
utf8Substr("Программа", 0, 8).tryGet() == "Программа"
|
||||
utf8Substr("Программа", 0, 9).tryGet() == "Программа"
|
||||
utf8Substr("Программа", 0, 10).tryGet() == "Программа"
|
||||
utf8Substr("Программа", 0, 18).tryGet() == "Программа"
|
||||
utf8Substr("Программа", 0, 19).tryGet() == "Программа"
|
||||
utf8Substr("Программа", 0, 100).tryGet() == "Программа"
|
||||
utf8Substr("Программа", 100, 0).tryGet() == ""
|
||||
utf8Substr("Программа", 100, 100).tryGet() == ""
|
||||
utf8Substr("Программа", 1, 1).tryGet() == "р"
|
||||
utf8Substr("Программа", 2, 2).tryGet() == "о"
|
||||
utf8Substr("Программа", 3, 3).tryGet() == "г"
|
||||
utf8Substr("Программа", 4, 4).tryGet() == "р"
|
||||
utf8Substr("Программа", 5, 5).tryGet() == "а"
|
||||
utf8Substr("Программа", 6, 6).tryGet() == "м"
|
||||
utf8Substr("Программа", 7, 7).tryGet() == "м"
|
||||
utf8Substr("Программа", 8, 8).tryGet() == "а"
|
||||
utf8Substr("Программа", 9, 9).tryGet() == ""
|
||||
utf8Substr("Программа", 0, -1).tryGet() == "Программа"
|
||||
utf8Substr("Программа", 1, -1).tryGet() == "рограмма"
|
||||
utf8Substr("Программа", 2, -1).tryGet() == "ограмма"
|
||||
utf8Substr("Программа", 3, -1).tryGet() == "грамма"
|
||||
utf8Substr("Программа", 4, -1).tryGet() == "рамма"
|
||||
utf8Substr("Программа", 5, -1).tryGet() == "амма"
|
||||
utf8Substr("Программа", 6, -1).tryGet() == "мма"
|
||||
utf8Substr("Программа", 7, -1).tryGet() == "ма"
|
||||
utf8Substr("Программа", 8, -1).tryGet() == "а"
|
||||
utf8Substr("Программа", 9, -1).tryGet() == ""
|
||||
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", -1, -1).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 0).tryGet() == "⠯"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 1).tryGet() == "⠯⠰"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 2).tryGet() == "⠯⠰⠱"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 3).tryGet() == "⠯⠰⠱⠲"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 4).tryGet() == "⠯⠰⠱⠲⠳"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 5).tryGet() == "⠯⠰⠱⠲⠳⠴"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 6).tryGet() == "⠯⠰⠱⠲⠳⠴⠵"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 7).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 8).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 9).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 23).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 24).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 100).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 100, 0).tryGet() == ""
|
||||
utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 100, 100).tryGet() == ""
|
||||
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", -1, -1).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜🤝🤞🤟"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 0).tryGet() ==
|
||||
"🤗"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 1).tryGet() ==
|
||||
"🤗🤘"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 2).tryGet() ==
|
||||
"🤗🤘🤙"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 3).tryGet() ==
|
||||
"🤗🤘🤙🤚"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 4).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 5).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 6).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜🤝"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 7).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜🤝🤞"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 8).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜🤝🤞🤟"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 9).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜🤝🤞🤟"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 31).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜🤝🤞🤟"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 32).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜🤝🤞🤟"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 100).tryGet() ==
|
||||
"🤗🤘🤙🤚🤛🤜🤝🤞🤟"
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == ""
|
||||
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == ""
|
||||
|
||||
test "wcharToUtf8() tests":
|
||||
for i in 0 ..< 0x11_0000:
|
||||
if i != 0xFFFE and i != 0xFFFF:
|
||||
if i < 0x10000:
|
||||
var data16 = [uint16(i)]
|
||||
let res = wcharToUtf8(data16)
|
||||
check:
|
||||
res.isOk() == true
|
||||
utf8Validate(res.get()) == true
|
||||
|
||||
var data32 = [uint32(i)]
|
||||
let res = wcharToUtf8(data32)
|
||||
check:
|
||||
res.isOk() == true
|
||||
utf8Validate(res.get()) == true
|
||||
|
Loading…
x
Reference in New Issue
Block a user