nim-stew/stew/utf8.nim

467 lines
16 KiB
Nim
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

## Copyright (c) 2020 Status Research & Development GmbH
## Licensed under either of
## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
## * MIT license ([LICENSE-MIT](LICENSE-MIT))
## at your option.
## This file may not be copied, modified, or distributed except according to
## those terms.
## This module implements UTF-8 related procedures.
import results, io2
export results
type
UResult*[T] = Result[T, cstring]
Wides32* = int32 | uint32
Wides16* = int16 | uint16
Bytes* = int8 | char | uint8 | byte
const
ErrorBufferOverflow* = cstring"Buffer is not large enough"
ErrorInvalidSequence* = cstring"Invalid Unicode sequence found"
ErrorInvalidLocale* = cstring"Could not obtain system locale"
ErrorNotEnoughCharacters* = cstring"Not enough characters in string"
proc utf8Validate*[T: Bytes](data: openarray[T]): bool =
## Returns ``true`` if ``data`` is correctly UTF-8 encoded string.
var index = 0
while true:
let byte1 =
block:
var b: byte
while true:
if index >= len(data):
return true
b = when T is byte: data[index] else: byte(data[index])
inc(index)
if b >= 0x80'u8:
break
b
if (byte1 and 0xE0'u8) == 0xC0'u8:
# Two-byte form (110xxxxx 10xxxxxx)
if index >= len(data):
return false
# overlong sequence test
if (byte1 and 0xFE'u8) == 0xC0'u8:
return false
let byte2 = when T is byte: data[index] else: byte(data[index])
if (byte2 and 0xC0'u8) != 0x80'u8:
return false
inc(index)
elif (byte1 and 0xF0'u8) == 0xE0'u8:
# Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
if (index + 1) >= len(data):
return false
let byte2 = when T is byte: data[index] else: byte(data[index])
if (byte2 and 0xC0'u8) != 0x80'u8:
return false
# overlong sequence test
if (byte1 == 0xE0'u8) and ((byte2 and 0xE0'u8) == 0x80'u8):
return false
# 0xD8000xDFFF (UTF-16 surrogates) test
if (byte1 == 0xED'u8) and ((byte2 and 0xE0'u8) == 0xA0'u8):
return false
let byte3 = when T is byte: data[index + 1] else: byte(data[index + 1])
if (byte3 and 0xC0'u8) != 0x80'u8:
return false
# U+FFFE or U+FFFF test
if (byte1 == 0xEF'u8) and (byte2 == 0xBF'u8) and
((byte3 and 0xFE'u8) == 0xBE'u8):
return false
inc(index, 2)
elif (byte1 and 0xF8'u8) == 0xF0'u8:
# Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if (index + 2) >= len(data):
return false
let byte2 = when T is byte: data[index] else: byte(data[index])
if (byte2 and 0xC0'u8) != 0x80'u8:
return false
# overlong sequence test
if (byte1 == 0xF0'u8) and ((byte2 and 0xF0'u8) == 0x80'u8):
return false
# According to RFC 3629 no point above U+10FFFF should be used, which
# limits characters to four bytes.
if ((byte1 == 0xF4'u8) and (byte2 > 0x8F'u8)) or (byte1 > 0xF4'u8):
return false
let byte3 = when T is byte: data[index + 1] else: byte(data[index + 1])
if (byte3 and 0xC0'u8) != 0x80'u8:
return false
let byte4 = when T is byte: data[index + 2] else: byte(data[index + 2])
if (byte4 and 0xC0'u8) != 0x80'u8:
return false
inc(index, 3)
else:
return false
proc utf8Length*[T: Bytes](data: openarray[T]): UResult[int] =
## Returns number of UTF-8 encoded characters in array ``data``.
##
## NOTE: Validate data with `utf8Validate()` before using this procedure,
## otherwise length returned by this procedure could be incorrect.
var index = 0
var size = 0
while index < len(data):
let ch = uint(data[index])
if ch < 0x80:
inc(index, 1)
elif (ch and 0xE0'u8) == 0xC0'u8:
inc(index, 2)
elif (ch and 0xF0'u8) == 0xE0'u8:
inc(index, 3)
elif (ch and 0xF8'u8) == 0xF0'u8:
inc(index, 4)
else:
return err(ErrorInvalidSequence)
inc(size)
if index == len(data):
ok(size)
else:
err(ErrorInvalidSequence)
proc utf8Offset*[T: Bytes](data: openarray[T], index: int): UResult[int] =
## Return offset in UTF-8 encoded string ``data`` for character position
## ``index``.
if index <= 0:
return ok(0)
var byteIndex = 0
var charIndex = 0
while (byteIndex < len(data)) and (charIndex < index):
let ch = uint(data[byteIndex])
if ch < 0x80:
inc(byteIndex, 1)
elif (ch and 0xE0'u8) == 0xC0'u8:
inc(byteIndex, 2)
elif (ch and 0xF0'u8) == 0xE0'u8:
inc(byteIndex, 3)
elif (ch and 0xF8'u8) == 0xF0'u8:
inc(byteIndex, 4)
else:
return err(ErrorInvalidSequence)
inc(charIndex)
if charIndex == index:
ok(byteIndex)
else:
err(ErrorNotEnoughCharacters)
proc utf8Substr*[T: Bytes](data: openarray[T],
start, finish: int): UResult[string] =
## Substring string ``data`` using starting character (not byte) index
## ``start`` and terminating character (not byte) index ``finish`` and return
## result string.
##
## ``data`` should be correct UTF-8 encoded string, because only initial
## octets got validated.
##
## ``start`` - The starting index of the substring, any value BELOW or EQUAL
## to zero will be considered as zero. If ``start`` index is not present in
## string ``data`` empty string will be returned as result.
##
## ``finish`` - The terminating index of the substring, any value BELOW
## zero will be considered as `len(data)`.
let soffset =
if start <= 0:
0
elif start >= len(data):
return ok("")
else:
let res = utf8Offset(data, start)
if res.isErr():
if res.error != ErrorNotEnoughCharacters:
return err(res.error)
return ok("")
else:
res.get()
let eoffset =
if finish < 0:
len(data)
elif finish >= len(data):
len(data)
else:
let res = utf8Offset(data, finish + 1)
if res.isErr():
if res.error != ErrorNotEnoughCharacters:
return err(res.error)
len(data)
else:
res.get()
var res = newString(eoffset - soffset)
var k = 0
for i in soffset ..< eoffset:
res[k] = cast[char](data[i])
inc(k)
ok(res)
proc utf32toUtf8*[A: Wides32, B: Bytes](input: openarray[A],
output: var openarray[B]): UResult[int] =
## Converts UTF-32 sequence ``input`` to UTF-8 array ``output``.
var offset = 0
for item in input:
let codepoint =
block:
if (uint32(item) >= 0xD800'u32) and (uint32(item) <= 0xDFFF'u32):
# high and low surrogates U+D800 through U+DFFF prohibited in UTF-32.
return err(ErrorInvalidSequence)
elif (uint32(item) == 0xFFFE'u32) or (uint32(item) == 0xFFFF'u32):
# these codes are intended for process-internal uses, and not a
# unicode characters.
return err(ErrorInvalidSequence)
uint32(item)
if codepoint <= 0x7F'u32:
if len(output) > 0:
if offset < len(output):
output[offset] = cast[B](codepoint and 0x7F'u32)
else:
return err(ErrorBufferOverflow)
inc(offset, 1)
elif codepoint <= 0x7FF'u32:
if len(output) > 0:
if offset + 1 < len(output):
output[offset + 0] = cast[B](0xC0'u8 or
byte((codepoint shr 6) and 0x1F'u32))
output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
else:
return err(ErrorBufferOverflow)
inc(offset, 2)
elif codepoint <= 0xFFFF'u32:
if len(output) > 0:
if offset + 2 < len(output):
output[offset + 0] = cast[B](0xE0'u8 or
byte((codepoint shr 12) and 0x0F'u32))
output[offset + 1] = cast[B](0x80'u8 or
byte((codepoint shr 6) and 0x3F'u32))
output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
else:
return err(ErrorBufferOverflow)
inc(offset, 3)
elif codepoint <= 0x10FFFF'u32:
if len(output) > 0:
if offset + 3 < len(output):
output[offset + 0] = cast[B](0xF0'u8 or
byte((codepoint shr 18) and 0x07'u32))
output[offset + 1] = cast[B](0x80'u8 or
byte((codepoint shr 12) and 0x3F'u32))
output[offset + 2] = cast[B](0x80'u8 or
byte((codepoint shr 6) and 0x3F'u32))
output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
else:
return err(ErrorBufferOverflow)
inc(offset, 4)
else:
return err(ErrorInvalidSequence)
ok(offset)
proc utf32toUtf8*[T: Wides32](input: openarray[T]): UResult[string] {.inline.} =
## Converts wide character sequence ``input`` to UTF-8 encoded string.
var empty: array[0, char]
let size = ? utf32ToUtf8(input, empty)
var output = newString(size)
let res {.used.} = ? utf32ToUtf8(input, output)
ok(output)
proc utf8toUtf32*[A: Bytes, B: Wides32](input: openarray[A],
output: var openarray[B]): UResult[int] =
## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
## sequences of 32bit limbs.
##
## To obtain required size of ``output`` you need to pass ``output`` as
## zero-length array, in such way required size will be returned as result of
## procedure.
##
## If size of ``output`` is not zero, and there not enough space in ``output``
## array to store whole ``input`` array, error ``ErrorBufferOverflow`` will
## be returned.
var index = 0
var dindex = 0
if len(output) == 0:
return utf8Length(input)
else:
while true:
if index >= len(input):
break
let byte1 = uint32(input[index])
inc(index)
if (byte1 and 0x80) == 0x00:
if dindex < len(output):
output[dindex] = B(byte1)
inc(dindex)
else:
return err(ErrorBufferOverflow)
elif (byte1 and 0xE0'u32) == 0xC0'u32:
# Two-byte form (110xxxxx 10xxxxxx)
if index >= len(input):
return err(ErrorInvalidSequence)
# overlong sequence test
if (byte1 and 0xFE'u32) == 0xC0'u32:
return err(ErrorInvalidSequence)
let byte2 = uint32(input[index])
if (byte2 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
if dindex < len(output):
output[dindex] = B(((byte1 and 0x1F'u32) shl 6) or
(byte2 and 0x3F'u32))
inc(dindex)
else:
return err(ErrorBufferOverflow)
inc(index)
elif (byte1 and 0xF0'u32) == 0xE0'u32:
# Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
if (index + 1) >= len(input):
return err(ErrorInvalidSequence)
let byte2 = uint32(input[index])
if (byte2 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
# overlong sequence test
if (byte1 == 0xE0'u32) and ((byte2 and 0xE0'u32) == 0x80'u32):
return err(ErrorInvalidSequence)
# 0xD8000xDFFF (UTF-16 surrogates) test
if (byte1 == 0xED'u32) and ((byte2 and 0xE0'u32) == 0xA0'u32):
return err(ErrorInvalidSequence)
let byte3 = uint32(input[index + 1])
if (byte3 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
# U+FFFE or U+FFFF test
if (byte1 == 0xEF'u32) and (byte2 == 0xBF'u32) and
((byte3 and 0xFE'u32) == 0xBE'u32):
return err(ErrorInvalidSequence)
if dindex < len(output):
output[dindex] = B(((byte1 and 0x0F'u32) shl 12) or
((byte2 and 0x3F'u32) shl 6) or
(byte3 and 0x3F'u32))
inc(dindex)
else:
return err(ErrorBufferOverflow)
inc(index, 2)
elif (byte1 and 0xF8'u8) == 0xF0'u8:
# Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if (index + 2) >= len(input):
return err(ErrorInvalidSequence)
let byte2 = uint32(input[index])
if (byte2 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
# overlong sequence test
if (byte1 == 0xF0'u32) and ((byte2 and 0xF0'u32) == 0x80'u32):
return err(ErrorInvalidSequence)
# According to RFC 3629 no point above U+10FFFF should be used, which
# limits characters to four bytes.
if ((byte1 == 0xF4'u32) and (byte2 > 0x8F'u32)) or (byte1 > 0xF4'u32):
return err(ErrorInvalidSequence)
let byte3 = uint32(input[index + 1])
if (byte3 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
let byte4 = uint32(input[index + 2])
if (byte4 and 0xC0'u32) != 0x80'u32:
return err(ErrorInvalidSequence)
if dindex < len(output):
output[dindex] = B(((byte1 and 0x07'u32) shl 18) or
((byte2 and 0x3F'u32) shl 12) or
((byte3 and 0x3F'u32) shl 6) or
(byte4 and 0x3F'u32))
inc(dindex)
else:
return err(ErrorBufferOverflow)
inc(index, 3)
else:
return err(ErrorInvalidSequence)
ok(dindex)
proc utf8toUtf32*[A: Bytes, B: Wides32](et: typedesc[B],
input: openarray[A]): UResult[seq[B]] =
## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
## sequence of 32bit limbs and return it.
var empty: array[0, B]
let size = ? utf8toUtf32(input, empty)
var output = newSeq[B](size)
let res {.used.} = ? utf8toUtf32(input, output)
ok(output)
when defined(posix):
import posix
type
Mbstate {.importc: "mbstate_t",
header: "<wchar.h>", pure, final.} = object
proc mbsrtowcs(dest: pointer, src: pointer, n: csize_t,
ps: ptr Mbstate): csize_t {.
importc, header: "<wchar.h>".}
proc mbstowcs*[A: Bytes, B: Wides32](t: typedesc[B],
input: openarray[A]): UResult[seq[B]] =
## Converts multibyte encoded string to OS specific wide char string.
##
## Note, that `input` should be `0` terminated.
##
## Encoding is made using `mbsrtowcs`, so procedure supports invalid
## sequences and able to decoded all the characters before first invalid
## character encountered.
# Without explicitely setting locale because `mbsrtowcs` will fail with
# EILSEQ.
# If locale is an empty string, "", each part of the locale that should
# be modified is set according to the environment variables.
let sres = setlocale(LC_ALL, cstring"")
if isNil(sres):
return err(ErrorInvalidLocale)
var buffer = newSeq[B](len(input))
if len(input) == 0:
return ok(buffer)
doAssert(input[^1] == A(0), "Input array should be zero-terminated")
var data = @input
var ostr = addr data[0]
var pstr = ostr
var mstate = Mbstate()
while true:
let res = mbsrtowcs(addr buffer[0], addr pstr, csize_t(len(buffer)),
addr mstate)
if res == cast[csize_t](-1):
# If invalid multibyte sequence has been encountered, ``pstr`` is left
## pointing to the invalid multibyte sequence, ``-1`` is returned, and
## errno is set to EILSEQ.
let diff = cast[uint](pstr) - cast[uint](ostr)
if diff == 0:
return err(ErrorInvalidSequence)
else:
# We have partially decoded sequence, `diff` is position of first
# invalid character in sequence.
data[diff] = A(0x00)
ostr = addr data[0]
pstr = ostr
mstate = Mbstate()
else:
# Its safe to convert `csize_t` to `int` here because `len(input)`
# is also `int`.
buffer.setLen(res)
return ok(buffer)