nim-stew/stew/utf8.nim

## Copyright (c) 2020 Status Research & Development GmbH
## Licensed under either of
##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
## at your option.
## This file may not be copied, modified, or distributed except according to
## those terms.

## This module implements UTF-8 related procedures.
import results, io2
export results

type
  UResult*[T] = Result[T, cstring]
  Wides32* = int32 | uint32
  Wides16* = int16 | uint16
  Bytes* = int8 | char | uint8 | byte

const
  ErrorBufferOverflow* = cstring"Buffer is not large enough"
  ErrorInvalidSequence* = cstring"Invalid Unicode sequence found"
  ErrorInvalidLocale* = cstring"Could not obtain system locale"
  ErrorNotEnoughCharacters* = cstring"Not enough characters in string"

proc utf8Validate*[T: Bytes](data: openarray[T]): bool =
  ## Returns ``true`` if ``data`` is correctly UTF-8 encoded string.
  var index = 0

  while true:
    let byte1 =
      block:
        var b: byte
        while true:
          if index >= len(data):
            return true
          b = when T is byte: data[index] else: byte(data[index])
          inc(index)
          if b >= 0x80'u8:
            break
        b

    if (byte1 and 0xE0'u8) == 0xC0'u8:
      # Two-byte form (110xxxxx 10xxxxxx)
      if index >= len(data):
        return false
      # overlong sequence test
      if (byte1 and 0xFE'u8) == 0xC0'u8:
        return false

      let byte2 = when T is byte: data[index] else: byte(data[index])
      if (byte2 and 0xC0'u8) != 0x80'u8:
        return false
      inc(index)

    elif (byte1 and 0xF0'u8) == 0xE0'u8:
      # Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
      if (index + 1) >= len(data):
        return false

      let byte2 = when T is byte: data[index] else: byte(data[index])
      if (byte2 and 0xC0'u8) != 0x80'u8:
        return false
      # overlong sequence test
      if (byte1 == 0xE0'u8) and ((byte2 and 0xE0'u8) == 0x80'u8):
        return false
      #  0xD800–0xDFFF (UTF-16 surrogates) test
      if (byte1 == 0xED'u8) and ((byte2 and 0xE0'u8) == 0xA0'u8):
        return false

      let byte3 = when T is byte: data[index + 1] else: byte(data[index + 1])
      if (byte3 and 0xC0'u8) != 0x80'u8:
        return false
      # U+FFFE or U+FFFF test
      if (byte1 == 0xEF'u8) and (byte2 == 0xBF'u8) and
         ((byte3 and 0xFE'u8) == 0xBE'u8):
        return false
      inc(index, 2)

    elif (byte1 and 0xF8'u8) == 0xF0'u8:
      # Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
      if (index + 2) >= len(data):
        return false

      let byte2 = when T is byte: data[index] else: byte(data[index])
      if (byte2 and 0xC0'u8) != 0x80'u8:
        return false
      # overlong sequence test
      if (byte1 == 0xF0'u8) and ((byte2 and 0xF0'u8) == 0x80'u8):
        return false
      # According to RFC 3629 no point above U+10FFFF should be used, which
      # limits characters to four bytes.
      if ((byte1 == 0xF4'u8) and (byte2 > 0x8F'u8)) or (byte1 > 0xF4'u8):
        return false

      let byte3 = when T is byte: data[index + 1] else: byte(data[index + 1])
      if (byte3 and 0xC0'u8) != 0x80'u8:
        return false

      let byte4 = when T is byte: data[index + 2] else: byte(data[index + 2])
      if (byte4 and 0xC0'u8) != 0x80'u8:
        return false
      inc(index, 3)

    else:
      return false

proc utf8Length*[T: Bytes](data: openarray[T]): UResult[int] =
  ## Returns number of UTF-8 encoded characters in array ``data``.
  ##
  ## NOTE: Validate data with `utf8Validate()` before using this procedure,
  ## otherwise length returned by this procedure could be incorrect.
  var index = 0
  var size = 0
  while index < len(data):
    let ch = uint(data[index])
    if ch < 0x80:
      inc(index, 1)
    elif (ch and 0xE0'u8) == 0xC0'u8:
      inc(index, 2)
    elif (ch and 0xF0'u8) == 0xE0'u8:
      inc(index, 3)
    elif (ch and 0xF8'u8) == 0xF0'u8:
      inc(index, 4)
    else:
      return err(ErrorInvalidSequence)
    inc(size)
  if index == len(data):
    ok(size)
  else:
    err(ErrorInvalidSequence)

proc utf8Offset*[T: Bytes](data: openarray[T], index: int): UResult[int] =
  ## Return offset in UTF-8 encoded string ``data`` for character position
  ## ``index``.
  if index <= 0:
    return ok(0)

  var byteIndex = 0
  var charIndex = 0

  while (byteIndex < len(data)) and (charIndex < index):
    let ch = uint(data[byteIndex])
    if ch < 0x80:
      inc(byteIndex, 1)
    elif (ch and 0xE0'u8) == 0xC0'u8:
      inc(byteIndex, 2)
    elif (ch and 0xF0'u8) == 0xE0'u8:
      inc(byteIndex, 3)
    elif (ch and 0xF8'u8) == 0xF0'u8:
      inc(byteIndex, 4)
    else:
      return err(ErrorInvalidSequence)
    inc(charIndex)

  if charIndex == index:
    ok(byteIndex)
  else:
    err(ErrorNotEnoughCharacters)

proc utf8Substr*[T: Bytes](data: openarray[T],
                           start, finish: int): UResult[string] =
  ## Substring string ``data`` using starting character (not byte) index
  ## ``start`` and terminating character (not byte) index ``finish`` and return
  ## result string.
  ##
  ## ``data`` should be correct UTF-8 encoded string, because only initial
  ## octets got validated.
  ##
  ## ``start`` - The starting index of the substring, any value BELOW or EQUAL
  ## to zero will be considered as zero. If ``start`` index is not present in
  ## string ``data`` empty string will be returned as result.
  ##
  ## ``finish`` - The terminating index of the substring, any value BELOW
  ## zero will be considered as `len(data)`.
  let soffset =
    if start <= 0:
      0
    elif start >= len(data):
      return ok("")
    else:
      let res = utf8Offset(data, start)
      if res.isErr():
        if res.error != ErrorNotEnoughCharacters:
          return err(res.error)
        return ok("")
      else:
        res.get()

  let eoffset =
    if finish < 0:
      len(data)
    elif finish >= len(data):
      len(data)
    else:
      let res = utf8Offset(data, finish + 1)
      if res.isErr():
        if res.error != ErrorNotEnoughCharacters:
          return err(res.error)
        len(data)
      else:
        res.get()

  var res = newString(eoffset - soffset)
  var k = 0
  for i in soffset ..< eoffset:
    res[k] = cast[char](data[i])
    inc(k)
  ok(res)

proc utf32toUtf8*[A: Wides32, B: Bytes](input: openarray[A],
                                      output: var openarray[B]): UResult[int] =
  ## Converts UTF-32 sequence ``input`` to UTF-8 array ``output``.
  var offset = 0
  for item in input:
    let codepoint =
      block:
        if (uint32(item) >= 0xD800'u32) and (uint32(item) <= 0xDFFF'u32):
          # high and low surrogates U+D800 through U+DFFF prohibited in UTF-32.
          return err(ErrorInvalidSequence)
        elif (uint32(item) == 0xFFFE'u32) or (uint32(item) == 0xFFFF'u32):
          # these codes are intended for process-internal uses, and not a
          # unicode characters.
          return err(ErrorInvalidSequence)
        uint32(item)
    if codepoint <= 0x7F'u32:
      if len(output) > 0:
        if offset < len(output):
          output[offset] = cast[B](codepoint and 0x7F'u32)
        else:
          return err(ErrorBufferOverflow)
      inc(offset, 1)
    elif codepoint <= 0x7FF'u32:
      if len(output) > 0:
        if offset + 1 < len(output):
          output[offset + 0] = cast[B](0xC0'u8 or
                                       byte((codepoint shr 6) and 0x1F'u32))
          output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
        else:
          return err(ErrorBufferOverflow)
      inc(offset, 2)
    elif codepoint <= 0xFFFF'u32:
      if len(output) > 0:
        if offset + 2 < len(output):
          output[offset + 0] = cast[B](0xE0'u8 or
                                       byte((codepoint shr 12) and 0x0F'u32))
          output[offset + 1] = cast[B](0x80'u8 or
                                       byte((codepoint shr 6) and 0x3F'u32))
          output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
        else:
          return err(ErrorBufferOverflow)
      inc(offset, 3)
    elif codepoint <= 0x10FFFF'u32:
      if len(output) > 0:
        if offset + 3 < len(output):
          output[offset + 0] = cast[B](0xF0'u8 or
                                       byte((codepoint shr 18) and 0x07'u32))
          output[offset + 1] = cast[B](0x80'u8 or
                                       byte((codepoint shr 12) and 0x3F'u32))
          output[offset + 2] = cast[B](0x80'u8 or
                                       byte((codepoint shr 6) and 0x3F'u32))
          output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
        else:
          return err(ErrorBufferOverflow)
      inc(offset, 4)
    else:
      return err(ErrorInvalidSequence)
  ok(offset)

proc utf32toUtf8*[T: Wides32](input: openarray[T]): UResult[string] {.inline.} =
  ## Converts wide character sequence ``input`` to UTF-8 encoded string.
  var empty: array[0, char]
  let size = ? utf32ToUtf8(input, empty)
  var output = newString(size)
  let res {.used.} = ? utf32ToUtf8(input, output)
  ok(output)

proc utf8toUtf32*[A: Bytes, B: Wides32](input: openarray[A],
                                       output: var openarray[B]): UResult[int] =
  ## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
  ## sequences of 32bit limbs.
  ##
  ## To obtain required size of ``output`` you need to pass ``output`` as
  ## zero-length array, in such way required size will be returned as result of
  ## procedure.
  ##
  ## If size of ``output`` is not zero, and there not enough space in ``output``
  ## array to store whole ``input`` array, error ``ErrorBufferOverflow`` will
  ## be returned.
  var index = 0
  var dindex = 0
  if len(output) == 0:
    return utf8Length(input)
  else:
    while true:
      if index >= len(input):
        break
      let byte1 = uint32(input[index])
      inc(index)

      if (byte1 and 0x80) == 0x00:
        if dindex < len(output):
          output[dindex] = B(byte1)
          inc(dindex)
        else:
          return err(ErrorBufferOverflow)
      elif (byte1 and 0xE0'u32) == 0xC0'u32:
        # Two-byte form (110xxxxx 10xxxxxx)
        if index >= len(input):
          return err(ErrorInvalidSequence)
        # overlong sequence test
        if (byte1 and 0xFE'u32) == 0xC0'u32:
          return err(ErrorInvalidSequence)

        let byte2 = uint32(input[index])
        if (byte2 and 0xC0'u32) != 0x80'u32:
          return err(ErrorInvalidSequence)

        if dindex < len(output):
          output[dindex] = B(((byte1 and 0x1F'u32) shl 6) or
                              (byte2 and 0x3F'u32))
          inc(dindex)
        else:
          return err(ErrorBufferOverflow)
        inc(index)
      elif (byte1 and 0xF0'u32) == 0xE0'u32:
        # Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
        if (index + 1) >= len(input):
          return err(ErrorInvalidSequence)

        let byte2 = uint32(input[index])
        if (byte2 and 0xC0'u32) != 0x80'u32:
          return err(ErrorInvalidSequence)
        # overlong sequence test
        if (byte1 == 0xE0'u32) and ((byte2 and 0xE0'u32) == 0x80'u32):
          return err(ErrorInvalidSequence)
        #  0xD800–0xDFFF (UTF-16 surrogates) test
        if (byte1 == 0xED'u32) and ((byte2 and 0xE0'u32) == 0xA0'u32):
          return err(ErrorInvalidSequence)

        let byte3 = uint32(input[index + 1])
        if (byte3 and 0xC0'u32) != 0x80'u32:
          return err(ErrorInvalidSequence)
        # U+FFFE or U+FFFF test
        if (byte1 == 0xEF'u32) and (byte2 == 0xBF'u32) and
           ((byte3 and 0xFE'u32) == 0xBE'u32):
          return err(ErrorInvalidSequence)

        if dindex < len(output):
          output[dindex] = B(((byte1 and 0x0F'u32) shl 12) or
                             ((byte2 and 0x3F'u32) shl 6) or
                              (byte3 and 0x3F'u32))
          inc(dindex)
        else:
          return err(ErrorBufferOverflow)
        inc(index, 2)

      elif (byte1 and 0xF8'u8) == 0xF0'u8:
        # Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
        if (index + 2) >= len(input):
          return err(ErrorInvalidSequence)

        let byte2 = uint32(input[index])
        if (byte2 and 0xC0'u32) != 0x80'u32:
          return err(ErrorInvalidSequence)
        # overlong sequence test
        if (byte1 == 0xF0'u32) and ((byte2 and 0xF0'u32) == 0x80'u32):
          return err(ErrorInvalidSequence)
        # According to RFC 3629 no point above U+10FFFF should be used, which
        # limits characters to four bytes.
        if ((byte1 == 0xF4'u32) and (byte2 > 0x8F'u32)) or (byte1 > 0xF4'u32):
          return err(ErrorInvalidSequence)

        let byte3 = uint32(input[index + 1])
        if (byte3 and 0xC0'u32) != 0x80'u32:
          return err(ErrorInvalidSequence)

        let byte4 = uint32(input[index + 2])
        if (byte4 and 0xC0'u32) != 0x80'u32:
          return err(ErrorInvalidSequence)

        if dindex < len(output):
          output[dindex] = B(((byte1 and 0x07'u32) shl 18) or
                             ((byte2 and 0x3F'u32) shl 12) or
                             ((byte3 and 0x3F'u32) shl 6) or
                              (byte4 and 0x3F'u32))
          inc(dindex)
        else:
          return err(ErrorBufferOverflow)
        inc(index, 3)

      else:
        return err(ErrorInvalidSequence)

    ok(dindex)

proc utf8toUtf32*[A: Bytes, B: Wides32](et: typedesc[B],
                                        input: openarray[A]): UResult[seq[B]] =
  ## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
  ## sequence of 32bit limbs and return it.
  var empty: array[0, B]
  let size = ? utf8toUtf32(input, empty)
  var output = newSeq[B](size)
  let res {.used.} = ? utf8toUtf32(input, output)
  ok(output)

when defined(posix):
  import posix

  type
    Mbstate {.importc: "mbstate_t",
              header: "<wchar.h>", pure, final.} = object

  proc mbsrtowcs(dest: pointer, src: pointer, n: csize_t,
                 ps: ptr Mbstate): csize_t {.
       importc, header: "<wchar.h>".}

  proc mbstowcs*[A: Bytes, B: Wides32](t: typedesc[B],
                                       input: openarray[A]): UResult[seq[B]] =
    ## Converts multibyte encoded string to OS specific wide char string.
    ##
    ## Note, that `input` should be `0` terminated.
    ##
    ## Encoding is made using `mbsrtowcs`, so procedure supports invalid
    ## sequences and able to decoded all the characters before first invalid
    ## character encountered.

    # Without explicitely setting locale because `mbsrtowcs` will fail with
    # EILSEQ.
    # If locale is an empty string, "", each part of the locale that should
    # be modified is set according to the environment variables.
    let sres = setlocale(LC_ALL, cstring"")
    if isNil(sres):
      return err(ErrorInvalidLocale)

    var buffer = newSeq[B](len(input))
    if len(input) == 0:
      return ok(buffer)

    doAssert(input[^1] == A(0), "Input array should be zero-terminated")
    var data = @input
    var ostr = addr data[0]
    var pstr = ostr
    var mstate = Mbstate()

    while true:
      let res = mbsrtowcs(addr buffer[0], addr pstr, csize_t(len(buffer)),
                          addr mstate)
      if res == cast[csize_t](-1):
        # If invalid multibyte sequence has been encountered, ``pstr`` is left
        ## pointing to the invalid multibyte sequence, ``-1`` is returned, and
        ## errno is set to EILSEQ.
        let diff = cast[uint](pstr) - cast[uint](ostr)
        if diff == 0:
          return err(ErrorInvalidSequence)
        else:
          # We have partially decoded sequence, `diff` is position of first
          # invalid character in sequence.
          data[diff] = A(0x00)
          ostr = addr data[0]
          pstr = ostr
          mstate = Mbstate()
      else:
        # Its safe to convert `csize_t` to `int` here because `len(input)`
        # is also `int`.
        buffer.setLen(res)
        return ok(buffer)