implement practical UTF-8/16 codepoints module

this module designed for common cases where unicode text representation converted to nim string or blob. usually this module used in a parser or unicode bytes stream validator.
2021-07-22 10:39:13 +07:00 · 2021-07-22 10:39:13 +07:00 · 51e7e0ecfd
parent 92d5a8cc55
commit 51e7e0ecfd
3 changed files with 286 additions and 21 deletions
--- a/stew/utf.nim
+++ b/stew/utf.nim
@ -9,33 +9,57 @@
 # DFA based UTF8 decoder/validator
 # See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 import stew/ranges/ptr_arith
 import stew/results
 type
-  Utf8* = object
+  Utf8*  = object
  Utf16* = object
  Utf32* = object
  Utf*   = uint32
 const
  UTF8_ACCEPT* = 0
  UTF8_REJECT* = 12
-const utf8Table = [
+  highBegin = 0xD800
-  # The first part of the table maps bytes to character classes that
+  highEnd   = 0xDBFF
-  # to reduce the size of the transition table and create bitmasks.
+  lowBegin  = 0xDC00
-  0'u8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  lowEnd    = 0xDFFF
  0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  1   ,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
  7   ,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  8   ,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10  ,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
-  # The second part is a transition table that maps a combination
+  Utf16Shift = 10
-  # of a state of the automaton and a character class to a state.
+  Utf16Base  = 0x0010000
-  0 ,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  Utf16Mask  = 0x3FF
-  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  Utf16Maxbmp= 0xFFFF
-  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  MaxUtf     = 0x10FFFF
-  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+
-  12,36,12,12,12,12,12,12,12,12,12,12
+  DefaultReplacement* = 0xFFFD
-]
+  InvalidUTF8  = "invalid UTF-8 sequence"
  InvalidUTF16 = "invalid UTF-16 sequence"
  InvalidUTF32 = "invalid UTF-32 sequence"
 const
  utf8Table = [
    # The first part of the table maps bytes to character classes that
    # to reduce the size of the transition table and create bitmasks.
    0'u8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0   ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1   ,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
    7   ,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
    8   ,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    10  ,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
    # The second part is a transition table that maps a combination
    # of a state of the automaton and a character class to a state.
    0 ,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
    12,36,12,12,12,12,12,12,12,12,12,12
  ]
 proc validate*[T: byte | char](_: type Utf8, text: openArray[T]): bool =
  var state = 0
@ -43,3 +67,214 @@ proc validate*[T: byte | char](_: type Utf8, text: openArray[T]): bool =
    let x = utf8Table[c.int].int
    state = utf8Table[256 + state + x].int
  state == UTF8_ACCEPT
 proc count*[T: byte | char](_: type Utf8,
                            text: openArray[T]): Result[int, string] =
  var
    state = 0
    res   = 0
  for c in text:
    let x = utf8Table[c.int].int
    state = utf8Table[256 + state + x].int
    if state == UTF8_ACCEPT:
      inc res
  if state == UTF8_ACCEPT:
    ok(res)
  else:
    err(InvalidUTF8)
 proc highSurrogate*(_: type Utf16, c: int): bool =
  c >= highBegin and c <= highEnd
 proc lowSurrogate*(_: type Utf16, c: int): bool =
  c >= lowBegin and c <= lowEnd
 proc utf*(_: type Utf16, c1, c2: int): Utf =
  Utf(((c1 - highBegin) shl Utf16Shift) + (c2 - lowBegin) + Utf16Base)
 proc inc*(_: type Utf16, cp: int, res: var int): bool =
  if cp <= Utf16Maxbmp:
    if cp >= highBegin and cp <= lowBegin:
      return false
    else:
      inc res
  elif cp > MaxUtf:
    return false
  else:
    inc res, 2
  return true
 proc utf16Len*[T: byte | char](_: type Utf8,
                               text: openArray[T]): Result[int, string] =
  var
    state = 0
    cp    = 0
    res   = 0
  for c in text:
    let x = utf8Table[c.int].int
    cp = if state != UTF8_ACCEPT:
          (c and 0x3fu) or (cp shl 6)
         else:
          (0xff shr x) and c
    state = utf8Table[256 + state + x].int
    if state == UTF8_ACCEPT:
      if not Utf16.inc(cp, res):
        return err(InvalidUTF8)
  if state == UTF8_ACCEPT:
    ok(res)
  else:
    err(InvalidUTF8)
 proc inc*(_: type Utf8, cp: int, res: var int): bool =
  if cp < 0x80:
    inc res
  elif cp < 0x800:
    inc res, 2
  elif cp < 0x10000:
    inc res, 3
  elif cp <= MaxUtf:
    inc res, 4
  else:
    return false
  return true
 proc utf8Len*(_: type Utf32, text: openArray[uint32]): Result[int, string] =
  var res = 0
  for cp in text:
    if not Utf8.inc(cp, res):
      return err(InvalidUTF32)
  ok(res)
 proc utf16Len*(_: type Utf32, text: openArray[uint32]): Result[int, string] =
  var res = 0
  for cp in text:
    if not Utf16.inc(cp, res):
      return err(InvalidUTF32)
  ok(res)
 proc utf8Len*(_: type Utf16, text: openArray[uint16]): Result[int, string] =
  var
    i   = 0
    res = 0
  while i < text.len:
    let c1 = text[i]
    if c1 >= highBegin and c1 <= highEnd:
      inc i
      if i >= text.len:
        return err(InvalidUtf16)
      # surrogate pairs
      let c2 = text[i]
      if c2 < lowBegin or c2 > lowEnd:
        return err(InvalidUtf16)
      let cp = Utf16.utf(c1, c2)
      if not Utf8.inc(cp, res):
        return err(InvalidUtf16)
    elif c1 >= lowBegin and c1 <= lowEnd:
      return err(InvalidUtf16)
    inc i
    if not Utf8.inc(c1, res):
      return err(InvalidUtf16)
  ok(res)
 proc validate*(_: type Utf16, text: openArray[uint16]): bool =
  var i  = 0
  while i < text.len:
    let c1 = text[i]
    if c1 >= highBegin and c1 <= highEnd:
      inc i
      if i >= text.len:
        return false
      # surrogate pairs
      let c2 = text[i]
      if c2 < lowBegin or c2 > lowEnd:
        return false
    elif c1 >= lowBegin and c1 <= lowEnd:
      return false
    inc i
  return true
 proc validate*[T: byte | char](_: type Utf16, text: openArray[T]): bool =
  if text.len mod 2 != 0:
    return false
  if text.len == 0:
    return true
  Utf16.validate(makeOpenArray(text[0].unsafeAddr, uint16, text.len div 2))
 proc append*(_: type Utf8, text: var (string | seq[byte]), cp: int): bool =
  var len = 0
  if not Utf8.inc(cp, len):
    return false
  let pos = text.len
  text.setLen(text.len + len)
  when text is string:
    type T = char
  else:
    type T = byte
  if len == 1:
    text[pos + 0] = T(cp)
  elif len == 2:
    text[pos + 0] = T(0xC0 + (cp shr 6))
    text[pos + 1] = T(0x80 + (cp and 0x3f))
  elif len == 3:
    text[pos + 0] = T(0xE0 + ( cp shr 12))
    text[pos + 1] = T(0x80 + ((cp shr 6) and 0x3F))
    text[pos + 2] = T(0x80 + ( cp and 0x3F))
  else:
    text[pos + 0] = T(0xF0 + ( cp shr 18))
    text[pos + 1] = T(0x80 + ((cp shr 12) and 0x3F))
    text[pos + 2] = T(0x80 + ((cp shr 6)  and 0x3F))
    text[pos + 3] = T(0x80 + ( cp and 0x3F))
  return true
 proc append*(_: type Utf8, text: var (string | seq[byte]), c1, c2: int): bool =
  Utf8.append(text, Utf16.utf(c1, c2))
 proc append*(_: type Utf16, text: var seq[uint16], cp: int): bool =
  if cp <= Utf16Maxbmp:
    if cp >= highBegin and cp <= lowBegin:
      return false
    else:
      text.add uint16(cp)
  elif cp > MaxUtf:
    return false
  else:
    let c = cp - Utf16Base
    text.add uint16((c shr Utf16Shift) + highBegin)
    text.add uint16((c and Utf16Mask) + lowBegin)
  return true
 proc append*[T: byte | char](_: type Utf16,
                             res: var seq[uint16],
                             text: openArray[T]): Result[int, string] =
  let r = Utf8.utf16Len(text)
  if r.isErr:
    return r
  var pos = res.len
  res.setLen(pos + r.get())
  var
    state = 0
    cp    = 0
  for c in text:
    let x = utf8Table[c.int].int
    cp = if state != UTF8_ACCEPT:
          (c and 0x3fu) or (cp shl 6)
         else:
          (0xff shr x) and c
    state = utf8Table[256 + state + x].int
    if state == UTF8_ACCEPT:
      if cp <= Utf16MaxBmp:
        res[pos] = uint16(cp)
        inc pos
      else:
        res[pos + 0] = uint16((cp shr Utf16Shift) + highBegin)
        res[pos + 1] = uint16((cp and Utf16Mask) + lowBegin)
        inc pos, 2
  return r
--- a/tests/all_tests.nim
+++ b/tests/all_tests.nim
@ -27,4 +27,5 @@ import
  test_sequtils2,
  test_results,
  test_varints,
-  test_winacl
+  test_winacl,
  test_utf
--- a/tests/test_utf.nim
+++ b/tests/test_utf.nim
@ -65,3 +65,32 @@ suite "UTF-8 DFA validator":
      Utf8.validate("foob\xc3\xa6r")
      Utf8.validate("foob\xf0\x9f\x99\x88r")
  test "boundary test":
    check:
      Utf8.validate("κόσμε")
      Utf8.validate("\xC2\x80")
      Utf8.validate("\xE0\xA0\x80")
      Utf8.validate("\xF0\x90\x80\x80")
      Utf8.validate("\xF8\x88\x80\x80\x80") == false
      Utf8.validate("\xFC\x84\x80\x80\x80\x80") == false
      Utf8.validate("\x7F")
      Utf8.validate("\xDF\xBF")
      Utf8.validate("\xEF\xBF\xBF")
      Utf8.validate("\xF4\x8F\xBF\xBF")
      Utf8.validate("\xF4\x90\x80\x80") == false
      Utf8.validate("\xFB\xBF\xBF\xBF\xBF") == false
      Utf8.validate("\xFD\xBF\xBF\xBF\xBF\xBF") == false
      Utf8.validate("\xed\x9f\xbf")
      Utf8.validate("\xee\x80\x80")
      Utf8.validate("\xef\xbf\xbd")
 #[
 import unicode, strutils
 func toHex(s: string): string =
  for c in s:
    result.add toHex(c.int, 2)
 echo toUTF8(0x110000.Rune).toHex
 ]#