274 lines
6.8 KiB
Nim

import strtabs
import strutils
import unicode_data
import two_stage_table
import min_perfect_hash
import utils
# See http://www.unicode.org/versions/latest/ch04.pdf
# Chapter 4.8
const
prefixRanges = [
[0xAC00, 0xD7A3],
[0x3400, 0x4DBF],
[0x4E00, 0x9FFC],
[0x20000, 0x2A6DD],
[0x2A700, 0x2B734],
[0x2B740, 0x2B81D],
[0x2B820, 0x2CEA1],
[0x2CEB0, 0x2EBE0],
[0x30000, 0x3134A],
[0x17000, 0x187F7],
[0x18D00, 0x18D08],
[0x18B00, 0x18CD5],
[0x1B170, 0x1B2FB],
[0xF900, 0xFA6D],
[0xFA70, 0xFAD9],
[0x2F800, 0x2FA1D]
]
proc isInPrefixRange(cp: int): bool =
result = false
for pr in prefixRanges:
if pr[0] <= cp and cp <= pr[1]:
result = true
break
proc parseNames(namesRaw: seq[string]): seq[string] =
result = newSeq[string](namesRaw.len)
for cp, nr in namesRaw:
if nr.len == 0:
continue
# These can be auto generated,
# so we don't store them
if isInPrefixRange(cp):
continue
# Un-named
if nr.startsWith("<") and nr.endswith(">"):
continue
result[cp] = nr
# See http://www.unicode.org/versions/latest/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
const
SBase = 0xAC00
LBase = 0x1100
VBase = 0x1161
TBase = 0x11A7
LCount = 19
VCount = 21
TCount = 28
NCount = VCount * TCount # 588
SCount = LCount * NCount # 11172
const
jamoLTable = [
"G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
"S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
]
jamoVTable = [
"A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
"WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
"YU", "EU", "YI", "I"
]
jamoTTable = [
"", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
"LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
"S", "SS", "NG", "J", "C", "K", "T", "P", "H"
]
# todo: move hangulName from src/unicodedbpkg/names to src/unicodedbpkg/hangul
# and import that, then remove this.
proc hangulName(s: int): string =
let SIndex = s - SBase
if 0 > SIndex or SIndex >= SCount:
raise newException(
ValueError, "Not a Hangul Syllable: $#" % [intToStr(s)])
let LIndex = SIndex div NCount
let VIndex = (SIndex mod NCount) div TCount
let TIndex = SIndex mod TCount
result = "HANGUL SYLLABLE "
result.add(jamoLTable[LIndex])
result.add(jamoVTable[VIndex])
result.add(jamoTTable[TIndex])
proc buildWords(names: seq[string]): seq[string] =
## Return unique words
result = @[]
var wordsLookup = newStringTable(modeCaseSensitive)
for cp, name in names:
if name.len == 0:
continue
for word in name.split(' '):
if wordsLookup.hasKey(word):
continue
result.add(word)
wordsLookup[word] = ""
type
WordsTable = ref object
words: seq[int]
offsets: seq[int]
proc buildWordsTable(words: seq[string]): WordsTable =
## build giant array of null-terminated words and
## an offset table to track where each word starts
var tableSize = 0
for word in words:
tableSize += len(word) + 1
result = WordsTable(
words: newSeq[int](tableSize),
offsets: newSeq[int](words.len))
var offset = 0
for i, word in words:
result.offsets[i] = offset
for c in word:
result.words[offset] = ord(c)
inc offset
result.words[offset] = 0
inc offset
type
NamesTable = ref object
names: seq[int]
offsets: seq[int]
proc buildNamesTable(names: seq[string], words: seq[string]): NamesTable =
## Return table for retrieving a name from a CP.
## Build table of name-words idx mapping and table of offsets cp-name.
## -> [lenA, wordAIdx, wordBIdx, lenB...]
var maxNamesSize = 0
for name in names:
if name.len == 0:
continue
for word in name.split(' '):
inc maxNamesSize
inc maxNamesSize
result = NamesTable(
names: newSeqOfCap[int](maxNamesSize),
offsets: newSeq[int](names.len))
for i in 0 ..< result.offsets.len:
result.offsets[i] = -1
# todo: use table
var wordsLookup = newStringTable(modeCaseSensitive)
for i, word in words:
wordsLookup[word] = intToStr(i)
var nameLookup = newStringTable(modeCaseSensitive)
var offset = 0
var dedupsCount = 0
for cp, name in names:
if name.len == 0:
continue
# De-duplicate name
let nl = nameLookup.getOrDefault(name, "")
if nl.len > 0:
inc dedupsCount
result.offsets[cp] = result.offsets[parseInt(nl)]
continue
nameLookup[name] = intToStr(cp)
result.offsets[cp] = offset
result.names.add(name.len)
inc offset
for word in name.split(' '):
result.names.add(parseInt(wordsLookup[word]))
inc offset
echo "dedups: $#" % $dedupsCount
proc buildNameLookup(namesRaw: seq[string]): seq[Record[int]] =
## Return lookup tables for retrieving a CP from a name
result = newSeqOfCap[Record[int]](namesRaw.len)
for cp, nr in namesRaw:
if nr.len == 0:
continue
var rd = (
key: newSeq[int](nr.len),
value: cp)
for i, c in nr:
rd.key[i] = ord(c)
result.add(rd)
# Add Hangul Syllables
for cp in 0xAC00 .. 0xD7A3:
let nr = hangulName(cp)
var rd = (
key: newSeq[int](nr.len),
value: cp)
for i, c in nr:
rd.key[i] = ord(c)
result.add(rd)
const namesTemplate = """## This is auto-generated. Do not modify it
const
namesOffsets* = [
$#
]
namesIndices* = [
$#
]
namesTable* = [
$#
]
wordsOffsets* = [
$#
]
wordsData* = [
$#
]
blockSize* = $#
namesHashes* = [
$#
]
namesValues* = [
$#
]
"""
when isMainModule:
echo "unicode data"
let names = parseNames(parseUDNames("./gen/UCD/UnicodeData.txt"))
echo "words"
let words = buildWords(names)
echo "words Table"
let wtObj = buildWordsTable(words)
echo len(wtObj.words)
echo len(wtObj.offsets)
echo "names Table"
let ntObj = buildNamesTable(names, words)
echo len(ntObj.names)
echo len(ntObj.offsets)
echo "done"
echo "two stage table"
let stageTable = findBestTable(ntObj.offsets)
assert stageTable.blockSize > 0
echo stageTable.blockSize
echo len(stageTable.stage1)
echo len(stageTable.stage2)
# todo: has to check the returned CP matches the given name (the reverse)
var mphTables = mph(buildNameLookup(names))
#var fooNameInts: seq[int] = @[]
#for c in "LATIN CAPITAL LETTER C WITH DOT ABOVE":
# fooNameInts.add(ord(c))
#echo mphLookup(mphTables.h, mphTables.v, fooNameInts)
var f = open("./src/unicodedb/names_data.nim", fmWrite)
try:
f.write(namesTemplate % [
prettyTable(stageTable.stage1, 15, "'u8"),
prettyTable(stageTable.stage2, 5, "'i32"),
prettyTable(ntObj.names, 10, "'i16"),
prettyTable(wtObj.offsets, 10, "'i32"),
prettyTable(wtObj.words, 15, "'i8"),
intToStr(stageTable.blockSize),
prettyTable(mphTables.h, 15, "'u16"),
prettyTable(mphTables.v, 10, "'i32")
])
finally:
close(f)