mirror of
https://github.com/waku-org/nwaku.git
synced 2025-02-22 20:08:40 +00:00
177 lines
3.6 KiB
Nim
177 lines
3.6 KiB
Nim
# https://unicode.org/reports/tr29/
|
|
|
|
import strutils
|
|
|
|
import derived_data
|
|
import two_stage_table
|
|
import utils
|
|
|
|
const
|
|
sgwOther = 0
|
|
sgwDoubleQuote = 1
|
|
sgwSingleQuote = 2
|
|
sgwHebrewLetter = 3
|
|
sgwCr = 4
|
|
sgwLf = 5
|
|
sgwNewline = 6
|
|
sgwExtend = 7
|
|
sgwRegionalIndicator = 8
|
|
sgwFormat = 9
|
|
sgwKatakana = 10
|
|
sgwAletter = 11
|
|
sgwMidLetter = 12
|
|
sgwMidNum = 13
|
|
sgwMidNumLet = 14
|
|
sgwNumeric = 15
|
|
sgwExtendNumLet = 16
|
|
sgwZwj = 17
|
|
sgwWsegSpace = 18
|
|
sgwExtendedPictographic = 19
|
|
|
|
func wordMap(s: string): int =
|
|
result = case s:
|
|
of "Other":
|
|
sgwOther
|
|
of "Double_Quote":
|
|
sgwDoubleQuote
|
|
of "Single_Quote":
|
|
sgwSingleQuote
|
|
of "Hebrew_Letter":
|
|
sgwHebrewLetter
|
|
of "CR":
|
|
sgwCr
|
|
of "LF":
|
|
sgwLf
|
|
of "Newline":
|
|
sgwNewline
|
|
of "Extend":
|
|
sgwExtend
|
|
of "Regional_Indicator":
|
|
sgwRegionalIndicator
|
|
of "Format":
|
|
sgwFormat
|
|
of "Katakana":
|
|
sgwKatakana
|
|
of "ALetter":
|
|
sgwAletter
|
|
of "MidLetter":
|
|
sgwMidLetter
|
|
of "MidNum":
|
|
sgwMidNum
|
|
of "MidNumLet":
|
|
sgwMidNumLet
|
|
of "Numeric":
|
|
sgwNumeric
|
|
of "ExtendNumLet":
|
|
sgwExtendNumLet
|
|
of "ZWJ":
|
|
sgwZwj
|
|
of "WSegSpace":
|
|
sgwWsegSpace
|
|
else:
|
|
assert false
|
|
-99
|
|
|
|
func emojiMap(s: string): int =
|
|
result = case s
|
|
of "Extended_Pictographic":
|
|
sgwExtendedPictographic
|
|
else:
|
|
-1
|
|
|
|
type
|
|
WordProps = seq[int]
|
|
|
|
proc parseWordBreak(filePath, filePathEmoji: string): WordProps =
|
|
let rawData = filePath.parseUDDNoDups
|
|
result = newSeq[int](rawData.len)
|
|
for i in 0 .. result.len-1:
|
|
result[i] = sgwOther
|
|
for cp, data in rawData.pairs:
|
|
if data.len == 0:
|
|
continue
|
|
result[cp] = data[0].wordMap
|
|
let rawDataEmoji = filePathEmoji.parseUDDEmoji
|
|
for cp, data in rawDataEmoji.pairs:
|
|
if data.len == 0:
|
|
continue
|
|
if data[0].emojiMap == -1:
|
|
continue
|
|
result[cp] = data[0].emojiMap
|
|
|
|
func buildWordBreak(wordProps: WordProps): Stages[int] =
|
|
buildTwoStageTable(wordProps)
|
|
|
|
const dataTemplate = """## This is auto-generated. Do not modify it
|
|
|
|
type
|
|
SgWord* = distinct int8
|
|
|
|
const
|
|
sgwOther* = $#.SgWord
|
|
sgwDoubleQuote* = $#.SgWord
|
|
sgwSingleQuote* = $#.SgWord
|
|
sgwHebrewLetter* = $#.SgWord
|
|
sgwCr* = $#.SgWord
|
|
sgwLf* = $#.SgWord
|
|
sgwNewline* = $#.SgWord
|
|
sgwExtend* = $#.SgWord
|
|
sgwRegionalIndicator* = $#.SgWord
|
|
sgwFormat* = $#.SgWord
|
|
sgwKatakana* = $#.SgWord
|
|
sgwAletter* = $#.SgWord
|
|
sgwMidLetter* = $#.SgWord
|
|
sgwMidNum* = $#.SgWord
|
|
sgwMidNumLet* = $#.SgWord
|
|
sgwNumeric* = $#.SgWord
|
|
sgwExtendNumLet* = $#.SgWord
|
|
sgwZwj* = $#.SgWord
|
|
sgwWsegSpace* = $#.SgWord
|
|
sgwExtendedPictographic* = $#.SgWord
|
|
|
|
const
|
|
wordBreakIndices* = [
|
|
$#
|
|
]
|
|
wordBreakData* = [
|
|
$#
|
|
]
|
|
wordBreakBlockSize* = $#
|
|
"""
|
|
|
|
when isMainModule:
|
|
let wordProps = parseWordBreak(
|
|
"./gen/UCD/auxiliary/WordBreakProperty.txt",
|
|
"./gen/UCD/emoji/emoji-data.txt")
|
|
let wordPropsTable = wordProps.buildWordBreak
|
|
|
|
var f = open("./src/unicodedb/segmentation_data.nim", fmWrite)
|
|
try:
|
|
f.write(dataTemplate % [
|
|
$sgwOther,
|
|
$sgwDoubleQuote,
|
|
$sgwSingleQuote,
|
|
$sgwHebrewLetter,
|
|
$sgwCr,
|
|
$sgwLf,
|
|
$sgwNewline,
|
|
$sgwExtend,
|
|
$sgwRegionalIndicator,
|
|
$sgwFormat,
|
|
$sgwKatakana,
|
|
$sgwAletter,
|
|
$sgwMidLetter,
|
|
$sgwMidNum,
|
|
$sgwMidNumLet,
|
|
$sgwNumeric,
|
|
$sgwExtendNumLet,
|
|
$sgwZwj,
|
|
$sgwWsegSpace,
|
|
$sgwExtendedPictographic,
|
|
prettyTable(wordPropsTable.stage1, 15, "'i16"),
|
|
prettyTable(wordPropsTable.stage2, 15, "'i8"),
|
|
$wordPropsTable.blockSize
|
|
])
|
|
finally:
|
|
close(f)
|