mirror of
https://github.com/waku-org/nwaku.git
synced 2025-02-22 20:08:40 +00:00
359 lines
8.2 KiB
Nim
359 lines
8.2 KiB
Nim
import unicode
|
|
import strutils
|
|
import sequtils
|
|
|
|
import ../src/unicodedb/properties
|
|
import ../src/unicodedb/compositions
|
|
import ../src/unicodedb/decompositions
|
|
import ../src/unicodedb/types
|
|
import ../src/unicodedb/casing
|
|
import ../src/unicodedb/segmentation
|
|
|
|
proc write(path: string, s: string) =
|
|
var f = open(path, fmWrite)
|
|
try:
|
|
f.write(s)
|
|
finally:
|
|
close(f)
|
|
|
|
proc isAssigned(r: Rune): bool =
|
|
r.unicodeCategory() != ctgCn
|
|
|
|
const maxCP = 0x10FFFF
|
|
|
|
proc bidiData(): seq[tuple[
|
|
cpFirst: int,
|
|
cpLast: int,
|
|
bi: string,
|
|
assigned: bool]] =
|
|
result = @[]
|
|
var lastData = 0.Rune.bidirectional()
|
|
var lastCP = 0
|
|
var lastAssigned = 0.Rune.isAssigned()
|
|
for cp in 0 .. maxCP:
|
|
let data = cp.Rune.bidirectional()
|
|
let assigned = cp.Rune.isAssigned()
|
|
if data != lastData or assigned != lastAssigned:
|
|
result.add((
|
|
cpFirst: lastCP,
|
|
cpLast: cp-1,
|
|
bi: lastData,
|
|
assigned: lastAssigned))
|
|
lastData = data
|
|
lastAssigned = assigned
|
|
lastCP = cp
|
|
result.add((
|
|
cpFirst: lastCP,
|
|
cpLast: maxCP,
|
|
bi: lastData,
|
|
assigned: lastAssigned))
|
|
|
|
const bidiTemplate = """const allBidis* = [
|
|
$#]
|
|
"""
|
|
|
|
proc categoryData(): seq[tuple[cpFirst: int, cpLast: int, cat: UnicodeCategory]] =
|
|
result = @[]
|
|
var lastData = 0.Rune.unicodeCategory()
|
|
var lastCP = 0
|
|
for cp in 0 .. maxCP:
|
|
let data = cp.Rune.unicodeCategory()
|
|
if data != lastData:
|
|
result.add((cpFirst: lastCP, cpLast: cp-1, cat: lastData))
|
|
lastData = data
|
|
lastCP = cp
|
|
result.add((cpFirst: lastCP, cpLast: maxCP, cat: lastData))
|
|
|
|
const catTemplate = """const allCats* = [
|
|
$#]
|
|
"""
|
|
|
|
proc combiningData(): seq[tuple[
|
|
cpFirst: int,
|
|
cpLast: int,
|
|
ccc: int,
|
|
assigned: bool]] =
|
|
result = @[]
|
|
var lastData = 0.Rune.combining()
|
|
var lastAssigned = 0.Rune.isAssigned()
|
|
var lastCP = 0
|
|
for cp in 0 .. maxCP:
|
|
let data = cp.Rune.combining()
|
|
let assigned = cp.Rune.isAssigned()
|
|
if data != lastData or assigned != lastAssigned:
|
|
result.add((
|
|
cpFirst: lastCP,
|
|
cpLast: cp-1,
|
|
ccc: lastData,
|
|
assigned: lastAssigned))
|
|
lastData = data
|
|
lastAssigned = assigned
|
|
lastCP = cp
|
|
result.add((
|
|
cpFirst: lastCP,
|
|
cpLast: maxCP,
|
|
ccc: lastData,
|
|
assigned: lastAssigned))
|
|
|
|
const combiningTemplate = """const allCombining* = [
|
|
$#]
|
|
"""
|
|
|
|
proc compositionData(): seq[array[3, int]] =
|
|
result = @[]
|
|
var comp = 0.Rune
|
|
for cp in 0 .. maxCP:
|
|
let decomp = canonicalDecomposition(cp.Rune)
|
|
if decomp.len == 2:
|
|
if composition(comp, decomp[0], decomp[1]):
|
|
result.add([cp, decomp[0].int, decomp[1].int])
|
|
assert comp == cp.Rune
|
|
|
|
const compositionTemplate = """const allComps* = [
|
|
$#]
|
|
"""
|
|
|
|
proc decompositionData(): seq[tuple[
|
|
cp: int,
|
|
isCanonical: bool,
|
|
dcp: seq[int]]] =
|
|
result = @[]
|
|
for cp in 0 .. maxCP:
|
|
let decomp = decomposition(cp.Rune)
|
|
if decomp.len > 0:
|
|
var decompInt = newSeq[int]()
|
|
for dcp in decomp:
|
|
decompInt.add(dcp.int)
|
|
result.add((
|
|
cp: cp,
|
|
isCanonical: canonicalDecomposition(cp.Rune).len > 0,
|
|
dcp: decompInt))
|
|
|
|
const decompositionTemplate = """const allDecomps* = [
|
|
$#]
|
|
"""
|
|
|
|
proc typesData(): seq[tuple[
|
|
first: int,
|
|
last: int,
|
|
de: bool,
|
|
di: bool,
|
|
nu: bool,
|
|
lo: bool,
|
|
up: bool,
|
|
asig: bool]] =
|
|
#[
|
|
utmDecimal = 1
|
|
utmDigit = 2
|
|
utmNumeric = 4
|
|
utmLowercase = 8
|
|
utmUppercase = 16
|
|
utmCased = 32
|
|
utmWhiteSpace = 64
|
|
utmWord = 128
|
|
]#
|
|
result = @[]
|
|
let t = 0.Rune.unicodeTypes()
|
|
var lastData = [
|
|
utmDecimal in t,
|
|
utmDigit in t,
|
|
utmNumeric in t,
|
|
utmLowercase in t,
|
|
utmUppercase in t,
|
|
0.Rune.isAssigned()]
|
|
var lastCP = 0
|
|
for cp in 0 .. maxCP:
|
|
let t = cp.Rune.unicodeTypes()
|
|
let data = [
|
|
utmDecimal in t,
|
|
utmDigit in t,
|
|
utmNumeric in t,
|
|
utmLowercase in t,
|
|
utmUppercase in t,
|
|
cp.Rune.isAssigned()]
|
|
if data != lastData:
|
|
result.add((
|
|
first: lastCP,
|
|
last: cp-1,
|
|
de: lastData[0],
|
|
di: lastData[1],
|
|
nu: lastData[2],
|
|
lo: lastData[3],
|
|
up: lastData[4],
|
|
asig: lastData[5]))
|
|
lastData = data
|
|
lastCP = cp
|
|
result.add((
|
|
first: lastCP,
|
|
last: maxCP,
|
|
de: lastData[0],
|
|
di: lastData[1],
|
|
nu: lastData[2],
|
|
lo: lastData[3],
|
|
up: lastData[4],
|
|
asig: lastData[5]))
|
|
|
|
const typesTemplate = """const allTypes* = [
|
|
$#]
|
|
"""
|
|
|
|
type
|
|
Casing = tuple
|
|
cp: int
|
|
cps: seq[int]
|
|
|
|
template casingData(conversion): untyped {.dirty.} =
|
|
for cp in 0 .. maxCP:
|
|
let cps = toSeq(conversion(cp.Rune))
|
|
if cps.len > 1:
|
|
result.add((
|
|
cp: cp,
|
|
cps: map(cps, proc (x: Rune): int = x.int)
|
|
))
|
|
elif cps.len == 1 and cps[0] != cp.Rune:
|
|
result.add((cp: cp, cps: @[cps[0].int]))
|
|
|
|
proc lowercaseData(): seq[Casing] =
|
|
casingData(lowerCase)
|
|
|
|
proc uppercaseData(): seq[Casing] =
|
|
casingData(upperCase)
|
|
|
|
proc titlecaseData(): seq[Casing] =
|
|
casingData(titleCase)
|
|
|
|
proc casefoldData(): seq[Casing] =
|
|
casingData(caseFold)
|
|
|
|
const casingTemplate = """const allLowercase* = [
|
|
$#]
|
|
const allUppercase* = [
|
|
$#]
|
|
const allTitlecase* = [
|
|
$#]
|
|
const allCasefold* = [
|
|
$#]
|
|
"""
|
|
|
|
type
|
|
WordBreak = tuple
|
|
cpFirst: int
|
|
cpLast: int
|
|
prop: int
|
|
|
|
proc wordBreakData(): seq[WordBreak] =
|
|
result = @[]
|
|
var lastData = 0.Rune.wordBreakProp.int
|
|
var lastCP = 0
|
|
for cp in 0 .. maxCP:
|
|
let data = cp.Rune.wordBreakProp.int
|
|
if data != lastData:
|
|
result.add((cpFirst: lastCP, cpLast: cp-1, prop: lastData))
|
|
lastData = data
|
|
lastCP = cp
|
|
result.add((cpFirst: lastCP, cpLast: maxCP, prop: lastData))
|
|
|
|
const wordBreakTemplate = """const allWordBreak* = [
|
|
$#]
|
|
"""
|
|
|
|
proc `$`(uctg: UnicodeCategory): string =
|
|
$uctg.int
|
|
|
|
when isMainModule:
|
|
echo "Generating bidirectional data"
|
|
var bidi = ""
|
|
for d in bidiData():
|
|
bidi.add(' ')
|
|
bidi.add(' ')
|
|
bidi.add($d)
|
|
bidi.add(',')
|
|
bidi.add('\L')
|
|
write("./tests/bidi_test_data.nim", bidiTemplate % bidi)
|
|
echo "Generating category data"
|
|
var cat = ""
|
|
for d in categoryData():
|
|
cat.add(' ')
|
|
cat.add(' ')
|
|
cat.add($d)
|
|
cat.add(',')
|
|
cat.add('\L')
|
|
write("./tests/category_test_data.nim", catTemplate % cat)
|
|
echo "Generating combining data"
|
|
var comb = ""
|
|
for d in combiningData():
|
|
comb.add(' ')
|
|
comb.add(' ')
|
|
comb.add($d)
|
|
comb.add(',')
|
|
comb.add('\L')
|
|
write("./tests/combining_test_data.nim", combiningTemplate % comb)
|
|
echo "Generating compositions data"
|
|
var comp = ""
|
|
for d in compositionData():
|
|
comp.add(' ')
|
|
comp.add(' ')
|
|
comp.add($d)
|
|
comp.add(',')
|
|
comp.add('\L')
|
|
write("./tests/compositions_test_data.nim", compositionTemplate % comp)
|
|
echo "Generating decompositions data"
|
|
var decomp = ""
|
|
for d in decompositionData():
|
|
decomp.add(' ')
|
|
decomp.add(' ')
|
|
decomp.add($d)
|
|
decomp.add(',')
|
|
decomp.add('\L')
|
|
write("./tests/decompositions_test_data.nim", decompositionTemplate % decomp)
|
|
echo "Generating types data"
|
|
var ts = ""
|
|
for d in typesData():
|
|
ts.add(' ')
|
|
ts.add(' ')
|
|
ts.add($d)
|
|
ts.add(',')
|
|
ts.add('\L')
|
|
write("./tests/types_test_data.nim", typesTemplate % ts)
|
|
echo "Generating casing data"
|
|
var lowercaseTpl = ""
|
|
for ca in lowercaseData():
|
|
lowercaseTpl.add(' ')
|
|
lowercaseTpl.add(' ')
|
|
lowercaseTpl.add($ca)
|
|
lowercaseTpl.add(',')
|
|
lowercaseTpl.add('\L')
|
|
var uppercaseTpl = ""
|
|
for ca in uppercaseData():
|
|
uppercaseTpl.add(' ')
|
|
uppercaseTpl.add(' ')
|
|
uppercaseTpl.add($ca)
|
|
uppercaseTpl.add(',')
|
|
uppercaseTpl.add('\L')
|
|
var titlecaseTpl = ""
|
|
for ca in titlecaseData():
|
|
titlecaseTpl.add(' ')
|
|
titlecaseTpl.add(' ')
|
|
titlecaseTpl.add($ca)
|
|
titlecaseTpl.add(',')
|
|
titlecaseTpl.add('\L')
|
|
var casefoldTpl = ""
|
|
for ca in casefoldData():
|
|
casefoldTpl.add(' ')
|
|
casefoldTpl.add(' ')
|
|
casefoldTpl.add($ca)
|
|
casefoldTpl.add(',')
|
|
casefoldTpl.add('\L')
|
|
write(
|
|
"./tests/casing_test_data.nim", casingTemplate % [
|
|
lowercaseTpl, uppercaseTpl, titlecaseTpl, casefoldTpl])
|
|
var wordBreakTpl = ""
|
|
for wb in wordBreakData():
|
|
wordBreakTpl.add(' ')
|
|
wordBreakTpl.add(' ')
|
|
wordBreakTpl.add($wb)
|
|
wordBreakTpl.add(',')
|
|
wordBreakTpl.add('\L')
|
|
write(
|
|
"./tests/word_break_test_data.nim", wordBreakTemplate % wordBreakTpl)
|