mirror of
https://github.com/waku-org/nwaku.git
synced 2025-02-21 03:18:32 +00:00
275 lines
7.3 KiB
Nim
275 lines
7.3 KiB
Nim
# Resources:
|
|
# http://www.unicode.org/versions/Unicode12.1.0/ch03.pdf
|
|
# 3.13 Default Case Algorithms
|
|
# http://www.unicode.org/versions/Unicode12.1.0/ch04.pdf#M9.27678.Heading.41.Case
|
|
# http://www.unicode.org/versions/Unicode12.1.0/ch05.pdf#G21180
|
|
# https://unicode.org/faq/casemap_charprop.html
|
|
|
|
import strutils
|
|
|
|
import unicode_data
|
|
import derived_data
|
|
import two_stage_table
|
|
import utils
|
|
|
|
type
|
|
Mapping = seq[int]
|
|
Casing = object
|
|
uppercase: Mapping
|
|
lowercase: Mapping
|
|
titlecase: Mapping
|
|
|
|
func parseSimpleCasing(rawData: seq[seq[string]]): seq[Casing] =
|
|
result = newSeq[Casing](rawData.len)
|
|
for cp, data in rawData.pairs:
|
|
if data.len == 0:
|
|
continue
|
|
if data[0].len > 0:
|
|
result[cp].uppercase.add(parseHexInt("0x$#" % data[0]))
|
|
if data[1].len > 0:
|
|
result[cp].lowercase.add(parseHexInt("0x$#" % data[1]))
|
|
if data[2].len > 0:
|
|
result[cp].titlecase.add(parseHexInt("0x$#" % data[2]))
|
|
# titlecase = uppercase if empty (simple mapping only, see tr44)
|
|
if result[cp].titlecase.len == 0:
|
|
result[cp].titlecase = result[cp].uppercase
|
|
|
|
proc parseSimpleCasing(filePath: string): seq[Casing] =
|
|
filePath
|
|
.parseUDCasing
|
|
.parseSimpleCasing
|
|
|
|
proc parseSpecialCasing(filePath: string): seq[Casing] =
|
|
# lower; title; upper; conditional?
|
|
let rawData = filePath.parseUDD
|
|
result = newSeq[Casing](rawData.len)
|
|
var i = 0
|
|
for cp, props in rawData.pairs:
|
|
if props.len == 0:
|
|
continue
|
|
i = 0
|
|
for p in props:
|
|
if p.len > 4: # conditional mapping
|
|
continue
|
|
assert p[3].strip.len == 0 # comment
|
|
for pp in 0 .. 2:
|
|
let casing = p[pp].split(' ')
|
|
let c0 = parseHexInt("0x$#" % casing[0])
|
|
if casing.len == 1 and cp == c0:
|
|
continue
|
|
for ca in casing:
|
|
assert ca.strip.len > 0
|
|
let caCp = parseHexInt("0x$#" % ca)
|
|
if pp == 0:
|
|
result[cp].lowercase.add(caCp)
|
|
elif pp == 1:
|
|
result[cp].titlecase.add(caCp)
|
|
elif pp == 2:
|
|
result[cp].uppercase.add(caCp)
|
|
else:
|
|
assert false
|
|
inc i
|
|
doAssert i <= 1
|
|
|
|
func mergeCasings(simple, special: seq[Casing]): seq[Casing] =
|
|
## overwrite simple casing by special casing
|
|
result = simple
|
|
for cp in 0 .. special.len-1:
|
|
if special[cp].uppercase.len > 0:
|
|
result[cp].uppercase = special[cp].uppercase
|
|
if special[cp].lowercase.len > 0:
|
|
result[cp].lowercase = special[cp].lowercase
|
|
if special[cp].titlecase.len > 0:
|
|
result[cp].titlecase = special[cp].titlecase
|
|
|
|
proc parse(simple, special: string): seq[Casing] =
|
|
let simpleCasings = simple.parseSimpleCasing
|
|
let specialCasings = special.parseSpecialCasing
|
|
result = mergeCasings(simpleCasings, specialCasings)
|
|
|
|
type
|
|
Folding = seq[int]
|
|
|
|
proc parseFolding(filePath: string): seq[Folding] =
|
|
let rawData = filePath.parseUDDFullCaseFolding
|
|
result = newSeq[Folding](rawData.len)
|
|
for cp, props in rawData.pairs:
|
|
if props.len == 0:
|
|
continue
|
|
for c in props[1].strip.split(' '):
|
|
result[cp].add parseHexInt("0x$#" % c)
|
|
|
|
type
|
|
CasingTable = object
|
|
cps: seq[uint32]
|
|
offsets: seq[int]
|
|
|
|
# Build table of values with dynamic len
|
|
func buildCasingTable(casings: seq[Mapping]): CasingTable =
|
|
var cpsSize = 0
|
|
for ca in casings:
|
|
if ca.len > 0:
|
|
cpsSize += len(ca)
|
|
|
|
result = CasingTable(
|
|
cps: newSeq[uint32](cpsSize),
|
|
offsets: newSeq[int](len(casings)))
|
|
for i in 0 ..< len(casings):
|
|
result.offsets[i] = -1
|
|
|
|
var offset = 0
|
|
for cp, ca in casings.pairs:
|
|
if ca.len == 0:
|
|
continue
|
|
result.offsets[cp] = offset
|
|
# Use ca[0] >> 2 to retrieve the first cp
|
|
# And ca[0] & 0x03 to retrieve the length
|
|
assert ca.len <= 0x03
|
|
assert ca[0] == ((ca[0].uint32 shl 2) shr 2).int64
|
|
result.cps[offset] = (ca[0].uint32 shl 2) + ca.len.uint32
|
|
inc offset
|
|
for i in 1 .. ca.len-1:
|
|
result.cps[offset] = ca[i].uint32
|
|
inc offset
|
|
assert offset == result.cps.len
|
|
|
|
type
|
|
MultiStageTable = object
|
|
data: seq[uint32]
|
|
stage1: seq[int]
|
|
stage2: seq[int]
|
|
blockSize: int
|
|
|
|
func buildMultiStageTable(casingTable: CasingTable): MultiStageTable =
|
|
let stageTable = findBestTable(casingTable.offsets)
|
|
assert stageTable.blockSize > 0
|
|
result = MultiStageTable(
|
|
data: casingTable.cps,
|
|
stage1: stageTable.stage1,
|
|
stage2: stageTable.stage2,
|
|
blockSize: stageTable.blockSize)
|
|
|
|
proc buildUpperCase(casings: seq[Casing]): MultiStageTable =
|
|
var uppercase = newSeq[Mapping](casings.len)
|
|
for i in 0 .. casings.len-1:
|
|
uppercase[i] = casings[i].uppercase
|
|
result = uppercase
|
|
.buildCasingTable
|
|
.buildMultiStageTable
|
|
|
|
func buildTitleCase(casings: seq[Casing]): MultiStageTable =
|
|
var titlecase = newSeq[Mapping](casings.len)
|
|
for i in 0 .. casings.len-1:
|
|
titlecase[i] = casings[i].titlecase
|
|
result = titlecase
|
|
.buildCasingTable
|
|
.buildMultiStageTable
|
|
|
|
func buildLowerCase(casings: seq[Casing]): MultiStageTable =
|
|
var lowercase = newSeq[Mapping](casings.len)
|
|
for i in 0 .. casings.len-1:
|
|
lowercase[i] = casings[i].lowercase
|
|
result = lowercase
|
|
.buildCasingTable
|
|
.buildMultiStageTable
|
|
|
|
func buildCaseFolding(foldings: seq[Folding]): MultiStageTable =
|
|
result = foldings
|
|
.buildCasingTable
|
|
.buildMultiStageTable
|
|
|
|
#[
|
|
func buildLowerCase(casings: seq[Casing]): Stages[int] =
|
|
var data = newSeq[int](casings.len)
|
|
for i in 0 .. data.len-1:
|
|
data[i] = -1
|
|
for cp, ca in casings.pairs:
|
|
assert ca.lowercase.len <= 1
|
|
if ca.lowercase.len == 1:
|
|
assert data[cp] != ca.lowercase[0]
|
|
data[cp] = ca.lowercase[0]
|
|
result = buildTwoStageTable(data)
|
|
]#
|
|
|
|
const dataTemplate = """## This is auto-generated. Do not modify it
|
|
|
|
const
|
|
lowercaseOffsets* = [
|
|
$#
|
|
]
|
|
lowercaseIndices* = [
|
|
$#
|
|
]
|
|
lowercaseData* = [
|
|
$#
|
|
]
|
|
lowercaseBlockSize* = $#
|
|
|
|
uppercaseOffsets* = [
|
|
$#
|
|
]
|
|
uppercaseIndices* = [
|
|
$#
|
|
]
|
|
uppercaseData* = [
|
|
$#
|
|
]
|
|
uppercaseBlockSize* = $#
|
|
|
|
titlecaseOffsets* = [
|
|
$#
|
|
]
|
|
titlecaseIndices* = [
|
|
$#
|
|
]
|
|
titlecaseData* = [
|
|
$#
|
|
]
|
|
titlecaseBlockSize* = $#
|
|
|
|
casefoldOffsets* = [
|
|
$#
|
|
]
|
|
casefoldIndices* = [
|
|
$#
|
|
]
|
|
casefoldData* = [
|
|
$#
|
|
]
|
|
casefoldBlockSize* = $#
|
|
"""
|
|
|
|
when isMainModule:
|
|
let casings = parse(
|
|
"./gen/UCD/UnicodeData.txt",
|
|
"./gen/UCD/SpecialCasing.txt")
|
|
let lowerTable = casings.buildLowerCase
|
|
let upperTable = casings.buildUpperCase
|
|
let titleTable = casings.buildTitleCase
|
|
|
|
let foldings = parseFolding("./gen/UCD/CaseFolding.txt")
|
|
let foldingTable = foldings.buildCaseFolding
|
|
|
|
var f = open("./src/unicodedb/casing_data.nim", fmWrite)
|
|
try:
|
|
f.write(dataTemplate % [
|
|
prettyTable(lowerTable.stage1, 15, "'i8"),
|
|
prettyTable(lowerTable.stage2, 15, "'i16"),
|
|
prettyTable(lowerTable.data, 15, "'u32"),
|
|
$lowerTable.blockSize,
|
|
prettyTable(upperTable.stage1, 15, "'i8"),
|
|
prettyTable(upperTable.stage2, 15, "'i16"),
|
|
prettyTable(upperTable.data, 15, "'u32"),
|
|
$upperTable.blockSize,
|
|
prettyTable(titleTable.stage1, 15, "'i8"),
|
|
prettyTable(titleTable.stage2, 15, "'i16"),
|
|
prettyTable(titleTable.data, 15, "'u32"),
|
|
$titleTable.blockSize,
|
|
prettyTable(foldingTable.stage1, 15, "'i8"),
|
|
prettyTable(foldingTable.stage2, 15, "'i16"),
|
|
prettyTable(foldingTable.data, 15, "'u32"),
|
|
$foldingTable.blockSize
|
|
])
|
|
finally:
|
|
close(f)
|