NimYAML/private/fastparse.nim

1551 lines
48 KiB
Nim

type
FastParseState = enum
fpInitial, fpBlockLineStart, fpBlockAfterObject, fpBlockAfterPlainScalar,
fpBlockObjectStart, fpBlockContinueScalar, fpExpectDocEnd, fpFlow,
fpFlowAfterObject
FastParseLevelKind = enum
fplUnknown, fplSequence, fplMapKey, fplMapValue, fplScalar
FastParseLevel = object
kind: FastParseLevelKind
indentation: int
LexedDirective = enum
ldYaml, ldTag, ldUnknown
LexedPossibleDirectivesEnd = enum
lpdeDirectivesEnd, lpdeSequenceItem, lpdeScalarContent
YamlContext = enum
cFlowIn, cFlowOut, cFlowKey, cBlockKey, cBlockIn, cBlockOut
FastLexer = object of BaseLexer
tokenstart: int
const
space = [' ', '\t']
lineEnd = ['\x0A', '\c', EndOfFile]
spaceOrLineEnd = [' ', '\t', '\x0A', '\c', EndOfFile]
digits = '0'..'9'
flowIndicators = ['[', ']', '{', '}', ',']
template debug(message: string) {.dirty.} =
when defined(yamlDebug):
try: styledWriteLine(stdout, fgBlue, message)
except IOError: discard
template raiseError(message: string) {.dirty.} =
var e = newException(YamlParserError, message)
e.line = lexer.lineNumber
e.column = lexer.tokenstart
e.lineContent = lexer.getCurrentLine(false) &
repeat(' ', lexer.getColNumber(lexer.bufpos)) & "^\n"
raise e
template raiseError(message: string, col: int) {.dirty.} =
var e = newException(YamlParserError, message)
e.line = lexer.lineNumber
e.column = col
e.lineContent = lexer.getCurrentLine(false) &
repeat(' ', lexer.getColNumber(lexer.bufpos)) & "^\n"
raise e
template yieldLevelEnd() {.dirty.} =
case level.kind
of fplSequence:
yield endSeqEvent()
of fplMapKey:
yield endMapEvent()
of fplMapValue:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionMark
anchor = yAnchorNone
yield endMapEvent()
of fplScalar:
yield scalarEvent(content, tag, anchor)
tag = yTagQuestionMark
anchor = yAnchorNone
of fplUnknown:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionMark
anchor = yAnchorNone
template handleLineEnd(insideDocument: bool) {.dirty.} =
case lexer.buf[lexer.bufpos]
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
of EndOfFile:
when insideDocument:
closeEverything()
return
else:
discard
template handleObjectEnd(nextState: FastParseState) {.dirty.} =
if ancestry.len == 0:
state = fpExpectDocEnd
else:
level = ancestry.pop()
state = nextState
tag = yTagQuestionMark
anchor = yAnchorNone
case level.kind
of fplMapKey:
level.kind = fplMapValue
of fplMapValue:
level.kind = fplMapKey
of fplSequence:
discard
of fplUnknown, fplScalar:
assert(false)
template handleObjectStart(k: YamlStreamEventKind) {.dirty.} =
assert(level.kind == fplUnknown)
when k == yamlStartMap:
yield startMapEvent(tag, anchor)
debug("started map at " & (if level.indentation == -1: $indentation else:
$level.indentation))
level.kind = fplMapKey
else:
yield startSeqEvent(tag, anchor)
debug("started sequence at " & (if level.indentation == -1: $indentation else:
$level.indentation))
level.kind = fplSequence
tag = yTagQuestionmark
anchor = yAnchorNone
if level.indentation == -1:
level.indentation = indentation
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
template closeMoreIndentedLevels() {.dirty.} =
while ancestry.len > 0:
let parent = ancestry[ancestry.high]
if parent.indentation >= indentation:
debug("Closing because parent.indentation (" & $parent.indentation &
") >= indentation(" & $indentation & ")")
yieldLevelEnd()
handleObjectEnd(fpBlockAfterObject)
else:
break
template closeEverything() {.dirty.} =
indentation = 0
closeMoreIndentedLevels()
yieldLevelEnd()
yield endDocEvent()
template handleBlockSequenceIndicator() {.dirty.} =
case level.kind
of fplUnknown:
handleObjectStart(yamlStartSequence)
of fplSequence:
if level.indentation != indentation:
raiseError("Invalid indentation of block sequence indicator",
lexer.bufpos)
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
else:
raiseError("Illegal sequence item in map")
lexer.skipWhitespace()
indentation = lexer.getColNumber(lexer.bufpos)
template handleMapKeyIndicator() {.dirty.} =
case level.kind
of fplUnknown:
handleObjectStart(yamlStartMap)
of fplMapValue:
if level.indentation != indentation:
raiseError("Invalid indentation of map key indicator",
lexer.bufpos)
yield scalarEvent("", yTagQuestionmark, yAnchorNone)
level.kind = fplMapKey
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
of fplMapKey:
if level.indentation != indentation:
raiseError("Invalid indentation of map key indicator",
lexer.bufpos)
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
of fplSequence:
raiseError("Unexpected map key indicator (expected '- ')")
of fplScalar:
raiseError("Unexpected map key indicator (expected multiline scalar end)")
lexer.skipWhitespace()
indentation = lexer.getColNumber(lexer.bufpos)
template handleMapValueIndicator() {.dirty.} =
case level.kind
of fplUnknown:
if level.indentation == -1:
handleObjectStart(yamlStartMap)
yield scalarEvent("", yTagQuestionmark, yAnchorNone)
else:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
ancestry[ancestry.high].kind = fplMapValue
of fplMapKey:
if level.indentation != indentation:
raiseError("Invalid indentation of map key indicator",
lexer.bufpos)
yield scalarEvent("", yTagQuestionmark, yAnchorNone)
level.kind = fplMapValue
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
of fplMapValue:
if level.indentation != indentation:
raiseError("Invalid indentation of map key indicator",
lexer.bufpos)
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
of fplSequence:
raiseError("Unexpected map value indicator (expected '- ')")
of fplScalar:
raiseError("Unexpected map value indicator (expected multiline scalar end)")
lexer.skipWhitespace()
indentation = lexer.getColNumber(lexer.bufpos)
template initDocValues() {.dirty.} =
shorthands = initTable[string, string]()
anchors = initTable[string, AnchorId]()
shorthands["!"] = "!"
shorthands["!!"] = "tag:yaml.org,2002:"
nextAnchorId = 0.AnchorId
level = FastParseLevel(kind: fplUnknown, indentation: -1)
tag = yTagQuestionmark
anchor = yAnchorNone
template handleTagHandle() {.dirty.} =
if level.kind != fplUnknown:
raiseError("Unexpected token", lexer.bufpos)
if tag != yTagQuestionmark:
raiseError("Only one tag handle is allowed per node")
content = ""
var
shorthandEnd: int
tagUri: string
lexer.tagHandle(content, shorthandEnd)
if shorthandEnd != -1:
try:
let prefix = shorthands[content[0..shorthandEnd]]
tagUri = prefix & content[shorthandEnd + 1 .. ^1]
except KeyError:
raiseError("Undefined tag shorthand: " & content[0..shorthandEnd])
else:
shallowCopy(tagUri, content)
try:
tag = tagLib.tags[tagUri]
except KeyError:
tag = tagLib.registerUri(tagUri)
template handleAnchor() {.dirty.} =
if level.kind != fplUnknown:
raiseError("Unexpected token", lexer.bufpos)
if anchor != yAnchorNone:
raiseError("Only one anchor is allowed per node", lexer.bufpos)
content = ""
lexer.anchorName(content)
anchor = nextAnchorId
anchors[content] = anchor
nextAnchorId = cast[AnchorId](cast[int](nextAnchorId) + 1)
template handleAlias() {.dirty.} =
if level.kind != fplUnknown:
raiseError("Unexpected token", lexer.bufpos)
if anchor != yAnchorNone or tag != yTagQuestionmark:
raiseError("Alias may not have anchor or tag")
content = ""
lexer.anchorName(content)
var id: AnchorId
try:
id = anchors[content]
except KeyError:
raiseError("Unknown anchor")
yield aliasEvent(id)
handleObjectEnd(fpBlockAfterObject)
template leaveFlowLevel() {.dirty.} =
flowdepth.inc(-1)
if flowdepth == 0:
yieldLevelEnd()
handleObjectEnd(fpBlockAfterObject)
else:
yieldLevelEnd()
handleObjectEnd(fpFlowAfterObject)
template handlePossibleMapStart() {.dirty.} =
if level.indentation == -1:
var flowDepth = 0
for p in countup(lexer.bufpos, lexer.bufpos + 1024):
case lexer.buf[p]
of ':':
if flowDepth == 0 and lexer.buf[p + 1] in spaceOrLineEnd:
handleObjectStart(yamlStartMap)
break
of lineEnd:
break
of '[', '{':
flowDepth.inc()
of '}', ']':
flowDepth.inc(-1)
of '?':
if flowDepth == 0: break
of '#':
if lexer.buf[p - 1] in space:
break
else:
discard
if level.indentation == -1:
level.indentation = indentation
template handleBlockItemStart() {.dirty.} =
case level.kind
of fplUnknown:
handlePossibleMapStart()
of fplSequence:
raiseError("Unexpected token (expected block sequence indicator)",
lexer.bufpos)
of fplMapKey:
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: indentation)
of fplMapValue:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
level.kind = fplMapKey
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: indentation)
of fplScalar:
assert(false)
template finishLine(lexer: FastLexer) =
debug("lex: finishLine")
while lexer.buf[lexer.bufpos] notin lineEnd:
lexer.bufpos.inc()
template skipWhitespace(lexer: FastLexer) =
debug("lex: skipWhitespace")
while lexer.buf[lexer.bufpos] in space: lexer.bufpos.inc()
template skipWhitespaceAndNewlines(lexer: FastLexer) =
debug("lex: skipWhitespaceAndNewLines")
while true:
case lexer.buf[lexer.bufpos]
of space:
lexer.bufpos.inc()
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
else:
break
template skipIndentation(lexer: FastLexer) =
debug("lex: skipIndentation")
while lexer.buf[lexer.bufpos] == ' ': lexer.bufpos.inc()
template directiveName(lexer: FastLexer, directive: var LexedDirective) =
debug("lex: directiveName")
directive = ldUnknown
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'Y':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'A':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'M':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'L':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] in spaceOrLineEnd:
directive = ldYaml
elif lexer.buf[lexer.bufpos] == 'T':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'A':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'G':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] in [' ', '\t', '\x0A', '\c', EndOfFile]:
directive = ldTag
while lexer.buf[lexer.bufpos] notin spaceOrLineEnd:
lexer.bufpos.inc()
template yamlVersion(lexer: FastLexer, o: var string) =
debug("lex: yamlVersion")
while lexer.buf[lexer.bufpos] in space:
lexer.bufpos.inc()
var c = lexer.buf[lexer.bufpos]
if c notin digits:
raiseError("Invalid YAML version number")
o.add(c)
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
lexer.bufpos.inc()
c = lexer.buf[lexer.bufpos]
while c in digits:
lexer.bufpos.inc()
o.add(c)
c = lexer.buf[lexer.bufpos]
if lexer.buf[lexer.bufpos] != '.':
raiseError("Invalid YAML version number")
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] notin digits:
raiseError("Invalid YAML version number")
lexer.bufpos.inc()
while lexer.buf[lexer.bufpos] in digits:
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] notin spaceOrLineEnd:
raiseError("Invalid YAML version number")
template lineEnding(lexer: FastLexer) =
debug("lex: lineEnding")
if lexer.buf[lexer.bufpos] notin lineEnd:
while lexer.buf[lexer.bufpos] in space:
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] in lineEnd:
discard
elif lexer.buf[lexer.bufpos] == '#':
while lexer.buf[lexer.bufpos] notin lineEnd:
lexer.bufpos.inc()
else:
raiseError("Unexpected token (expected comment or line end)",
lexer.bufpos)
template tagShorthand(lexer: FastLexer, shorthand: var string) =
debug("lex: tagShorthand")
while lexer.buf[lexer.bufpos] in space:
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] != '!':
raiseError("Invalid tag shorthand")
shorthand.add('!')
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
lexer.bufpos.inc()
var c = lexer.buf[lexer.bufpos]
if c in spaceOrLineEnd:
discard
else:
while c != '!':
case c
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-':
shorthand.add(c)
lexer.bufpos.inc()
c = lexer.buf[lexer.bufpos]
else:
raiseError("Illegal character in tag shorthand", lexer.bufpos)
shorthand.add(c)
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] notin spaceOrLineEnd:
raiseError("Missing space after tag shorthand", lexer.bufpos)
template tagUri(lexer: FastLexer, uri: var string) =
debug("lex: tagUri")
while lexer.buf[lexer.bufpos] in space:
lexer.bufpos.inc()
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
var c = lexer.buf[lexer.bufpos]
while c notin spaceOrLineEnd:
case c
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&',
'-', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
uri.add(c)
lexer.bufpos.inc()
c = lexer.buf[lexer.bufpos]
else:
raiseError("Invalid tag uri")
template directivesEnd(lexer: FastLexer,
token: var LexedPossibleDirectivesEnd) =
debug("lex: directivesEnd")
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
var p = lexer.bufpos + 1
case lexer.buf[p]
of '-':
p.inc()
if lexer.buf[p] == '-':
p.inc()
if lexer.buf[p] in spaceOrLineEnd:
token = lpdeDirectivesEnd
else:
token = lpdeScalarContent
else:
token = lpdeScalarContent
of spaceOrLineEnd:
token = lpdeSequenceItem
else:
token = lpdeScalarContent
template documentEnd(lexer: var FastLexer, isDocumentEnd: var bool) =
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
var p = lexer.bufpos + 1
if lexer.buf[p] == '.':
p.inc()
if lexer.buf[p] == '.':
p.inc()
if lexer.buf[p] in spaceOrLineEnd:
isDocumentEnd = true
else:
isDocumentEnd = false
else:
isDocumentEnd = false
else:
isDocumentEnd = false
template singleQuotedScalar(lexer: FastLexer, content: var string) =
debug("lex: singleQuotedScalar")
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
lexer.bufpos.inc()
while true:
case lexer.buf[lexer.bufpos]
of '\'':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == '\'':
content.add('\'')
else:
break
of EndOfFile:
raiseError("Unfinished single quoted string")
else:
content.add(lexer.buf[lexer.bufpos])
lexer.bufpos.inc()
proc unicodeSequence(lexer: var FastLexer, length: int):
string {.raises: [YamlParserError].} =
debug("lex: unicodeSequence")
var unicodeChar = 0.Rune
let start = lexer.bufpos - 1
for i in countup(0, length - 1):
lexer.bufpos.inc()
let
digitPosition = length - i - 1
c = lexer.buf[lexer.bufpos]
case c
of EndOFFile:
raiseError("Unfinished unicode escape sequence", start)
of '0' .. '9':
unicodeChar = unicodechar or
(cast[int](c) - 0x30) shl (digitPosition * 4)
of 'A' .. 'F':
unicodeChar = unicodechar or
(cast[int](c) - 0x37) shl (digitPosition * 4)
of 'a' .. 'f':
unicodeChar = unicodechar or
(cast[int](c) - 0x57) shl (digitPosition * 4)
else:
raiseError("Invalid character in unicode escape sequence", lexer.bufpos)
return toUTF8(unicodeChar)
template processDoubleQuotedWhitespace(newlines: var int) {.dirty.} =
var
after = ""
block outer:
while true:
case lexer.buf[lexer.bufpos]
of ' ', '\t':
after.add(lexer.buf[lexer.bufpos])
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
break
of '\c':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
break
else:
content.add(after)
break outer
lexer.bufpos.inc()
while true:
case lexer.buf[lexer.bufpos]
of ' ', '\t':
discard
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
newlines.inc()
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
newlines.inc()
else:
if newlines == 0:
discard
elif newlines == 1:
content.add(' ')
else:
content.add(repeat('\x0A', newlines - 1))
break
lexer.bufpos.inc()
template doubleQuotedScalar(lexer: FastLexer, content: var string) =
debug("lex: doubleQuotedScalar")
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
lexer.bufpos.inc()
while true:
var c = lexer.buf[lexer.bufpos]
case c
of EndOfFile:
raiseError("Unfinished double quoted string")
of '\\':
lexer.bufpos.inc()
case lexer.buf[lexer.bufpos]
of EndOfFile:
raiseError("Unfinished escape sequence")
of '0': content.add('\0')
of 'a': content.add('\x07')
of 'b': content.add('\x08')
of '\t', 't': content.add('\t')
of 'n': content.add('\x0A')
of 'v': content.add('\v')
of 'f': content.add('\f')
of 'r': content.add('\r')
of 'e': content.add('\e')
of ' ': content.add(' ')
of '"': content.add('"')
of '/': content.add('/')
of '\\': content.add('\\')
of 'N': content.add(UTF8NextLine)
of '_': content.add(UTF8NonBreakingSpace)
of 'L': content.add(UTF8LineSeparator)
of 'P': content.add(UTF8ParagraphSeparator)
of 'x': content.add(lexer.unicodeSequence(2))
of 'u': content.add(lexer.unicodeSequence(4))
of 'U': content.add(lexer.unicodeSequence(8))
of '\x0A', '\c':
var newlines = 0
processDoubleQuotedWhitespace(newlines)
continue
else:
raiseError("Illegal character in escape sequence")
of '"':
lexer.bufpos.inc()
break
of '\x0A', '\c', '\t', ' ':
var newlines = 1
processdoubleQuotedWhitespace(newlines)
continue
else:
content.add(c)
lexer.bufpos.inc()
proc isPlainSafe(lexer: FastLexer, index: int, context: YamlContext): bool =
case lexer.buf[lexer.bufpos + 1]
of spaceOrLineEnd:
result = false
of flowIndicators:
result = context in [cBlockIn, cBlockOut, cBlockKey]
else:
result = true
template plainScalar(lexer: FastLexer, content: var string,
context: YamlContext) =
debug("lex: plainScalar")
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
content.add(lexer.buf[lexer.bufpos])
block outer:
while true:
lexer.bufpos.inc()
let c = lexer.buf[lexer.bufpos]
case c
of lineEnd:
break
of ' ', '\t':
var after = "" & c
while true:
lexer.bufpos.inc()
let c2 = lexer.buf[lexer.bufpos]
case c2
of ' ', '\t':
after.add(c2)
of lineEnd:
break outer
of ':':
if lexer.isPlainSafe(lexer.bufpos + 1, context):
content.add(after & ':')
else:
break outer
of '#':
break outer
else:
content.add(after)
content.add(c2)
break
of flowIndicators:
if context in [cBlockOut, cBlockIn, cBlockKey]:
content.add(c)
else:
break
of ':':
if lexer.isPlainSafe(lexer.bufpos + 1, context):
content.add(':')
else:
break outer
of '#':
break outer
else:
content.add(c)
template continueMultilineScalar() {.dirty.} =
content.add(if newlines == 1: " " else: repeat('\x0A', newlines - 1))
lexer.plainScalar(content, cBlockOut)
state = fpBlockAfterPlainScalar
template handleFlowPlainScalar() {.dirty.} =
content = ""
lexer.plainScalar(content, cFlowOut)
if lexer.buf[lexer.bufpos] in ['{', '}', '[', ']', ',', ':', '#']:
discard
else:
var newlines = 0
while true:
case lexer.buf[lexer.bufpos]
of ':':
if lexer.isPlainSafe(lexer.bufpos + 1, cFlowOut):
if newlines == 1:
content.add(' ')
newlines = 0
elif newlines > 1:
content.add(repeat(' ', newlines - 1))
newlines = 0
lexer.plainScalar(content, cFlowOut)
elif explicitFlowKey:
break
else:
raiseError("Multiline scalar is not allowed as implicit key")
of '#', EndOfFile:
break
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
newlines.inc()
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
newlines.inc()
of flowIndicators:
break
of ' ', '\t':
lexer.skipWhitespace()
else:
if newlines == 1:
content.add(' ')
newlines = 0
elif newlines > 1:
content.add(repeat(' ', newlines - 1))
newlines = 0
lexer.plainScalar(content, cFlowOut)
yield scalarEvent(content, tag, anchor)
handleObjectEnd(fpFlowAfterObject)
template ensureCorrectIndentation() {.dirty.} =
if level.indentation != indentation:
raiseError("Invalid indentation (expected indentation for " & $level.kind &
" :" & $level.indentation & ")", lexer.bufpos)
template tagHandle(lexer: var FastLexer, content: var string,
shorthandEnd: var int) =
debug("lex: tagHandle")
shorthandEnd = 0
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
content.add(lexer.buf[lexer.bufpos])
var i = 0
while true:
lexer.bufpos.inc()
i.inc()
let c = lexer.buf[lexer.bufpos]
case c
of spaceOrLineEnd:
if shorthandEnd == -1:
raiseError("Unclosed verbatim tag")
break
of '!':
if shorthandEnd == -1 and i == 2:
content.add(c)
elif shorthandEnd != 0:
raiseError("Illegal character in tag suffix", lexer.bufpos)
shorthandEnd = i
content.add(c)
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&',
'-', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
content.add(c)
of '<':
if i == 1:
shorthandEnd = -1
content = ""
else:
raiseError("Illegal character in tag handle", lexer.bufpos)
of '>':
if shorthandEnd == -1:
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] notin spaceOrLineEnd:
raiseError("Missing space after verbatim tag handle", lexer.bufpos)
break
else:
raiseError("Illegal character in tag handle", lexer.bufpos)
else:
raiseError("Illegal character in tag handle", lexer.bufpos)
template anchorName(lexer: FastLexer, content: var string) =
debug("lex: anchorName")
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
while true:
lexer.bufpos.inc()
let c = lexer.buf[lexer.bufpos]
case c
of spaceOrLineEnd, '[', ']', '{', '}', ',':
break
else:
content.add(c)
template blockScalar(lexer: FastLexer, content: var string,
stateAfter: var FastParseState) =
type ChompType = enum
ctKeep, ctClip, ctStrip
var
literal: bool
blockIndent = 0
chomp: ChompType = ctClip
detectedIndent = false
case lexer.buf[lexer.bufpos]
of '|':
literal = true
of '>':
literal = false
else:
assert(false)
while true:
lexer.bufpos.inc()
case lexer.buf[lexer.bufpos]
of '+':
if chomp != ctClip:
raiseError("Only one chomping indicator is allowed", lexer.bufpos)
chomp = ctKeep
of '-':
if chomp != ctClip:
raiseError("Only one chomping indicator is allowed", lexer.bufpos)
chomp = ctStrip
of '1'..'9':
if detectedIndent:
raiseError("Only one indentation indicator is allowed", lexer.bufpos)
blockIndent = int(lexer.buf[lexer.bufpos]) - int('\x30')
detectedIndent = true
of spaceOrLineEnd:
break
else:
raiseError("Illegal character in block scalar header", lexer.bufpos)
lexer.lineEnding()
case lexer.buf[lexer.bufpos]
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
of EndOfFile:
raiseError("Missing content of block scalar") # TODO: is this correct?
else:
assert(false)
var newlines = 0
let parentIndent = ancestry[ancestry.high].indentation
content = ""
block outer:
while true:
block inner:
for i in countup(1, parentIndent):
case lexer.buf[lexer.bufpos]
of ' ':
discard
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
newlines.inc()
break inner
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
newlines.inc()
break inner
else:
stateAfter = if i == 1: fpBlockLineStart else: fpBlockObjectStart
break outer
lexer.bufpos.inc()
if detectedIndent:
for i in countup(1, blockIndent):
case lexer.buf[lexer.bufpos]
of ' ':
discard
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
newlines.inc()
break inner
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
newlines.inc()
break inner
of EndOfFile:
stateAfter = fpBlockLineStart
break outer
of '#':
lexer.lineEnding()
case lexer.buf[lexer.bufpos]
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
else: discard
stateAfter = fpBlockLineStart
break outer
else:
raiseError("The text is less indented than expected")
lexer.bufpos.inc()
else:
while true:
case lexer.buf[lexer.bufpos]
of ' ':
discard
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
newlines.inc()
break inner
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
newlines.inc()
break inner
of EndOfFile:
stateAfter = fpBlockLineStart
break outer
else:
blockIndent = lexer.getColNumber(lexer.bufpos) - parentIndent
detectedIndent = true
break
lexer.bufpos.inc()
case lexer.buf[lexer.bufpos]
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
newlines.inc()
break inner
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
newlines.inc()
break inner
of EndOfFile:
stateAfter = fpBlockLineStart
break outer
else:
discard
if newlines > 0:
if literal:
content.add(repeat('\x0A', newlines))
elif newlines == 1:
content.add(' ')
else:
content.add(repeat('\x0A', newlines - 1))
newlines = 0
while true:
let c = lexer.buf[lexer.bufpos]
case c
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
newlines.inc()
break
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
newlines.inc()
break inner
of EndOfFile:
stateAfter = fpBlockLineStart
break outer
else:
content.add(c)
lexer.bufpos.inc()
case chomp
of ctClip:
content.add('\x0A')
of ctKeep:
content.add(repeat('\x0A', newlines))
of ctStrip:
discard
proc fastparse*(tagLib: TagLibrary, s: Stream): YamlStream =
result = iterator(): YamlStreamEvent =
var
lexer: FastLexer
state = fpInitial
shorthands: Table[string, string]
anchors: Table[string, AnchorId]
nextAnchorId: AnchorId
content: string
tag: TagId
anchor: AnchorId
ancestry = newSeq[FastParseLevel]()
level: FastParseLevel
indentation: int
newlines: int
flowdepth: int = 0
explicitFlowKey: bool
lexer.open(s)
initDocValues()
while true:
case state
of fpInitial:
debug("state: initial")
case lexer.buf[lexer.bufpos]
of '%':
var ld: LexedDirective
lexer.directiveName(ld)
case ld
of ldYaml:
var version = ""
lexer.yamlVersion(version)
if version != "1.2":
echo "version is not 1.2!"
# TODO: warning (unknown version)
discard
lexer.lineEnding()
handleLineEnd(false)
of ldTag:
var shorthand, uri = ""
lexer.tagShorthand(shorthand)
lexer.tagUri(uri)
shorthands.add(shorthand, uri)
lexer.lineEnding()
handleLineEnd(false)
of ldUnknown:
# TODO: warning (unknown directive)
lexer.finishLine()
handleLineEnd(false)
of ' ', '\t':
while true:
lexer.bufpos.inc()
case lexer.buf[lexer.bufpos]
of ' ', '\t':
discard
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
break
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
break
of '#', EndOfFile:
lexer.lineEnding()
handleLineEnd(false)
break
else:
indentation = lexer.getColNumber(lexer.bufpos)
yield startDocEvent()
state = fpBlockObjectStart
break
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
of EndOfFile:
return
of '#':
lexer.lineEnding()
handleLineEnd(false)
of '-':
var token: LexedPossibleDirectivesEnd
lexer.directivesEnd(token)
yield startDocEvent()
case token
of lpdeDirectivesEnd:
lexer.bufpos.inc(3)
state = fpBlockObjectStart
of lpdeSequenceItem:
indentation = 0
lexer.bufpos.inc()
handleBlockSequenceIndicator()
state = fpBlockObjectStart
of lpdeScalarContent:
content = ""
lexer.plainScalar(content, cBlockOut)
state = fpBlockAfterPlainScalar
else:
yield startDocEvent()
state = fpBlockLineStart
of fpBlockLineStart:
debug("state: blockLineStart")
case lexer.buf[lexer.bufpos]
of '-':
var token: LexedPossibleDirectivesEnd
lexer.directivesEnd(token)
case token
of lpdeDirectivesEnd:
lexer.bufpos.inc(3)
closeEverything()
initDocValues()
yield startDocEvent()
state = fpBlockObjectStart
of lpdeSequenceItem:
indentation = 0
closeMoreIndentedLevels()
lexer.bufpos.inc()
handleBlockSequenceIndicator()
state = fpBlockObjectStart
of lpdeScalarContent:
case level.kind
of fplScalar:
continueMultilineScalar()
of fplUnknown:
handlePossibleMapStart()
else:
ensureCorrectIndentation()
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
content = ""
lexer.plainScalar(content, cBlockOut)
state = fpBlockAfterPlainScalar
of '.':
var isDocumentEnd: bool
lexer.documentEnd(isDocumentEnd)
if isDocumentEnd:
lexer.bufpos.inc(3)
lexer.lineEnding()
handleLineEnd(true)
closeEverything()
initDocValues()
state = fpInitial
else:
indentation = 0
closeMoreIndentedLevels()
case level.kind
of fplUnknown:
handlePossibleMapStart()
of fplScalar:
continueMultilineScalar()
else:
ensureCorrectIndentation()
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
content = ""
lexer.plainScalar(content, cBlockOut)
state = fpBlockAfterPlainScalar
of ' ':
lexer.skipIndentation()
if lexer.buf[lexer.bufpos] in ['\t', '\x0A', '\c', '#']:
lexer.lineEnding()
handleLineEnd(true)
else:
indentation = lexer.getColNumber(lexer.bufpos)
closeMoreIndentedLevels()
case level.kind
of fplScalar:
state = fpBlockContinueScalar
of fplUnknown:
state = fpBlockObjectStart
else:
ensureCorrectIndentation()
state = fpBlockObjectStart
else:
indentation = 0
closeMoreIndentedLevels()
case level.kind
of fplScalar:
state = fpBlockContinueScalar
of fplUnknown:
state = fpBlockObjectStart
else:
ensureCorrectIndentation()
state = fpBlockObjectStart
of fpBlockContinueScalar:
debug("state: blockAfterPlainScalar")
lexer.skipWhitespace()
case lexer.buf[lexer.bufpos]
of '\x0A':
newlines.inc()
lexer.bufpos = lexer.handleLF(lexer.bufpos)
state = fpBlockLineStart
of '\c':
newlines.inc()
lexer.bufpos = lexer.handleCR(lexer.bufpos)
of ':':
if lexer.isPlainSafe(lexer.bufpos + 1, cBlockOut):
continueMultilineScalar()
else:
raiseError("Unexpected token", lexer.bufpos)
of '#':
yield scalarEvent(content, tag, anchor)
lexer.lineEnding()
handleLineEnd(true)
handleObjectEnd(fpBlockLineStart)
else:
continueMultilineScalar()
of fpBlockAfterPlainScalar:
debug("state: blockAfterPlainScalar")
lexer.skipWhitespace()
case lexer.buf[lexer.bufpos]
of '\x0A':
if level.kind notin [fplUnknown, fplScalar]:
raiseError("Unexpected scalar")
newlines = 1
level.kind = fplScalar
lexer.bufpos = lexer.handleLF(lexer.bufpos)
state = fpBlockLineStart
of '\c':
if level.kind notin [fplUnknown, fplScalar]:
raiseError("Unexpected scalar")
newlines = 1
level.kind = fplScalar
lexer.bufpos = lexer.handleCR(lexer.bufpos)
state = fpBlockLineStart
else:
yield scalarEvent(content, tag, anchor)
handleObjectEnd(fpBlockAfterObject)
of fpBlockAfterObject:
debug("state: blockAfterObject")
lexer.skipWhitespace()
case lexer.buf[lexer.bufpos]
of EndOfFile:
closeEverything()
break
of '\x0A':
state = fpBlockLineStart
lexer.bufpos = lexer.handleLF(lexer.bufpos)
of '\c':
state = fpBlockLineStart
lexer.bufpos = lexer.handleCR(lexer.bufpos)
of ':':
case level.kind
of fplUnknown:
handleObjectStart(yamlStartMap)
of fplMapKey:
yield scalarEvent("", yTagQuestionMark, yAnchorNone)
level.kind = fplMapValue
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
of fplMapValue:
level.kind = fplMapValue
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
of fplSequence:
raiseError("Illegal token (expected sequence item)")
of fplScalar:
raiseError("Multiline scalars may not be implicit map keys")
lexer.bufpos.inc()
lexer.skipWhitespace()
indentation = lexer.getColNumber(lexer.bufpos)
state = fpBlockObjectStart
of '#':
lexer.lineEnding()
handleLineEnd(true)
handleObjectEnd(fpBlockLineStart)
else:
raiseError("Illegal token (expected ':', comment or line end)",
lexer.bufpos)
of fpBlockObjectStart:
debug("state: blockObjectStart")
lexer.skipWhitespace()
indentation = lexer.getColNumber(lexer.bufpos)
let objectStart = lexer.getColNumber(lexer.bufpos)
case lexer.buf[lexer.bufpos]
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
state = fpBlockLineStart
level.indentation = -1
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
state = fpBlockLineStart
level.indentation = -1
of EndOfFile:
closeEverything()
return
of '#':
lexer.lineEnding()
handleLineEnd(true)
of '\'':
handleBlockItemStart()
content = ""
lexer.singleQuotedScalar(content)
if tag == yTagQuestionMark:
tag = yTagExclamationMark
yield scalarEvent(content, tag, anchor)
handleObjectEnd(fpBlockAfterObject)
of '"':
handleBlockItemStart()
content = ""
lexer.doubleQuotedScalar(content)
if tag == yTagQuestionMark:
tag = yTagExclamationMark
yield scalarEvent(content, tag, anchor)
handleObjectEnd(fpBlockAfterObject)
of '|', '>':
# TODO: this will scan for possible map start, which is not
# neccessary in this case
handleBlockItemStart()
var stateAfter: FastParseState
content = ""
lexer.blockScalar(content, stateAfter)
if tag == yTagQuestionmark:
tag = yTagExclamationmark
yield scalarEvent(content, tag, anchor)
handleObjectEnd(stateAfter)
of '-':
if lexer.isPlainSafe(lexer.bufpos + 1, cBlockOut):
handleBlockItemStart()
content = ""
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
lexer.plainScalar(content, cBlockOut)
state = fpBlockAfterPlainScalar
else:
lexer.bufpos.inc()
handleBlockSequenceIndicator()
of '!':
handleBlockItemStart()
handleTagHandle()
of '&':
handleBlockItemStart()
handleAnchor()
of '*':
handleBlockItemStart()
handleAlias()
of '[', '{':
handleBlockItemStart()
state = fpFlow
of '?':
if lexer.isPlainSafe(lexer.bufpos + 1, cBlockOut):
handleBlockItemStart()
content = ""
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
lexer.plainScalar(content, cBlockOut)
state = fpBlockAfterPlainScalar
else:
lexer.bufpos.inc()
handleMapKeyIndicator()
of ':':
if lexer.isPlainSafe(lexer.bufpos + 1, cBlockOut):
handleBlockItemStart()
content = ""
lexer.tokenstart = lexer.getColNumber(lexer.bufpos)
lexer.plainScalar(content, cBlockOut)
state = fpBlockAfterPlainScalar
else:
lexer.bufpos.inc()
handleMapValueIndicator()
of '@', '`':
raiseError("Reserved characters cannot start a plain scalar",
lexer.bufpos)
else:
handleBlockItemStart()
content = ""
lexer.plainScalar(content, cBlockOut)
state = fpBlockAfterPlainScalar
of fpExpectDocEnd:
case lexer.buf[lexer.bufpos]
of '-':
var token: LexedPossibleDirectivesEnd
lexer.directivesEnd(token)
case token
of lpdeDirectivesEnd:
lexer.bufpos.inc(3)
yield endDocEvent()
initDocValues()
yield startDocEvent()
state = fpBlockObjectStart
else:
raiseError("Unexpected content (expected document end)")
of '.':
var isDocumentEnd: bool
lexer.documentEnd(isDocumentEnd)
if isDocumentEnd:
lexer.bufpos.inc(3)
yield endDocEvent()
initDocValues()
state = fpInitial
else:
raiseError("Unexpected content (expected document end)")
of ' ', '\t', '#':
lexer.lineEnding()
handleLineEnd(true)
of '\x0A':
lexer.bufpos = lexer.handleLF(lexer.bufpos)
of '\c':
lexer.bufpos = lexer.handleCR(lexer.bufpos)
of EndOfFile:
yield endDocEvent()
break
else:
raiseError("Unexpected content (expected document end)")
of fpFlow:
debug("state: flow")
lexer.skipWhitespaceAndNewlines()
case lexer.buf[lexer.bufpos]
of '{':
handleObjectStart(yamlStartMap)
flowdepth.inc()
lexer.bufpos.inc()
explicitFlowKey = false
of '[':
handleObjectStart(yamlStartSequence)
flowdepth.inc()
lexer.bufpos.inc()
of '}':
assert(level.kind == fplUnknown)
level = ancestry.pop()
case level.kind
of fplMapValue:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
level.kind = fplMapKey
of fplMapKey:
if tag != yTagQuestionmark or anchor != yAnchorNone or
explicitFlowKey:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
yield scalarEvent("", tag, anchor)
of fplSequence:
raiseError("Unexpected token (expected ']')", lexer.bufpos)
of fplUnknown, fplScalar:
assert(false)
lexer.bufpos.inc()
leaveFlowLevel()
of ']':
assert(level.kind == fplUnknown)
level = ancestry.pop()
case level.kind
of fplSequence:
if tag != yTagQuestionmark or anchor != yAnchorNone:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
of fplMapKey, fplMapValue:
raiseError("Unexpected token (expected '}')", lexer.bufpos)
of fplUnknown, fplScalar:
assert(false)
lexer.bufpos.inc()
leaveFlowLevel()
of ',':
assert(level.kind == fplUnknown)
level = ancestry.pop()
case level.kind
of fplSequence:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
of fplMapValue:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
level.kind = fplMapKey
explicitFlowKey = false
of fplMapKey:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
yield scalarEvent("", tag, anchor)
explicitFlowKey = false
of fplUnknown, fplScalar:
assert(false)
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
lexer.bufpos.inc()
of ':':
assert(level.kind == fplUnknown)
if lexer.isPlainSafe(lexer.bufpos + 1, cFlowIn):
level = ancestry.pop()
case level.kind
of fplSequence, fplMapValue:
raiseError("Unexpected token (expected ',')", lexer.bufpos)
of fplMapKey:
yield scalarEvent("", tag, anchor)
tag = yTagQuestionmark
anchor = yAnchorNone
level.kind = fplMapValue
of fplUnknown, fplScalar:
assert(false)
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
lexer.bufpos.inc()
else:
handleFlowPlainScalar()
of '\'':
content = ""
lexer.singleQuotedScalar(content)
if tag == yTagQuestionMark:
tag = yTagExclamationMark
yield scalarEvent(content, tag, anchor)
handleObjectEnd(fpFlowAfterObject)
of '"':
content = ""
lexer.doubleQuotedScalar(content)
if tag == yTagQuestionmark:
tag = yTagExclamationmark
yield scalarEvent(content, tag, anchor)
handleObjectEnd(fpFlowAfterObject)
of '!':
handleTagHandle()
of '&':
handleAnchor()
of '*':
handleAlias()
state = fpFlowAfterObject
of '?':
if lexer.isPlainSafe(lexer.bufpos + 1, cFlowOut):
handleFlowPlainScalar()
elif explicitFlowKey:
raiseError("Duplicate '?' in flow mapping", lexer.bufpos)
else:
explicitFlowKey = true
lexer.bufpos.inc()
else:
handleFlowPlainScalar()
of fpFlowAfterObject:
debug("state: flowAfterObject")
lexer.skipWhitespaceAndNewlines()
case lexer.buf[lexer.bufpos]
of ']':
case level.kind
of fplSequence:
discard
of fplMapKey, fplMapValue:
raiseError("Unexpected token (expected '}')", lexer.bufpos)
of fplScalar, fplUnknown:
assert(false)
lexer.bufpos.inc()
leaveFlowLevel()
of '}':
case level.kind
of [fplMapKey, fplMapValue]:
discard
of fplSequence:
raiseError("Unexpected token (expected ']')", lexer.bufpos)
of fplUnknown, fplScalar:
assert(false)
lexer.bufpos.inc()
leaveFlowLevel()
of ',':
case level.kind
of fplSequence:
discard
of fplMapValue:
yield scalarEvent("", yTagQuestionmark, yAnchorNone)
level.kind = fplMapKey
explicitFlowKey = false
of fplMapKey:
explicitFlowKey = false
of fplUnknown, fplScalar:
assert(false)
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
state = fpFlow
lexer.bufpos.inc()
of ':':
case level.kind
of fplSequence, fplMapKey:
raiseError("Unexpected token (expected ',')", lexer.bufpos)
of fplMapValue:
level.kind = fplMapValue
of fplUnknown, fplScalar:
assert(false)
ancestry.add(level)
level = FastParseLevel(kind: fplUnknown, indentation: -1)
state = fpFlow
lexer.bufpos.inc()
of '#':
lexer.lineEnding()
handleLineEnd(true)
of EndOfFile:
raiseError("Unclosed flow content", lexer.bufpos)
else:
raiseError("Unexpected content (expected flow indicator)",
lexer.bufpos)