mirror of
https://github.com/status-im/NimYAML.git
synced 2025-01-12 12:34:23 +00:00
00387d955f
* verbatim tags containing '[', ']' or ',' were not properly parsed in flow style collections * ref #140
1177 lines
36 KiB
Nim
1177 lines
36 KiB
Nim
# NimYAML - YAML implementation in Nim
|
|
# (c) Copyright 2015-2023 Felix Krause
|
|
#
|
|
# See the file "copying.txt", included in this
|
|
# distribution, for details about the copyright.
|
|
|
|
import lexbase, streams, strutils, unicode
|
|
import ../data
|
|
when defined(yamlDebug):
|
|
import terminal
|
|
export terminal
|
|
|
|
type
|
|
Lexer* = object
|
|
cur*: Token
|
|
curStartPos*, curEndPos*: Mark
|
|
flowDepth*: int
|
|
# recently read scalar or URI, if any
|
|
evaluated*: string
|
|
# internals
|
|
indentation: int
|
|
source: BaseLexer
|
|
tokenStart: int
|
|
state, lineStartState, jsonEnablingState: State
|
|
c: char
|
|
seenMultiline: bool
|
|
# indentation of recently started set of node properties.
|
|
# necessary for implicit keys with properties.
|
|
propertyIndentation: int
|
|
|
|
LexerError* = object of ValueError
|
|
line*, column*: Positive
|
|
lineContent*: string
|
|
|
|
State = proc(lex: var Lexer): bool {.gcSafe, nimcall, raises: [LexerError].}
|
|
|
|
Token* {.pure.} = enum
|
|
YamlDirective, # `%YAML`
|
|
TagDirective, # `%TAG`
|
|
UnknownDirective, # any directive but `%YAML` and `%TAG`
|
|
DirectiveParam, # parameters of %YAML and unknown directives
|
|
EmptyLine, # for line folding in multiline plain scalars
|
|
DirectivesEnd, # explicit `---`
|
|
DocumentEnd, # explicit `...`
|
|
StreamEnd, # end of input
|
|
Indentation, # beginning of non-empty line
|
|
Plain, SingleQuoted, DoubleQuoted, Literal, Folded,
|
|
SeqItemInd, # block sequence item indicator `- `
|
|
MapKeyInd, # block mapping key indicator `? `
|
|
MapValueInd # block mapping value indicator `: `
|
|
MapStart, MapEnd, SeqStart, SeqEnd, SeqSep # {}[],
|
|
TagHandle, # a handle of a tag, e.g. `!!` of `!!str`
|
|
Suffix, # suffix of a tag shorthand, e.g. `str` of `!!str`.
|
|
# also used for the URI of the %TAG directive
|
|
VerbatimTag, # a verbatim tag, e.g. `!<tag:yaml.org,2002:str>`
|
|
Anchor, # anchor property of a node, e.g. `&anchor`
|
|
Alias # alias node, e.g. `*alias`
|
|
|
|
ChompType* = enum
|
|
ctKeep, ctClip, ctStrip
|
|
|
|
LineStartType = enum
|
|
lsDirectivesEndMarker, lsDocumentEndMarker, lsComment,
|
|
lsNewline, lsStreamEnd, lsContent
|
|
|
|
# consts
|
|
|
|
const
|
|
space = {' ', '\t'}
|
|
lineEnd = {'\l', '\c', EndOfFile}
|
|
spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile}
|
|
commentOrLineEnd = {'\l', '\c', EndOfFile, '#'}
|
|
digits = {'0'..'9'}
|
|
flowIndicators = {'[', ']', '{', '}', ','}
|
|
uriChars = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':',
|
|
'@', '&', '-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')'}
|
|
tagShorthandChars = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-'}
|
|
nodePropertyKind* = {Token.TagHandle, Token.VerbatimTag, Token.Anchor}
|
|
scalarTokenKind* = {Token.Plain, Token.SingleQuoted, Token.DoubleQuoted,
|
|
Token.Literal, Token.Folded}
|
|
|
|
UTF8NextLine = toUTF8(0x85.Rune)
|
|
UTF8NonBreakingSpace = toUTF8(0xA0.Rune)
|
|
UTF8LineSeparator = toUTF8(0x2028.Rune)
|
|
UTF8ParagraphSeparator = toUTF8(0x2029.Rune)
|
|
|
|
UnknownIndentation* = int.low
|
|
|
|
proc currentIndentation*(lex: Lexer): int =
|
|
return lex.source.getColNumber(lex.source.bufpos) - 1
|
|
|
|
proc recentIndentation*(lex: Lexer): int =
|
|
return lex.indentation
|
|
|
|
# lexer source handling
|
|
|
|
proc advance(lex: var Lexer, step: int = 1) {.inline.} =
|
|
lex.c = lex.source.buf[lex.source.bufpos]
|
|
lex.source.bufpos.inc(step)
|
|
|
|
template lexCR(lex: var Lexer) =
|
|
try: lex.source.bufpos = lex.source.handleCR(lex.source.bufpos - 1)
|
|
except CatchableError as ce:
|
|
var e = lex.generateError("Encountered stream error: " & ce.msg)
|
|
e.parent = ce
|
|
raise e
|
|
lex.advance()
|
|
|
|
template lexLF(lex: var Lexer) =
|
|
try: lex.source.bufpos = lex.source.handleLF(lex.source.bufpos - 1)
|
|
except CatchableError as ce:
|
|
var e = generateError(lex, "Encountered stream error: " & ce.msg)
|
|
e.parent = ce
|
|
raise e
|
|
lex.advance()
|
|
|
|
template lineNumber(lex: Lexer): Positive =
|
|
lex.source.lineNumber
|
|
|
|
template columnNumber(lex: Lexer): Positive =
|
|
lex.source.getColNumber(lex.source.bufpos)
|
|
|
|
template currentLine(lex: Lexer): string =
|
|
lex.source.getCurrentLine(true)
|
|
|
|
proc isPlainSafe(lex: Lexer): bool {.inline.} =
|
|
case lex.source.buf[lex.source.bufpos]
|
|
of spaceOrLineEnd: result = false
|
|
of flowIndicators: result = lex.flowDepth == 0
|
|
else: result = true
|
|
|
|
# lexer states
|
|
|
|
{.push gcSafe.}
|
|
# `raises` cannot be pushed.
|
|
proc outsideDoc(lex: var Lexer): bool {.raises: [].}
|
|
proc yamlVersion(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc tagShorthand(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc tagUri(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc unknownDirParams(lex: var Lexer): bool {.raises: [].}
|
|
proc expectLineEnd(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc lineStart(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc flowLineStart(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc flowLineIndentation(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc insideLine(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc indentationSettingToken(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc afterToken(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc beforeIndentationSettingToken(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc afterJsonEnablingToken(lex: var Lexer): bool {.raises: LexerError.}
|
|
proc lineIndentation(lex: var Lexer): bool {.raises: [].}
|
|
proc lineDirEnd(lex: var Lexer): bool {.raises: [].}
|
|
proc lineDocEnd(lex: var Lexer): bool {.raises: [].}
|
|
proc atSuffix(lex: var Lexer): bool {.raises: [LexerError].}
|
|
proc streamEnd(lex: var Lexer): bool {.raises: [].}
|
|
{.pop.}
|
|
|
|
# helpers
|
|
|
|
template debug*(message: string) =
|
|
when defined(yamlDebug):
|
|
when nimvm:
|
|
echo "yamlDebug: ", message
|
|
else:
|
|
try: styledWriteLine(stdout, fgBlue, message)
|
|
except ValueError, IOError: discard
|
|
|
|
proc generateError(lex: Lexer, message: string):
|
|
ref LexerError {.raises: [].} =
|
|
result = (ref LexerError)(
|
|
msg: message, line: lex.lineNumber(), column: lex.columnNumber(),
|
|
lineContent: lex.currentLine())
|
|
|
|
proc startToken(lex: var Lexer) {.inline.} =
|
|
lex.curStartPos = Mark(line: lex.lineNumber(), column: lex.columnNumber())
|
|
lex.tokenStart = lex.source.bufpos
|
|
|
|
proc endToken(lex: var Lexer) {.inline.} =
|
|
lex.curEndPos = Mark(line: lex.lineNumber(), column: lex.columnNumber())
|
|
|
|
proc readNumericSubtoken(lex: var Lexer) {.inline.} =
|
|
if lex.c notin digits:
|
|
raise lex.generateError("Illegal character in YAML version string: " & escape("" & lex.c))
|
|
while true:
|
|
lex.advance()
|
|
if lex.c notin digits: break
|
|
|
|
proc isDirectivesEnd(lex: var Lexer): bool =
|
|
var peek = lex.source.bufpos
|
|
if lex.source.buf[peek] == '-':
|
|
peek += 1
|
|
if lex.source.buf[peek] == '-':
|
|
peek += 1
|
|
if lex.source.buf[peek] in spaceOrLineEnd:
|
|
lex.source.bufpos = peek
|
|
lex.advance()
|
|
return true
|
|
return false
|
|
|
|
proc isDocumentEnd(lex: var Lexer): bool =
|
|
var peek = lex.source.bufpos
|
|
if lex.source.buf[peek] == '.':
|
|
peek += 1
|
|
if lex.source.buf[peek] == '.':
|
|
peek += 1
|
|
if lex.source.buf[peek] in spaceOrLineEnd:
|
|
lex.source.bufpos = peek
|
|
lex.advance()
|
|
return true
|
|
return false
|
|
|
|
proc readHexSequence(lex: var Lexer, len: int) =
|
|
var charPos = 0
|
|
for i in countup(0, len-1):
|
|
lex.advance()
|
|
let digitPosition = len - i - 1
|
|
case lex.c
|
|
of lineEnd:
|
|
raise lex.generateError("Unfinished unicode escape sequence")
|
|
of '0'..'9':
|
|
charPos = charPos or (int(lex.c) - 0x30) shl (digitPosition * 4)
|
|
of 'A' .. 'F':
|
|
charPos = charPos or (int(lex.c) - 0x37) shl (digitPosition * 4)
|
|
of 'a' .. 'f':
|
|
charPos = charPos or (int(lex.c) - 0x57) shl (digitPosition * 4)
|
|
else:
|
|
raise lex.generateError("Invalid character in hex escape sequence: " &
|
|
escape("" & lex.c))
|
|
lex.evaluated.add(toUTF8(Rune(charPos)))
|
|
|
|
proc readURI(lex: var Lexer, verbatim: bool) =
|
|
lex.evaluated.setLen(0)
|
|
let endWithSpace = lex.c != '<'
|
|
let restricted = lex.flowDepth > 0 and not verbatim
|
|
var literalStart: int
|
|
if endWithSpace:
|
|
if not restricted and lex.c in {'[', ']', ','}:
|
|
raise lex.generateError("Flow indicator cannot start tag prefix")
|
|
literalStart = lex.source.bufpos - 1
|
|
else:
|
|
literalStart = lex.source.bufpos
|
|
lex.advance()
|
|
while true:
|
|
case lex.c
|
|
of spaceOrLineEnd:
|
|
if endWithSpace:
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
break
|
|
raise lex.generateError("Unclosed verbatim tag")
|
|
of '%':
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
lex.readHexSequence(2)
|
|
literalStart = lex.source.bufpos
|
|
of uriChars: discard
|
|
of '[', ']', ',':
|
|
if restricted:
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
break
|
|
of '!':
|
|
if restricted:
|
|
raise lex.generateError("Illegal '!' in tag suffix")
|
|
of '>':
|
|
if endWithSpace:
|
|
raise lex.generateError("Illegal character in URI: `>`")
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
lex.advance()
|
|
break
|
|
else:
|
|
raise lex.generateError("Illegal character in URI: " & escape("" & lex.c))
|
|
lex.advance()
|
|
|
|
proc endLine(lex: var Lexer) =
|
|
while true:
|
|
case lex.c
|
|
of '\l':
|
|
lex.lexLF()
|
|
lex.state = lex.lineStartState
|
|
break
|
|
of '\c':
|
|
lex.lexCR()
|
|
lex.state = lex.lineStartState
|
|
break
|
|
of EndOfFile:
|
|
lex.state = streamEnd
|
|
break
|
|
of '#':
|
|
while true:
|
|
lex.advance()
|
|
if lex.c in lineEnd: break
|
|
else: discard
|
|
|
|
proc startLine(lex: var Lexer): LineStartType =
|
|
case lex.c
|
|
of '-':
|
|
return if lex.isDirectivesEnd(): lsDirectivesEndMarker
|
|
else: lsContent
|
|
of '.':
|
|
return if lex.isDocumentEnd(): lsDocumentEndMarker
|
|
else: lsContent
|
|
else:
|
|
while lex.c == ' ': lex.advance()
|
|
if lex.c == '\t':
|
|
var peek = lex.source.bufpos
|
|
while lex.source.buf[peek] in space:
|
|
peek += 1
|
|
if lex.source.buf[peek] in commentOrLineEnd:
|
|
lex.source.bufpos = peek + 1
|
|
lex.c = lex.source.buf[peek]
|
|
else:
|
|
return lsContent
|
|
return case lex.c
|
|
of '#': lsComment
|
|
of '\l', '\c': lsNewline
|
|
of EndOfFile: lsStreamEnd
|
|
else: lsContent
|
|
|
|
proc readPlainScalar(lex: var Lexer) =
|
|
lex.evaluated.setLen(0)
|
|
let afterNewlineState = if lex.flowDepth == 0: lineIndentation
|
|
else: flowLineIndentation
|
|
var lineStartPos: int
|
|
lex.seenMultiline = false
|
|
lex.startToken()
|
|
if lex.propertyIndentation != -1:
|
|
lex.indentation = lex.propertyIndentation
|
|
lex.propertyIndentation = -1
|
|
lex.cur = Token.Plain
|
|
block multilineLoop:
|
|
while true:
|
|
lineStartPos = lex.source.bufpos - 1
|
|
block inlineLoop:
|
|
while true:
|
|
lex.advance()
|
|
case lex.c
|
|
of space:
|
|
lex.endToken()
|
|
let spaceStart = lex.source.bufpos - 2
|
|
block spaceLoop:
|
|
while true:
|
|
lex.advance()
|
|
case lex.c
|
|
of '\l', '\c':
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
|
|
break inlineLoop
|
|
of EndOfFile:
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
|
|
lex.state = streamEnd
|
|
break multilineLoop
|
|
of '#':
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
|
|
lex.state = expectLineEnd
|
|
break multilineLoop
|
|
of ':':
|
|
if not lex.isPlainSafe():
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
|
|
lex.state = insideLine
|
|
break multilineLoop
|
|
break spaceLoop
|
|
of flowIndicators:
|
|
if lex.flowDepth > 0:
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
|
|
lex.state = insideLine
|
|
break multilineLoop
|
|
break spaceLoop
|
|
of space: discard
|
|
else: break spaceLoop
|
|
of ':':
|
|
if not lex.isPlainSafe():
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
|
|
lex.endToken()
|
|
lex.state = insideLine
|
|
break multilineLoop
|
|
of flowIndicators:
|
|
if lex.flowDepth > 0:
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
|
|
lex.endToken()
|
|
lex.state = insideLine
|
|
break multilineLoop
|
|
of '\l', '\c':
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
|
|
lex.endToken()
|
|
break inlineLoop
|
|
of EndOfFile:
|
|
lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
|
|
if lex.currentIndentation() > 0:
|
|
lex.endToken()
|
|
lex.state = streamEnd
|
|
break multilineLoop
|
|
else: discard
|
|
lex.endLine()
|
|
var newlines = 1
|
|
block newlineLoop:
|
|
while true:
|
|
case lex.startLine()
|
|
of lsContent:
|
|
if lex.currentIndentation() <= lex.indentation:
|
|
lex.state = afterNewlineState
|
|
break multilineLoop
|
|
if lex.c == '\t':
|
|
while lex.c in space: lex.advance()
|
|
case lex.c:
|
|
of '#':
|
|
lex.endLine()
|
|
lex.state = lineStart
|
|
break multilineLoop
|
|
of '\l', '\c':
|
|
lex.endLine()
|
|
newlines += 1
|
|
continue
|
|
else: discard
|
|
break newlineLoop
|
|
of lsDirectivesEndMarker:
|
|
lex.state = lineDirEnd
|
|
break multilineLoop
|
|
of lsDocumentEndMarker:
|
|
lex.state = lineDocEnd
|
|
break multilineLoop
|
|
of lsStreamEnd:
|
|
break multilineLoop
|
|
of lsComment:
|
|
lex.endLine()
|
|
lex.state = lineStart
|
|
break multilineLoop
|
|
of lsNewline: lex.endLine()
|
|
newlines += 1
|
|
while lex.c in space: lex.advance()
|
|
if (lex.c == ':' and not lex.isPlainSafe()) or
|
|
lex.c == '#' or (lex.c in flowIndicators and
|
|
lex.flowDepth > 0):
|
|
lex.state = afterNewlineState
|
|
break multilineLoop
|
|
lex.seenMultiline = true
|
|
if newlines == 1: lex.evaluated.add(' ')
|
|
else:
|
|
for i in countup(2, newlines): lex.evaluated.add('\l')
|
|
|
|
proc streamEndAfterBlock(lex: var Lexer) =
|
|
if lex.currentIndentation() != 0:
|
|
lex.endToken()
|
|
lex.curEndPos.column -= 1
|
|
|
|
proc dirEndFollows(lex: Lexer): bool =
|
|
return lex.c == '-' and lex.source.buf[lex.source.bufpos] == '-' and
|
|
lex.source.buf[lex.source.bufpos+1] == '-'
|
|
|
|
proc docEndFollows(lex: Lexer): bool =
|
|
return lex.c == '.' and lex.source.buf[lex.source.bufpos] == '.' and
|
|
lex.source.buf[lex.source.bufpos+1] == '.'
|
|
|
|
proc readBlockScalar(lex: var Lexer) =
|
|
var
|
|
chomp = ctClip
|
|
indent = 0
|
|
separationLines = 0
|
|
contentStart: int
|
|
hasBody = true
|
|
lex.startToken()
|
|
lex.cur = if lex.c == '>': Token.Folded else: Token.Literal
|
|
lex.evaluated.setLen(0)
|
|
|
|
# header
|
|
while true:
|
|
lex.advance()
|
|
case lex.c
|
|
of '+':
|
|
if chomp != ctClip:
|
|
raise lex.generateError("Multiple chomping indicators")
|
|
chomp = ctKeep
|
|
of '-':
|
|
if chomp != ctClip:
|
|
raise lex.generateError("Multiple chomping indicators")
|
|
chomp = ctStrip
|
|
of '1' .. '9':
|
|
if indent != 0:
|
|
raise lex.generateError("Multiple indentation indicators")
|
|
indent = max(0, lex.indentation) + int(lex.c) - int('0')
|
|
of ' ':
|
|
while true:
|
|
lex.advance()
|
|
if lex.c != ' ': break
|
|
if lex.c notin commentOrLineEnd:
|
|
raise lex.generateError("Illegal character after block scalar header: " &
|
|
escape("" & lex.c))
|
|
break
|
|
of EndOfFile:
|
|
hasBody = false
|
|
break
|
|
of '\l', '\c': break
|
|
else:
|
|
raise lex.generateError("Illegal character in block scalar header: " &
|
|
escape("" & lex.c))
|
|
lex.endLine()
|
|
|
|
block body:
|
|
# determining indentation and leading empty lines
|
|
var
|
|
maxLeadingSpaces = 0
|
|
moreIndented = false
|
|
while true:
|
|
if indent == 0:
|
|
while lex.c == ' ': lex.advance()
|
|
else:
|
|
maxLeadingSpaces = lex.currentIndentation() + indent
|
|
while lex.c == ' ' and lex.currentIndentation() < maxLeadingSpaces:
|
|
lex.advance()
|
|
case lex.c
|
|
of '\l', '\c':
|
|
lex.endToken()
|
|
maxLeadingSpaces = max(maxLeadingSpaces, lex.currentIndentation())
|
|
lex.endLine()
|
|
separationLines += 1
|
|
of EndOfFile:
|
|
lex.state = streamEnd
|
|
lex.streamEndAfterBlock()
|
|
if lex.source.getColNumber(lex.source.bufpos) > 1 and hasBody: separationLines += 1
|
|
break body
|
|
else:
|
|
if indent == 0:
|
|
indent = lex.currentIndentation()
|
|
if indent <= lex.indentation or
|
|
(indent == 0 and (lex.dirEndFollows() or lex.docEndFollows())):
|
|
lex.state = lineIndentation
|
|
break body
|
|
elif indent < maxLeadingSpaces:
|
|
raise lex.generateError("Leading all-spaces line contains too many spaces")
|
|
elif lex.currentIndentation() < indent: break body
|
|
if lex.cur == Token.Folded and lex.c in space:
|
|
moreIndented = true
|
|
break
|
|
for i in countup(0, separationLines - 1):
|
|
lex.evaluated.add('\l')
|
|
separationLines = if moreIndented: 1 else: 0
|
|
|
|
block content:
|
|
while true:
|
|
contentStart = lex.source.bufpos - 1
|
|
while lex.c notin lineEnd: lex.advance()
|
|
lex.evaluated.add(lex.source.buf[contentStart .. lex.source.bufpos - 2])
|
|
if lex.c == EndOfFile:
|
|
lex.state = streamEnd
|
|
lex.streamEndAfterBlock()
|
|
break body
|
|
separationLines += 1
|
|
lex.endToken()
|
|
lex.endLine()
|
|
|
|
let oldMoreIndented = moreIndented
|
|
# empty lines and indentation of next line
|
|
moreIndented = false
|
|
while true:
|
|
while lex.c == ' ' and lex.currentIndentation() < indent:
|
|
lex.advance()
|
|
case lex.c
|
|
of '\l', '\c':
|
|
lex.endToken()
|
|
separationLines += 1
|
|
lex.endLine()
|
|
of EndOfFile:
|
|
lex.state = streamEnd
|
|
lex.streamEndAfterBlock()
|
|
break body
|
|
else:
|
|
if lex.currentIndentation() < indent or
|
|
(indent == 0 and (lex.dirEndFollows() or lex.docEndFollows())):
|
|
break content
|
|
if lex.cur == Token.Folded and lex.c in space:
|
|
moreIndented = true
|
|
if not oldMoreIndented:
|
|
separationLines += 1
|
|
break
|
|
|
|
# line folding
|
|
if lex.cur == Token.Literal:
|
|
for i in countup(0, separationLines - 1):
|
|
lex.evaluated.add('\l')
|
|
elif separationLines == 1:
|
|
lex.evaluated.add(' ')
|
|
else:
|
|
for i in countup(0, separationLines - 2):
|
|
lex.evaluated.add('\l')
|
|
separationLines = if moreIndented: 1 else: 0
|
|
|
|
let markerFollows = lex.currentIndentation() == 0 and
|
|
(lex.dirEndFollows() or lex.docEndFollows())
|
|
if lex.c == '#':
|
|
lex.state = expectLineEnd
|
|
elif lex.currentIndentation() > lex.indentation and not markerFollows:
|
|
raise lex.generateError("This line #" & $lex.curStartPos.line & " at " & escape("" & lex.c) & " is less indented than necessary")
|
|
elif lex.currentIndentation() == 0:
|
|
lex.state = lineStart
|
|
else:
|
|
lex.state = lineIndentation
|
|
|
|
lex.endToken()
|
|
|
|
case chomp
|
|
of ctStrip: discard
|
|
of ctClip:
|
|
if len(lex.evaluated) > 0: lex.evaluated.add('\l')
|
|
of ctKeep:
|
|
for i in countup(0, separationLines - 1):
|
|
lex.evaluated.add('\l')
|
|
|
|
proc processQuotedWhitespace(lex: var Lexer, initial: int) =
|
|
var newlines = initial
|
|
let firstSpace = lex.source.bufpos - 1
|
|
while true:
|
|
case lex.c
|
|
of ' ', '\t': discard
|
|
of '\l':
|
|
lex.lexLF()
|
|
break
|
|
of '\c':
|
|
lex.lexCR()
|
|
break
|
|
else:
|
|
lex.evaluated.add(lex.source.buf[firstSpace..lex.source.bufpos - 2])
|
|
return
|
|
lex.advance()
|
|
lex.seenMultiline = true
|
|
while true:
|
|
case lex.startLine()
|
|
of lsContent, lsComment:
|
|
while lex.c in space: lex.advance()
|
|
if lex.c in {'\l', '\c'}:
|
|
lex.endLine()
|
|
else: break
|
|
of lsDirectivesEndMarker:
|
|
raise lex.generateError("Illegal `---` within quoted scalar")
|
|
of lsDocumentEndMarker:
|
|
raise lex.generateError("Illegal `...` within quoted scalar")
|
|
of lsNewline: lex.endLine()
|
|
of lsStreamEnd:
|
|
raise lex.generateError("Unclosed quoted string")
|
|
newlines += 1
|
|
if newlines == 0: discard
|
|
elif newlines == 1: lex.evaluated.add(' ')
|
|
else:
|
|
for i in countup(2, newlines): lex.evaluated.add('\l')
|
|
|
|
proc readSingleQuotedScalar(lex: var Lexer) =
|
|
lex.seenMultiline = false
|
|
lex.startToken()
|
|
lex.evaluated.setLen(0)
|
|
if lex.propertyIndentation != -1:
|
|
lex.indentation = lex.propertyIndentation
|
|
lex.propertyIndentation = -1
|
|
var literalStart = lex.source.bufpos
|
|
lex.advance()
|
|
while true:
|
|
case lex.c
|
|
of EndOfFile:
|
|
raise lex.generateError("Unclosed quoted string")
|
|
of '\'':
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
lex.advance()
|
|
if lex.c == '\'':
|
|
lex.evaluated.add('\'')
|
|
literalStart = lex.source.bufpos
|
|
lex.advance()
|
|
else: break
|
|
of ' ', '\t', '\l', '\c':
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
lex.processQuotedWhitespace(1)
|
|
literalStart = lex.source.bufpos - 1
|
|
else:
|
|
lex.advance()
|
|
lex.endToken()
|
|
lex.cur = Token.SingleQuoted
|
|
|
|
proc readDoubleQuotedScalar(lex: var Lexer) =
|
|
lex.seenMultiline = false
|
|
lex.startToken()
|
|
lex.evaluated.setLen(0)
|
|
if lex.propertyIndentation != -1:
|
|
lex.indentation = lex.propertyIndentation
|
|
lex.propertyIndentation = -1
|
|
var literalStart = lex.source.bufpos
|
|
lex.advance()
|
|
while true:
|
|
case lex.c
|
|
of EndOfFile:
|
|
raise lex.generateError("Unclosed quoted string")
|
|
of '\\':
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
lex.advance()
|
|
literalStart = lex.source.bufpos
|
|
case lex.c
|
|
of '0': lex.evaluated.add('\0')
|
|
of 'a': lex.evaluated.add('\a')
|
|
of 'b': lex.evaluated.add('\b')
|
|
of 't', '\t': lex.evaluated.add('\t')
|
|
of 'n': lex.evaluated.add('\l')
|
|
of 'v': lex.evaluated.add('\v')
|
|
of 'f': lex.evaluated.add('\f')
|
|
of 'r': lex.evaluated.add('\c')
|
|
of 'e': lex.evaluated.add('\e')
|
|
of ' ': lex.evaluated.add(' ')
|
|
of '"': lex.evaluated.add('"')
|
|
of '/': lex.evaluated.add('/')
|
|
of '\\':lex.evaluated.add('\\')
|
|
of 'N': lex.evaluated.add(UTF8NextLine)
|
|
of '_': lex.evaluated.add(UTF8NonBreakingSpace)
|
|
of 'L': lex.evaluated.add(UTF8LineSeparator)
|
|
of 'P': lex.evaluated.add(UTF8ParagraphSeparator)
|
|
of 'x':
|
|
lex.readHexSequence(2)
|
|
literalStart = lex.source.bufpos
|
|
of 'u':
|
|
lex.readHexSequence(4)
|
|
literalStart = lex.source.bufpos
|
|
of 'U':
|
|
lex.readHexSequence(8)
|
|
literalStart = lex.source.bufpos
|
|
of '\l', '\c':
|
|
lex.processQuotedWhitespace(0)
|
|
literalStart = lex.source.bufpos - 1
|
|
continue
|
|
else:
|
|
raise lex.generateError("Illegal character in escape sequence: " & escape("" & lex.c))
|
|
of '"':
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
break
|
|
of ' ', '\t', '\l', '\c':
|
|
lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
|
|
lex.processQuotedWhitespace(1)
|
|
literalStart = lex.source.bufpos - 1
|
|
continue
|
|
else: discard
|
|
lex.advance()
|
|
lex.advance()
|
|
lex.endToken()
|
|
lex.cur = Token.DoubleQuoted
|
|
|
|
proc basicInit(lex: var Lexer) =
|
|
lex.state = outsideDoc
|
|
lex.flowDepth = 0
|
|
lex.lineStartState = outsideDoc
|
|
lex.jsonEnablingState = afterToken
|
|
lex.propertyIndentation = -1
|
|
lex.evaluated = ""
|
|
lex.advance()
|
|
|
|
# interface
|
|
|
|
proc lastScalarWasMultiline*(lex: Lexer): bool =
|
|
result = lex.seenMultiline
|
|
|
|
proc shortLexeme*(lex: Lexer): string =
|
|
return lex.source.buf[lex.tokenStart..lex.source.bufpos-2]
|
|
|
|
proc fullLexeme*(lex: Lexer): string =
|
|
return lex.source.buf[lex.tokenStart - 1..lex.source.bufpos-2]
|
|
|
|
proc currentLine*(lex: Lexer): string =
|
|
return lex.source.getCurrentLine(false)
|
|
|
|
proc next*(lex: var Lexer) {.raises: [LexerError].}=
|
|
while not lex.state(lex): discard
|
|
debug("lexer -> [" & $lex.curStartPos.line & "," & $lex.curStartPos.column &
|
|
"-" & $lex.curEndPos.line & "," & $lex.curEndPos.column & "] " & $lex.cur)
|
|
|
|
proc init*(lex: var Lexer, source: Stream) {.raises: [IOError, OSError].} =
|
|
lex.source.open(source)
|
|
lex.basicInit()
|
|
|
|
proc init*(lex: var Lexer, source: string) {.raises: [].} =
|
|
try:
|
|
lex.source.open(newStringStream(source))
|
|
except CatchableError:
|
|
discard # can never happen with StringStream
|
|
lex.basicInit()
|
|
|
|
# states
|
|
|
|
proc outsideDoc(lex: var Lexer): bool =
|
|
case lex.c
|
|
of '%':
|
|
lex.startToken()
|
|
while true:
|
|
lex.advance()
|
|
if lex.c in spaceOrLineEnd: break
|
|
lex.endToken()
|
|
let name = lex.shortLexeme()
|
|
case name
|
|
of "YAML":
|
|
lex.state = yamlVersion
|
|
lex.cur = Token.YamlDirective
|
|
of "TAG":
|
|
lex.state = tagShorthand
|
|
lex.cur = Token.TagDirective
|
|
else:
|
|
lex.state = unknownDirParams
|
|
lex.cur = Token.UnknownDirective
|
|
lex.evaluated.setLen(0)
|
|
lex.evaluated.add(name)
|
|
of '-':
|
|
lex.startToken()
|
|
if lex.isDirectivesEnd():
|
|
lex.state = afterToken
|
|
lex.cur = Token.DirectivesEnd
|
|
else:
|
|
lex.state = indentationSettingToken
|
|
lex.cur = Token.Indentation
|
|
lex.lineStartState = lineStart
|
|
lex.indentation = -1
|
|
lex.endToken()
|
|
of '.':
|
|
lex.startToken()
|
|
if lex.isDocumentEnd():
|
|
lex.state = expectLineEnd
|
|
lex.cur = Token.DocumentEnd
|
|
else:
|
|
lex.state = indentationSettingToken
|
|
lex.lineStartState = lineStart
|
|
lex.indentation = -1
|
|
lex.cur = Token.Indentation
|
|
lex.endToken()
|
|
else:
|
|
lex.startToken()
|
|
while lex.c == ' ': lex.advance()
|
|
if lex.c in commentOrLineEnd:
|
|
lex.state = expectLineEnd
|
|
return false
|
|
if lex.c == '\t':
|
|
var peek = lex.source.bufpos
|
|
while lex.source.buf[peek] in space:
|
|
peek += 1
|
|
if lex.source.buf[peek] in commentOrLineEnd:
|
|
lex.state = expectLineEnd
|
|
lex.source.bufpos = peek
|
|
return false
|
|
lex.endToken()
|
|
lex.cur = Token.Indentation
|
|
lex.indentation = -1
|
|
lex.state = indentationSettingToken
|
|
lex.lineStartState = lineStart
|
|
return true
|
|
|
|
proc yamlVersion(lex: var Lexer): bool =
|
|
while lex.c in space: lex.advance()
|
|
lex.startToken()
|
|
lex.readNumericSubtoken()
|
|
if lex.c != '.':
|
|
raise lex.generateError("Illegal character in YAML version string: " & escape("" & lex.c))
|
|
lex.advance()
|
|
lex.readNumericSubtoken()
|
|
if lex.c notin spaceOrLineEnd:
|
|
raise lex.generateError("Illegal character in YAML version string: " & escape("" & lex.c))
|
|
lex.cur = Token.DirectiveParam
|
|
lex.endToken()
|
|
lex.state = expectLineEnd
|
|
return true
|
|
|
|
proc tagShorthand(lex: var Lexer): bool =
|
|
while lex.c in space: lex.advance()
|
|
if lex.c != '!':
|
|
raise lex.generateError("Illegal character, tag shorthand must start with '!': " & escape("" & lex.c))
|
|
lex.startToken()
|
|
lex.advance()
|
|
|
|
if lex.c in spaceOrLineEnd: discard
|
|
else:
|
|
while lex.c in tagShorthandChars: lex.advance()
|
|
if lex.c != '!':
|
|
if lex.c in spaceOrLineEnd:
|
|
raise lex.generateError("Tag shorthand must end with '!'.")
|
|
else:
|
|
raise lex.generateError("Illegal character in tag shorthand: " & escape("" & lex.c))
|
|
lex.advance()
|
|
if lex.c notin spaceOrLineEnd:
|
|
raise lex.generateError("Missing space after tag shorthand")
|
|
lex.cur = Token.TagHandle
|
|
lex.endToken()
|
|
lex.state = tagUri
|
|
return true
|
|
|
|
proc tagUri(lex: var Lexer): bool =
|
|
while lex.c in space: lex.advance()
|
|
lex.startToken()
|
|
if lex.c == '<':
|
|
raise lex.generateError("Illegal character in tag URI: " & escape("" & lex.c))
|
|
lex.readUri(false)
|
|
lex.cur = Token.Suffix
|
|
lex.endToken()
|
|
lex.state = expectLineEnd
|
|
return true
|
|
|
|
proc unknownDirParams(lex: var Lexer): bool =
|
|
while lex.c in space: lex.advance()
|
|
if lex.c in lineEnd + {'#'}:
|
|
lex.state = expectLineEnd
|
|
return false
|
|
lex.startToken()
|
|
while true:
|
|
lex.advance()
|
|
if lex.c in lineEnd + {'#'}: break
|
|
lex.cur = Token.DirectiveParam
|
|
return true
|
|
|
|
proc expectLineEnd(lex: var Lexer): bool =
|
|
while lex.c in space: lex.advance()
|
|
if lex.c notin commentOrLineEnd:
|
|
raise lex.generateError("Unexpected character (expected line end): " & escape("" & lex.c))
|
|
lex.endLine()
|
|
return false
|
|
|
|
proc lineStart(lex: var Lexer): bool =
|
|
return case lex.startLine()
|
|
of lsDirectivesEndMarker: lex.lineDirEnd()
|
|
of lsDocumentEndMarker: lex.lineDocEnd()
|
|
of lsComment, lsNewline: lex.endLine(); false
|
|
of lsStreamEnd: lex.state = streamEnd; false
|
|
of lsContent:
|
|
if lex.flowDepth == 0: lex.lineIndentation()
|
|
else: lex.flowLineIndentation()
|
|
|
|
proc flowLineStart(lex: var Lexer): bool =
|
|
var indent: int
|
|
case lex.c
|
|
of '-':
|
|
if lex.isDirectivesEnd():
|
|
raise lex.generateError("Directives end marker before end of flow content")
|
|
indent = 0
|
|
of '.':
|
|
if lex.isDocumentEnd():
|
|
raise lex.generateError("Document end marker before end of flow content")
|
|
indent = 0
|
|
else:
|
|
let lineStart = lex.source.bufpos
|
|
while lex.c == ' ': lex.advance()
|
|
indent = lex.source.bufpos - lineStart
|
|
while lex.c in space: lex.advance()
|
|
if lex.c in commentOrLineEnd:
|
|
lex.state = expectLineEnd
|
|
return false
|
|
if indent <= lex.indentation:
|
|
raise lex.generateError("Too few indentation spaces (must surpass surrounding block level)")
|
|
lex.state = insideLine
|
|
return false
|
|
|
|
proc flowLineIndentation(lex: var Lexer): bool =
|
|
if lex.currentIndentation() < lex.indentation:
|
|
raise lex.generateError("Too few indentation spaces (must surpass surrounding block level)")
|
|
lex.state = insideLine
|
|
return false
|
|
|
|
proc checkIndicatorChar(lex: var Lexer, kind: Token) =
|
|
if lex.isPlainSafe():
|
|
lex.readPlainScalar()
|
|
else:
|
|
lex.startToken()
|
|
lex.advance()
|
|
lex.endToken()
|
|
lex.cur = kind
|
|
lex.state = beforeIndentationSettingToken
|
|
|
|
proc enterFlowCollection(lex: var Lexer, kind: Token) =
|
|
lex.startToken()
|
|
if lex.flowDepth == 0:
|
|
lex.jsonEnablingState = afterJsonEnablingToken
|
|
lex.lineStartState = flowLineStart
|
|
lex.propertyIndentation = -1
|
|
lex.flowDepth += 1
|
|
lex.state = afterToken
|
|
lex.advance()
|
|
lex.endToken()
|
|
lex.cur = kind
|
|
|
|
proc leaveFlowCollection(lex: var Lexer, kind: Token) =
|
|
lex.startToken()
|
|
if lex.flowDepth == 0:
|
|
raise lex.generateError("No flow collection to leave!")
|
|
lex.flowDepth -= 1
|
|
if lex.flowDepth == 0:
|
|
lex.jsonEnablingState = afterToken
|
|
lex.lineStartState = lineStart
|
|
lex.state = lex.jsonEnablingState
|
|
lex.advance()
|
|
lex.endToken()
|
|
lex.cur = kind
|
|
|
|
proc readNamespace(lex: var Lexer) =
|
|
lex.startToken()
|
|
lex.advance()
|
|
if lex.c == '<':
|
|
lex.readURI(true)
|
|
lex.endToken()
|
|
lex.cur = Token.VerbatimTag
|
|
lex.state = afterToken
|
|
else:
|
|
var handleEnd = lex.tokenStart
|
|
while true:
|
|
case lex.source.buf[handleEnd]
|
|
of spaceOrLineEnd + flowIndicators:
|
|
handleEnd = lex.tokenStart
|
|
lex.source.bufpos -= 1
|
|
break
|
|
of '!':
|
|
handleEnd += 1
|
|
break
|
|
else:
|
|
handleEnd += 1
|
|
while lex.source.bufpos < handleEnd:
|
|
lex.advance()
|
|
if lex.c notin tagShorthandChars + {'!'}:
|
|
raise lex.generateError("Illegal character in tag handle: " & escape("" & lex.c))
|
|
lex.advance()
|
|
lex.endToken()
|
|
lex.cur = Token.TagHandle
|
|
lex.state = atSuffix
|
|
|
|
proc readAnchorName(lex: var Lexer) =
|
|
lex.startToken()
|
|
while true:
|
|
lex.advance()
|
|
if lex.c in spaceOrLineEnd + flowIndicators: break
|
|
if lex.source.bufpos == lex.tokenStart + 1:
|
|
raise lex.generateError("Anchor name must not be empty")
|
|
lex.state = afterToken
|
|
|
|
proc insideLine(lex: var Lexer): bool =
|
|
case lex.c
|
|
of ':':
|
|
lex.checkIndicatorChar(Token.MapValueInd)
|
|
if lex.cur == Token.MapValueInd and lex.propertyIndentation != -1:
|
|
lex.indentation = lex.propertyIndentation
|
|
lex.propertyIndentation = -1
|
|
of '?':
|
|
lex.checkIndicatorChar(Token.MapKeyInd)
|
|
of '-':
|
|
lex.checkIndicatorChar(Token.SeqItemInd)
|
|
of commentOrLineEnd:
|
|
lex.endLine()
|
|
return false
|
|
of '"':
|
|
lex.readDoubleQuotedScalar()
|
|
lex.state = lex.jsonEnablingState
|
|
of '\'':
|
|
lex.readSingleQuotedScalar()
|
|
lex.state = lex.jsonEnablingState
|
|
of '>', '|':
|
|
if lex.flowDepth > 0:
|
|
lex.readPlainScalar()
|
|
else:
|
|
lex.readBlockScalar()
|
|
of '{':
|
|
lex.enterFlowCollection(Token.MapStart)
|
|
of '}':
|
|
lex.leaveFlowCollection(Token.MapEnd)
|
|
of '[':
|
|
lex.enterFlowCollection(Token.SeqStart)
|
|
of ']':
|
|
lex.leaveFlowCollection(Token.SeqEnd)
|
|
of ',':
|
|
lex.startToken()
|
|
lex.advance()
|
|
lex.endToken()
|
|
lex.cur = Token.SeqSep
|
|
lex.state = afterToken
|
|
of '!':
|
|
lex.readNamespace()
|
|
of '&':
|
|
lex.readAnchorName()
|
|
lex.endToken()
|
|
lex.cur = Token.Anchor
|
|
of '*':
|
|
lex.readAnchorName()
|
|
lex.endToken()
|
|
lex.cur = Token.Alias
|
|
of ' ', '\t':
|
|
while true:
|
|
lex.advance()
|
|
if lex.c notin space: break
|
|
return false
|
|
of '@', '`':
|
|
raise lex.generateError("Reserved character may not start any token")
|
|
else:
|
|
lex.readPlainScalar()
|
|
return true
|
|
|
|
proc indentationSettingToken(lex: var Lexer): bool =
|
|
let cachedIntentation = lex.currentIndentation()
|
|
result = lex.insideLine()
|
|
if result and lex.flowDepth == 0:
|
|
if lex.cur in nodePropertyKind:
|
|
lex.propertyIndentation = cachedIntentation
|
|
else:
|
|
lex.indentation = cachedIntentation
|
|
|
|
proc afterToken(lex: var Lexer): bool =
|
|
while lex.c in space: lex.advance()
|
|
if lex.c in commentOrLineEnd:
|
|
lex.endLine()
|
|
else:
|
|
lex.state = insideLine
|
|
return false
|
|
|
|
proc beforeIndentationSettingToken(lex: var Lexer): bool =
|
|
discard lex.afterToken()
|
|
if lex.state == insideLine:
|
|
lex.state = indentationSettingToken
|
|
return false
|
|
|
|
proc afterJsonEnablingToken(lex: var Lexer): bool =
|
|
while lex.c == ' ': lex.advance()
|
|
while true:
|
|
case lex.c
|
|
of ':':
|
|
lex.startToken()
|
|
lex.advance()
|
|
lex.endToken()
|
|
lex.cur = Token.MapValueInd
|
|
lex.state = afterToken
|
|
return true
|
|
of '#', '\l', '\c':
|
|
lex.endLine()
|
|
discard lex.flowLineStart()
|
|
of EndOfFile:
|
|
lex.state = streamEnd
|
|
return false
|
|
else:
|
|
lex.state = insideLine
|
|
return false
|
|
|
|
proc lineIndentation(lex: var Lexer): bool =
|
|
lex.curStartPos.line = lex.source.lineNumber
|
|
lex.curStartPos.column = 1
|
|
lex.endToken()
|
|
lex.cur = Token.Indentation
|
|
lex.state = indentationSettingToken
|
|
return true
|
|
|
|
proc lineDirEnd(lex: var Lexer): bool =
|
|
lex.curStartPos.line = lex.source.lineNumber
|
|
lex.curStartPos.column = 1
|
|
lex.endToken()
|
|
lex.cur = Token.DirectivesEnd
|
|
lex.state = afterToken
|
|
lex.indentation = -1
|
|
lex.propertyIndentation = -1
|
|
return true
|
|
|
|
proc lineDocEnd(lex: var Lexer): bool =
|
|
lex.curStartPos.line = lex.source.lineNumber
|
|
lex.curStartPos.column = 1
|
|
lex.endToken()
|
|
lex.cur = Token.DocumentEnd
|
|
lex.state = expectLineEnd
|
|
lex.lineStartState = outsideDoc
|
|
return true
|
|
|
|
proc atSuffix(lex: var Lexer): bool =
|
|
lex.startToken()
|
|
lex.evaluated.setLen(0)
|
|
var curStart = lex.tokenStart - 1
|
|
while true:
|
|
case lex.c
|
|
of uriChars: lex.advance()
|
|
of '%':
|
|
if curStart <= lex.source.bufpos - 2:
|
|
lex.evaluated.add(lex.source.buf[curStart..lex.source.bufpos - 2])
|
|
lex.readHexSequence(2)
|
|
curStart = lex.source.bufpos
|
|
lex.advance()
|
|
else: break
|
|
if curStart <= lex.source.bufpos - 2:
|
|
lex.evaluated.add(lex.source.buf[curStart..lex.source.bufpos - 2])
|
|
lex.endToken()
|
|
lex.cur = Token.Suffix
|
|
lex.state = afterToken
|
|
return true
|
|
|
|
proc streamEnd(lex: var Lexer): bool =
|
|
lex.startToken()
|
|
lex.endToken()
|
|
lex.cur = Token.StreamEnd
|
|
return true |