NimYAML/private/lexer.nim

1095 lines
38 KiB
Nim

# file must be included from yaml.nim and cannot compile on its own
type
Encoding = enum
Unsupported, ## Unsupported encoding
UTF8, ## UTF-8
UTF16LE, ## UTF-16 Little Endian
UTF16BE, ## UTF-16 Big Endian
UTF32LE, ## UTF-32 Little Endian
UTF32BE ## UTF-32 Big Endian
YamlLexerToken = enum
# separating tokens
tDirectivesEnd, tDocumentEnd, tStreamEnd,
# tokens only in directives
tTagDirective, tYamlDirective, tUnknownDirective,
tVersionPart, tTagURI,
tUnknownDirectiveParam,
# tokens in directives and content
tTagHandle, tComment,
# from here on tokens only in content
tLineStart,
# control characters
tColon, tDash, tQuestionmark, tComma, tOpeningBrace,
tOpeningBracket, tClosingBrace, tClosingBracket, tPipe, tGreater,
# block scalar header
tBlockIndentationIndicator, tPlus,
# scalar content
tScalar, tScalarPart,
# tags
tVerbatimTag, tTagSuffix,
# anchoring
tAnchor, tAlias,
# error reporting
tError
YamlLexerState = enum
# initial states (not started reading any token)
ylInitial, ylInitialSpaces, ylInitialUnknown, ylInitialContent,
ylDefineTagHandleInitial, ylDefineTagURIInitial, ylInitialInLine,
ylLineEnd, ylDirectiveLineEnd,
# directive reading states
ylDirective, ylDefineTagHandle, ylDefineTagURI, ylMajorVersion,
ylMinorVersion, ylUnknownDirectiveParam, ylDirectiveComment,
# scalar reading states
ylPlainScalar, ylPlainScalarNone, ylSingleQuotedScalar,
ylDoublyQuotedScalar, ylEscape, ylBlockScalar, ylBlockScalarHeader,
ylSpaceAfterPlainScalar, ylSpaceAfterQuotedScalar,
# indentation
ylIndentation,
# comments
ylComment,
# tags
ylTagHandle, ylTagSuffix, ylVerbatimTag,
# document separation
ylDashes, ylDots,
# anchoring
ylAnchor, ylAlias
YamlTypeHintState = enum
ythInitial,
ythF, ythFA, ythFAL, ythFALS, ythFALSE,
ythN, ythNU, ythNUL, ythNULL,
ythNO,
ythO, ythON,
ythOF, ythOFF,
ythT, ythTR, ythTRU, ythTRUE,
ythY, ythYE, ythYES,
ythPoint, ythPointI, ythPointIN, ythPointINF,
ythPointN, ythPointNA, ythPointNAN,
ythLowerFA, ythLowerFAL, ythLowerFALS,
ythLowerNU, ythLowerNUL,
ythLowerOF,
ythLowerTR, ythLowerTRU,
ythLowerYE,
ythPointLowerIN, ythPointLowerN, ythPointLowerNA,
ythMinus, yth0, ythInt, ythDecimal, ythNumE, ythNumEPlusMinus,
ythExponent, ythNone
YamlLexer = object of BaseLexer
indentations: seq[int]
encoding: Encoding
charlen: int
charoffset: int
content*: string # my.content of the last returned token.
line*, column*: int
typeHint*: YamlTypeHint
const
UTF8NextLine = toUTF8(Rune(0x85))
UTF8NonBreakingSpace = toUTF8(Rune(0xA0))
UTF8LineSeparator = toUTF8(Rune(0x2028))
UTF8ParagraphSeparator = toUTF8(Rune(0x2029))
proc detect_encoding(my: var YamlLexer) =
var numBomChars = 0
my.encoding = Unsupported
if my.bufpos == 3:
# BaseLexer already skipped UTF-8 BOM
my.encoding = UTF8
else:
case my.buf[0]
of '\0':
if my.buf[1] == '\0':
if my.buf[2] == '\0':
my.encoding = UTF32LE
elif my.buf[2] == '\xFE' and my.buf[3] == '\xFF':
my.encoding = UTF32BE
numBomChars = 4
else:
# this is probably not a unicode character stream,
# but we just use the next match in the table
my.encoding = UTF16BE
else:
# this is how a BOM-less UTF16BE input should actually look like
my.encoding = UTF16BE
of '\xFF':
case my.buf[1]
of '\xFE':
if my.buf[2] == '\0' and my.buf[3] == '\0':
my.encoding = UTF32LE
numBomChars = 4
else:
my.encoding = UTF16LE
numBomChars = 2
of '\0':
my.encoding = UTF16LE
else:
my.encoding = UTF8
of '\xFE':
case my.buf[1]
of '\xFF':
my.encoding = UTF16BE
numBomChars = 2
of '\0':
my.encoding = UTF16LE
else:
my.encoding = UTF8
else:
if my.buf[1] == '\0':
my.encoding = UTF16LE
else:
my.encoding = UTF8
inc(my.bufPos, numBomChars)
my.charlen = case my.encoding
of UTF8, Unsupported: 1
of UTF16LE, UTF16BE: 2
of UTF32LE, UTF32BE: 4
my.charoffset = case my.encoding
of UTF8, Unsupported, UTF16LE, UTF32LE: 0
of UTF16BE: 1
of UTF32BE: 3
proc open(my: var YamlLexer, input: Stream) =
lexbase.open(my, input)
my.indentations = newSeq[int]()
my.detect_encoding()
my.content = ""
my.line = 0
my.column = 0
template yieldToken(kind: YamlLexerToken) {.dirty.} =
when defined(yamlDebug):
if kind == tScalar:
echo "Lexer token: tScalar(\"", my.content, "\")"
else:
echo "Lexer token: ", kind
yield kind
my.content = ""
template yieldScalarPart() {.dirty.} =
case typeHintState
of ythNULL:
my.typeHint = yTypeNull
of ythTRUE, ythON, ythYES, ythY:
my.typeHint = yTypeBoolTrue
of ythFALSE, ythOFF, ythNO, ythN:
my.typeHint = yTypeBoolFalse
of ythInt, yth0:
my.typeHint = yTypeInteger
of ythDecimal, ythExponent:
my.typeHint = yTypeFloat
of ythPointINF:
my.typeHint = yTypeFloatInf
of ythPointNAN:
my.typeHint = yTypeFloatNaN
else:
my.typeHint = yTypeUnknown
when defined(yamlDebug):
echo "Lexer token: tScalarPart(\"", my.content, "\".", typeHintState,
")"
yield tScalarPart
my.content = ""
template yieldLexerError(message: string) {.dirty.} =
when defined(yamlDebug):
echo "Lexer error: " & message
my.content = message
yield tError
my.content = ""
template handleCR() {.dirty.} =
my.bufpos = lexbase.handleCR(my, my.bufpos + my.charoffset) + my.charlen -
my.charoffset - 1
my.line.inc()
curPos = 0
template handleLF() {.dirty.} =
my.bufpos = lexbase.handleLF(my, my.bufpos + my.charoffset) +
my.charlen - my.charoffset - 1
my.line.inc()
curPos = 0
template `or`(r: Rune, i: int): Rune =
cast[Rune](cast[int](r) or i)
macro typeHintStateMachine(c: untyped, content: untyped): stmt =
assert content.kind == nnkStmtList
result = newNimNode(nnkCaseStmt, content).add(copyNimNode(c))
for branch in content.children:
assert branch.kind == nnkOfBranch
var
charBranch = newNimNode(nnkOfBranch, branch)
i = 0
stateBranches = newNimNode(nnkCaseStmt, branch).add(
newIdentNode("typeHintState"))
while branch[i].kind != nnkStmtList:
charBranch.add(copyNimTree(branch[i]))
inc(i)
for rule in branch[i].children:
assert rule.kind == nnkInfix
assert ($rule[0].ident == "=>")
var stateBranch = newNimNode(nnkOfBranch, rule)
case rule[1].kind
of nnkBracket:
for item in rule[1].children:
stateBranch.add(item)
of nnkIdent:
stateBranch.add(rule[1])
else:
assert false
if rule[2].kind == nnkNilLit:
stateBranch.add(newStmtList(newNimNode(nnkDiscardStmt).add(
newEmptyNode())))
else:
stateBranch.add(newStmtList(newAssignment(
newIdentNode("typeHintState"), copyNimTree(rule[2]))))
stateBranches.add(stateBranch)
stateBranches.add(newNimNode(nnkElse).add(newStmtList(newAssignment(
newIdentNode("typeHintState"), newIdentNode("ythNone")),
newAssignment(newIdentNode("state"),
newIdentNode("ylPlainScalarNone")))))
charBranch.add(newStmtList(stateBranches))
result.add(charBranch)
result.add(newNimNode(nnkElse).add(newStmtList(newAssignment(
newIdentNode("typeHintState"), newIdentNode("ythNone")),
newAssignment(newIdentNode("state"),
newIdentNode("ylPlainScalarNone")))))
template advanceTypeHint(ch: char) {.dirty.} =
typeHintStateMachine ch:
of '.':
[yth0, ythInt] => ythDecimal
[ythInitial, ythMinus] => ythPoint
of '+': ythNumE => ythNumEPlusMinus
of '-':
ythInitial => ythMinus
ythNumE => ythNumEPlusMinus
of '0':
[ythInitial, ythMinus] => yth0
[ythNumE, ythNumEPlusMinus] => ythExponent
of '1'..'9':
[ythInitial, ythMinus] => ythInt
[ythNumE, ythNumEPlusMinus] => ythExponent
[ythInt, ythDecimal, ythExponent] => nil
of 'a':
ythF => ythLowerFA
ythPointN => ythPointNA
ythPointLowerN => ythPointLowerNA
of 'A':
ythF => ythFA
ythPointN => ythPointNA
of 'e':
[yth0, ythInt, ythDecimal] => ythNumE
ythLowerFALS => ythFALSE
ythLowerTRU => ythTRUE
ythY => ythLowerYE
of 'E':
[yth0, ythInt, ythDecimal] => ythNumE
ythFALS => ythFALSE
ythTRU => ythTRUE
ythY => ythYE
of 'f':
ythInitial => ythF
ythO => ythLowerOF
ythLowerOF => ythOFF
ythPointLowerIN => ythPointINF
of 'F':
ythInitial => ythF
ythO => ythOF
ythOF => ythOFF
ythPointIN => ythPointINF
of 'i', 'I': ythPoint => ythPointI
of 'l':
ythLowerNU => ythLowerNUL
ythLowerNUL => ythNULL
ythLowerFA => ythLowerFAL
of 'L':
ythNU => ythNUL
ythNUL => ythNULL
ythFA => ythFAL
of 'n':
ythInitial => ythN
ythO => ythON
ythPoint => ythPointLowerN
ythPointI => ythPointLowerIN
ythPointLowerNA => ythPointNAN
of 'N':
ythInitial => ythN
ythO => ythON
ythPoint => ythPointN
ythPointI => ythPointIN
ythPointNA => ythPointNAN
of 'o', 'O':
ythInitial => ythO
ythN => ythNO
of 'r': ythT => ythLowerTR
of 'R': ythT => ythTR
of 's':
ythLowerFAL => ythLowerFALS
ythLowerYE => ythYES
of 'S':
ythFAL => ythFALS
ythYE => ythYES
of 't', 'T': ythInitial => ythT
of 'u':
ythN => ythLowerNU
ythLowerTR => ythLowerTRU
of 'U':
ythN => ythNU
ythTR => ythTRU
of 'y', 'Y': ythInitial => ythY
iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
var
# the following three values are used for parsing escaped unicode chars
unicodeChar: Rune = cast[Rune](0)
escapeLength = 0
expectedEscapeLength = 0
trailingSpace = ""
# used to temporarily store whitespace after a plain scalar
lastSpecialChar: char = '\0'
# stores chars that behave differently dependent on the following
# char. handling will be deferred to next loop iteration.
flowDepth = 0
# Lexer must know whether it parses block or flow style. Therefore,
# it counts the number of open flow arrays / maps here
state: YamlLexerState = ylInitial # lexer state
typeHintState: YamlTypeHintState = ythInitial
# for giving type hints of plain scalars
lastIndentationLength = 0
# after parsing the indentation of the line, this will hold the
# indentation length of the current line. Needed for checking where
# a block scalar ends.
blockScalarIndentation = -1
# when parsing a block scalar, this will be set to the indentation
# of the line that starts the flow scalar.
curPos = 0
while true:
let c = my.buf[my.bufpos + my.charoffset]
case state
of ylInitial:
case c
of '%':
state = ylDirective
continue
of ' ', '\t':
state = ylInitialSpaces
continue
of '#':
state = ylDirectiveComment
else:
state = ylInitialContent
continue
of ylInitialSpaces:
case c
of ' ', '\t':
my.content.add(c)
of '#':
my.content = ""
state = ylDirectiveComment
of EndOfFile, '\r', '\x0A':
state = ylDirectiveLineEnd
continue
else:
state = ylIndentation
continue
of ylInitialContent:
case c
of '-':
my.column = 0
state = ylDashes
continue
of '.':
yieldToken(tLineStart)
my.column = 0
state = ylDots
continue
else:
state = ylIndentation
continue
of ylDashes:
case c
of '-':
my.content.add(c)
of ' ', '\t', '\r', '\x0A', EndOfFile:
case my.content.len
of 3:
yieldToken(tDirectivesEnd)
state = ylInitialInLine
of 1:
my.content = ""
yieldToken(tLineStart)
lastSpecialChar = '-'
state = ylInitialInLine
else:
let tmp = my.content
my.content = ""
yieldToken(tLineStart)
my.content = tmp
my.column = curPos
state = ylPlainScalarNone
typeHintState = ythNone
continue
else:
let tmp = my.content
my.content = ""
yieldToken(tLineStart)
my.content = tmp
if my.content.len == 1:
typeHintState = ythMinus
state = ylPlainScalar
else:
typeHintState = ythNone
state = ylPlainScalarNone
continue
of ylDots:
case c
of '.':
my.content.add(c)
of ' ', '\t', '\r', '\x0A', EndOfFile:
case my.content.len
of 3:
yieldToken(tDocumentEnd)
state = ylDirectiveLineEnd
else:
state = ylPlainScalarNone
typeHintState = ythNone
continue
else:
state = ylPlainScalarNone
typeHintState = ythNone
continue
of ylDirectiveLineEnd:
case c
of '\r':
handleCR()
state = ylInitial
continue
of '\x0A':
handleLF()
state = ylInitial
continue
of EndOfFile:
yieldToken(tStreamEnd)
break
of ' ', '\t':
discard
of '#':
state = ylDirectiveComment
else:
yieldLexerError("Unexpected content at end of directive: " & c)
of ylLineEnd:
case c
of '\r':
handleCR()
of '\x0A':
handleLF()
of EndOfFile:
yieldToken(tStreamEnd)
break
else:
yieldLexerError("Internal error: Unexpected char at line end: " & c)
state = ylInitialContent
continue
of ylSingleQuotedScalar:
if lastSpecialChar != '\0':
# ' is the only special char
case c
of '\'':
my.content.add(c)
lastSpecialChar = '\0'
of EndOfFile, '\r', '\x0A':
yieldToken(tScalar)
lastSpecialChar = '\0'
state = ylLineEnd
continue
else:
yieldToken(tScalar)
lastSpecialChar = '\0'
state = ylSpaceAfterQuotedScalar
continue
else:
case c
of '\'':
lastSpecialChar = c
of EndOfFile:
yieldLexerError("Unterminated single quoted string")
yieldToken(tStreamEnd)
break
else:
my.content.add(c)
of ylDoublyQuotedScalar:
case c
of '"':
yieldToken(tScalar)
state = ylSpaceAfterQuotedScalar
of EndOfFile:
yieldLexerError("Unterminated doubly quoted string")
yieldToken(tStreamEnd)
break
of '\\':
state = ylEscape
escapeLength = 0
of '\r':
my.content.add("\x0A")
handleCR()
of '\x0A':
my.content.add(c)
handleLF()
else:
my.content.add(c)
of ylEscape:
if escapeLength == 0:
expectedEscapeLength = 0
case c
of EndOfFile:
yieldLexerError("Unterminated doubly quoted string")
of '0': my.content.add('\0')
of 'a': my.content.add('\x07')
of 'b': my.content.add('\x08')
of '\t', 't': my.content.add('\t')
of 'n': my.content.add('\x0A')
of 'v': my.content.add('\v')
of 'f': my.content.add('\f')
of 'r': my.content.add('\r')
of 'e': my.content.add('\e')
of ' ': my.content.add(' ')
of '"': my.content.add('"')
of '/': my.content.add('/')
of '\\': my.content.add('\\')
of 'N': my.content.add(UTF8NextLine)
of '_': my.content.add(UTF8NonBreakingSpace)
of 'L': my.content.add(UTF8LineSeparator)
of 'P': my.content.add(UTF8ParagraphSeparator)
of 'x': unicodeChar = cast[Rune](0); expectedEscapeLength = 3
of 'u': unicodeChar = cast[Rune](0); expectedEscapeLength = 5
of 'U': unicodeChar = cast[Rune](0); expectedEscapeLength = 9
else:
yieldLexerError("Unsupported escape sequence: \\" & c)
if expectedEscapeLength == 0: state = ylDoublyQuotedScalar
else:
let digitPosition = expectedEscapeLength - escapeLength - 1
case c
of EndOFFile:
yieldLexerError("Unterminated escape sequence")
state = ylLineEnd
continue
of '0' .. '9':
unicodeChar = unicodechar or
(cast[int](c) - 0x30) shl (digitPosition * 4)
of 'A' .. 'F':
unicodeChar = unicodechar or
(cast[int](c) - 0x37) shl (digitPosition * 4)
of 'a' .. 'f':
unicodeChar = unicodechar or
(cast[int](c) - 0x57) shl (digitPosition * 4)
else:
yieldLexerError("unsupported char in unicode escape sequence: " &
c)
escapeLength = 0
state = ylDoublyQuotedScalar
continue
inc(escapeLength)
if escapeLength == expectedEscapeLength and escapeLength > 0:
my.content.add(toUTF8(unicodeChar))
state = ylDoublyQuotedScalar
of ylSpaceAfterQuotedScalar:
case c
of ' ', '\t':
trailingSpace.add(c)
of '#':
if trailingSpace.len > 0:
yieldLexerError("Missing space before comment start")
state = ylComment
trailingSpace = ""
else:
trailingSpace = ""
state = ylInitialInLine
continue
of ylPlainScalar:
case c
of EndOfFile, '\r', '\x0A':
yieldScalarPart()
state = ylLineEnd
continue
of ':':
lastSpecialChar = c
state = ylSpaceAfterPlainScalar
of ' ':
state = ylSpaceAfterPlainScalar
continue
of ',':
if flowDepth > 0:
lastSpecialChar = c
state = ylSpaceAfterPlainScalar
else:
my.content.add(c)
state = ylPlainScalarNone
typeHintState = ythNone
of '[', ']', '{', '}':
yieldScalarPart()
state = ylInitialInLine
continue
else:
advanceTypeHint(c)
my.content.add(c)
of ylPlainScalarNone:
case c
of EndOfFile, '\r', '\x0A':
yieldScalarPart()
state = ylLineEnd
continue
of ':':
lastSpecialChar = c
state = ylSpaceAfterPlainScalar
of ' ':
state = ylSpaceAfterPlainScalar
continue
of ',':
if flowDepth > 0:
lastSpecialChar = c
state = ylSpaceAfterPlainScalar
else:
my.content.add(c)
of '[', ']', '{', '}':
yieldScalarPart()
state = ylInitialInLine
continue
else:
my.content.add(c)
of ylSpaceAfterPlainScalar:
if lastSpecialChar != '\0':
case c
of ' ', '\t', EndOfFile, '\r', '\x0A':
yieldScalarPart()
state = ylInitialInLine
else:
my.content.add(trailingSpace)
my.content.add(lastSpecialChar)
lastSpecialChar = '\0'
trailingSpace = ""
state = ylPlainScalarNone
typeHintState = ythNone
continue
case c
of EndOfFile, '\r', '\x0A':
trailingSpace = ""
yieldScalarPart()
state = ylLineEnd
continue
of ' ', '\t':
trailingSpace.add(c)
of ',':
if flowDepth > 0:
lastSpecialChar = c
else:
my.content.add(trailingSpace)
my.content.add(c)
trailingSpace = ""
state = ylPlainScalarNone
typeHintState = ythNone
of ':', '#':
lastSpecialChar = c
of '[', ']', '{', '}':
yieldScalarPart()
trailingSpace = ""
state = ylInitialInLine
continue
else:
my.content.add(trailingSpace)
my.content.add(c)
trailingSpace = ""
state = ylPlainScalarNone
typeHintState = ythNone
of ylInitialInLine:
if lastSpecialChar != '\0':
my.column = curPos - 1
case c
of ' ', '\t', '\r', '\x0A', EndOfFile:
case lastSpecialChar
of '#':
my.content = "#"
state = ylComment
of ':':
yieldToken(tColon)
of '?':
yieldToken(tQuestionmark)
of '-':
yieldToken(tDash)
of ',':
yieldToken(tComma)
of '!':
my.content = "!"
yieldToken(tTagHandle)
my.content = ""
yieldToken(tTagSuffix)
else:
yieldLexerError("Unexpected special char: \"" &
lastSpecialChar & "\"")
lastSpecialChar = '\0'
elif lastSpecialChar == '!':
case c
of '<':
state = ylVerbatimTag
lastSpecialChar = '\0'
my.bufpos += my.charlen
else:
state = ylTagHandle
my.content = "!"
lastSpecialChar = '\0'
my.column = curPos - 1
else:
my.content.add(lastSpecialChar)
advanceTypeHint(lastSpecialChar)
lastSpecialChar = '\0'
my.column = curPos - 1
state = ylPlainScalar
typeHintState = ythInitial
continue
case c
of '\r', '\x0A', EndOfFile:
state = ylLineEnd
continue
of ',':
if flowDepth > 0:
yieldToken(tComma)
else:
my.content = "" & c
my.column = curPos
state = ylPlainScalar
typeHintState = ythInitial
advanceTypeHint(c)
of '[':
inc(flowDepth)
yieldToken(tOpeningBracket)
of '{':
inc(flowDepth)
yieldToken(tOpeningBrace)
of ']':
yieldToken(tClosingBracket)
if flowDepth > 0:
inc(flowDepth, -1)
of '}':
yieldToken(tClosingBrace)
if flowDepth > 0:
inc(flowDepth, -1)
of '#':
lastSpecialChar = '#'
of '"':
my.column = curPos
state = ylDoublyQuotedScalar
of '\'':
my.column = curPos
state = ylSingleQuotedScalar
of '!':
my.column = curPos
lastSpecialChar = '!'
of '&':
my.column = curPos
state = ylAnchor
of '*':
my.column = curPos
state = ylAlias
of ' ':
discard
of '-':
if flowDepth == 0:
lastSpecialChar = '-'
else:
my.content = "" & c
my.column = curPos
state = ylPlainScalar
typeHintState = ythInitial
advanceTypeHint(c)
of '?', ':':
my.column = curPos
lastSpecialChar = c
of '|':
yieldToken(tPipe)
state = ylBlockScalarHeader
of '>':
yieldToken(tGreater)
state = ylBlockScalarHeader
of '\t':
discard
else:
my.content = "" & c
my.column = curPos
state = ylPlainScalar
typeHintState = ythInitial
advanceTypeHint(c)
of ylComment, ylDirectiveComment:
case c
of EndOfFile, '\r', '\x0A':
yieldToken(tComment)
case state
of ylComment:
state = ylLineEnd
of ylDirectiveComment:
state = ylDirectiveLineEnd
else:
yieldLexerError("Should never happen")
continue
else:
my.content.add(c)
of ylIndentation:
case c
of EndOfFile, '\r', '\x0A':
lastIndentationLength = my.content.len
yieldToken(tLineStart)
state = ylLineEnd
continue
of ' ':
my.content.add(' ')
else:
lastIndentationLength = my.content.len
yieldToken(tLineStart)
if blockScalarIndentation != -1:
if lastIndentationLength <= blockScalarIndentation:
blockScalarIndentation = -1
else:
state = ylBlockScalar
continue
state = ylInitialInLine
continue
of ylTagHandle:
case c
of '!':
my.content.add(c)
yieldToken(tTagHandle)
state = ylTagSuffix
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-':
my.content.add(c)
of ' ', '\t', EndOfFile, '\r', '\x0A':
var suffix = my.content[1..^1]
my.content = "!"
yieldToken(tTagHandle)
my.content = suffix
yieldToken(tTagSuffix)
state = ylInitialInLine
continue
else:
yieldLexerError("Invalid character in tag handle: " & c)
my.content = ""
state = ylInitialInLine
of ylTagSuffix:
case c
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@',
'&', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
my.content.add(c)
of ' ', '\t', EndOfFile, '\r', '\x0A':
yieldToken(tTagSuffix)
state = ylInitialInLine
continue
else:
yieldLexerError("Invalid character in tag suffix: " & c)
state = ylInitialInLine
of ylVerbatimTag:
case c
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@',
'&', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
my.content.add(c)
of '>':
yieldToken(tVerbatimTag)
state = ylInitialInLine
of EndOfFile, '\r', '\x0A':
yieldLexerError("Unfinished verbatim tag")
state = ylLineEnd
continue
else:
yieldLexerError("Invalid character in tag URI: " & c)
my.content = ""
state = ylInitialInLine
of ylDirective:
case c
of ' ', '\t', '\r', '\x0A', EndOfFile:
if my.content == "%YAML":
yieldToken(tYamlDirective)
state = ylMajorVersion
elif my.content == "%TAG":
yieldToken(tTagDirective)
state = ylDefineTagHandleInitial
else:
yieldToken(tUnknownDirective)
state = ylInitialUnknown
if c == EndOfFile:
continue
else:
my.content.add(c)
of ylInitialUnknown:
case c
of ' ', '\t':
discard
of '\r', '\x0A', EndOfFile:
state = ylDirectiveLineEnd
continue
of '#':
state = ylDirectiveComment
continue
else:
state = ylUnknownDirectiveParam
continue
of ylUnknownDirectiveParam:
case c
of '\r', '\x0A', EndOfFile, ' ', '\t':
yieldToken(tUnknownDirectiveParam)
state = ylInitialUnknown
continue
else:
my.content.add(c)
of ylMajorVersion:
case c
of '0' .. '9':
my.content.add(c)
of '.':
yieldToken(tVersionPart)
state = ylMinorVersion
of EndOfFile, '\r', '\x0A', ' ', '\t':
yieldLexerError("Missing YAML minor version.")
state = ylDirectiveLineEnd
continue
else:
yieldLexerError("Invalid character in YAML version: " & c)
state = ylInitialUnknown
of ylMinorVersion:
case c
of '0' .. '9':
my.content.add(c)
of EndOfFile, '\r', '\x0A', ' ', '\t':
yieldToken(tVersionPart)
state = ylDirectiveLineEnd
continue
else:
yieldLexerError("Invalid character in YAML version: " & c)
state = ylInitialUnknown
of ylDefineTagHandleInitial:
case c
of ' ', '\t':
discard
of EndOfFile, '\r', '\x0A':
yieldLexerError("Unfinished %TAG directive")
state = ylDirectiveLineEnd
continue
of '!':
my.content.add(c)
state = ylDefineTagHandle
else:
yieldLexerError("Unexpected character in %TAG directive: " & c)
state = ylInitialInLine
of ylDefineTagHandle:
case c
of '!':
my.content.add(c)
yieldToken(tTagHandle)
state = ylDefineTagURIInitial
of 'a' .. 'z', 'A' .. 'Z', '-':
my.content.add(c)
of EndOfFile, '\r', '\x0A':
yieldLexerError("Unfinished %TAG directive")
state = ylDirectiveLineEnd
continue
else:
yieldLexerError("Unexpected char in %TAG directive: " & c)
state = ylInitialInLine
of ylDefineTagURIInitial:
case c
of '\t', ' ':
my.content.add(c)
of '\x0A', '\r', EndOfFile:
yieldLexerError("Unfinished %TAG directive")
state = ylDirectiveLineEnd
continue
else:
if my.content.len == 0:
yieldLexerError("Missing whitespace in %TAG directive")
my.content = ""
state = ylDefineTagURI
continue
of ylDefineTagURI:
case c
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@',
'&', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
my.content.add(c)
of '\x0A', '\r', EndOfFile, ' ', '\t':
yieldToken(tTagURI)
state = ylDirectiveLineEnd
continue
else:
yieldLexerError("Invalid URI character: " & c)
state = ylInitialInLine
continue
of ylBlockScalarHeader:
case c
of '0' .. '9':
my.content = "" & c
yieldToken(tBlockIndentationIndicator)
of '+':
yieldToken(tPlus)
of '-':
yieldToken(tDash)
of '\r', '\x0A', EndOfFile:
blockScalarIndentation = lastIndentationLength
state = ylLineEnd
continue
else:
yieldLexerError("Unexpected character in block scalar header: " & c)
of ylBlockScalar:
case c
of EndOfFile, '\r', '\x0A':
yieldScalarPart()
state = ylLineEnd
continue
else:
my.content.add(c)
of ylAnchor:
case c
of EndOfFile, '\r', '\x0A', ' ', '\t', '{', '}', '[', ']':
yieldToken(tAnchor)
state = ylInitialInLine
continue
else:
my.content.add(c)
of ylAlias:
if lastSpecialChar != '\0':
case c
of EndOfFile, '\r', '\x0A', ' ', '\t', '{', '}', '[', ']':
yieldToken(tAlias)
state = ylInitialInLine
continue
else:
my.content.add(lastSpecialChar)
lastSpecialChar = '\0'
case c
of EndOfFile, '\r', '\x0A', ' ', '\t', '{', '}', '[', ']':
yieldToken(tAlias)
state = ylInitialInLine
continue
of ':':
lastSpecialChar = ':'
of ',':
if flowDepth > 0:
yieldToken(tAlias)
state = ylInitialInLine
continue
my.content.add(c)
else:
my.content.add(c)
my.bufpos += my.charlen
curPos.inc