NimYAML/private/lex.nim

847 lines
25 KiB
Nim
Raw Normal View History

2016-09-10 08:30:40 +00:00
# NimYAML - YAML implementation in Nim
# (c) Copyright 2015 Felix Krause
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
2016-09-10 11:38:42 +00:00
import lexbase, streams, strutils, unicode
when defined(yamlDebug):
import terminal
export terminal
2016-09-10 08:30:40 +00:00
type
StringSource* = object
src: string
pos: int
line, lineStart: int
SourceProvider* = concept c
advance(c) is char
lexCR(c)
lexLF(c)
2016-09-11 09:28:05 +00:00
YamlLexerObj* = object
cur*: LexerToken
# ltScalarPart, ltQuotedScalar, ltYamlVersion, ltTagShorthand, ltTagUri,
# ltLiteralTag, ltTagHandle, ltAnchor, ltAlias
2016-09-10 08:30:40 +00:00
buf*: string not nil
# ltIndentation
2016-09-10 08:30:40 +00:00
indentation*: int
# ltBlockScalarHeader
moreIndented*, folded*: bool
chomp*: ChompType
# ltTagHandle
shorthandEnd*: int
# internals
source: pointer
inFlow: bool
literalEndIndent: int
nextState, lineStartState, inlineState, insideLineImpl, insideDocImpl:
LexerState
blockScalarIndent: int
2016-09-10 10:38:03 +00:00
c: char
2016-09-11 09:28:05 +00:00
YamlLexer* = ref YamlLexerObj
LexerState = proc(lex: YamlLexer): bool
2016-09-10 08:30:40 +00:00
LexerToken* = enum
ltYamlDirective, ltYamlVersion, ltTagDirective, ltTagShorthand,
ltTagUri, ltUnknownDirective, ltUnknownDirectiveParams, ltEmptyLine,
2016-09-10 08:30:40 +00:00
ltDirectivesEnd, ltDocumentEnd, ltStreamEnd, ltIndentation, ltQuotedScalar,
ltScalarPart, ltBlockScalarHeader, ltSeqItemInd, ltMapKeyInd, ltMapValInd,
2016-09-10 08:30:40 +00:00
ltBraceOpen, ltBraceClose, ltBracketOpen, ltBracketClose, ltComma,
2016-09-11 09:28:05 +00:00
ltLiteralTag, ltTagHandle, ltAnchor, ltAlias
2016-09-10 08:30:40 +00:00
YamlLexerError* = object of Exception
line*, column*: int
lineContent*: string
2016-09-11 09:28:05 +00:00
ChompType* = enum
ctKeep, ctClip, ctStrip
2016-09-10 08:30:40 +00:00
2016-09-11 09:28:05 +00:00
# consts
2016-09-10 08:30:40 +00:00
const
space = {' ', '\t'}
lineEnd = {'\l', '\c', EndOfFile}
spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile}
digits = {'0'..'9'}
flowIndicators = {'[', ']', '{', '}', ','}
2016-09-11 10:52:24 +00:00
uriChars = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':',
'@', '&', '-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')'}
2016-09-11 09:28:05 +00:00
2016-09-10 11:38:42 +00:00
UTF8NextLine = toUTF8(0x85.Rune)
UTF8NonBreakingSpace = toUTF8(0xA0.Rune)
UTF8LineSeparator = toUTF8(0x2028.Rune)
UTF8ParagraphSeparator = toUTF8(0x2029.Rune)
2016-09-11 09:28:05 +00:00
UnknownIndentation = int.low
2016-09-10 08:30:40 +00:00
2016-09-11 09:28:05 +00:00
# lexer backend implementations
template blSource(lex: YamlLexer): var BaseLexer =
(cast[ptr BaseLexer](lex.source))[]
template sSource(lex: YamlLexer): var StringSource =
(cast[ptr StringSource](lex.source))[]
2016-09-10 08:30:40 +00:00
2016-09-11 09:28:05 +00:00
proc advance(lex: YamlLexer, t: typedesc[BaseLexer], step: int = 1) {.inline.} =
lex.blSource.bufpos.inc(step)
lex.c = lex.blSource.buf[lex.blSource.bufpos]
2016-09-10 08:30:40 +00:00
2016-09-11 09:28:05 +00:00
proc advance(lex: YamlLexer, t: typedesc[StringSource], step: int = 1)
{.inline.} =
lex.sSource.pos.inc(step)
if lex.sSource.pos >= lex.sSource.src.len: lex.c = EndOfFile
else: lex.c = lex.sSource.src[lex.sSource.pos]
2016-09-10 08:30:40 +00:00
2016-09-11 09:28:05 +00:00
template lexCR(lex: YamlLexer, t: typedesc[BaseLexer]) =
lex.blSource.bufpos = lex.blSource.handleCR(lex.blSource.bufpos)
lex.c = lex.blSource.buf[lex.blSource.bufpos]
2016-09-10 08:30:40 +00:00
2016-09-11 09:28:05 +00:00
template lexCR(lex: YamlLexer, t: typedesc[StringSource]) =
lex.sSource.pos.inc()
if lex.sSource.src[lex.sSource.pos] == '\l': lex.sSource.pos.inc()
lex.sSource.lineStart = lex.sSource.pos
lex.sSource.line.inc()
lex.c = lex.sSource.src[lex.sSource.pos]
2016-09-10 10:38:03 +00:00
2016-09-11 09:28:05 +00:00
template lexLF(lex: YamlLexer, t: typedesc[BaseLexer]) =
lex.blSource.bufpos = lex.blSource.handleLF(lex.blSource.bufpos)
lex.c = lex.blSource.buf[lex.blSource.bufpos]
2016-09-10 10:38:03 +00:00
2016-09-11 09:28:05 +00:00
template lexLF(lex: YamlLexer, t: typedesc[StringSource]) =
lex.sSource.pos.inc()
lex.sSource.lineStart = lex.sSource.pos
lex.sSource.line.inc()
lex.c = lex.sSource.src[lex.sSource.pos]
2016-09-10 10:38:03 +00:00
2016-09-11 09:28:05 +00:00
template lineNumber(lex: YamlLexer, t: typedesc[BaseLexer]): int =
lex.blSource.lineNumber
2016-09-10 10:38:03 +00:00
2016-09-11 09:28:05 +00:00
template lineNumber(lex: YamlLexer, t: typedesc[StringSource]): int =
lex.sSource.line
2016-09-10 10:38:03 +00:00
2016-09-11 09:28:05 +00:00
template columnNumber(lex: YamlLexer, t: typedesc[BaseLexer]): int =
lex.blSource.getColNumber(lex.blSource.bufpos) + 1
2016-09-10 10:38:03 +00:00
2016-09-11 09:28:05 +00:00
template columnNumber(lex: YamlLexer, t: typedesc[StringSource]): int =
lex.sSource.pos - lex.sSource.lineStart + 1
template currentLine(lex: YamlLexer, t: typedesc[BaseLexer]): string =
lex.blSource.getCurrentLine(true)
template currentLine(lex: YamlLexer, t: typedesc[StringSource]): string =
2016-09-10 10:38:03 +00:00
var result = ""
2016-09-11 09:28:05 +00:00
var i = lex.sSource.lineStart
while lex.sSource.src[i] notin lineEnd:
result.add(lex.sSource.src[i])
2016-09-10 10:38:03 +00:00
inc(i)
2016-09-11 09:28:05 +00:00
result.add("\n" & spaces(lex.columnNumber(t) - 1) & "^\n")
2016-09-10 10:38:03 +00:00
result
2016-09-11 09:28:05 +00:00
proc nextIsPlainSafe(lex: YamlLexer, t: typedesc[BaseLexer], inFlow: bool):
bool {.inline.} =
case lex.blSource.buf[lex.blSource.bufpos + 1]
of spaceOrLineEnd: result = false
of flowIndicators: result = not inFlow
else: result = true
proc nextIsPlainSafe(lex: YamlLexer, t: typedesc[StringSource],
inFlow: bool): bool {.inline.} =
case lex.sSource.src[lex.sSource.pos + 1]
of spaceOrLineEnd: result = false
of flowIndicators: result = not inFlow
else: result = true
2016-09-11 10:52:24 +00:00
proc mark(lex: YamlLexer, t: typedesc[BaseLexer]): int = lex.blSource.bufpos
proc mark(lex: YamlLexer, t: typedesc[StringSource]): int = lex.sSource.pos
proc afterMark(lex: YamlLexer, t: typedesc[BaseLexer], m: int): int =
lex.blSource.bufpos - m
proc afterMark(lex: YamlLexer, t: typedesc[StringSource], m: int): int =
lex.sSource.pos - m
2016-09-11 09:28:05 +00:00
# lexer states
proc outsideDoc[T](lex: YamlLexer): bool
proc yamlVersion[T](lex: YamlLexer): bool
proc tagShorthand[T](lex: YamlLexer): bool
proc tagUri[T](lex: YamlLexer): bool
proc unknownDirParams[T](lex: YamlLexer): bool
proc expectLineEnd[T](lex: YamlLexer): bool
proc possibleDirectivesEnd[T](lex: YamlLexer): bool
proc possibleDocumentEnd[T](lex: YamlLexer): bool
proc afterSeqInd[T](lex: YamlLexer): bool
proc insideDoc[T](lex: YamlLexer): bool {.locks:0.}
proc insideLine[T](lex: YamlLexer): bool
proc plainScalarPart[T](lex: YamlLexer): bool
proc blockScalarHeader[T](lex: YamlLexer): bool
proc blockScalar[T](lex: YamlLexer): bool
2016-09-11 10:52:24 +00:00
proc tagHandle[T](lex: YamlLexer): bool
2016-09-11 11:04:10 +00:00
proc anchor[T](lex: YamlLexer): bool
proc alias[T](lex: YamlLexer): bool
proc streamEnd(lex: YamlLexer): bool
2016-09-11 09:28:05 +00:00
# implementation
template debug(message: string) {.dirty.} =
when defined(yamlDebug):
try: styledWriteLine(stdout, fgBlue, message)
except IOError: discard
proc generateError[T](lex: YamlLexer, message: string):
2016-09-10 10:38:03 +00:00
ref YamlLexerError {.raises: [].} =
result = newException(YamlLexerError, message)
2016-09-11 09:28:05 +00:00
result.line = lex.lineNumber(T)
result.column = lex.columnNumber(T)
result.lineContent = lex.currentLine(T)
2016-09-10 08:30:40 +00:00
2016-09-11 09:28:05 +00:00
proc directiveName(lex: YamlLexer, t: typedesc) =
2016-09-10 08:30:40 +00:00
while lex.c notin spaceOrLineEnd:
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(t)
2016-09-10 08:30:40 +00:00
proc yamlVersion[T](lex: YamlLexer): bool =
2016-09-10 08:30:40 +00:00
debug("lex: yamlVersion")
2016-09-11 09:28:05 +00:00
while lex.c in space: lex.advance(T)
if lex.c notin digits:
raise generateError[T](lex, "Invalid YAML version number")
2016-09-10 08:30:40 +00:00
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
while lex.c in digits:
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
if lex.c != '.': raise generateError[T](lex, "Invalid YAML version number")
2016-09-10 08:30:40 +00:00
lex.buf.add('.')
2016-09-11 09:28:05 +00:00
lex.advance(T)
if lex.c notin digits:
raise generateError[T](lex, "Invalid YAML version number")
2016-09-10 08:30:40 +00:00
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
while lex.c in digits:
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c notin spaceOrLineEnd:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Invalid YAML version number")
lex.cur = ltYamlVersion
2016-09-10 08:30:40 +00:00
result = true
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
proc tagShorthand[T](lex: YamlLexer): bool =
2016-09-10 08:30:40 +00:00
debug("lex: tagShorthand")
2016-09-11 09:28:05 +00:00
while lex.c in space: lex.advance(T)
if lex.c != '!':
raise generateError[T](lex, "Tag shorthand must start with a '!'")
2016-09-10 08:30:40 +00:00
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c in spaceOrLineEnd: discard
else:
while lex.c != '!':
case lex.c
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-':
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
else: raise generateError[T](lex, "Illegal character in tag shorthand")
2016-09-10 08:30:40 +00:00
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c notin spaceOrLineEnd:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Missing space after tag shorthand")
lex.cur = ltTagShorthand
result = true
2016-09-11 09:28:05 +00:00
lex.nextState = tagUri[T]
2016-09-10 08:30:40 +00:00
proc tagUri[T](lex: YamlLexer): bool =
2016-09-10 08:30:40 +00:00
debug("lex: tagUri")
2016-09-11 09:28:05 +00:00
while lex.c in space: lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c == '!':
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
while true:
case lex.c
of spaceOrLineEnd: break
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&',
'-', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
else: raise generateError[T](lex, "Invalid character in tag uri: " &
2016-09-10 08:30:40 +00:00
escape("" & lex.c))
lex.cur = ltTagUri
result = true
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
proc unknownDirParams[T](lex: YamlLexer): bool =
debug("lex: unknownDirParams")
2016-09-11 09:28:05 +00:00
while lex.c in space: lex.advance(T)
while lex.c notin lineEnd + {'#'}:
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
lex.cur = ltUnknownDirectiveParams
2016-09-10 08:30:40 +00:00
result = true
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
proc expectLineEnd[T](lex: YamlLexer): bool =
2016-09-10 08:30:40 +00:00
debug("lex: expectLineEnd")
result = false
2016-09-11 09:28:05 +00:00
while lex.c in space: lex.advance(T)
2016-09-10 08:30:40 +00:00
while true:
case lex.c
of '#':
2016-09-11 09:28:05 +00:00
lex.advance(T)
while lex.c notin lineEnd: lex.advance(T)
2016-09-10 08:30:40 +00:00
of EndOfFile:
2016-09-11 09:28:05 +00:00
lex.nextState = streamEnd
2016-09-10 08:30:40 +00:00
break
of '\l':
2016-09-11 09:28:05 +00:00
lex.lexLF(T)
lex.nextState = lex.lineStartState
2016-09-10 08:30:40 +00:00
break
of '\c':
2016-09-11 09:28:05 +00:00
lex.lexCR(T)
lex.nextState = lex.lineStartState
2016-09-10 08:30:40 +00:00
break
else:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Unexpected character (expected line end): " &
2016-09-10 08:30:40 +00:00
escape("" & lex.c))
proc possibleDirectivesEnd[T](lex: YamlLexer): bool =
debug("lex: possibleDirectivesEnd")
2016-09-11 09:28:05 +00:00
lex.lineStartState = insideDoc[T]
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c == '-':
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c == '-':
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c in spaceOrLineEnd:
lex.cur = ltDirectivesEnd
2016-09-11 09:28:05 +00:00
lex.nextState = insideLine[T]
return true
2016-09-10 08:30:40 +00:00
lex.buf.add('-')
lex.buf.add('-')
elif lex.c in spaceOrLineEnd:
lex.indentation = 0
lex.cur = ltIndentation
2016-09-11 09:28:05 +00:00
lex.nextState = afterSeqInd[T]
return true
2016-09-10 08:30:40 +00:00
lex.buf.add('-')
2016-09-11 09:28:05 +00:00
lex.nextState = plainScalarPart[T]
result = false
2016-09-10 08:30:40 +00:00
proc afterSeqInd[T](lex: YamlLexer): bool =
result = true
lex.cur = ltSeqItemInd
2016-09-11 09:28:05 +00:00
if lex.c notin lineEnd: lex.advance(T)
lex.nextState = insideLine[T]
proc possibleDocumentEnd[T](lex: YamlLexer): bool =
debug("lex: possibleDocumentEnd")
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c == '.':
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c == '.':
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c in spaceOrLineEnd:
lex.cur = ltDocumentEnd
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
lex.lineStartState = outsideDoc[T]
return true
2016-09-10 08:30:40 +00:00
lex.buf.add('.')
lex.buf.add('.')
lex.buf.add('.')
2016-09-11 09:28:05 +00:00
lex.nextState = plainScalarPart[T]
result = false
2016-09-10 08:30:40 +00:00
proc outsideDoc[T](lex: YamlLexer): bool =
debug("lex: outsideDoc")
2016-09-10 08:30:40 +00:00
case lex.c
of '%':
2016-09-11 09:28:05 +00:00
lex.advance(T)
lex.directiveName(T)
2016-09-10 08:30:40 +00:00
case lex.buf
2016-09-10 10:38:03 +00:00
of "YAML":
lex.cur = ltYamlDirective
lex.buf.setLen(0)
2016-09-11 09:28:05 +00:00
lex.nextState = yamlVersion[T]
2016-09-10 10:38:03 +00:00
of "TAG":
lex.buf.setLen(0)
lex.cur = ltTagDirective
2016-09-11 09:28:05 +00:00
lex.nextState = tagShorthand[T]
2016-09-10 10:38:03 +00:00
else:
lex.cur = ltUnknownDirective
2016-09-11 09:28:05 +00:00
lex.nextState = unknownDirParams[T]
2016-09-10 10:38:03 +00:00
return true
2016-09-10 08:30:40 +00:00
of '-':
2016-09-11 09:28:05 +00:00
lex.nextState = possibleDirectivesEnd[T]
return false
2016-09-10 08:30:40 +00:00
of '.':
lex.indentation = 0
2016-09-11 09:28:05 +00:00
lex.nextState = possibleDocumentEnd[T]
2016-09-10 08:30:40 +00:00
of spaceOrLineEnd + {'#'}:
lex.indentation = 0
while lex.c == ' ':
lex.indentation.inc()
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c in spaceOrLineEnd + {'#'}:
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
return false
2016-09-11 09:28:05 +00:00
lex.nextState = insideLine[T]
else:
lex.indentation = 0
2016-09-11 09:28:05 +00:00
lex.nextState = insideLine[T]
lex.lineStartState = insideDoc[T]
lex.cur = ltIndentation
2016-09-10 08:30:40 +00:00
result = true
proc insideDoc[T](lex: YamlLexer): bool =
2016-09-11 08:02:10 +00:00
debug("lex: insideDoc")
lex.indentation = 0
2016-09-10 08:30:40 +00:00
case lex.c
of '-':
2016-09-11 09:28:05 +00:00
lex.nextState = possibleDirectivesEnd[T]
return false
2016-09-11 09:28:05 +00:00
of '.': lex.nextState = possibleDocumentEnd[T]
of spaceOrLineEnd:
2016-09-10 08:30:40 +00:00
while lex.c == ' ':
lex.indentation.inc()
2016-09-11 09:28:05 +00:00
lex.advance(T)
if lex.c in spaceOrLineEnd:
lex.cur = ltEmptyLine
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
return true
else:
2016-09-11 09:28:05 +00:00
lex.nextState = lex.inlineState
else: lex.nextState = lex.inlineState
lex.cur = ltIndentation
2016-09-10 08:30:40 +00:00
result = true
2016-09-11 09:28:05 +00:00
proc possibleIndicatorChar[T](lex: YamlLexer, indicator: LexerToken,
jsonContext: bool = false): bool =
if not(jsonContext) and lex.nextIsPlainSafe(T, false):
2016-09-11 09:28:05 +00:00
lex.nextState = plainScalarPart[T]
2016-09-10 08:30:40 +00:00
result = false
else:
lex.cur = indicator
2016-09-10 08:30:40 +00:00
result = true
2016-09-11 09:28:05 +00:00
lex.advance(T)
while lex.c in space: lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c in lineEnd:
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
proc flowIndicator[T](lex: YamlLexer, indicator: LexerToken): bool {.inline.} =
lex.cur = indicator
2016-09-11 09:28:05 +00:00
lex.advance(T)
while lex.c in space: lex.advance(T)
2016-09-10 08:30:40 +00:00
if lex.c in lineEnd:
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 15:33:58 +00:00
result = true
2016-09-10 08:30:40 +00:00
2016-09-10 11:38:42 +00:00
proc addMultiple(s: var string, c: char, num: int) {.raises: [], inline.} =
for i in 1..num: s.add(c)
2016-09-10 11:38:42 +00:00
2016-09-11 09:28:05 +00:00
proc processQuotedWhitespace[T](lex: YamlLexer, newlines: var int) =
2016-09-10 11:38:42 +00:00
block outer:
let beforeSpace = lex.buf.len
while true:
case lex.c
of ' ', '\t': lex.buf.add(lex.c)
of '\l':
2016-09-11 09:28:05 +00:00
lex.lexLF(T)
2016-09-10 11:38:42 +00:00
break
of '\c':
2016-09-11 09:28:05 +00:00
lex.lexCR(T)
2016-09-10 11:38:42 +00:00
break
else: break outer
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
lex.buf.setLen(beforeSpace)
while true:
case lex.c
of ' ', '\t': discard
of '\l':
2016-09-11 09:28:05 +00:00
lex.lexLF(T)
2016-09-10 11:38:42 +00:00
newlines.inc()
continue
of '\c':
2016-09-11 09:28:05 +00:00
lex.lexCR(T)
2016-09-10 11:38:42 +00:00
newlines.inc()
continue
else:
if newlines == 0: discard
elif newlines == 1: lex.buf.add(' ')
else: lex.buf.addMultiple('\l', newlines - 1)
break
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
2016-09-11 09:28:05 +00:00
proc singleQuotedScalar[T](lex: YamlLexer) =
2016-09-10 11:38:42 +00:00
debug("lex: singleQuotedScalar")
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
while true:
case lex.c
of '\'':
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
if lex.c == '\'': lex.buf.add('\'')
else: break
2016-09-11 09:28:05 +00:00
of EndOfFile: raise generateError[T](lex, "Unfinished single quoted string")
2016-09-10 11:38:42 +00:00
of '\l', '\c', '\t', ' ':
var newlines = 1
2016-09-11 09:28:05 +00:00
processQuotedWhitespace[T](lex, newlines)
2016-09-10 11:38:42 +00:00
continue
else: lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
2016-09-11 09:28:05 +00:00
proc unicodeSequence[T](lex: YamlLexer, length: int) =
2016-09-10 11:38:42 +00:00
debug("lex: unicodeSequence")
var unicodeChar = 0.int
for i in countup(0, length - 1):
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
let digitPosition = length - i - 1
case lex.c
of EndOFFile, '\l', '\c':
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Unfinished unicode escape sequence")
2016-09-10 11:38:42 +00:00
of '0' .. '9':
unicodeChar = unicodechar or (int(lex.c) - 0x30) shl (digitPosition * 4)
of 'A' .. 'F':
unicodeChar = unicodechar or (int(lex.c) - 0x37) shl (digitPosition * 4)
of 'a' .. 'f':
unicodeChar = unicodechar or (int(lex.c) - 0x57) shl (digitPosition * 4)
else:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex,
2016-09-10 11:38:42 +00:00
"Invalid character in unicode escape sequence: " &
escape("" & lex.c))
lex.buf.add(toUTF8(Rune(unicodeChar)))
2016-09-11 09:28:05 +00:00
proc doubleQuotedScalar[T](lex: YamlLexer) =
2016-09-10 11:38:42 +00:00
debug("lex: doubleQuotedScalar")
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
while true:
case lex.c
of EndOfFile:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Unfinished double quoted string")
2016-09-10 11:38:42 +00:00
of '\\':
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
case lex.c
of EndOfFile:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Unfinished escape sequence")
2016-09-10 11:38:42 +00:00
of '0': lex.buf.add('\0')
of 'a': lex.buf.add('\x07')
of 'b': lex.buf.add('\x08')
of '\t', 't': lex.buf.add('\t')
of 'n': lex.buf.add('\l')
of 'v': lex.buf.add('\v')
of 'f': lex.buf.add('\f')
of 'r': lex.buf.add('\c')
of 'e': lex.buf.add('\e')
of ' ': lex.buf.add(' ')
of '"': lex.buf.add('"')
of '/': lex.buf.add('/')
of '\\': lex.buf.add('\\')
of 'N': lex.buf.add(UTF8NextLine)
of '_': lex.buf.add(UTF8NonBreakingSpace)
of 'L': lex.buf.add(UTF8LineSeparator)
of 'P': lex.buf.add(UTF8ParagraphSeparator)
2016-09-11 09:28:05 +00:00
of 'x': unicodeSequence[T](lex, 2)
of 'u': unicodeSequence[T](lex, 4)
of 'U': unicodeSequence[T](lex, 8)
2016-09-10 11:38:42 +00:00
of '\l', '\c':
var newlines = 0
2016-09-11 09:28:05 +00:00
processQuotedWhitespace[T](lex, newlines)
2016-09-10 11:38:42 +00:00
continue
2016-09-11 09:28:05 +00:00
else: raise generateError[T](lex, "Illegal character in escape sequence")
2016-09-10 11:38:42 +00:00
of '"':
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 11:38:42 +00:00
break
of '\l', '\c', '\t', ' ':
var newlines = 1
2016-09-11 09:28:05 +00:00
processQuotedWhitespace[T](lex, newlines)
2016-09-10 11:38:42 +00:00
continue
else: lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 10:38:03 +00:00
proc insideLine[T](lex: YamlLexer): bool =
2016-09-11 08:02:10 +00:00
debug("lex: insideLine")
2016-09-10 08:30:40 +00:00
case lex.c
of ':':
result = possibleIndicatorChar[T](lex, ltMapValInd,
lex.inFlow and
lex.cur in [ltBraceClose, ltBracketClose, ltQuotedScalar])
of '?': result = possibleIndicatorChar[T](lex, ltMapKeyInd)
of '-': result = possibleIndicatorChar[T](lex, ltSeqItemInd)
2016-09-10 08:30:40 +00:00
of lineEnd + {'#'}:
result = false
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
of '\"':
2016-09-11 09:28:05 +00:00
doubleQuotedScalar[T](lex)
lex.cur = ltQuotedScalar
2016-09-10 08:30:40 +00:00
result = true
of '\'':
2016-09-11 09:28:05 +00:00
singleQuotedScalar[T](lex)
lex.cur = ltQuotedScalar
2016-09-10 08:30:40 +00:00
result = true
of '>', '|':
2016-09-11 09:28:05 +00:00
if lex.inFlow: lex.nextState = plainScalarPart[T]
else: lex.nextState = blockScalarHeader[T]
result = false
of '{': result = flowIndicator[T](lex, ltBraceOpen)
of '}': result = flowIndicator[T](lex, ltBraceClose)
of '[': result = flowIndicator[T](lex, ltBracketOpen)
of ']': result = flowIndicator[T](lex, ltBracketClose)
of ',': result = flowIndicator[T](lex, ltComma)
2016-09-11 10:52:24 +00:00
of '!':
lex.nextState = tagHandle[T]
result = false
2016-09-11 11:04:10 +00:00
of '&':
lex.nextState = anchor[T]
result = false
of '*':
lex.nextState = alias[T]
result = false
of '@', '`':
raise generateError[T](lex,
"Reserved characters cannot start a plain scalar")
2016-09-10 08:30:40 +00:00
else:
2016-09-11 09:28:05 +00:00
lex.nextState = plainScalarPart[T]
2016-09-10 08:30:40 +00:00
result = false
proc plainScalarPart[T](lex: YamlLexer): bool =
debug("lex: plainScalarPart")
2016-09-10 08:30:40 +00:00
block outer:
while true:
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
case lex.c
of space:
let lenBeforeSpace = lex.buf.len()
while true:
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 10:38:03 +00:00
case lex.c
2016-09-10 08:30:40 +00:00
of lineEnd + {'#'}:
lex.buf.setLen(lenBeforeSpace)
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
break outer
of ':':
2016-09-11 09:28:05 +00:00
if lex.nextIsPlainSafe(T, lex.inFlow): break
2016-09-10 08:30:40 +00:00
else:
lex.buf.setLen(lenBeforeSpace)
2016-09-11 09:28:05 +00:00
lex.nextState = insideLine[T]
2016-09-10 08:30:40 +00:00
break outer
of flowIndicators:
if lex.inFlow:
lex.buf.setLen(lenBeforeSpace)
2016-09-11 09:28:05 +00:00
lex.nextState = insideLine[T]
2016-09-10 08:30:40 +00:00
break outer
else:
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
2016-09-10 08:30:40 +00:00
break
of space: discard
else: break
2016-09-10 11:38:42 +00:00
of lineEnd:
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 11:38:42 +00:00
break
2016-09-10 08:30:40 +00:00
of flowIndicators:
if lex.inFlow:
2016-09-11 09:28:05 +00:00
lex.nextState = insideLine[T]
2016-09-10 08:30:40 +00:00
break
of ':':
2016-09-11 09:28:05 +00:00
if not lex.nextIsPlainSafe(T, lex.inFlow):
lex.nextState = insideLine[T]
2016-09-10 11:38:42 +00:00
break outer
2016-09-10 08:30:40 +00:00
else: discard
lex.cur = ltScalarPart
2016-09-10 08:30:40 +00:00
result = true
proc blockScalarHeader[T](lex: YamlLexer): bool =
debug("lex: blockScalarHeader")
lex.chomp = ctClip
lex.blockScalarIndent = UnknownIndentation
lex.folded = lex.c == '>'
while true:
2016-09-11 09:28:05 +00:00
lex.advance(T)
case lex.c
of '+':
if lex.chomp != ctClip:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Only one chomping indicator is allowed")
lex.chomp = ctKeep
of '-':
if lex.chomp != ctClip:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Only one chomping indicator is allowed")
lex.chomp = ctStrip
of '1'..'9':
if lex.blockScalarIndent != UnknownIndentation:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Only one indentation indicator is allowed")
lex.blockScalarIndent = lex.indentation + ord(lex.c) - ord('\x30')
of spaceOrLineEnd: break
else:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex,
"Illegal character in block scalar header: '" & escape("" & lex.c) &
'\'')
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
lex.inlineState = blockScalar[T]
lex.cur = ltBlockScalarHeader
result = true
2016-09-11 09:28:05 +00:00
proc blockScalar[T](lex: YamlLexer): bool =
debug("lex: blockScalarLine")
2016-09-10 08:30:40 +00:00
result = false
if lex.blockScalarIndent == UnknownIndentation:
lex.blockScalarIndent = lex.indentation
elif lex.c == '#':
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
return false
elif lex.indentation < lex.blockScalarIndent:
2016-09-11 09:28:05 +00:00
raise generateError[T](lex, "Too little indentation in block scalar")
elif lex.indentation > lex.blockScalarIndent or lex.c == '\t':
lex.moreIndented = true
lex.buf.addMultiple(' ', lex.indentation - lex.blockScalarIndent)
else: lex.moreIndented = false
while lex.c notin lineEnd:
lex.buf.add(lex.c)
2016-09-11 09:28:05 +00:00
lex.advance(T)
lex.cur = ltScalarPart
result = true
2016-09-11 09:28:05 +00:00
lex.nextState = expectLineEnd[T]
2016-09-10 08:30:40 +00:00
2016-09-11 10:52:24 +00:00
proc byteSequence[T](lex: YamlLexer) =
debug("lex: byteSequence")
var charCode = 0.int8
for i in 0 .. 1:
lex.advance(T)
let digitPosition = int8(1 - i)
case lex.c
of EndOfFile, '\l', 'r':
raise generateError[T](lex, "Unfinished octet escape sequence")
of '0' .. '9':
charCode = charCode or (int8(lex.c) - 0x30.int8) shl (digitPosition * 4)
of 'A' .. 'F':
charCode = charCode or (int8(lex.c) - 0x37.int8) shl (digitPosition * 4)
of 'a' .. 'f':
charCode = charCode or (int8(lex.c) - 0x57.int8) shl (digitPosition * 4)
else:
raise generateError[T](lex, "Invalid character in octet escape sequence")
lex.buf.add(char(charCode))
proc tagHandle[T](lex: YamlLexer): bool =
debug("lex: tagHandle")
lex.advance(T)
if lex.c == '<':
lex.advance(T)
if lex.c == '!':
lex.buf.add('!')
lex.advance(T)
while true:
case lex.c
of spaceOrLineEnd: raise generateError[T](lex, "Unclosed verbatim tag")
of '%': byteSequence[T](lex)
of uriChars + {','}: lex.buf.add(lex.c)
of '>': break
else: raise generateError[T](lex, "Illegal character in verbatim tag")
lex.advance(T)
lex.advance(T)
lex.cur = ltLiteralTag
else:
lex.shorthandEnd = 0
let m = lex.mark(T)
lex.buf.add('!')
while true:
case lex.c
of spaceOrLineEnd: break
of '!':
if lex.shorthandEnd != 0:
raise generateError[T](lex, "Illegal character in tag suffix")
lex.shorthandEnd = lex.afterMark(T, m) + 1
lex.buf.add('!')
of ',':
if lex.shorthandEnd > 0: break # ',' after shorthand is flow indicator
lex.buf.add(',')
of '%':
if lex.shorthandEnd == 0:
raise generateError[T](lex, "Illegal character in tag handle")
byteSequence[T](lex)
of uriChars: lex.buf.add(lex.c)
else: raise generateError[T](lex, "Illegal character in tag handle")
lex.advance(T)
lex.cur = ltTagHandle
while lex.c in space: lex.advance(T)
if lex.c in lineEnd: lex.nextState = expectLineEnd[T]
else: lex.nextState = insideLine[T]
result = true
2016-09-11 11:04:10 +00:00
proc anchorName[T](lex: YamlLexer) =
debug("lex: anchorName")
while true:
lex.advance(T)
case lex.c
of spaceOrLineEnd, '[', ']', '{', '}', ',': break
else: lex.buf.add(lex.c)
while lex.c in space: lex.advance(T)
if lex.c in lineEnd: lex.nextState = expectLineEnd[T]
else: lex.nextState = insideLine[T]
proc anchor[T](lex: YamlLexer): bool =
debug("lex: anchor")
anchorName[T](lex)
lex.cur = ltAnchor
result = true
proc alias[T](lex: YamlLexer): bool =
debug("lex: alias")
anchorName[T](lex)
lex.cur = ltAlias
result = true
proc streamEnd(lex: YamlLexer): bool =
debug("lex: streamEnd")
lex.cur = ltStreamEnd
2016-09-11 09:28:05 +00:00
result = true
# interface
proc init*[T](lex: YamlLexer) =
lex.nextState = outsideDoc[T]
lex.lineStartState = outsideDoc[T]
lex.inlineState = insideLine[T]
lex.insideLineImpl = insideLine[T]
lex.insideDocImpl = insideDoc[T]
2016-09-11 09:28:05 +00:00
proc newYamlLexer*(source: Stream): YamlLexer =
let blSource = cast[ptr BaseLexer](alloc(sizeof(BaseLexer)))
blSource[].open(source)
new(result, proc(x: ref YamlLexerObj) {.nimcall.} =
dealloc(x.source)
)
result[] = YamlLexerObj(source: blSource, inFlow: false, buf: "",
c: blSource[].buf[blSource[].bufpos])
init[BaseLexer](result)
proc newYamlLexer*(source: string, startAt: int = 0): YamlLexer =
let sSource = cast[ptr StringSource](alloc(sizeof(StringSource)))
sSource[] =
StringSource(src: source, pos: startAt, lineStart: startAt, line: 1)
new(result, proc(x: ref YamlLexerObj) {.nimcall.} =
dealloc(x.source)
)
result[] = YamlLexerObj(buf: "", source: sSource, inFlow: false,
c: sSource.src[startAt])
init[StringSource](result)
proc next*(lex: YamlLexer) =
while not lex.nextState(lex): discard
2016-09-11 09:28:05 +00:00
proc setFlow*(lex: YamlLexer, value: bool) =
lex.inFlow = value
2016-09-11 10:52:24 +00:00
# in flow mode, no indentation tokens are generated because they are not
# necessary. actually, the lexer will behave wrongly if we do that, because
# adjacent values need to check if the preceding token was a JSON value, and
# if indentation tokens are generated, that information is not available.
# therefore, we do not use insideDoc in flow mode. another reason is that this
# would erratically check for document markers (---, ...) which are simply
# scalars in flow mode.
if value: lex.lineStartState = lex.insideLineImpl
else: lex.lineStartState = lex.insideDocImpl
2016-09-11 09:28:05 +00:00
proc endBlockScalar*(lex: YamlLexer) =
lex.inlineState = lex.insideLineImpl
lex.nextState = lex.insideLineImpl