started patching parse.nim

This commit is contained in:
Felix Krause 2016-09-12 18:04:26 +02:00
parent a1f900ae44
commit 7376af7d6f
2 changed files with 90 additions and 762 deletions

View File

@ -5,70 +5,37 @@
# distribution, for details about the copyright. # distribution, for details about the copyright.
type type
ScalarType = enum FastParseLevelKind = enum
stFlow, stLiteral, stFolded fplUnknown, fplSequence, fplMapKey, fplMapValue, fplSinglePairKey,
fplSinglePairValue, fplScalar, fplDocument
LexedDirective = enum FastParseLevel = object
ldYaml, ldTag, ldUnknown kind: FastParseLevelKind
indentation: int
YamlContext = enum
cBlock, cFlow
ChompType = enum
ctKeep, ctClip, ctStrip
ParserContext = ref object of YamlStream ParserContext = ref object of YamlStream
p: YamlParser p: YamlParser
lex: YamlLexer
storedState: proc(s: YamlStream, e: var YamlStreamEvent): bool storedState: proc(s: YamlStream, e: var YamlStreamEvent): bool
scalarType: ScalarType
chomp: ChompType
atSequenceItem: bool atSequenceItem: bool
recentWasMoreIndented: bool
flowdepth: int flowdepth: int
explicitFlowKey: bool
content, after: string
ancestry: seq[FastParseLevel] ancestry: seq[FastParseLevel]
level: FastParseLevel level: FastParseLevel
tagUri: string
tag: TagId tag: TagId
anchor: AnchorId anchor: AnchorId
shorthands: Table[string, string] shorthands: Table[string, string]
nextAnchorId: AnchorId nextAnchorId: AnchorId
newlines: int newlines: int
indentation: int
LevelEndResult = enum LevelEndResult = enum
lerNothing, lerOne, lerAdditionalMapEnd lerNothing, lerOne, lerAdditionalMapEnd
const
space = {' ', '\t'}
lineEnd = {'\l', '\c', EndOfFile}
spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile}
digits = {'0'..'9'}
flowIndicators = {'[', ']', '{', '}', ','}
UTF8NextLine = toUTF8(0x85.Rune)
UTF8NonBreakingSpace = toUTF8(0xA0.Rune)
UTF8LineSeparator = toUTF8(0x2028.Rune)
UTF8ParagraphSeparator = toUTF8(0x2029.Rune)
UnknownIndentation = int.low
proc newYamlParser*(tagLib: TagLibrary = initExtendedTagLibrary(), proc newYamlParser*(tagLib: TagLibrary = initExtendedTagLibrary(),
callback: WarningCallback = nil): YamlParser = callback: WarningCallback = nil): YamlParser =
new(result) new(result)
result.tagLib = tagLib result.tagLib = tagLib
result.callback = callback result.callback = callback
proc getLineNumber*(p: YamlParser): int = p.lexer.lineNumber
proc getColNumber*(p: YamlParser): int = p.tokenstart + 1 # column is 1-based
proc getLineContent*(p: YamlParser, marker: bool = true): string =
result = p.lexer.getCurrentLine(false)
if marker: result.add(repeat(' ', p.tokenstart) & "^\n")
proc lexer(c: ParserContext): var BaseLexer {.inline.} = c.p.lexer
template debug(message: string) {.dirty.} = template debug(message: string) {.dirty.} =
when defined(yamlDebug): when defined(yamlDebug):
try: styledWriteLine(stdout, fgBlue, message) try: styledWriteLine(stdout, fgBlue, message)
@ -77,31 +44,8 @@ template debug(message: string) {.dirty.} =
proc generateError(c: ParserContext, message: string): proc generateError(c: ParserContext, message: string):
ref YamlParserError {.raises: [].} = ref YamlParserError {.raises: [].} =
result = newException(YamlParserError, message) result = newException(YamlParserError, message)
result.line = c.lexer.lineNumber (result.line, result.column) = c.lex.curStartPos
result.column = c.p.tokenstart + 1 result.lineContent = c.lex.getTokenLine()
result.lineContent = c.p.getLineContent(true)
proc generateError(lx: BaseLexer, message: string):
ref YamlParserError {.raises: [].} =
result = newException(YamlParserError, message)
result.line = lx.lineNumber
result.column = lx.bufpos + 1
result.lineContent = lx.getCurrentLine(false) &
repeat(' ', lx.getColNumber(lx.bufpos)) & "^\n"
template lexCR(lexer: BaseLexer) {.dirty.} =
try: lexer.bufpos = lexer.handleCR(lexer.bufpos)
except:
var e = generateError(lexer, "I/O Error: " & getCurrentExceptionMsg())
e.parent = getCurrentException()
raise e
template lexLF(lexer: BaseLexer) {.dirty.} =
try: lexer.bufpos = lexer.handleLF(lexer.bufpos)
except:
var e = generateError(lexer, "I/O Error: " & getCurrentExceptionMsg())
e.parent = getCurrentException()
raise e
proc callCallback(c: ParserContext, msg: string) {.raises: [YamlParserError].} = proc callCallback(c: ParserContext, msg: string) {.raises: [YamlParserError].} =
try: try:
@ -114,12 +58,6 @@ proc callCallback(c: ParserContext, msg: string) {.raises: [YamlParserError].} =
e.parent = getCurrentException() e.parent = getCurrentException()
raise e raise e
proc addMultiple(s: var string, c: char, num: int) {.raises: [], inline.} =
for i in 1..num:
s.add(c)
proc reset(buffer: var string) {.raises: [], inline.} = buffer.setLen(0)
proc initLevel(k: FastParseLevelKind): FastParseLevel {.raises: [], inline.} = proc initLevel(k: FastParseLevelKind): FastParseLevel {.raises: [], inline.} =
FastParseLevel(kind: k, indentation: UnknownIndentation) FastParseLevel(kind: k, indentation: UnknownIndentation)
@ -130,18 +68,12 @@ proc emptyScalar(c: ParserContext): YamlStreamEvent {.raises: [], inline.} =
proc currentScalar(c: ParserContext): YamlStreamEvent {.raises: [], inline.} = proc currentScalar(c: ParserContext): YamlStreamEvent {.raises: [], inline.} =
result = YamlStreamEvent(kind: yamlScalar, scalarTag: c.tag, result = YamlStreamEvent(kind: yamlScalar, scalarTag: c.tag,
scalarAnchor: c.anchor, scalarContent: c.content) scalarAnchor: c.anchor)
shallowCopy(result.scalarContent, c.lex.buf)
c.lex.buf = newStringOfCap(256)
c.tag = yTagQuestionMark c.tag = yTagQuestionMark
c.anchor = yAnchorNone c.anchor = yAnchorNone
proc handleLineEnd(c: ParserContext, incNewlines: static[bool]): bool =
case c.lexer.buf[c.lexer.bufpos]
of '\l': c.lexer.lexLF()
of '\c': c.lexer.lexCR()
of EndOfFile: return true
else: discard
when incNewlines: c.newlines.inc()
proc objectStart(c: ParserContext, k: static[YamlStreamEventKind], proc objectStart(c: ParserContext, k: static[YamlStreamEventKind],
single: bool = false): YamlStreamEvent {.raises: [].} = single: bool = false): YamlStreamEvent {.raises: [].} =
yAssert(c.level.kind == fplUnknown) yAssert(c.level.kind == fplUnknown)
@ -181,607 +113,58 @@ proc initDocValues(c: ParserContext) {.raises: [].} =
c.anchor = yAnchorNone c.anchor = yAnchorNone
c.ancestry.add(FastParseLevel(kind: fplDocument, indentation: -1)) c.ancestry.add(FastParseLevel(kind: fplDocument, indentation: -1))
proc startToken(c: ParserContext) {.raises: [], inline.} =
c.p.tokenstart = c.lexer.getColNumber(c.lexer.bufpos)
proc anchorName(c: ParserContext) {.raises: [].} =
debug("lex: anchorName")
while true:
c.lexer.bufpos.inc()
let ch = c.lexer.buf[c.lexer.bufpos]
case ch
of spaceOrLineEnd, '[', ']', '{', '}', ',': break
else: c.content.add(ch)
proc handleAnchor(c: ParserContext) {.raises: [YamlParserError].} = proc handleAnchor(c: ParserContext) {.raises: [YamlParserError].} =
c.startToken()
if c.level.kind != fplUnknown: raise c.generateError("Unexpected token") if c.level.kind != fplUnknown: raise c.generateError("Unexpected token")
if c.anchor != yAnchorNone: if c.anchor != yAnchorNone:
raise c.generateError("Only one anchor is allowed per node") raise c.generateError("Only one anchor is allowed per node")
c.content.reset()
c.anchorName()
c.anchor = c.nextAnchorId c.anchor = c.nextAnchorId
c.p.anchors[c.content] = c.anchor c.p.anchors[c.lex.buf] = c.anchor
c.nextAnchorId = AnchorId(int(c.nextAnchorId) + 1) c.nextAnchorId = AnchorId(int(c.nextAnchorId) + 1)
c.lex.buf.setLen(0)
proc finishLine(lexer: var BaseLexer) {.raises: [], inline.} =
debug("lex: finishLine")
while lexer.buf[lexer.bufpos] notin lineEnd:
lexer.bufpos.inc()
proc skipWhitespace(lexer: var BaseLexer) {.raises: [], inline.} =
debug("lex: skipWhitespace")
while lexer.buf[lexer.bufpos] in space: lexer.bufpos.inc()
# TODO: {.raises: [].}
proc skipWhitespaceCommentsAndNewlines(lexer: var BaseLexer) {.inline.} =
debug("lex: skipWhitespaceCommentsAndNewlines")
if lexer.buf[lexer.bufpos] != '#':
while true:
case lexer.buf[lexer.bufpos]
of space: lexer.bufpos.inc()
of '\l': lexer.lexLF()
of '\c': lexer.lexCR()
of '#': # also skip comments
lexer.bufpos.inc()
while lexer.buf[lexer.bufpos] notin lineEnd:
lexer.bufpos.inc()
else: break
proc skipIndentation(lexer: var BaseLexer) {.raises: [], inline.} =
debug("lex: skipIndentation")
while lexer.buf[lexer.bufpos] == ' ': lexer.bufpos.inc()
proc directiveName(lexer: var BaseLexer, directive: var LexedDirective)
{.raises: [].} =
debug("lex: directiveName")
directive = ldUnknown
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'Y':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'A':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'M':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'L':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] in spaceOrLineEnd:
directive = ldYaml
elif lexer.buf[lexer.bufpos] == 'T':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'A':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] == 'G':
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] in spaceOrLineEnd:
directive = ldTag
while lexer.buf[lexer.bufpos] notin spaceOrLineEnd:
lexer.bufpos.inc()
proc yamlVersion(lexer: var BaseLexer, o: var string)
{.raises: [YamlParserError], inline.} =
debug("lex: yamlVersion")
while lexer.buf[lexer.bufpos] in space: lexer.bufpos.inc()
var c = lexer.buf[lexer.bufpos]
if c notin digits: raise lexer.generateError("Invalid YAML version number")
o.add(c)
lexer.bufpos.inc()
c = lexer.buf[lexer.bufpos]
while c in digits:
lexer.bufpos.inc()
o.add(c)
c = lexer.buf[lexer.bufpos]
if lexer.buf[lexer.bufpos] != '.':
raise lexer.generateError("Invalid YAML version number")
o.add('.')
lexer.bufpos.inc()
c = lexer.buf[lexer.bufpos]
if c notin digits: raise lexer.generateError("Invalid YAML version number")
o.add(c)
lexer.bufpos.inc()
c = lexer.buf[lexer.bufpos]
while c in digits:
o.add(c)
lexer.bufpos.inc()
c = lexer.buf[lexer.bufpos]
if lexer.buf[lexer.bufpos] notin spaceOrLineEnd:
raise lexer.generateError("Invalid YAML version number")
proc lineEnding(c: ParserContext) {.raises: [YamlParserError], inline.} =
debug("lex: lineEnding")
if c.lexer.buf[c.lexer.bufpos] notin lineEnd:
while c.lexer.buf[c.lexer.bufpos] in space: c.lexer.bufpos.inc()
if c.lexer.buf[c.lexer.bufpos] in lineEnd: discard
elif c.lexer.buf[c.lexer.bufpos] == '#':
while c.lexer.buf[c.lexer.bufpos] notin lineEnd: c.lexer.bufpos.inc()
else:
c.startToken()
raise c.generateError("Unexpected token (expected comment or line end)")
proc tagShorthand(lexer: var BaseLexer, shorthand: var string) {.inline.} =
debug("lex: tagShorthand")
while lexer.buf[lexer.bufpos] in space: lexer.bufpos.inc()
yAssert lexer.buf[lexer.bufpos] == '!'
shorthand.add('!')
lexer.bufpos.inc()
var ch = lexer.buf[lexer.bufpos]
if ch in spaceOrLineEnd: discard
else:
while ch != '!':
case ch
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-':
shorthand.add(ch)
lexer.bufpos.inc()
ch = lexer.buf[lexer.bufpos]
else: raise lexer.generateError("Illegal character in tag shorthand")
shorthand.add(ch)
lexer.bufpos.inc()
if lexer.buf[lexer.bufpos] notin spaceOrLineEnd:
raise lexer.generateError("Missing space after tag shorthand")
proc tagUriMapping(lexer: var BaseLexer, uri: var string)
{.raises: [YamlParserError].} =
debug("lex: tagUriMapping")
while lexer.buf[lexer.bufpos] in space:
lexer.bufpos.inc()
var ch = lexer.buf[lexer.bufpos]
if ch == '!':
uri.add(ch)
lexer.bufpos.inc()
ch = lexer.buf[lexer.bufpos]
while ch notin spaceOrLineEnd:
case ch
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&',
'-', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
uri.add(ch)
lexer.bufpos.inc()
ch = lexer.buf[lexer.bufpos]
else: raise lexer.generateError("Invalid tag uri")
proc directivesEndMarker(lexer: var BaseLexer, success: var bool)
{.raises: [].} =
debug("lex: directivesEndMarker")
success = true
for i in 0..2:
if lexer.buf[lexer.bufpos + i] != '-':
success = false
break
if success: success = lexer.buf[lexer.bufpos + 3] in spaceOrLineEnd
proc documentEndMarker(lexer: var BaseLexer, success: var bool) {.raises: [].} =
debug("lex: documentEndMarker")
success = true
for i in 0..2:
if lexer.buf[lexer.bufpos + i] != '.':
success = false
break
if success: success = lexer.buf[lexer.bufpos + 3] in spaceOrLineEnd
proc unicodeSequence(lexer: var BaseLexer, length: int):
string {.raises: [YamlParserError].} =
debug("lex: unicodeSequence")
var unicodeChar = 0.int
for i in countup(0, length - 1):
lexer.bufpos.inc()
let
digitPosition = length - i - 1
ch = lexer.buf[lexer.bufpos]
case ch
of EndOFFile, '\l', '\c':
raise lexer.generateError("Unfinished unicode escape sequence")
of '0' .. '9':
unicodeChar = unicodechar or (int(ch) - 0x30) shl (digitPosition * 4)
of 'A' .. 'F':
unicodeChar = unicodechar or (int(ch) - 0x37) shl (digitPosition * 4)
of 'a' .. 'f':
unicodeChar = unicodechar or (int(ch) - 0x57) shl (digitPosition * 4)
else:
raise lexer.generateError(
"Invalid character in unicode escape sequence")
return toUTF8(Rune(unicodeChar))
proc byteSequence(lexer: var BaseLexer): char {.raises: [YamlParserError].} =
debug("lex: byteSequence")
var charCode = 0.int8
for i in 0 .. 1:
lexer.bufpos.inc()
let
digitPosition = int8(1 - i)
ch = lexer.buf[lexer.bufpos]
case ch
of EndOfFile, '\l', 'r':
raise lexer.generateError("Unfinished octet escape sequence")
of '0' .. '9':
charCode = charCode or (int8(ch) - 0x30.int8) shl (digitPosition * 4)
of 'A' .. 'F':
charCode = charCode or (int8(ch) - 0x37.int8) shl (digitPosition * 4)
of 'a' .. 'f':
charCode = charCode or (int8(ch) - 0x57.int8) shl (digitPosition * 4)
else:
raise lexer.generateError("Invalid character in octet escape sequence")
return char(charCode)
# TODO: {.raises: [].}
proc processQuotedWhitespace(c: ParserContext, newlines: var int) =
c.after.reset()
block outer:
while true:
case c.lexer.buf[c.lexer.bufpos]
of ' ', '\t': c.after.add(c.lexer.buf[c.lexer.bufpos])
of '\l':
c.lexer.bufpos = c.lexer.handleLF(c.lexer.bufpos)
break
of '\c':
c.lexer.bufpos = c.lexer.handleLF(c.lexer.bufpos)
break
else:
c.content.add(c.after)
break outer
c.lexer.bufpos.inc()
while true:
case c.lexer.buf[c.lexer.bufpos]
of ' ', '\t': discard
of '\l':
c.lexer.lexLF()
newlines.inc()
continue
of '\c':
c.lexer.lexCR()
newlines.inc()
continue
else:
if newlines == 0: discard
elif newlines == 1: c.content.add(' ')
else: c.content.addMultiple('\l', newlines - 1)
break
c.lexer.bufpos.inc()
# TODO: {.raises: [YamlParserError].}
proc doubleQuotedScalar(c: ParserContext) =
debug("lex: doubleQuotedScalar")
c.lexer.bufpos.inc()
while true:
var ch = c.lexer.buf[c.lexer.bufpos]
case ch
of EndOfFile:
raise c.lexer.generateError("Unfinished double quoted string")
of '\\':
c.lexer.bufpos.inc()
case c.lexer.buf[c.lexer.bufpos]
of EndOfFile:
raise c.lexer.generateError("Unfinished escape sequence")
of '0': c.content.add('\0')
of 'a': c.content.add('\x07')
of 'b': c.content.add('\x08')
of '\t', 't': c.content.add('\t')
of 'n': c.content.add('\l')
of 'v': c.content.add('\v')
of 'f': c.content.add('\f')
of 'r': c.content.add('\c')
of 'e': c.content.add('\e')
of ' ': c.content.add(' ')
of '"': c.content.add('"')
of '/': c.content.add('/')
of '\\': c.content.add('\\')
of 'N': c.content.add(UTF8NextLine)
of '_': c.content.add(UTF8NonBreakingSpace)
of 'L': c.content.add(UTF8LineSeparator)
of 'P': c.content.add(UTF8ParagraphSeparator)
of 'x': c.content.add(c.lexer.unicodeSequence(2))
of 'u': c.content.add(c.lexer.unicodeSequence(4))
of 'U': c.content.add(c.lexer.unicodeSequence(8))
of '\l', '\c':
var newlines = 0
c.processQuotedWhitespace(newlines)
continue
else: raise c.lexer.generateError("Illegal character in escape sequence")
of '"':
c.lexer.bufpos.inc()
break
of '\l', '\c', '\t', ' ':
var newlines = 1
c.processQuotedWhitespace(newlines)
continue
else: c.content.add(ch)
c.lexer.bufpos.inc()
# TODO: {.raises: [].}
proc singleQuotedScalar(c: ParserContext) =
debug("lex: singleQuotedScalar")
c.lexer.bufpos.inc()
while true:
case c.lexer.buf[c.lexer.bufpos]
of '\'':
c.lexer.bufpos.inc()
if c.lexer.buf[c.lexer.bufpos] == '\'': c.content.add('\'')
else: break
of EndOfFile: raise c.lexer.generateError("Unfinished single quoted string")
of '\l', '\c', '\t', ' ':
var newlines = 1
c.processQuotedWhitespace(newlines)
continue
else: c.content.add(c.lexer.buf[c.lexer.bufpos])
c.lexer.bufpos.inc()
proc isPlainSafe(lexer: BaseLexer, index: int, context: YamlContext): bool
{.raises: [].} =
case lexer.buf[lexer.bufpos + 1]
of spaceOrLineEnd: result = false
of flowIndicators: result = context == cBlock
else: result = true
# tried this for performance optimization, but it didn't optimize any
# performance. keeping it around for future reference.
#const
# plainCharOut = {'!', '\"', '$'..'9', ';'..'\xFF'}
# plainCharIn = {'!', '\"', '$'..'+', '-'..'9', ';'..'Z', '\\', '^'..'z',
# '|', '~'..'\xFF'}
#template isPlainChar(c: char, context: YamlContext): bool =
# when context == cBlock: c in plainCharOut
# else: c in plainCharIn
proc plainScalar(c: ParserContext, context: static[YamlContext])
{.raises: [].} =
debug("lex: plainScalar")
c.content.add(c.lexer.buf[c.lexer.bufpos])
block outer:
while true:
c.lexer.bufpos.inc()
let ch = c.lexer.buf[c.lexer.bufpos]
case ch
of ' ', '\t':
c.after.setLen(1)
c.after[0] = ch
while true:
c.lexer.bufpos.inc()
let ch2 = c.lexer.buf[c.lexer.bufpos]
case ch2
of ' ', '\t': c.after.add(ch2)
of lineEnd: break outer
of ':':
if c.lexer.isPlainSafe(c.lexer.bufpos + 1, context):
c.content.add(c.after & ':')
break
else: break outer
of '#': break outer
of flowIndicators:
if context == cBlock:
c.content.add(c.after)
c.content.add(ch2)
break
else: break outer
else:
c.content.add(c.after)
c.content.add(ch2)
break
of flowIndicators:
when context == cFlow: break
else: c.content.add(ch)
of lineEnd: break
of ':':
if c.lexer.isPlainSafe(c.lexer.bufpos + 1, context): c.content.add(':')
else: break outer
else: c.content.add(ch)
debug("lex: \"" & c.content & '\"')
proc continueMultilineScalar(c: ParserContext) {.raises: [].} = proc continueMultilineScalar(c: ParserContext) {.raises: [].} =
c.content.add(if c.newlines == 1: " " else: repeat('\l', c.newlines - 1)) c.lex.buf.add(if c.newlines == 1: " " else: repeat('\l', c.newlines - 1))
c.startToken() c.newlines = 0
c.plainScalar(cBlock)
template startScalar(t: ScalarType) {.dirty.} = template startScalar(t: ScalarType) {.dirty.} =
c.newlines = 0 c.newlines = 0
c.level.kind = fplScalar c.level.kind = fplScalar
c.scalarType = t c.scalarType = t
proc blockScalarHeader(c: ParserContext): bool =
debug("lex: blockScalarHeader")
c.chomp = ctClip
c.level.indentation = UnknownIndentation
if c.tag == yTagQuestionMark: c.tag = yTagExclamationMark
let t = if c.lexer.buf[c.lexer.bufpos] == '|': stLiteral else: stFolded
while true:
c.lexer.bufpos.inc()
case c.lexer.buf[c.lexer.bufpos]
of '+':
if c.chomp != ctClip:
raise c.lexer.generateError("Only one chomping indicator is allowed")
c.chomp = ctKeep
of '-':
if c.chomp != ctClip:
raise c.lexer.generateError("Only one chomping indicator is allowed")
c.chomp = ctStrip
of '1'..'9':
if c.level.indentation != UnknownIndentation:
raise c.lexer.generateError("Only one p.indentation indicator is allowed")
c.level.indentation = c.ancestry[c.ancestry.high].indentation +
ord(c.lexer.buf[c.lexer.bufpos]) - ord('\x30')
of spaceOrLineEnd: break
else:
raise c.lexer.generateError(
"Illegal character in block scalar header: '" &
c.lexer.buf[c.lexer.bufpos] & "'")
c.recentWasMoreIndented = false
c.lineEnding()
result = c.handleLineEnd(true)
if not result:
startScalar(t)
c.content.reset()
proc blockScalarLine(c: ParserContext):
bool {.raises: [YamlParserError].} =
debug("lex: blockScalarLine")
result = false
if c.level.indentation == UnknownIndentation:
if c.lexer.buf[c.lexer.bufpos] in lineEnd:
return c.handleLineEnd(true)
else:
c.level.indentation = c.indentation
c.content.addMultiple('\l', c.newlines)
elif c.indentation > c.level.indentation or
c.lexer.buf[c.lexer.bufpos] == '\t':
c.content.addMultiple('\l', c.newlines)
c.recentWasMoreIndented = true
c.content.addMultiple(' ', c.indentation - c.level.indentation)
elif c.scalarType == stFolded:
if c.recentWasMoreIndented:
c.recentWasMoreIndented = false
c.newlines.inc()
if c.newlines == 0: discard
elif c.newlines == 1: c.content.add(' ')
else: c.content.addMultiple('\l', c.newlines - 1)
else: c.content.addMultiple('\l', c.newlines)
c.newlines = 0
while c.lexer.buf[c.lexer.bufpos] notin lineEnd:
c.content.add(c.lexer.buf[c.lexer.bufpos])
c.lexer.bufpos.inc()
result = c.handleLineEnd(true)
proc tagHandle(c: ParserContext, shorthandEnd: var int)
{.raises: [YamlParserError].} =
debug("lex: tagHandle")
shorthandEnd = 0
c.content.add(c.lexer.buf[c.lexer.bufpos])
var i = 0
while true:
c.lexer.bufpos.inc()
i.inc()
let ch = c.lexer.buf[c.lexer.bufpos]
case ch
of spaceOrLineEnd:
if shorthandEnd == -1:
raise c.lexer.generateError("Unclosed verbatim tag")
break
of '!':
if shorthandEnd == -1 and i == 2:
c.content.add(ch)
continue
elif shorthandEnd != 0:
raise c.lexer.generateError("Illegal character in tag suffix")
shorthandEnd = i
c.content.add(ch)
of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&',
'-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')':
c.content.add(ch)
of ',':
if shortHandEnd > 0: break # ',' after shorthand is flow indicator
c.content.add(ch)
of '<':
if i == 1:
shorthandEnd = -1
c.content.reset()
else: raise c.lexer.generateError("Illegal character in tag handle")
of '>':
if shorthandEnd == -1:
c.lexer.bufpos.inc()
if c.lexer.buf[c.lexer.bufpos] notin spaceOrLineEnd:
raise c.lexer.generateError("Missing space after verbatim tag handle")
break
else: raise c.lexer.generateError("Illegal character in tag handle")
of '%':
if shorthandEnd != 0: c.content.add(c.lexer.byteSequence())
else: raise c.lexer.generateError("Illegal character in tag handle")
else: raise c.lexer.generateError("Illegal character in tag handle")
proc handleTagHandle(c: ParserContext) {.raises: [YamlParserError].} = proc handleTagHandle(c: ParserContext) {.raises: [YamlParserError].} =
c.startToken()
if c.level.kind != fplUnknown: raise c.generateError("Unexpected tag handle") if c.level.kind != fplUnknown: raise c.generateError("Unexpected tag handle")
if c.tag != yTagQuestionMark: if c.tag != yTagQuestionMark:
raise c.generateError("Only one tag handle is allowed per node") raise c.generateError("Only one tag handle is allowed per node")
c.content.reset() if c.lex.cur == ltTagHandle:
var var tagUri = ""
shorthandEnd: int
c.tagHandle(shorthandEnd)
if shorthandEnd != -1:
try: try:
c.tagUri.reset() tagUri.add(c.shorthands[c.lex.buf[0..c.lex.shorthandEnd]])
c.tagUri.add(c.shorthands[c.content[0..shorthandEnd]]) tagUri.add(c.lex.buf[c.lex.shorthandEnd + 1 .. ^1])
c.tagUri.add(c.content[shorthandEnd + 1 .. ^1])
except KeyError: except KeyError:
raise c.generateError( raise c.generateError(
"Undefined tag shorthand: " & c.content[0..shorthandEnd]) "Undefined tag shorthand: " & c.lex.buf[0..c.lex.shorthandEnd])
try: c.tag = c.p.tagLib.tags[c.tagUri] try: c.tag = c.p.tagLib.tags[tagUri]
except KeyError: c.tag = c.p.tagLib.registerUri(c.tagUri) except KeyError: c.tag = c.p.tagLib.registerUri(tagUri)
else: else:
try: c.tag = c.p.tagLib.tags[c.content] try: c.tag = c.p.tagLib.tags[c.lex.buf]
except KeyError: c.tag = c.p.tagLib.registerUri(c.content) except KeyError: c.tag = c.p.tagLib.registerUri(c.lex.buf)
proc consumeLineIfEmpty(c: ParserContext, newlines: var int): bool =
result = true
while true:
c.lexer.bufpos.inc()
case c.lexer.buf[c.lexer.bufpos]
of ' ', '\t': discard
of '\l':
c.lexer.lexLF()
break
of '\c':
c.lexer.lexCR()
break
of '#', EndOfFile:
c.lineEnding()
discard c.handleLineEnd(true)
break
else:
result = false
break
proc handlePossibleMapStart(c: ParserContext, e: var YamlStreamEvent, proc handlePossibleMapStart(c: ParserContext, e: var YamlStreamEvent,
flow: bool = false, single: bool = false): bool = flow: bool = false, single: bool = false): bool =
result = false result = false
if c.level.indentation == UnknownIndentation: if c.level.indentation == UnknownIndentation:
var flowDepth = 0 if c.lex.isImplicitKeyStart():
var pos = c.lexer.bufpos e = c.objectStart(yamlStartMap, single)
var recentJsonStyle = false result = true
while pos < c.lexer.bufpos + 1024:
case c.lexer.buf[pos]
of ':':
if flowDepth == 0 and (c.lexer.buf[pos + 1] in spaceOrLineEnd or
recentJsonStyle):
e = c.objectStart(yamlStartMap, single)
result = true
break
of lineEnd: break
of '[', '{': flowDepth.inc()
of '}', ']':
flowDepth.inc(-1)
if flowDepth < 0: break
of '?', ',':
if flowDepth == 0: break
of '#':
if c.lexer.buf[pos - 1] in space: break
of '"':
pos.inc()
while c.lexer.buf[pos] notin {'"', EndOfFile, '\l', '\c'}:
if c.lexer.buf[pos] == '\\': pos.inc()
pos.inc()
if c.lexer.buf[pos] != '"': break
of '\'':
pos.inc()
while c.lexer.buf[pos] notin {'\'', '\l', '\c', EndOfFile}:
pos.inc()
of '&', '*', '!':
if pos == c.lexer.bufpos or c.lexer.buf[c.lexer.bufpos] in space:
pos.inc()
while c.lexer.buf[pos] notin spaceOrLineEnd:
pos.inc()
continue
else: discard
if flow and c.lexer.buf[pos] notin space:
recentJsonStyle = c.lexer.buf[pos] in {']', '}', '\'', '"'}
pos.inc()
if c.level.indentation == UnknownIndentation:
c.level.indentation = c.indentation
proc handleMapKeyIndicator(c: ParserContext, e: var YamlStreamEvent): bool = proc handleMapKeyIndicator(c: ParserContext, e: var YamlStreamEvent): bool =
result = false result = false
c.startToken()
case c.level.kind case c.level.kind
of fplUnknown: of fplUnknown:
e = c.objectStart(yamlStartMap) e = c.objectStart(yamlStartMap)
result = true result = true
of fplMapValue: of fplMapValue:
if c.level.indentation != c.indentation: if c.level.indentation != c.lex.indentation:
raise c.generateError("Invalid p.indentation of map key indicator") raise c.generateError("Invalid p.indentation of map key indicator")
e = scalarEvent("", yTagQuestionMark, yAnchorNone) e = scalarEvent("", yTagQuestionMark, yAnchorNone)
result = true result = true
@ -789,7 +172,7 @@ proc handleMapKeyIndicator(c: ParserContext, e: var YamlStreamEvent): bool =
c.ancestry.add(c.level) c.ancestry.add(c.level)
c.level = initLevel(fplUnknown) c.level = initLevel(fplUnknown)
of fplMapKey: of fplMapKey:
if c.level.indentation != c.indentation: if c.level.indentation != c.lex.indentation:
raise c.generateError("Invalid p.indentation of map key indicator") raise c.generateError("Invalid p.indentation of map key indicator")
c.ancestry.add(c.level) c.ancestry.add(c.level)
c.level = initLevel(fplUnknown) c.level = initLevel(fplUnknown)
@ -800,25 +183,26 @@ proc handleMapKeyIndicator(c: ParserContext, e: var YamlStreamEvent): bool =
"Unexpected map key indicator (expected multiline scalar end)") "Unexpected map key indicator (expected multiline scalar end)")
of fplSinglePairKey, fplSinglePairValue, fplDocument: of fplSinglePairKey, fplSinglePairValue, fplDocument:
internalError("Unexpected level kind: " & $c.level.kind) internalError("Unexpected level kind: " & $c.level.kind)
c.lexer.skipWhitespace() # TODO: why was this there?
c.indentation = c.lexer.getColNumber(c.lexer.bufpos) # c.lexer.skipWhitespace()
# c.indentation = c.lexer.getColNumber(c.lexer.bufpos)
proc handleBlockSequenceIndicator(c: ParserContext, e: var YamlStreamEvent): proc handleBlockSequenceIndicator(c: ParserContext, e: var YamlStreamEvent):
bool = bool =
result = false result = false
c.startToken()
case c.level.kind case c.level.kind
of fplUnknown: of fplUnknown:
e = c.objectStart(yamlStartSeq) e = c.objectStart(yamlStartSeq)
result = true result = true
of fplSequence: of fplSequence:
if c.level.indentation != c.indentation: if c.level.indentation != c.lex.indentation:
raise c.generateError("Invalid p.indentation of block sequence indicator") raise c.generateError("Invalid p.indentation of block sequence indicator")
c.ancestry.add(c.level) c.ancestry.add(c.level)
c.level = initLevel(fplUnknown) c.level = initLevel(fplUnknown)
else: raise c.generateError("Illegal sequence item in map") else: raise c.generateError("Illegal sequence item in map")
c.lexer.skipWhitespace() # TODO: why was this there?
c.indentation = c.lexer.getColNumber(c.lexer.bufpos) # c.lexer.skipWhitespace()
# c.indentation = c.lexer.getColNumber(c.lexer.bufpos)
proc handleBlockItemStart(c: ParserContext, e: var YamlStreamEvent): bool = proc handleBlockItemStart(c: ParserContext, e: var YamlStreamEvent): bool =
result = false result = false
@ -846,42 +230,10 @@ proc handleFlowItemStart(c: ParserContext, e: var YamlStreamEvent): bool =
result = c.handlePossibleMapStart(e, true, true) result = c.handlePossibleMapStart(e, true, true)
proc handleFlowPlainScalar(c: ParserContext, e: var YamlStreamEvent) = proc handleFlowPlainScalar(c: ParserContext, e: var YamlStreamEvent) =
c.content.reset() while c.lex.cur in {ltScalarPart, ltEmptyLine}:
c.startToken() c.lex.newlines.inc()
c.plainScalar(cFlow) c.lex.next()
if c.lexer.buf[c.lexer.bufpos] in {'{', '}', '[', ']', ',', ':', '#'}: c.lex.newlines = 0
discard
else:
c.newlines = 0
while true:
case c.lexer.buf[c.lexer.bufpos]
of ':':
if c.lexer.isPlainSafe(c.lexer.bufpos + 1, cFlow):
if c.newlines == 1:
c.content.add(' ')
c.newlines = 0
elif c.newlines > 1:
c.content.addMultiple(' ', c.newlines - 1)
c.newlines = 0
c.plainScalar(cFlow)
break
of '#', EndOfFile: break
of '\l':
c.lexer.bufpos = c.lexer.handleLF(c.lexer.bufpos)
c.newlines.inc()
of '\c':
c.lexer.bufpos = c.lexer.handleCR(c.lexer.bufpos)
c.newlines.inc()
of flowIndicators: break
of ' ', '\t': c.lexer.skipWhitespace()
else:
if c.newlines == 1:
c.content.add(' ')
c.newlines = 0
elif c.newlines > 1:
c.content.addMultiple(' ', c.newlines - 1)
c.newlines = 0
c.plainScalar(cFlow)
e = c.currentScalar() e = c.currentScalar()
# --- macros for defining parser states --- # --- macros for defining parser states ---
@ -965,7 +317,7 @@ parserStates(initial, blockObjectStart, blockAfterPlainScalar, blockAfterObject,
leaveFlowSinglePairMap) leaveFlowSinglePairMap)
proc closeEverything(c: ParserContext) = proc closeEverything(c: ParserContext) =
c.indentation = -1 c.lex.indentation = -1
c.nextImpl = stateCloseMoreIndentedLevels c.nextImpl = stateCloseMoreIndentedLevels
c.atSequenceItem = false c.atSequenceItem = false
@ -1068,59 +420,42 @@ proc leaveFlowLevel(c: ParserContext, e: var YamlStreamEvent): bool =
c.nextImpl = stateObjectEnd c.nextImpl = stateObjectEnd
parserState initial: parserState initial:
case c.lexer.buf[c.lexer.bufpos] c.lex.next()
of '%': case c.lex.cur
var ld: LexedDirective of ltYamlDirective:
c.startToken() c.lex.next()
c.lexer.directiveName(ld) assert c.lex.cur == ltYamlVersion
case ld if c.lex.buf != "1.2":
of ldYaml: c.callCallback("Version is not 1.2, but " & c.lex.buf)
var version = "" of ltTagDirective:
c.startToken() c.lex.next()
c.lexer.yamlVersion(version) assert c.lex.cur == ltTagShorthand
if version != "1.2": var tagShorthand: string
c.callCallback("Version is not 1.2, but " & version) shallowCopy(tagShorthand, c.lex.buf)
c.lineEnding() c.lex.buf = ""
discard c.handleLineEnd(true) c.lex.next()
of ldTag: assert c.lex.cur == ltTagUri
var shorthand = "" c.shorthands[tagShorthand] = c.lex.buf
c.tagUri.reset() c.lex.buf.setLen(0)
c.startToken() of ltUnknownDirective:
c.lexer.tagShorthand(shorthand) c.callCallback("Unknown directive: " & c.lex.buf)
c.lexer.tagUriMapping(c.tagUri) c.lex.buf.setLen(0)
c.shorthands[shorthand] = c.tagUri c.lex.next()
c.lineEnding() assert c.lex.cur == ltUnknownDirectiveParams
discard c.handleLineEnd(true) of ltIndentation:
of ldUnknown:
c.callCallback("Unknown directive")
c.lexer.finishLine()
discard c.handleLineEnd(true)
of ' ', '\t':
if not c.consumeLineIfEmpty(c.newlines):
c.indentation = c.lexer.getColNumber(c.lexer.bufpos)
e = startDocEvent()
result = true
state = blockObjectStart
of '\l': c.lexer.lexLF()
of '\c': c.lexer.lexCR()
of EndOfFile: c.isFinished = true
of '#':
c.lineEnding()
discard c.handleLineEnd(true)
of '-':
var success: bool
c.startToken()
c.lexer.directivesEndMarker(success)
if success: c.lexer.bufpos.inc(3)
e = startDocEvent() e = startDocEvent()
result = true result = true
state = blockObjectStart state = blockObjectStart
else: of ltStreamEnd: c.isFinished = true
of ltDirectivesEnd:
e = startDocEvent() e = startDocEvent()
result = true result = true
state = blockObjectStart state = blockObjectStart
else: internalError("Unexpected lexer token: " & $c.lex.cur)
parserState blockObjectStart: parserState blockObjectStart:
c.next()
c.lexer.skipIndentation() c.lexer.skipIndentation()
c.indentation = c.lexer.getColNumber(c.lexer.bufpos) c.indentation = c.lexer.getColNumber(c.lexer.bufpos)
if c.indentation == 0: if c.indentation == 0:
@ -1153,8 +488,6 @@ parserState blockObjectStart:
stored = afterDocument stored = afterDocument
return false return false
else: else:
c.atSequenceItem = c.lexer.buf[c.lexer.bufpos] == '-' and
not c.lexer.isPlainSafe(c.lexer.bufpos + 1, cBlock)
state = closeMoreIndentedLevels state = closeMoreIndentedLevels
stored = blockObjectStart stored = blockObjectStart
return false return false
@ -1808,15 +1141,19 @@ parserState flowAfterObject:
# --- parser initialization --- # --- parser initialization ---
proc parse*(p: YamlParser, s: Stream): YamlStream = proc init(c: ParserContext, p: YamlParser) =
result = new(ParserContext)
let c = ParserContext(result)
c.content = ""
c.after = ""
c.tagUri = ""
c.ancestry = newSeq[FastParseLevel]()
c.p = p c.p = p
try: p.lexer.open(s) c.ancestry = newSeq[FastParseLevel]()
c.initDocValues()
c.flowdepth = 0
c.isFinished = false
c.peeked = false
c.nextImpl = stateInitial
proc parse*(p: YamlParser, s: Stream): YamlStream =
let c = new(ParserContext)
c.init(p)
try: c.lex = newYamlLexer(s)
except: except:
let e = newException(YamlParserError, let e = newException(YamlParserError,
"Error while opening stream: " & getCurrentExceptionMsg()) "Error while opening stream: " & getCurrentExceptionMsg())
@ -1825,9 +1162,10 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
e.column = 1 e.column = 1
e.lineContent = "" e.lineContent = ""
raise e raise e
c.initDocValues() result = c
c.atSequenceItem = false
c.flowdepth = 0 proc parse*(p: YamlParser, str: string): YamlStream =
result.isFinished = false let c = new(ParserContext)
result.peeked = false c.init(p)
result.nextImpl = stateInitial c.lex = newYamlLexer(str)
result = c

View File

@ -17,7 +17,7 @@
## this enhances interoperability with other languages. ## this enhances interoperability with other languages.
import streams, unicode, lexbase, tables, strutils, json, hashes, queues, import streams, unicode, lexbase, tables, strutils, json, hashes, queues,
macros, typetraits, parseutils macros, typetraits, parseutils, private/lex
export streams, tables, json export streams, tables, json
when defined(yamlDebug): import terminal when defined(yamlDebug): import terminal
@ -143,14 +143,6 @@ type
## ``1.2``. ## ``1.2``.
## - If there is an unknown directive encountered. ## - If there is an unknown directive encountered.
FastParseLevelKind = enum
fplUnknown, fplSequence, fplMapKey, fplMapValue, fplSinglePairKey,
fplSinglePairValue, fplScalar, fplDocument
FastParseLevel = object
kind: FastParseLevelKind
indentation: int
YamlParser* = ref object YamlParser* = ref object
## A parser object. Retains its ``TagLibrary`` across calls to ## A parser object. Retains its ``TagLibrary`` across calls to
## `parse <#parse,YamlParser,Stream>`_. Can be used ## `parse <#parse,YamlParser,Stream>`_. Can be used
@ -160,8 +152,6 @@ type
tagLib: TagLibrary tagLib: TagLibrary
callback: WarningCallback callback: WarningCallback
anchors: Table[string, AnchorId] anchors: Table[string, AnchorId]
lexer: BaseLexer
tokenstart: int
PresentationStyle* = enum PresentationStyle* = enum
## Different styles for YAML character stream output. ## Different styles for YAML character stream output.