From 7376af7d6f676754d99f3b9188c7ac8dcf7189f7 Mon Sep 17 00:00:00 2001 From: Felix Krause Date: Mon, 12 Sep 2016 18:04:26 +0200 Subject: [PATCH] started patching parse.nim --- private/parse.nim | 840 +++++----------------------------------------- yaml.nim | 12 +- 2 files changed, 90 insertions(+), 762 deletions(-) diff --git a/private/parse.nim b/private/parse.nim index 9bad0ef..13aba4b 100644 --- a/private/parse.nim +++ b/private/parse.nim @@ -5,70 +5,37 @@ # distribution, for details about the copyright. type - ScalarType = enum - stFlow, stLiteral, stFolded + FastParseLevelKind = enum + fplUnknown, fplSequence, fplMapKey, fplMapValue, fplSinglePairKey, + fplSinglePairValue, fplScalar, fplDocument - LexedDirective = enum - ldYaml, ldTag, ldUnknown - - YamlContext = enum - cBlock, cFlow - - ChompType = enum - ctKeep, ctClip, ctStrip + FastParseLevel = object + kind: FastParseLevelKind + indentation: int ParserContext = ref object of YamlStream p: YamlParser + lex: YamlLexer storedState: proc(s: YamlStream, e: var YamlStreamEvent): bool - scalarType: ScalarType - chomp: ChompType atSequenceItem: bool - recentWasMoreIndented: bool flowdepth: int - explicitFlowKey: bool - content, after: string ancestry: seq[FastParseLevel] level: FastParseLevel - tagUri: string tag: TagId anchor: AnchorId shorthands: Table[string, string] nextAnchorId: AnchorId newlines: int - indentation: int LevelEndResult = enum lerNothing, lerOne, lerAdditionalMapEnd -const - space = {' ', '\t'} - lineEnd = {'\l', '\c', EndOfFile} - spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile} - digits = {'0'..'9'} - flowIndicators = {'[', ']', '{', '}', ','} - - UTF8NextLine = toUTF8(0x85.Rune) - UTF8NonBreakingSpace = toUTF8(0xA0.Rune) - UTF8LineSeparator = toUTF8(0x2028.Rune) - UTF8ParagraphSeparator = toUTF8(0x2029.Rune) - UnknownIndentation = int.low - proc newYamlParser*(tagLib: TagLibrary = initExtendedTagLibrary(), callback: WarningCallback = nil): YamlParser = new(result) result.tagLib = tagLib result.callback = callback -proc getLineNumber*(p: YamlParser): int = p.lexer.lineNumber - -proc getColNumber*(p: YamlParser): int = p.tokenstart + 1 # column is 1-based - -proc getLineContent*(p: YamlParser, marker: bool = true): string = - result = p.lexer.getCurrentLine(false) - if marker: result.add(repeat(' ', p.tokenstart) & "^\n") - -proc lexer(c: ParserContext): var BaseLexer {.inline.} = c.p.lexer - template debug(message: string) {.dirty.} = when defined(yamlDebug): try: styledWriteLine(stdout, fgBlue, message) @@ -77,31 +44,8 @@ template debug(message: string) {.dirty.} = proc generateError(c: ParserContext, message: string): ref YamlParserError {.raises: [].} = result = newException(YamlParserError, message) - result.line = c.lexer.lineNumber - result.column = c.p.tokenstart + 1 - result.lineContent = c.p.getLineContent(true) - -proc generateError(lx: BaseLexer, message: string): - ref YamlParserError {.raises: [].} = - result = newException(YamlParserError, message) - result.line = lx.lineNumber - result.column = lx.bufpos + 1 - result.lineContent = lx.getCurrentLine(false) & - repeat(' ', lx.getColNumber(lx.bufpos)) & "^\n" - -template lexCR(lexer: BaseLexer) {.dirty.} = - try: lexer.bufpos = lexer.handleCR(lexer.bufpos) - except: - var e = generateError(lexer, "I/O Error: " & getCurrentExceptionMsg()) - e.parent = getCurrentException() - raise e - -template lexLF(lexer: BaseLexer) {.dirty.} = - try: lexer.bufpos = lexer.handleLF(lexer.bufpos) - except: - var e = generateError(lexer, "I/O Error: " & getCurrentExceptionMsg()) - e.parent = getCurrentException() - raise e + (result.line, result.column) = c.lex.curStartPos + result.lineContent = c.lex.getTokenLine() proc callCallback(c: ParserContext, msg: string) {.raises: [YamlParserError].} = try: @@ -114,12 +58,6 @@ proc callCallback(c: ParserContext, msg: string) {.raises: [YamlParserError].} = e.parent = getCurrentException() raise e -proc addMultiple(s: var string, c: char, num: int) {.raises: [], inline.} = - for i in 1..num: - s.add(c) - -proc reset(buffer: var string) {.raises: [], inline.} = buffer.setLen(0) - proc initLevel(k: FastParseLevelKind): FastParseLevel {.raises: [], inline.} = FastParseLevel(kind: k, indentation: UnknownIndentation) @@ -130,18 +68,12 @@ proc emptyScalar(c: ParserContext): YamlStreamEvent {.raises: [], inline.} = proc currentScalar(c: ParserContext): YamlStreamEvent {.raises: [], inline.} = result = YamlStreamEvent(kind: yamlScalar, scalarTag: c.tag, - scalarAnchor: c.anchor, scalarContent: c.content) + scalarAnchor: c.anchor) + shallowCopy(result.scalarContent, c.lex.buf) + c.lex.buf = newStringOfCap(256) c.tag = yTagQuestionMark c.anchor = yAnchorNone -proc handleLineEnd(c: ParserContext, incNewlines: static[bool]): bool = - case c.lexer.buf[c.lexer.bufpos] - of '\l': c.lexer.lexLF() - of '\c': c.lexer.lexCR() - of EndOfFile: return true - else: discard - when incNewlines: c.newlines.inc() - proc objectStart(c: ParserContext, k: static[YamlStreamEventKind], single: bool = false): YamlStreamEvent {.raises: [].} = yAssert(c.level.kind == fplUnknown) @@ -181,607 +113,58 @@ proc initDocValues(c: ParserContext) {.raises: [].} = c.anchor = yAnchorNone c.ancestry.add(FastParseLevel(kind: fplDocument, indentation: -1)) -proc startToken(c: ParserContext) {.raises: [], inline.} = - c.p.tokenstart = c.lexer.getColNumber(c.lexer.bufpos) - -proc anchorName(c: ParserContext) {.raises: [].} = - debug("lex: anchorName") - while true: - c.lexer.bufpos.inc() - let ch = c.lexer.buf[c.lexer.bufpos] - case ch - of spaceOrLineEnd, '[', ']', '{', '}', ',': break - else: c.content.add(ch) - proc handleAnchor(c: ParserContext) {.raises: [YamlParserError].} = - c.startToken() if c.level.kind != fplUnknown: raise c.generateError("Unexpected token") if c.anchor != yAnchorNone: raise c.generateError("Only one anchor is allowed per node") - c.content.reset() - c.anchorName() c.anchor = c.nextAnchorId - c.p.anchors[c.content] = c.anchor + c.p.anchors[c.lex.buf] = c.anchor c.nextAnchorId = AnchorId(int(c.nextAnchorId) + 1) - -proc finishLine(lexer: var BaseLexer) {.raises: [], inline.} = - debug("lex: finishLine") - while lexer.buf[lexer.bufpos] notin lineEnd: - lexer.bufpos.inc() - -proc skipWhitespace(lexer: var BaseLexer) {.raises: [], inline.} = - debug("lex: skipWhitespace") - while lexer.buf[lexer.bufpos] in space: lexer.bufpos.inc() - -# TODO: {.raises: [].} -proc skipWhitespaceCommentsAndNewlines(lexer: var BaseLexer) {.inline.} = - debug("lex: skipWhitespaceCommentsAndNewlines") - if lexer.buf[lexer.bufpos] != '#': - while true: - case lexer.buf[lexer.bufpos] - of space: lexer.bufpos.inc() - of '\l': lexer.lexLF() - of '\c': lexer.lexCR() - of '#': # also skip comments - lexer.bufpos.inc() - while lexer.buf[lexer.bufpos] notin lineEnd: - lexer.bufpos.inc() - else: break - -proc skipIndentation(lexer: var BaseLexer) {.raises: [], inline.} = - debug("lex: skipIndentation") - while lexer.buf[lexer.bufpos] == ' ': lexer.bufpos.inc() - -proc directiveName(lexer: var BaseLexer, directive: var LexedDirective) - {.raises: [].} = - debug("lex: directiveName") - directive = ldUnknown - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] == 'Y': - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] == 'A': - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] == 'M': - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] == 'L': - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] in spaceOrLineEnd: - directive = ldYaml - elif lexer.buf[lexer.bufpos] == 'T': - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] == 'A': - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] == 'G': - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] in spaceOrLineEnd: - directive = ldTag - while lexer.buf[lexer.bufpos] notin spaceOrLineEnd: - lexer.bufpos.inc() - -proc yamlVersion(lexer: var BaseLexer, o: var string) - {.raises: [YamlParserError], inline.} = - debug("lex: yamlVersion") - while lexer.buf[lexer.bufpos] in space: lexer.bufpos.inc() - var c = lexer.buf[lexer.bufpos] - if c notin digits: raise lexer.generateError("Invalid YAML version number") - o.add(c) - lexer.bufpos.inc() - c = lexer.buf[lexer.bufpos] - while c in digits: - lexer.bufpos.inc() - o.add(c) - c = lexer.buf[lexer.bufpos] - if lexer.buf[lexer.bufpos] != '.': - raise lexer.generateError("Invalid YAML version number") - o.add('.') - lexer.bufpos.inc() - c = lexer.buf[lexer.bufpos] - if c notin digits: raise lexer.generateError("Invalid YAML version number") - o.add(c) - lexer.bufpos.inc() - c = lexer.buf[lexer.bufpos] - while c in digits: - o.add(c) - lexer.bufpos.inc() - c = lexer.buf[lexer.bufpos] - if lexer.buf[lexer.bufpos] notin spaceOrLineEnd: - raise lexer.generateError("Invalid YAML version number") - -proc lineEnding(c: ParserContext) {.raises: [YamlParserError], inline.} = - debug("lex: lineEnding") - if c.lexer.buf[c.lexer.bufpos] notin lineEnd: - while c.lexer.buf[c.lexer.bufpos] in space: c.lexer.bufpos.inc() - if c.lexer.buf[c.lexer.bufpos] in lineEnd: discard - elif c.lexer.buf[c.lexer.bufpos] == '#': - while c.lexer.buf[c.lexer.bufpos] notin lineEnd: c.lexer.bufpos.inc() - else: - c.startToken() - raise c.generateError("Unexpected token (expected comment or line end)") - -proc tagShorthand(lexer: var BaseLexer, shorthand: var string) {.inline.} = - debug("lex: tagShorthand") - while lexer.buf[lexer.bufpos] in space: lexer.bufpos.inc() - yAssert lexer.buf[lexer.bufpos] == '!' - shorthand.add('!') - lexer.bufpos.inc() - var ch = lexer.buf[lexer.bufpos] - if ch in spaceOrLineEnd: discard - else: - while ch != '!': - case ch - of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-': - shorthand.add(ch) - lexer.bufpos.inc() - ch = lexer.buf[lexer.bufpos] - else: raise lexer.generateError("Illegal character in tag shorthand") - shorthand.add(ch) - lexer.bufpos.inc() - if lexer.buf[lexer.bufpos] notin spaceOrLineEnd: - raise lexer.generateError("Missing space after tag shorthand") - -proc tagUriMapping(lexer: var BaseLexer, uri: var string) - {.raises: [YamlParserError].} = - debug("lex: tagUriMapping") - while lexer.buf[lexer.bufpos] in space: - lexer.bufpos.inc() - var ch = lexer.buf[lexer.bufpos] - if ch == '!': - uri.add(ch) - lexer.bufpos.inc() - ch = lexer.buf[lexer.bufpos] - while ch notin spaceOrLineEnd: - case ch - of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&', - '-', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')': - uri.add(ch) - lexer.bufpos.inc() - ch = lexer.buf[lexer.bufpos] - else: raise lexer.generateError("Invalid tag uri") - -proc directivesEndMarker(lexer: var BaseLexer, success: var bool) - {.raises: [].} = - debug("lex: directivesEndMarker") - success = true - for i in 0..2: - if lexer.buf[lexer.bufpos + i] != '-': - success = false - break - if success: success = lexer.buf[lexer.bufpos + 3] in spaceOrLineEnd - -proc documentEndMarker(lexer: var BaseLexer, success: var bool) {.raises: [].} = - debug("lex: documentEndMarker") - success = true - for i in 0..2: - if lexer.buf[lexer.bufpos + i] != '.': - success = false - break - if success: success = lexer.buf[lexer.bufpos + 3] in spaceOrLineEnd - -proc unicodeSequence(lexer: var BaseLexer, length: int): - string {.raises: [YamlParserError].} = - debug("lex: unicodeSequence") - var unicodeChar = 0.int - for i in countup(0, length - 1): - lexer.bufpos.inc() - let - digitPosition = length - i - 1 - ch = lexer.buf[lexer.bufpos] - case ch - of EndOFFile, '\l', '\c': - raise lexer.generateError("Unfinished unicode escape sequence") - of '0' .. '9': - unicodeChar = unicodechar or (int(ch) - 0x30) shl (digitPosition * 4) - of 'A' .. 'F': - unicodeChar = unicodechar or (int(ch) - 0x37) shl (digitPosition * 4) - of 'a' .. 'f': - unicodeChar = unicodechar or (int(ch) - 0x57) shl (digitPosition * 4) - else: - raise lexer.generateError( - "Invalid character in unicode escape sequence") - return toUTF8(Rune(unicodeChar)) - -proc byteSequence(lexer: var BaseLexer): char {.raises: [YamlParserError].} = - debug("lex: byteSequence") - var charCode = 0.int8 - for i in 0 .. 1: - lexer.bufpos.inc() - let - digitPosition = int8(1 - i) - ch = lexer.buf[lexer.bufpos] - case ch - of EndOfFile, '\l', 'r': - raise lexer.generateError("Unfinished octet escape sequence") - of '0' .. '9': - charCode = charCode or (int8(ch) - 0x30.int8) shl (digitPosition * 4) - of 'A' .. 'F': - charCode = charCode or (int8(ch) - 0x37.int8) shl (digitPosition * 4) - of 'a' .. 'f': - charCode = charCode or (int8(ch) - 0x57.int8) shl (digitPosition * 4) - else: - raise lexer.generateError("Invalid character in octet escape sequence") - return char(charCode) - -# TODO: {.raises: [].} -proc processQuotedWhitespace(c: ParserContext, newlines: var int) = - c.after.reset() - block outer: - while true: - case c.lexer.buf[c.lexer.bufpos] - of ' ', '\t': c.after.add(c.lexer.buf[c.lexer.bufpos]) - of '\l': - c.lexer.bufpos = c.lexer.handleLF(c.lexer.bufpos) - break - of '\c': - c.lexer.bufpos = c.lexer.handleLF(c.lexer.bufpos) - break - else: - c.content.add(c.after) - break outer - c.lexer.bufpos.inc() - while true: - case c.lexer.buf[c.lexer.bufpos] - of ' ', '\t': discard - of '\l': - c.lexer.lexLF() - newlines.inc() - continue - of '\c': - c.lexer.lexCR() - newlines.inc() - continue - else: - if newlines == 0: discard - elif newlines == 1: c.content.add(' ') - else: c.content.addMultiple('\l', newlines - 1) - break - c.lexer.bufpos.inc() - -# TODO: {.raises: [YamlParserError].} -proc doubleQuotedScalar(c: ParserContext) = - debug("lex: doubleQuotedScalar") - c.lexer.bufpos.inc() - while true: - var ch = c.lexer.buf[c.lexer.bufpos] - case ch - of EndOfFile: - raise c.lexer.generateError("Unfinished double quoted string") - of '\\': - c.lexer.bufpos.inc() - case c.lexer.buf[c.lexer.bufpos] - of EndOfFile: - raise c.lexer.generateError("Unfinished escape sequence") - of '0': c.content.add('\0') - of 'a': c.content.add('\x07') - of 'b': c.content.add('\x08') - of '\t', 't': c.content.add('\t') - of 'n': c.content.add('\l') - of 'v': c.content.add('\v') - of 'f': c.content.add('\f') - of 'r': c.content.add('\c') - of 'e': c.content.add('\e') - of ' ': c.content.add(' ') - of '"': c.content.add('"') - of '/': c.content.add('/') - of '\\': c.content.add('\\') - of 'N': c.content.add(UTF8NextLine) - of '_': c.content.add(UTF8NonBreakingSpace) - of 'L': c.content.add(UTF8LineSeparator) - of 'P': c.content.add(UTF8ParagraphSeparator) - of 'x': c.content.add(c.lexer.unicodeSequence(2)) - of 'u': c.content.add(c.lexer.unicodeSequence(4)) - of 'U': c.content.add(c.lexer.unicodeSequence(8)) - of '\l', '\c': - var newlines = 0 - c.processQuotedWhitespace(newlines) - continue - else: raise c.lexer.generateError("Illegal character in escape sequence") - of '"': - c.lexer.bufpos.inc() - break - of '\l', '\c', '\t', ' ': - var newlines = 1 - c.processQuotedWhitespace(newlines) - continue - else: c.content.add(ch) - c.lexer.bufpos.inc() - -# TODO: {.raises: [].} -proc singleQuotedScalar(c: ParserContext) = - debug("lex: singleQuotedScalar") - c.lexer.bufpos.inc() - while true: - case c.lexer.buf[c.lexer.bufpos] - of '\'': - c.lexer.bufpos.inc() - if c.lexer.buf[c.lexer.bufpos] == '\'': c.content.add('\'') - else: break - of EndOfFile: raise c.lexer.generateError("Unfinished single quoted string") - of '\l', '\c', '\t', ' ': - var newlines = 1 - c.processQuotedWhitespace(newlines) - continue - else: c.content.add(c.lexer.buf[c.lexer.bufpos]) - c.lexer.bufpos.inc() - -proc isPlainSafe(lexer: BaseLexer, index: int, context: YamlContext): bool - {.raises: [].} = - case lexer.buf[lexer.bufpos + 1] - of spaceOrLineEnd: result = false - of flowIndicators: result = context == cBlock - else: result = true - -# tried this for performance optimization, but it didn't optimize any -# performance. keeping it around for future reference. -#const -# plainCharOut = {'!', '\"', '$'..'9', ';'..'\xFF'} -# plainCharIn = {'!', '\"', '$'..'+', '-'..'9', ';'..'Z', '\\', '^'..'z', -# '|', '~'..'\xFF'} -#template isPlainChar(c: char, context: YamlContext): bool = -# when context == cBlock: c in plainCharOut -# else: c in plainCharIn - -proc plainScalar(c: ParserContext, context: static[YamlContext]) - {.raises: [].} = - debug("lex: plainScalar") - c.content.add(c.lexer.buf[c.lexer.bufpos]) - block outer: - while true: - c.lexer.bufpos.inc() - let ch = c.lexer.buf[c.lexer.bufpos] - case ch - of ' ', '\t': - c.after.setLen(1) - c.after[0] = ch - while true: - c.lexer.bufpos.inc() - let ch2 = c.lexer.buf[c.lexer.bufpos] - case ch2 - of ' ', '\t': c.after.add(ch2) - of lineEnd: break outer - of ':': - if c.lexer.isPlainSafe(c.lexer.bufpos + 1, context): - c.content.add(c.after & ':') - break - else: break outer - of '#': break outer - of flowIndicators: - if context == cBlock: - c.content.add(c.after) - c.content.add(ch2) - break - else: break outer - else: - c.content.add(c.after) - c.content.add(ch2) - break - of flowIndicators: - when context == cFlow: break - else: c.content.add(ch) - of lineEnd: break - of ':': - if c.lexer.isPlainSafe(c.lexer.bufpos + 1, context): c.content.add(':') - else: break outer - else: c.content.add(ch) - debug("lex: \"" & c.content & '\"') + c.lex.buf.setLen(0) proc continueMultilineScalar(c: ParserContext) {.raises: [].} = - c.content.add(if c.newlines == 1: " " else: repeat('\l', c.newlines - 1)) - c.startToken() - c.plainScalar(cBlock) + c.lex.buf.add(if c.newlines == 1: " " else: repeat('\l', c.newlines - 1)) + c.newlines = 0 template startScalar(t: ScalarType) {.dirty.} = c.newlines = 0 c.level.kind = fplScalar c.scalarType = t -proc blockScalarHeader(c: ParserContext): bool = - debug("lex: blockScalarHeader") - c.chomp = ctClip - c.level.indentation = UnknownIndentation - if c.tag == yTagQuestionMark: c.tag = yTagExclamationMark - let t = if c.lexer.buf[c.lexer.bufpos] == '|': stLiteral else: stFolded - while true: - c.lexer.bufpos.inc() - case c.lexer.buf[c.lexer.bufpos] - of '+': - if c.chomp != ctClip: - raise c.lexer.generateError("Only one chomping indicator is allowed") - c.chomp = ctKeep - of '-': - if c.chomp != ctClip: - raise c.lexer.generateError("Only one chomping indicator is allowed") - c.chomp = ctStrip - of '1'..'9': - if c.level.indentation != UnknownIndentation: - raise c.lexer.generateError("Only one p.indentation indicator is allowed") - c.level.indentation = c.ancestry[c.ancestry.high].indentation + - ord(c.lexer.buf[c.lexer.bufpos]) - ord('\x30') - of spaceOrLineEnd: break - else: - raise c.lexer.generateError( - "Illegal character in block scalar header: '" & - c.lexer.buf[c.lexer.bufpos] & "'") - c.recentWasMoreIndented = false - c.lineEnding() - result = c.handleLineEnd(true) - if not result: - startScalar(t) - c.content.reset() - -proc blockScalarLine(c: ParserContext): - bool {.raises: [YamlParserError].} = - debug("lex: blockScalarLine") - result = false - if c.level.indentation == UnknownIndentation: - if c.lexer.buf[c.lexer.bufpos] in lineEnd: - return c.handleLineEnd(true) - else: - c.level.indentation = c.indentation - c.content.addMultiple('\l', c.newlines) - elif c.indentation > c.level.indentation or - c.lexer.buf[c.lexer.bufpos] == '\t': - c.content.addMultiple('\l', c.newlines) - c.recentWasMoreIndented = true - c.content.addMultiple(' ', c.indentation - c.level.indentation) - elif c.scalarType == stFolded: - if c.recentWasMoreIndented: - c.recentWasMoreIndented = false - c.newlines.inc() - if c.newlines == 0: discard - elif c.newlines == 1: c.content.add(' ') - else: c.content.addMultiple('\l', c.newlines - 1) - else: c.content.addMultiple('\l', c.newlines) - c.newlines = 0 - while c.lexer.buf[c.lexer.bufpos] notin lineEnd: - c.content.add(c.lexer.buf[c.lexer.bufpos]) - c.lexer.bufpos.inc() - result = c.handleLineEnd(true) - -proc tagHandle(c: ParserContext, shorthandEnd: var int) - {.raises: [YamlParserError].} = - debug("lex: tagHandle") - shorthandEnd = 0 - c.content.add(c.lexer.buf[c.lexer.bufpos]) - var i = 0 - while true: - c.lexer.bufpos.inc() - i.inc() - let ch = c.lexer.buf[c.lexer.bufpos] - case ch - of spaceOrLineEnd: - if shorthandEnd == -1: - raise c.lexer.generateError("Unclosed verbatim tag") - break - of '!': - if shorthandEnd == -1 and i == 2: - c.content.add(ch) - continue - elif shorthandEnd != 0: - raise c.lexer.generateError("Illegal character in tag suffix") - shorthandEnd = i - c.content.add(ch) - of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&', - '-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')': - c.content.add(ch) - of ',': - if shortHandEnd > 0: break # ',' after shorthand is flow indicator - c.content.add(ch) - of '<': - if i == 1: - shorthandEnd = -1 - c.content.reset() - else: raise c.lexer.generateError("Illegal character in tag handle") - of '>': - if shorthandEnd == -1: - c.lexer.bufpos.inc() - if c.lexer.buf[c.lexer.bufpos] notin spaceOrLineEnd: - raise c.lexer.generateError("Missing space after verbatim tag handle") - break - else: raise c.lexer.generateError("Illegal character in tag handle") - of '%': - if shorthandEnd != 0: c.content.add(c.lexer.byteSequence()) - else: raise c.lexer.generateError("Illegal character in tag handle") - else: raise c.lexer.generateError("Illegal character in tag handle") - proc handleTagHandle(c: ParserContext) {.raises: [YamlParserError].} = - c.startToken() if c.level.kind != fplUnknown: raise c.generateError("Unexpected tag handle") if c.tag != yTagQuestionMark: raise c.generateError("Only one tag handle is allowed per node") - c.content.reset() - var - shorthandEnd: int - c.tagHandle(shorthandEnd) - if shorthandEnd != -1: + if c.lex.cur == ltTagHandle: + var tagUri = "" try: - c.tagUri.reset() - c.tagUri.add(c.shorthands[c.content[0..shorthandEnd]]) - c.tagUri.add(c.content[shorthandEnd + 1 .. ^1]) + tagUri.add(c.shorthands[c.lex.buf[0..c.lex.shorthandEnd]]) + tagUri.add(c.lex.buf[c.lex.shorthandEnd + 1 .. ^1]) except KeyError: raise c.generateError( - "Undefined tag shorthand: " & c.content[0..shorthandEnd]) - try: c.tag = c.p.tagLib.tags[c.tagUri] - except KeyError: c.tag = c.p.tagLib.registerUri(c.tagUri) + "Undefined tag shorthand: " & c.lex.buf[0..c.lex.shorthandEnd]) + try: c.tag = c.p.tagLib.tags[tagUri] + except KeyError: c.tag = c.p.tagLib.registerUri(tagUri) else: - try: c.tag = c.p.tagLib.tags[c.content] - except KeyError: c.tag = c.p.tagLib.registerUri(c.content) - -proc consumeLineIfEmpty(c: ParserContext, newlines: var int): bool = - result = true - while true: - c.lexer.bufpos.inc() - case c.lexer.buf[c.lexer.bufpos] - of ' ', '\t': discard - of '\l': - c.lexer.lexLF() - break - of '\c': - c.lexer.lexCR() - break - of '#', EndOfFile: - c.lineEnding() - discard c.handleLineEnd(true) - break - else: - result = false - break + try: c.tag = c.p.tagLib.tags[c.lex.buf] + except KeyError: c.tag = c.p.tagLib.registerUri(c.lex.buf) proc handlePossibleMapStart(c: ParserContext, e: var YamlStreamEvent, flow: bool = false, single: bool = false): bool = result = false if c.level.indentation == UnknownIndentation: - var flowDepth = 0 - var pos = c.lexer.bufpos - var recentJsonStyle = false - while pos < c.lexer.bufpos + 1024: - case c.lexer.buf[pos] - of ':': - if flowDepth == 0 and (c.lexer.buf[pos + 1] in spaceOrLineEnd or - recentJsonStyle): - e = c.objectStart(yamlStartMap, single) - result = true - break - of lineEnd: break - of '[', '{': flowDepth.inc() - of '}', ']': - flowDepth.inc(-1) - if flowDepth < 0: break - of '?', ',': - if flowDepth == 0: break - of '#': - if c.lexer.buf[pos - 1] in space: break - of '"': - pos.inc() - while c.lexer.buf[pos] notin {'"', EndOfFile, '\l', '\c'}: - if c.lexer.buf[pos] == '\\': pos.inc() - pos.inc() - if c.lexer.buf[pos] != '"': break - of '\'': - pos.inc() - while c.lexer.buf[pos] notin {'\'', '\l', '\c', EndOfFile}: - pos.inc() - of '&', '*', '!': - if pos == c.lexer.bufpos or c.lexer.buf[c.lexer.bufpos] in space: - pos.inc() - while c.lexer.buf[pos] notin spaceOrLineEnd: - pos.inc() - continue - else: discard - if flow and c.lexer.buf[pos] notin space: - recentJsonStyle = c.lexer.buf[pos] in {']', '}', '\'', '"'} - pos.inc() - if c.level.indentation == UnknownIndentation: - c.level.indentation = c.indentation + if c.lex.isImplicitKeyStart(): + e = c.objectStart(yamlStartMap, single) + result = true proc handleMapKeyIndicator(c: ParserContext, e: var YamlStreamEvent): bool = result = false - c.startToken() case c.level.kind of fplUnknown: e = c.objectStart(yamlStartMap) result = true of fplMapValue: - if c.level.indentation != c.indentation: + if c.level.indentation != c.lex.indentation: raise c.generateError("Invalid p.indentation of map key indicator") e = scalarEvent("", yTagQuestionMark, yAnchorNone) result = true @@ -789,7 +172,7 @@ proc handleMapKeyIndicator(c: ParserContext, e: var YamlStreamEvent): bool = c.ancestry.add(c.level) c.level = initLevel(fplUnknown) of fplMapKey: - if c.level.indentation != c.indentation: + if c.level.indentation != c.lex.indentation: raise c.generateError("Invalid p.indentation of map key indicator") c.ancestry.add(c.level) c.level = initLevel(fplUnknown) @@ -800,25 +183,26 @@ proc handleMapKeyIndicator(c: ParserContext, e: var YamlStreamEvent): bool = "Unexpected map key indicator (expected multiline scalar end)") of fplSinglePairKey, fplSinglePairValue, fplDocument: internalError("Unexpected level kind: " & $c.level.kind) - c.lexer.skipWhitespace() - c.indentation = c.lexer.getColNumber(c.lexer.bufpos) + # TODO: why was this there? + # c.lexer.skipWhitespace() + # c.indentation = c.lexer.getColNumber(c.lexer.bufpos) proc handleBlockSequenceIndicator(c: ParserContext, e: var YamlStreamEvent): bool = result = false - c.startToken() case c.level.kind of fplUnknown: e = c.objectStart(yamlStartSeq) result = true of fplSequence: - if c.level.indentation != c.indentation: + if c.level.indentation != c.lex.indentation: raise c.generateError("Invalid p.indentation of block sequence indicator") c.ancestry.add(c.level) c.level = initLevel(fplUnknown) else: raise c.generateError("Illegal sequence item in map") - c.lexer.skipWhitespace() - c.indentation = c.lexer.getColNumber(c.lexer.bufpos) + # TODO: why was this there? + # c.lexer.skipWhitespace() + # c.indentation = c.lexer.getColNumber(c.lexer.bufpos) proc handleBlockItemStart(c: ParserContext, e: var YamlStreamEvent): bool = result = false @@ -846,42 +230,10 @@ proc handleFlowItemStart(c: ParserContext, e: var YamlStreamEvent): bool = result = c.handlePossibleMapStart(e, true, true) proc handleFlowPlainScalar(c: ParserContext, e: var YamlStreamEvent) = - c.content.reset() - c.startToken() - c.plainScalar(cFlow) - if c.lexer.buf[c.lexer.bufpos] in {'{', '}', '[', ']', ',', ':', '#'}: - discard - else: - c.newlines = 0 - while true: - case c.lexer.buf[c.lexer.bufpos] - of ':': - if c.lexer.isPlainSafe(c.lexer.bufpos + 1, cFlow): - if c.newlines == 1: - c.content.add(' ') - c.newlines = 0 - elif c.newlines > 1: - c.content.addMultiple(' ', c.newlines - 1) - c.newlines = 0 - c.plainScalar(cFlow) - break - of '#', EndOfFile: break - of '\l': - c.lexer.bufpos = c.lexer.handleLF(c.lexer.bufpos) - c.newlines.inc() - of '\c': - c.lexer.bufpos = c.lexer.handleCR(c.lexer.bufpos) - c.newlines.inc() - of flowIndicators: break - of ' ', '\t': c.lexer.skipWhitespace() - else: - if c.newlines == 1: - c.content.add(' ') - c.newlines = 0 - elif c.newlines > 1: - c.content.addMultiple(' ', c.newlines - 1) - c.newlines = 0 - c.plainScalar(cFlow) + while c.lex.cur in {ltScalarPart, ltEmptyLine}: + c.lex.newlines.inc() + c.lex.next() + c.lex.newlines = 0 e = c.currentScalar() # --- macros for defining parser states --- @@ -965,7 +317,7 @@ parserStates(initial, blockObjectStart, blockAfterPlainScalar, blockAfterObject, leaveFlowSinglePairMap) proc closeEverything(c: ParserContext) = - c.indentation = -1 + c.lex.indentation = -1 c.nextImpl = stateCloseMoreIndentedLevels c.atSequenceItem = false @@ -1068,59 +420,42 @@ proc leaveFlowLevel(c: ParserContext, e: var YamlStreamEvent): bool = c.nextImpl = stateObjectEnd parserState initial: - case c.lexer.buf[c.lexer.bufpos] - of '%': - var ld: LexedDirective - c.startToken() - c.lexer.directiveName(ld) - case ld - of ldYaml: - var version = "" - c.startToken() - c.lexer.yamlVersion(version) - if version != "1.2": - c.callCallback("Version is not 1.2, but " & version) - c.lineEnding() - discard c.handleLineEnd(true) - of ldTag: - var shorthand = "" - c.tagUri.reset() - c.startToken() - c.lexer.tagShorthand(shorthand) - c.lexer.tagUriMapping(c.tagUri) - c.shorthands[shorthand] = c.tagUri - c.lineEnding() - discard c.handleLineEnd(true) - of ldUnknown: - c.callCallback("Unknown directive") - c.lexer.finishLine() - discard c.handleLineEnd(true) - of ' ', '\t': - if not c.consumeLineIfEmpty(c.newlines): - c.indentation = c.lexer.getColNumber(c.lexer.bufpos) - e = startDocEvent() - result = true - state = blockObjectStart - of '\l': c.lexer.lexLF() - of '\c': c.lexer.lexCR() - of EndOfFile: c.isFinished = true - of '#': - c.lineEnding() - discard c.handleLineEnd(true) - of '-': - var success: bool - c.startToken() - c.lexer.directivesEndMarker(success) - if success: c.lexer.bufpos.inc(3) + c.lex.next() + case c.lex.cur + of ltYamlDirective: + c.lex.next() + assert c.lex.cur == ltYamlVersion + if c.lex.buf != "1.2": + c.callCallback("Version is not 1.2, but " & c.lex.buf) + of ltTagDirective: + c.lex.next() + assert c.lex.cur == ltTagShorthand + var tagShorthand: string + shallowCopy(tagShorthand, c.lex.buf) + c.lex.buf = "" + c.lex.next() + assert c.lex.cur == ltTagUri + c.shorthands[tagShorthand] = c.lex.buf + c.lex.buf.setLen(0) + of ltUnknownDirective: + c.callCallback("Unknown directive: " & c.lex.buf) + c.lex.buf.setLen(0) + c.lex.next() + assert c.lex.cur == ltUnknownDirectiveParams + of ltIndentation: e = startDocEvent() result = true state = blockObjectStart - else: + of ltStreamEnd: c.isFinished = true + of ltDirectivesEnd: e = startDocEvent() result = true state = blockObjectStart + else: internalError("Unexpected lexer token: " & $c.lex.cur) parserState blockObjectStart: + c.next() + c.lexer.skipIndentation() c.indentation = c.lexer.getColNumber(c.lexer.bufpos) if c.indentation == 0: @@ -1153,8 +488,6 @@ parserState blockObjectStart: stored = afterDocument return false else: - c.atSequenceItem = c.lexer.buf[c.lexer.bufpos] == '-' and - not c.lexer.isPlainSafe(c.lexer.bufpos + 1, cBlock) state = closeMoreIndentedLevels stored = blockObjectStart return false @@ -1808,15 +1141,19 @@ parserState flowAfterObject: # --- parser initialization --- -proc parse*(p: YamlParser, s: Stream): YamlStream = - result = new(ParserContext) - let c = ParserContext(result) - c.content = "" - c.after = "" - c.tagUri = "" - c.ancestry = newSeq[FastParseLevel]() +proc init(c: ParserContext, p: YamlParser) = c.p = p - try: p.lexer.open(s) + c.ancestry = newSeq[FastParseLevel]() + c.initDocValues() + c.flowdepth = 0 + c.isFinished = false + c.peeked = false + c.nextImpl = stateInitial + +proc parse*(p: YamlParser, s: Stream): YamlStream = + let c = new(ParserContext) + c.init(p) + try: c.lex = newYamlLexer(s) except: let e = newException(YamlParserError, "Error while opening stream: " & getCurrentExceptionMsg()) @@ -1825,9 +1162,10 @@ proc parse*(p: YamlParser, s: Stream): YamlStream = e.column = 1 e.lineContent = "" raise e - c.initDocValues() - c.atSequenceItem = false - c.flowdepth = 0 - result.isFinished = false - result.peeked = false - result.nextImpl = stateInitial + result = c + +proc parse*(p: YamlParser, str: string): YamlStream = + let c = new(ParserContext) + c.init(p) + c.lex = newYamlLexer(str) + result = c \ No newline at end of file diff --git a/yaml.nim b/yaml.nim index b215d33..6f2fdfa 100644 --- a/yaml.nim +++ b/yaml.nim @@ -17,7 +17,7 @@ ## this enhances interoperability with other languages. import streams, unicode, lexbase, tables, strutils, json, hashes, queues, - macros, typetraits, parseutils + macros, typetraits, parseutils, private/lex export streams, tables, json when defined(yamlDebug): import terminal @@ -143,14 +143,6 @@ type ## ``1.2``. ## - If there is an unknown directive encountered. - FastParseLevelKind = enum - fplUnknown, fplSequence, fplMapKey, fplMapValue, fplSinglePairKey, - fplSinglePairValue, fplScalar, fplDocument - - FastParseLevel = object - kind: FastParseLevelKind - indentation: int - YamlParser* = ref object ## A parser object. Retains its ``TagLibrary`` across calls to ## `parse <#parse,YamlParser,Stream>`_. Can be used @@ -160,8 +152,6 @@ type tagLib: TagLibrary callback: WarningCallback anchors: Table[string, AnchorId] - lexer: BaseLexer - tokenstart: int PresentationStyle* = enum ## Different styles for YAML character stream output.