From 55d5cfcbf9887b1310164cd2f75c632d79b35a31 Mon Sep 17 00:00:00 2001 From: Felix Krause Date: Wed, 20 Jan 2016 21:15:33 +0100 Subject: [PATCH] Started re-implementing parser * Made TagLibrary a ref object * Started writing fastparse.nim, a parser aimed to deliver much faster speeds than the current implementation. It currently is only able to parse block content --- private/events.nim | 5 +- private/fastparse.nim | 901 +++++++++++++++++++++++++++++++++++++++++ private/lexer.nim | 3 +- private/tagLibrary.nim | 3 +- yaml.nim | 26 +- 5 files changed, 925 insertions(+), 13 deletions(-) create mode 100644 private/fastparse.nim diff --git a/private/events.nim b/private/events.nim index 1641fa7..05e8b42 100644 --- a/private/events.nim +++ b/private/events.nim @@ -69,4 +69,7 @@ proc endSeqEvent*(): YamlStreamEvent = proc scalarEvent*(content: string = "", tag: TagId = yTagQuestionMark, anchor: AnchorId = yAnchorNone): YamlStreamEvent = result = YamlStreamEvent(kind: yamlScalar, scalarTag: tag, - scalarAnchor: anchor, scalarContent: content) \ No newline at end of file + scalarAnchor: anchor, scalarContent: content) + +proc aliasEvent*(anchor: AnchorId): YamlStreamEvent = + result = YamlStreamEvent(kind: yamlAlias, aliasTarget: anchor) \ No newline at end of file diff --git a/private/fastparse.nim b/private/fastparse.nim new file mode 100644 index 0000000..189c1f0 --- /dev/null +++ b/private/fastparse.nim @@ -0,0 +1,901 @@ +type + FastParseState = enum + fpInitial, fpBlockLineStart, fpBlockAfterScalar, fpBlockAfterPlainScalar, + fpBlockObjectStart, fpBlockContinueScalar, fpExpectDocEnd + + FastParseLevelKind = enum + fplUnknown, fplSequence, fplMapKey, fplMapValue, fplScalar + + FastParseLevel = object + kind: FastParseLevelKind + indentation: int + + LexedDirective = enum + ldYaml, ldTag, ldUnknown + + LexedPossibleDirectivesEnd = enum + lpdeDirectivesEnd, lpdeSequenceItem, lpdeScalarContent + + YamlContext = enum + cFlowIn, cFlowOut, cFlowKey, cBlockKey, cBlockIn, cBlockOut + + FastLexer = object of BaseLexer + tokenstart: int + +const + space = [' ', '\t'] + lineEnd = ['\x0A', '\c', EndOfFile] + spaceOrLineEnd = [' ', '\t', '\x0A', '\c', EndOfFile] + digits = '0'..'9' + flowIndicators = ['[', ']', '{', '}', ','] + +template debug(message: string) {.dirty.} = + when defined(yamlDebug): + try: styledWriteLine(stdout, fgBlue, message) + except IOError: discard + +template raiseError(message: string) {.dirty.} = + var e = newException(YamlParserError, message) + e.line = lexer.lineNumber + e.column = lexer.tokenstart + e.lineContent = lexer.getCurrentLine(false) & + repeat(' ', lexer.getColNumber(lexer.bufpos)) & "^\n" + raise e + +template raiseError(message: string, col: int) {.dirty.} = + var e = newException(YamlParserError, message) + e.line = lexer.lineNumber + e.column = col + e.lineContent = lexer.getCurrentLine(false) & + repeat(' ', lexer.getColNumber(lexer.bufpos)) & "^\n" + raise e + +template closeLevel() {.dirty.} = + case level.kind + of fplSequence: + yield endSeqEvent() + of fplMapKey: + yield endMapEvent() + of fplMapValue: + yield scalarEvent("", tag, anchor) + tag = yTagQuestionMark + anchor = yAnchorNone + yield endMapEvent() + of fplScalar: + applyObjectProperties() + yield cachedScalar + of fplUnknown: + yield scalarEvent("") + if ancestry.len > 0: + level = ancestry.pop() + +template handleLineEnd(insideDocument: bool) {.dirty.} = + case lexer.buf[lexer.bufpos] + of '\x0A': + lexer.bufpos = lexer.handleLF(lexer.bufpos) + of '\c': + lexer.bufpos = lexer.handleCR(lexer.bufpos) + of EndOfFile: + when insideDocument: + closeEverything() + return + else: + discard + +template handleObjectEnd() {.dirty.} = + tag = yTagQuestionMark + anchor = yAnchorNone + case level.kind + of fplMapKey: + level.kind = fplMapValue + of fplMapValue: + level.kind = fplMapKey + of fplSequence: + discard + of fplUnknown, fplScalar: + raiseError("Internal error!") + +template handleStartObject(k: YamlStreamEventKind) {.dirty.} = + when k == yamlStartMap: + yield startMapEvent(objectTag, objectAnchor) + debug("started map at " & $lexer.tokenstart) + else: + yield startSeqEvent(objectTag, objectAnchor) + debug("started sequence at " & $lexer.tokenstart) + objectTag = yTagQuestionMark + objectAnchor = yAnchorNone + +template closeMoreIndentedLevels() {.dirty.} = + while ancestry.len > 0: + let parent = ancestry[ancestry.high] + if parent.indentation >= indentation: + debug("Closing because level.indentation =" & $level.indentation & + ", but indentation = " & $indentation) + closeLevel() + handleObjectEnd() + else: + break + +template closeEverything() {.dirty.} = + indentation = 0 + closeMoreIndentedLevels() + closeLevel() + yield endDocEvent() + +template handleStartBlockSequence() {.dirty.} = + case level.kind + of fplUnknown: + level.kind = fplSequence + handleStartObject(yamlStartSequence) + of fplSequence: + if level.indentation != indentation: + raiseError("Invalid indentation of block sequence indicator", + lexer.bufpos) + else: + raiseError("Illegal sequence item in map") + ancestry.add(level) + lexer.skipWhitespace() + indentation = lexer.getColNumber(lexer.bufpos) + level = FastParseLevel(kind: fplUnknown, indentation: indentation) + +template handleStartBlockScalar() {.dirty.} = + case level.kind + of fplUnknown, fplMapKey: + discard + of fplSequence: + raiseError("Illegal token (expected '- ')") + of fplMapValue, fplScalar: + raiseError("Internal error!") + +template propsToObjectProps() {.dirty.} = + if objectTag == yTagQuestionmark: + objectTag = tag + tag = yTagQuestionmark + elif tag != yTagQuestionMark: + raiseError("Only one tag is allowed per node") + if objectAnchor == yAnchorNone: + objectAnchor = anchor + anchor = yAnchorNone + elif anchor != yAnchorNone: + raiseError("Only one anchor is allowed per node") + +template initDocValues() {.dirty.} = + shorthands = initTable[string, string]() + anchors = initTable[string, AnchorId]() + shorthands["!"] = "!" + shorthands["!!"] = "tag:yaml.org,2002:" + nextAnchorId = 0.AnchorId + level = FastParseLevel(kind: fplUnknown, indentation: -1) + +template applyObjectProperties() {.dirty.} = + if objectTag != yTagQuestionmark: + if cachedScalar.scalarTag != yTagQuestionmark: + raiseError("Only one tag is allowed per node") + else: + cachedScalar.scalarTag = objectTag + objectTag = yTagQuestionmark + if objectAnchor != yAnchorNone: + if cachedScalar.scalarAnchor != yAnchorNone: + raiseError("Only one anchor is allowed per node") + else: + cachedScalar.scalarAnchor = objectAnchor + objectAnchor = yAnchorNone + +template finishLine(lexer: FastLexer) = + debug("lex: finishLine") + while lexer.buf[lexer.bufpos] notin lineEnd: + lexer.bufpos.inc() + +template skipWhitespace(lexer: FastLexer) = + debug("lex: skipWhitespace") + while lexer.buf[lexer.bufpos] in space: lexer.bufpos.inc() + +template skipIndentation(lexer: FastLexer) = + debug("lex: skipIndentation") + while lexer.buf[lexer.bufpos] == ' ': lexer.bufpos.inc() + +template directiveName(lexer: FastLexer, directive: var LexedDirective) = + debug("lex: directiveName") + directive = ldUnknown + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == 'Y': + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == 'A': + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == 'M': + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == 'L': + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] in spaceOrLineEnd: + directive = ldYaml + elif lexer.buf[lexer.bufpos] == 'T': + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == 'A': + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == 'G': + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] in [' ', '\t', '\x0A', '\c', EndOfFile]: + directive = ldTag + while lexer.buf[lexer.bufpos] notin spaceOrLineEnd: + lexer.bufpos.inc() + +template yamlVersion(lexer: FastLexer, o: var string) = + debug("lex: yamlVersion") + while lexer.buf[lexer.bufpos] in space: + lexer.bufpos.inc() + var c = lexer.buf[lexer.bufpos] + if c notin digits: + raiseError("Invalid YAML version number") + o.add(c) + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + lexer.bufpos.inc() + c = lexer.buf[lexer.bufpos] + while c in digits: + lexer.bufpos.inc() + o.add(c) + c = lexer.buf[lexer.bufpos] + if lexer.buf[lexer.bufpos] != '.': + raiseError("Invalid YAML version number") + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] notin digits: + raiseError("Invalid YAML version number") + lexer.bufpos.inc() + while lexer.buf[lexer.bufpos] in digits: + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] notin spaceOrLineEnd: + raiseError("Invalid YAML version number") + +template lineEnding(lexer: FastLexer) = + debug("lex: lineEnding") + if lexer.buf[lexer.bufpos] notin lineEnd: + while lexer.buf[lexer.bufpos] in space: + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] in lineEnd: + discard + elif lexer.buf[lexer.bufpos] == '#': + while lexer.buf[lexer.bufpos] notin lineEnd: + lexer.bufpos.inc() + else: + raiseError("Unexpected token (expected comment or line end)", + lexer.bufpos) + +template tagShorthand(lexer: FastLexer, shorthand: var string) = + debug("lex: tagShorthand") + while lexer.buf[lexer.bufpos] in space: + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] != '!': + raiseError("Invalid tag shorthand") + shorthand.add('!') + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + lexer.bufpos.inc() + var c = lexer.buf[lexer.bufpos] + if c in spaceOrLineEnd: + discard + else: + while c != '!': + case c + of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-': + shorthand.add(c) + lexer.bufpos.inc() + c = lexer.buf[lexer.bufpos] + else: + raiseError("Illegal character in tag shorthand", lexer.bufpos) + shorthand.add(c) + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] notin spaceOrLineEnd: + raiseError("Missing space after tag shorthand", lexer.bufpos) + +template tagUri(lexer: FastLexer, uri: var string) = + debug("lex: tagUri") + while lexer.buf[lexer.bufpos] in space: + lexer.bufpos.inc() + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + var c = lexer.buf[lexer.bufpos] + while c notin spaceOrLineEnd: + case c + of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&', + '-', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')': + uri.add(c) + lexer.bufpos.inc() + c = lexer.buf[lexer.bufpos] + else: + raiseError("Invalid tag uri") + +template directivesEnd(lexer: FastLexer, content: var string, + token: var LexedPossibleDirectivesEnd) = + debug("lex: directivesEnd") + content.add('-') + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + lexer.bufpos.inc() + case lexer.buf[lexer.bufpos] + of '-': + content.add('-') + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == '-': + content.add('-') + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] in spaceOrLineEnd: + token = lpdeDirectivesEnd + else: + token = lpdeScalarContent + else: + token = lpdeScalarContent + of spaceOrLineEnd: + token = lpdeSequenceItem + else: + token = lpdeScalarContent + +template documentEnd(lexer: var FastLexer, content: var string, + isDocumentEnd: var bool) = + content.add('.') + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == '.': + content.add('.') + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == '.': + content.add('.') + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] in spaceOrLineEnd: + isDocumentEnd = true + else: + isDocumentEnd = false + else: + isDocumentEnd = false + else: + isDocumentEnd = false + +template singleQuotedScalar(lexer: FastLexer, content: var string) = + debug("lex: singleQuotedScalar") + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + lexer.bufpos.inc() + while true: + case lexer.buf[lexer.bufpos] + of '\'': + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] == '\'': + content.add('\'') + else: + break + of EndOfFile: + raiseError("Unfinished single quoted string") + else: + content.add(lexer.buf[lexer.bufpos]) + lexer.bufpos.inc() + +proc unicodeSequence(lexer: var FastLexer, length: int): + string {.raises: [YamlParserError].} = + debug("lex: unicodeSequence") + var unicodeChar = 0.Rune + let start = lexer.bufpos - 1 + for i in countup(0, length - 1): + lexer.bufpos.inc() + let + digitPosition = length - i - 1 + c = lexer.buf[lexer.bufpos] + case c + of EndOFFile: + raiseError("Unfinished unicode escape sequence", start) + of '0' .. '9': + unicodeChar = unicodechar or + (cast[int](c) - 0x30) shl (digitPosition * 4) + of 'A' .. 'F': + unicodeChar = unicodechar or + (cast[int](c) - 0x37) shl (digitPosition * 4) + of 'a' .. 'f': + unicodeChar = unicodechar or + (cast[int](c) - 0x57) shl (digitPosition * 4) + else: + raiseError("Invalid character in unicode escape sequence", lexer.bufpos) + return toUTF8(unicodeChar) + +template doublyQuotedScalar(lexer: FastLexer, content: var string) = + debug("lex: doublyQuotedScalar") + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + while true: + lexer.bufpos.inc() + let c = lexer.buf[lexer.bufpos] + case c + of EndOfFile: + raiseError("Unfinished doubly quoted string") + of '\\': + lexer.bufpos.inc() + case lexer.buf[lexer.bufpos] + of EndOfFile: + raiseError("Unfinished escape sequence") + of '0': content.add('\0') + of 'a': content.add('\x07') + of 'b': content.add('\x08') + of '\t', 't': content.add('\t') + of 'n': content.add('\x0A') + of 'v': content.add('\v') + of 'f': content.add('\f') + of 'r': content.add('\r') + of 'e': content.add('\e') + of ' ': content.add(' ') + of '"': content.add('"') + of '/': content.add('/') + of '\\': content.add('\\') + of 'N': content.add(UTF8NextLine) + of '_': content.add(UTF8NonBreakingSpace) + of 'L': content.add(UTF8LineSeparator) + of 'P': content.add(UTF8ParagraphSeparator) + of 'x': content.add(lexer.unicodeSequence(2)) + of 'u': content.add(lexer.unicodeSequence(4)) + of 'U': content.add(lexer.unicodeSequence(8)) + else: + raiseError("Illegal character in escape sequence") + of '"': + lexer.bufpos.inc() + break + else: + content.add(c) + +proc isPlainSafe(lexer: FastLexer, index: int, context: YamlContext): bool = + case lexer.buf[lexer.bufpos + 1] + of spaceOrLineEnd: + result = false + of flowIndicators: + result = context in [cFlowOut, cBlockKey] + else: + result = true + +template plainScalar(lexer: FastLexer, content: var string, + context: YamlContext) = + debug("lex: plainScalar") + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + content.add(lexer.buf[lexer.bufpos]) + block outer: + while true: + lexer.bufpos.inc() + let c = lexer.buf[lexer.bufpos] + case c + of lineEnd: + break + of ' ', '\t': + var after = "" & c + while true: + lexer.bufpos.inc() + let c2 = lexer.buf[lexer.bufpos] + case c2 + of ' ', '\t': + after.add(c2) + of lineEnd: + break outer + of ':': + if lexer.isPlainSafe(lexer.bufpos + 1, context): + content.add(after & ':') + else: + break outer + of '#': + break outer + else: + content.add(after) + content.add(c2) + break + of flowIndicators: + if context in [cFlowOut, cBlockKey]: + content.add(c) + else: + break + of ':': + if lexer.isPlainSafe(lexer.bufpos + 1, context): + content.add(':') + else: + break outer + of '#': + break outer + else: + content.add(c) + +template continueMultilineScalar() {.dirty.} = + cachedScalar.scalarContent.add(if newlines == 1: " " else: + repeat('\x0A', newlines - 1)) + lexer.plainScalar(cachedScalar.scalarContent, cBlockOut) + state = fpBlockAfterPlainScalar + +template tagHandle(lexer: var FastLexer, content: var string, + shorthandEnd: var int) = + debug("lex: tagHandle") + shorthandEnd = 0 + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + content.add(lexer.buf[lexer.bufpos]) + var i = 0 + while true: + lexer.bufpos.inc() + i.inc() + let c = lexer.buf[lexer.bufpos] + case c + of spaceOrLineEnd: + if shorthandEnd == -1: + raiseError("Unclosed verbatim tag") + break + of '!': + if shorthandEnd == -1 and i == 2: + content.add(c) + elif shorthandEnd != 0: + raiseError("Illegal character in tag suffix", lexer.bufpos) + shorthandEnd = i + content.add(c) + of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&', + '-', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')': + content.add(c) + of '<': + if i == 1: + shorthandEnd = -1 + content = "" + else: + raiseError("Illegal character in tag handle", lexer.bufpos) + of '>': + if shorthandEnd == -1: + lexer.bufpos.inc() + if lexer.buf[lexer.bufpos] notin spaceOrLineEnd: + raiseError("Missing space after verbatim tag handle", lexer.bufpos) + break + else: + raiseError("Illegal character in tag handle", lexer.bufpos) + else: + raiseError("Illegal character in tag handle", lexer.bufpos) + +template anchorName(lexer: FastLexer, content: var string) = + debug("lex: anchorName") + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + while true: + lexer.bufpos.inc() + let c = lexer.buf[lexer.bufpos] + case c + of spaceOrLineEnd: + break + of '[', ']', '{', '}', ',': + raiseError("Illegal character in anchor", lexer.bufpos) + else: + content.add(c) + +proc fastparse*(tagLib: TagLibrary, s: Stream): YamlStream = + result = iterator(): YamlStreamEvent = + var + lexer: FastLexer + state = fpInitial + shorthands: Table[string, string] + anchors: Table[string, AnchorId] + nextAnchorId: AnchorId + content: string + tag, objectTag: TagId = yTagQuestionMark + anchor, objectAnchor: AnchorId = yAnchorNone + ancestry = newSeq[FastParseLevel]() + level: FastParseLevel + cachedScalar: YamlStreamEvent + indentation: int + newlines: int + + lexer.open(s) + initDocValues() + + while true: + case state + of fpInitial: + debug("state: initial") + case lexer.buf[lexer.bufpos] + of '%': + var ld: LexedDirective + lexer.directiveName(ld) + case ld + of ldYaml: + var version = "" + lexer.yamlVersion(version) + if version != "1.2": + echo "version is not 1.2!" + # TODO: warning (unknown version) + discard + lexer.lineEnding() + handleLineEnd(false) + of ldTag: + var shorthand, uri = "" + lexer.tagShorthand(shorthand) + lexer.tagUri(uri) + shorthands.add(shorthand, uri) + lexer.lineEnding() + handleLineEnd(false) + of ldUnknown: + # TODO: warning (unknown directive) + lexer.finishLine() + handleLineEnd(false) + of ' ', '\t': + lexer.bufpos.inc() + of '\x0A': + lexer.bufpos = lexer.handleLF(lexer.bufpos) + of '\c': + lexer.bufpos = lexer.handleCR(lexer.bufpos) + lexer.bufpos.inc() + of EndOfFile: + return + of '#': + lexer.lineEnding() + handleLineEnd(false) + of '-': + var token: LexedPossibleDirectivesEnd + content = "" + lexer.directivesEnd(content, token) + yield startDocEvent() + case token + of lpdeDirectivesEnd: + state = fpBlockObjectStart + of lpdeSequenceItem: + indentation = 0 + handleStartBlockSequence() + state = fpBlockObjectStart + of lpdeScalarContent: + lexer.plainScalar(content, cBlockOut) + cachedScalar = scalarEvent(content, tag, anchor) + state = fpBlockAfterPlainScalar + else: + yield startDocEvent() + state = fpBlockLineStart + of fpBlockLineStart: + debug("state: blockLineStart") + case lexer.buf[lexer.bufpos] + of '-': + var token: LexedPossibleDirectivesEnd + content = "" + lexer.directivesEnd(content, token) + case token + of lpdeDirectivesEnd: + closeEverything() + initDocValues() + yield startDocEvent() + state = fpBlockObjectStart + of lpdeSequenceItem: + indentation = 0 + closeMoreIndentedLevels() + handleStartBlockSequence() + state = fpBlockObjectStart + of lpdeScalarContent: + if level.kind == fplScalar: + continueMultilineScalar() + else: + lexer.plainScalar(content, cBlockOut) + cachedScalar = scalarEvent(content, tag, anchor) + state = fpBlockAfterPlainScalar + of '.': + var isDocumentEnd: bool + content = "" + lexer.documentEnd(content, isDocumentEnd) + if isDocumentEnd: + lexer.lineEnding() + closeEverything() + initDocValues() + state = fpInitial + elif level.kind == fplScalar: + continueMultilineScalar() + else: + lexer.plainScalar(content, cBlockOut) + cachedScalar = scalarEvent(content, tag, anchor) + state = fpBlockAfterPlainScalar + of ' ': + lexer.skipIndentation() + indentation = lexer.getColNumber(lexer.bufpos) + closeMoreIndentedLevels() + case level.kind + of fplScalar: + state = fpBlockContinueScalar + of fplUnknown: + state = fpBlockObjectStart + level.indentation = indentation + else: + state = fpBlockObjectStart + else: + indentation = 0 + closeMoreIndentedLevels() + case level.kind + of fplScalar: + state = fpBlockContinueScalar + of fplUnknown: + state = fpBlockObjectStart + level.indentation = indentation + else: + state = fpBlockObjectStart + of fpBlockContinueScalar: + debug("state: blockAfterPlainScalar") + lexer.skipWhitespace() + case lexer.buf[lexer.bufpos] + of '\x0A': + newlines.inc() + lexer.bufpos = lexer.handleLF(lexer.bufpos) + state = fpBlockLineStart + of '\c': + newlines.inc() + lexer.bufpos = lexer.handleCR(lexer.bufpos) + of ':': + if lexer.isPlainSafe(lexer.bufpos + 1, cBlockOut): + continueMultilineScalar() + else: + raiseError("Unexpected token", lexer.bufpos) + of '#': + yield cachedScalar + lexer.lineEnding() + handleLineEnd(true) + if ancestry.len == 0: + state = fpExpectDocEnd + else: + level = ancestry.pop() + handleObjectEnd() + state = fpBlockLineStart + else: + continueMultilineScalar() + + of fpBlockAfterPlainScalar: + debug("state: blockAfterPlainScalar") + lexer.skipWhitespace() + case lexer.buf[lexer.bufpos] + of '\x0A': + if level.kind notin [fplUnknown, fplScalar]: + raiseError("Unexpected scalar") + newlines = 1 + level.kind = fplScalar + lexer.bufpos = lexer.handleLF(lexer.bufpos) + state = fpBlockLineStart + of '\c': + if level.kind notin [fplUnknown, fplScalar]: + raiseError("Unexpected scalar") + newlines = 1 + level.kind = fplScalar + lexer.bufpos = lexer.handleCR(lexer.bufpos) + state = fpBlockLineStart + else: + state = fpBlockAfterScalar + of fpBlockAfterScalar: + debug("state: blockAfterScalar") + lexer.skipWhitespace() + case lexer.buf[lexer.bufpos] + of EndOfFile: + level.kind = fplScalar + closeEverything() + break + of '\x0A': + if level.kind != fplUnknown: + raiseError("Unexpected scalar") + applyObjectProperties() + yield cachedScalar + if ancestry.len == 0: + state = fpExpectDocEnd + else: + level = ancestry.pop() + handleObjectEnd() + state = fpBlockLineStart + lexer.bufpos = lexer.handleLF(lexer.bufpos) + of '\c': + if level.kind != fplUnknown: + raiseError("Unexpected scalar") + applyObjectProperties() + yield cachedScalar + if ancestry.len == 0: + state = fpExpectDocEnd + else: + level = ancestry.pop() + handleObjectEnd() + state = fpBlockLineStart + lexer.bufpos = lexer.handleCR(lexer.bufpos) + of ':': + case level.kind + of fplUnknown: + level.kind = fplMapKey + handleStartObject(yamlStartMap) + of fplMapValue: + yield scalarEvent("", yTagQuestionMark, yAnchorNone) + level.kind = fplMapKey + of fplMapKey: + if level.indentation != indentation: + raiseError("Invalid indentation for map key") + of fplSequence: + raiseError("Illegal token (expected sequence item)") + of fplScalar: + raiseError("Multiline scalars may not be implicit map keys") + handleObjectEnd() + yield cachedScalar + ancestry.add(level) + lexer.bufpos.inc() + lexer.skipWhitespace() + indentation = lexer.getColNumber(lexer.bufpos) + level = FastParseLevel(kind: fplUnknown, indentation: indentation) + state = fpBlockObjectStart + of '#': + applyObjectProperties() + yield cachedScalar + lexer.lineEnding() + handleLineEnd(true) + state = fpBlockLineStart + else: + raiseError("Illegal token (expected ':', comment or line end)", + lexer.bufpos) + of fpBlockObjectStart: + debug("state: blockObjectStart") + lexer.skipWhitespace() + let objectStart = lexer.getColNumber(lexer.bufpos) + case lexer.buf[lexer.bufpos] + of '\x0A': + propsToObjectProps() + lexer.bufpos = lexer.handleLF(lexer.bufpos) + state = fpBlockLineStart + of '\c': + propsToObjectProps() + lexer.bufpos = lexer.handleCR(lexer.bufpos) + state = fpBlockLineStart + of EndOfFile: + closeEverything() + return + of '#': + lexer.lineEnding() + handleLineEnd(true) + of '\'': + handleStartBlockScalar() + content = "" + lexer.singleQuotedScalar(content) + if tag == yTagQuestionMark: + tag = yTagExclamationMark + cachedScalar = scalarEvent(content, tag, anchor) + state = fpBlockAfterScalar + of '"': + handleStartBlockScalar() + content = "" + lexer.doublyQuotedScalar(content) + if tag == yTagQuestionMark: + tag = yTagExclamationMark + cachedScalar = scalarEvent(content, tag, anchor) + state = fpBlockAfterScalar + of '-': + if lexer.isPlainSafe(lexer.bufpos + 1, cBlockOut): + handleStartBlockScalar() + lexer.tokenstart = lexer.getColNumber(lexer.bufpos) + lexer.plainScalar(content, cBlockOut) + cachedScalar = scalarEvent(content, tag, anchor) + state = fpBlockAfterPlainScalar + else: + lexer.bufpos.inc() + handleStartBlockSequence() + of '!': + if tag != yTagQuestionmark: + raiseError("Only one tag handle is allowed per node") + content = "" + var + shorthandEnd: int + tagUri: string + lexer.tagHandle(content, shorthandEnd) + if shorthandEnd != -1: + try: + let prefix = shorthands[content[0..shorthandEnd]] + tagUri = prefix & content[shorthandEnd + 1 .. ^1] + except KeyError: + raiseError("Undefined tag shorthand: " & content[0..shorthandEnd]) + else: + shallowCopy(tagUri, content) + try: + tag = tagLib.tags[tagUri] + except KeyError: + tag = tagLib.registerUri(tagUri) + of '&': + if anchor != yAnchorNone: + raiseError("Only one anchor is allowed per node", lexer.bufpos) + content = "" + lexer.anchorName(content) + anchor = nextAnchorId + anchors[content] = anchor + nextAnchorId = cast[AnchorId](cast[int](nextAnchorId) + 1) + of '*': + if anchor != yAnchorNone or tag != yTagQuestionmark: + raiseError("Alias may not have anchor or tag") + content = "" + lexer.anchorName(content) + try: + cachedScalar = aliasEvent(anchors[content]) + except KeyError: + raiseError("Unknown anchor") + state = fpBlockAfterScalar + else: + handleStartBlockScalar() + content = "" + lexer.plainScalar(content, cBlockOut) + cachedScalar = scalarEvent(content, tag, anchor) + state = fpBlockAfterPlainScalar + of fpExpectDocEnd: + discard # TODO \ No newline at end of file diff --git a/private/lexer.nim b/private/lexer.nim index b5729ab..75e8147 100644 --- a/private/lexer.nim +++ b/private/lexer.nim @@ -153,8 +153,7 @@ template yieldToken(kind: YamlLexerToken) {.dirty.} = template yieldScalarPart() {.dirty.} = when defined(yamlDebug): - echo "Lexer token: tScalarPart(\"", my.content, "\".", typeHintState, - ")" + echo "Lexer token: tScalarPart(\"", my.content, "\")" yield tScalarPart my.content = "" diff --git a/private/tagLibrary.nim b/private/tagLibrary.nim index 78d1ca8..50fccf6 100644 --- a/private/tagLibrary.nim +++ b/private/tagLibrary.nim @@ -5,11 +5,12 @@ # distribution, for details about the copyright. proc initTagLibrary*(): TagLibrary = + new(result) result.tags = initTable[string, TagId]() result.nextCustomTagId = yFirstCustomTagId result.secondaryPrefix = yamlTagRepositoryPrefix -proc registerUri*(tagLib: var TagLibrary, uri: string): TagId = +proc registerUri*(tagLib: TagLibrary, uri: string): TagId = tagLib.tags[uri] = tagLib.nextCustomTagId result = tagLib.nextCustomTagId tagLib.nextCustomTagId = cast[TagId](cast[int](tagLib.nextCustomTagId) + 1) diff --git a/yaml.nim b/yaml.nim index 6fc9529..c9dfcf9 100644 --- a/yaml.nim +++ b/yaml.nim @@ -20,6 +20,9 @@ import streams, unicode, lexbase, tables, strutils, json, hashes, queues, macros export streams, tables, json +when defined(yamlDebug): + import terminal + type TypeHint* = enum ## A type hint is a friendly message from the YAML lexer, telling you @@ -55,9 +58,9 @@ type TagId* = distinct int ## \ ## A ``TagId`` identifies a tag URI, like for example ## ``"tag:yaml.org,2002:str"``. The URI corresponding to a ``TagId`` can - ## be queried from the `YamlTagLibrary <#YamlTagLibrary>`_ which was + ## be queried from the `TagLibrary <#TagLibrary>`_ which was ## used to create this ``TagId`` with - ## `uri <#uri,YamlTagLibrary,TagId>`_. URI strings are + ## `uri <#uri,TagLibrary,TagId>`_. URI strings are ## mapped to ``TagId`` s for efficiency reasons (you do not need to ## compare strings every time) and to be able to discover unknown tag ## URIs early in the parsing process. @@ -68,7 +71,7 @@ type ## (for example, because the parser yielded a ``yamlEndDocument`` ## event). ``AnchorId`` s exists because of efficiency, much like ## ``TagId`` s. The actual anchor name can be queried with - ## `anchor <#anchor,YamlSequentialParser,AnchorId>`_. + ## `anchor <#anchor,YamlParser,AnchorId>`_. YamlStreamEvent* = object ## An element from a `YamlStream <#YamlStream>`_. Events that start an @@ -80,7 +83,7 @@ type ## the non-specific tags ``?`` or ``!`` according to the YAML ## specification. These are by convention mapped to the ``TagId`` s ## ``yTagQuestionMark`` and ``yTagExclamationMark`` respectively. - ## Mapping is done by a `YamlTagLibrary <#YamlTagLibrary>`_. + ## Mapping is done by a `TagLibrary <#TagLibrary>`_. case kind*: YamlStreamEventKind of yamlStartMap: mapAnchor* : AnchorId @@ -110,7 +113,7 @@ type ## always yield a well-formed ``YamlStream`` and expect it to be ## well-formed if it's an input. - TagLibrary* = object + TagLibrary* = ref object ## A ``YamlTagLibrary`` maps tag URIs to ``TagId`` s. YAML tag URIs ## that are defined in the YAML specification or in the ## `YAML tag repository `_ should be mapped to @@ -121,8 +124,8 @@ type ## `coreTagLibrary <#coreTagLibrary>`_, and ## `extendedTagLibrary <#extendedTagLibrary>`_. ## - ## If the ``YamlSequentialParser`` encounters a tag which is not part of - ## the ``YamlTagLibrary``, it will create a new ``TagId`` equal to + ## If the ``YamlParser`` encounters a tag which is not part of + ## the ``TagLibrary``, it will create a new ``TagId`` equal to ## ``nextCustomTagId`` and increase that variable. It will be ## initialized to `yFirstCustomTagId <#yFirstCustomTagId>`_. If you do ## not want to allow unknown tag URIs to be processed, just abort @@ -309,6 +312,7 @@ proc endSeqEvent*(): YamlStreamEvent {.inline, raises: [].} proc scalarEvent*(content: string = "", tag: TagId = yTagQuestionMark, anchor: AnchorId = yAnchorNone): YamlStreamEvent {.inline, raises: [].} +proc aliasEvent*(anchor: AnchorId): YamlStreamEvent {.inline, raises: [].} proc `==`*(left, right: TagId): bool {.borrow.} proc `$`*(id: TagId): string @@ -322,7 +326,7 @@ proc initTagLibrary*(): TagLibrary ## initializes the ``tags`` table and sets ``nextCustomTagId`` to ## ``yFirstCustomTagId``. -proc registerUri*(tagLib: var TagLibrary, uri: string): TagId +proc registerUri*(tagLib: TagLibrary, uri: string): TagId ## registers a custom tag URI with a ``YamlTagLibrary``. The URI will get ## the ``TagId`` ``nextCustomTagId``, which will be incremented. @@ -375,6 +379,9 @@ proc parse*(parser: YamlParser, s: Stream): YamlStream {.raises: [IOError, YamlParserError].} ## Parse a YAML character stream. ``s`` must be readable. +proc fastparse*(tagLib: TagLibrary, s: Stream): + YamlStream {.raises: [IOError, YamlParserError].} + proc constructJson*(s: YamlStream): seq[JsonNode] ## Construct an in-memory JSON tree from a YAML event stream. The stream may ## not contain any tags apart from those in ``coreTagLibrary``. Anchors and @@ -414,4 +421,5 @@ include private.events include private.parser include private.json include private.presenter -include private.hints \ No newline at end of file +include private.hints +include private.fastparse \ No newline at end of file