From 2840d4d654b9a36d7b4b25bcdb1f53dc09460807 Mon Sep 17 00:00:00 2001 From: Felix Krause Date: Wed, 4 Nov 2020 19:32:09 +0100 Subject: [PATCH] made lexer tests green again --- test/tlex.nim | 48 +++++++++----- yaml/parser.nim | 30 ++++----- yaml/private/lex.nim | 150 ++++++++++++++++++++++--------------------- 3 files changed, 123 insertions(+), 105 deletions(-) diff --git a/test/tlex.nim b/test/tlex.nim index 99f5045..0dda21b 100644 --- a/test/tlex.nim +++ b/test/tlex.nim @@ -2,17 +2,25 @@ import ../yaml/private/lex import unittest, strutils -const tokensWithValue = +const + tokensWithValue = {Token.Plain, Token.SingleQuoted, Token.DoubleQuoted, Token.Literal, - Token.Folded, Token.DirectiveParam, - Token.TagHandle, Token.Suffix, Token.VerbatimTag, - Token.UnknownDirective, Token.Anchor, Token.Alias} + Token.Folded, Token.Suffix, Token.VerbatimTag, + Token.UnknownDirective} + tokensWithFullLexeme = + {Token.DirectiveParam, Token.TagHandle} + tokensWithShortLexeme = {Token.Anchor, Token.Alias} + type TokenWithValue = object case kind: Token of tokensWithValue: value: string + of tokensWithFullLexeme: + lexeme: string + of tokensWithShortLexeme: + slexeme: string of Indentation: indentation: int else: discard @@ -23,7 +31,7 @@ proc actualRepr(lex: Lexer, t: Token): string = of tokensWithValue + {Token.TagHandle}: result.add("(" & escape(lex.evaluated) & ")") of Indentation: - result.add("(" & $lex.indentation & ")") + result.add("(" & $lex.currentIndentation() & ")") else: discard proc assertEquals(input: string, expected: varargs[TokenWithValue]) = @@ -43,14 +51,22 @@ proc assertEquals(input: string, expected: varargs[TokenWithValue]) = doAssert lex.evaluated == expectedToken.value, "Wrong token content at #" & $i & ": Expected " & escape(expectedToken.value) & ", got " & escape(lex.evaluated) + of tokensWithFullLexeme: + doAssert lex.fullLexeme() == expectedToken.lexeme, "Wrong token lexeme at #" & + $i & ": Expected" & escape(expectedToken.lexeme) & + ", got " & escape(lex.fullLexeme()) + of tokensWithShortLexeme: + doAssert lex.shortLexeme() == expectedToken.slexeme, "Wrong token slexeme at #" & + $i & ": Expected" & escape(expectedToken.slexeme) & + ", got " & escape(lex.shortLexeme()) of Indentation: - doAssert lex.indentation == expectedToken.indentation, + doAssert lex.currentIndentation() == expectedToken.indentation, "Wrong indentation length at #" & $i & ": Expected " & - $expectedToken.indentation & ", got " & $lex.indentation + $expectedToken.indentation & ", got " & $lex.currentIndentation() else: discard except LexerError: let e = (ref LexerError)(getCurrentException()) - echo "Error at line " & $e.line & ", column " & $e.column & ":" + echo "Error at line", e.line, ", column", e.column, ":", e.msg echo e.lineContent assert false @@ -71,9 +87,9 @@ proc dt(): TokenWithValue = TokenWithValue(kind: Token.TagDirective) proc du(v: string): TokenWithValue = TokenWithValue(kind: Token.UnknownDirective, value: v) proc dp(v: string): TokenWithValue = - TokenWithValue(kind: Token.DirectiveParam, value: v) + TokenWithValue(kind: Token.DirectiveParam, lexeme: v) proc th(v: string): TokenWithValue = - TokenWithValue(kind: Token.TagHandle, value: v) + TokenWithValue(kind: Token.TagHandle, lexeme: v) proc ts(v: string): TokenWithValue = TokenWithValue(kind: Token.Suffix, value: v) proc tv(v: string): TokenWithValue = @@ -87,8 +103,8 @@ proc se(): TokenWithValue = TokenWithValue(kind: Token.SeqEnd) proc ms(): TokenWithValue = TokenWithValue(kind: Token.MapStart) proc me(): TokenWithValue = TokenWithValue(kind: Token.MapEnd) proc sep(): TokenWithValue = TokenWithValue(kind: Token.SeqSep) -proc an(v: string): TokenWithValue = TokenWithValue(kind: Token.Anchor, value: v) -proc al(v: string): TokenWithValue = TokenWithValue(kind: Token.Alias, value: v) +proc an(v: string): TokenWithValue = TokenWithValue(kind: Token.Anchor, slexeme: v) +proc al(v: string): TokenWithValue = TokenWithValue(kind: Token.Alias, slexeme: v) suite "Lexer": test "Empty document": @@ -133,11 +149,11 @@ suite "Lexer": test "Directives": assertEquals("%YAML 1.2\n---\n%TAG\n...\n\n%TAG ! example.html", - dy(), dp("1.2"), dirE(), i(0), pl("%TAG"), i(0), docE(), dt(), + dy(), dp("1.2"), dirE(), i(0), pl("%TAG"), docE(), dt(), th("!"), ts("example.html"), e()) test "Markers and Unknown Directive": - assertEquals("---\n---\n...\n%UNKNOWN warbl", dirE(), dirE(), i(0), + assertEquals("---\n---\n...\n%UNKNOWN warbl", dirE(), dirE(), docE(), du("UNKNOWN"), dp("warbl"), e()) test "Block scalar": @@ -145,7 +161,7 @@ suite "Lexer": test "Block Scalars": assertEquals("one : >2-\l foo\l bar\ltwo: |+\l bar\l baz", i(0), - pl("one"), mv(), fs(" foo\lbar"), i(0), pl("two"), mv(), + pl("one"), mv(), fs(" foo bar"), i(0), pl("two"), mv(), ls("bar\l baz"), e()) test "Flow indicators": @@ -153,7 +169,7 @@ suite "Lexer": mv(), pl("d"), sep(), ss(), pl("e"), se(), mv(), pl("f"), me(), e()) test "Adjacent map values in flow style": - assertEquals("{\"foo\":bar, [1]\l:egg}", i(0), ms(), dq("foo"), mv(), + assertEquals("{\"foo\":bar, [1]\l :egg}", i(0), ms(), dq("foo"), mv(), pl("bar"), sep(), ss(), pl("1"), se(), mv(), pl("egg"), me(), e()) test "Tag handles": diff --git a/yaml/parser.nim b/yaml/parser.nim index 3cede55..b77e7b2 100644 --- a/yaml/parser.nim +++ b/yaml/parser.nim @@ -273,7 +273,7 @@ proc beforeImplicitRoot(c: Context, e: var Event): bool = if c.lex.cur != Token.Indentation: raise c.generateError("Unexpected token (expected line start): " & $c.lex.cur) c.inlineStart = c.lex.curEndPos - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() c.lex.next() case c.lex.cur of SeqItemInd, MapKeyInd, MapValueInd: @@ -292,7 +292,7 @@ proc beforeImplicitRoot(c: Context, e: var Event): bool = raise c.generateError("Unexpected token (expected collection start): " & $c.lex.cur) proc requireImplicitMapStart(c: Context, e: var Event): bool = - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() case c.lex.cur of Alias: e = aliasEvent(c.lex.shortLexeme().Anchor, c.inlineStart, c.lex.curEndPos) @@ -346,7 +346,7 @@ proc atBlockIndentation(c: Context, e: var Event): bool = discard c.levels.pop() return true c.inlineStart = c.lex.curStartPos - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() case c.lex.cur of nodePropertyKind: if isEmpty(c.headerProps): @@ -359,9 +359,9 @@ proc atBlockIndentation(c: Context, e: var Event): bool = e = startSeqEvent(csBlock, c.headerProps, c.headerStart, c.lex.curEndPos) c.headerProps = defaultProperties - c.levels[^1] = Level(state: inBlockSeq, indentation: c.lex.indentation) + c.levels[^1] = Level(state: inBlockSeq, indentation: c.lex.currentIndentation()) c.levels.add(Level(state: beforeBlockIndentation, indentation: 0)) - c.levels.add(Level(state: afterCompactParent, indentation: c.lex.indentation)) + c.levels.add(Level(state: afterCompactParent, indentation: c.lex.currentIndentation())) c.lex.next() return true of MapKeyInd: @@ -370,10 +370,10 @@ proc atBlockIndentation(c: Context, e: var Event): bool = c.headerProps = defaultProperties c.levels[^1] = Level(state: beforeBlockMapValue, indentation: 0) c.levels.add(Level(state: beforeBlockIndentation)) - c.levels.add(Level(state: afterCompactParent, indentation: c.lex.indentation)) + c.levels.add(Level(state: afterCompactParent, indentation: c.lex.currentIndentation())) c.lex.next() of Plain, SingleQuoted, DoubleQuoted: - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() e = scalarEvent(c.lex.evaluated, c.headerProps, toStyle(c.lex.cur), c.inlineStart, c.lex.curEndPos) c.headerProps = defaultProperties @@ -409,7 +409,7 @@ proc atBlockIndentation(c: Context, e: var Event): bool = c.levels[^1].state = atBlockIndentationProps proc atBlockIndentationProps(c: Context, e: var Event): bool = - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() case c.lex.cur of MapValueInd: c.peek = scalarEvent("", c.inlineProps, ssPlain, c.inlineStart, c.lex.curEndPos) @@ -487,7 +487,7 @@ proc afterCompactParent(c: Context, e: var Event): bool = of SeqItemInd: e = startSeqEvent(csBlock, c.headerProps, c.headerStart, c.lex.curEndPos) c.headerProps = defaultProperties - c.levels[^1] = Level(state: inBlockSeq, indentation: c.lex.indentation) + c.levels[^1] = Level(state: inBlockSeq, indentation: c.lex.currentIndentation()) c.levels.add(Level(state: beforeBlockIndentation)) c.levels.add(Level(state: afterCompactParent)) c.lex.next() @@ -495,7 +495,7 @@ proc afterCompactParent(c: Context, e: var Event): bool = of MapKeyInd: e = startMapEvent(csBlock, c.headerProps, c.headerStart, c.lex.curEndPos) c.headerProps = defaultProperties - c.levels[^1] = Level(state: beforeBlockMapValue, indentation: c.lex.indentation) + c.levels[^1] = Level(state: beforeBlockMapValue, indentation: c.lex.currentIndentation()) c.levels.add(Level(state: beforeBlockIndentation)) c.levels.add(Level(state: afterCompactParent)) return true @@ -504,7 +504,7 @@ proc afterCompactParent(c: Context, e: var Event): bool = return false proc afterCompactParentProps(c: Context, e: var Event): bool = - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() case c.lex.cur of nodePropertyKind: c.levels.add(Level(state: beforeNodeProperties)) @@ -541,7 +541,7 @@ proc afterCompactParentProps(c: Context, e: var Event): bool = c.inlineStart, c.lex.curEndPos) c.inlineProps = defaultProperties let headerEnd = c.lex.curStartPos - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() c.lex.next() if c.lex.cur == Token.MapValueInd: if c.lex.lastScalarWasMultiline(): @@ -580,7 +580,7 @@ proc afterBlockParent(c: Context, e: var Event): bool = return false proc afterBlockParentProps(c: Context, e: var Event): bool = - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() case c.lex.cur of nodePropertyKind: c.levels.add(Level(state: beforeNodeProperties)) @@ -600,7 +600,7 @@ proc afterBlockParentProps(c: Context, e: var Event): bool = return false proc requireInlineBlockItem(c: Context, e: var Event): bool = - c.levels[^1].indentation = c.lex.indentation + c.levels[^1].indentation = c.lex.currentIndentation() case c.lex.cur of Indentation: raise c.generateError("Node properties may not stand alone on a line") @@ -740,7 +740,7 @@ proc beforeBlockIndentation(c: Context, e: var Event): bool = discard c.levels.pop() case c.lex.cur of Indentation: - c.blockIndentation = c.lex.indentation + c.blockIndentation = c.lex.currentIndentation() if c.blockIndentation < c.levels[^1].indentation: endBlockNode(e) return true diff --git a/yaml/private/lex.nim b/yaml/private/lex.nim index 3fea24d..eb1178f 100644 --- a/yaml/private/lex.nim +++ b/yaml/private/lex.nim @@ -16,10 +16,8 @@ type curStartPos*, curEndPos*: Mark # recently read scalar or URI, if any evaluated*: string - # ltIndentation - indentation*: int - # internals + indentation: int source: BaseLexer tokenStart: int flowDepth: int @@ -75,7 +73,6 @@ const spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile} commentOrLineEnd = {'\l', '\c', EndOfFile, '#'} digits = {'0'..'9'} - hexDigits = {'0'..'9', 'a'..'f', 'A'..'F'} flowIndicators = {'[', ']', '{', '}', ','} uriChars = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@', '&', '-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')'} @@ -93,41 +90,44 @@ const UnknownIndentation* = int.low +proc currentIndentation*(lex: Lexer): Natural = + return lex.source.getColNumber(lex.source.bufpos) - 1 + # lexer source handling proc advance(lex: var Lexer, step: int = 1) {.inline.} = - lex.source.bufpos.inc(step) lex.c = lex.source.buf[lex.source.bufpos] + lex.source.bufpos.inc(step) template lexCR(lex: var Lexer) = - try: lex.source.bufpos = lex.source.handleCR(lex.source.bufpos) + try: lex.source.bufpos = lex.source.handleCR(lex.source.bufpos - 1) except: var e = lex.generateError("Encountered stream error: " & getCurrentExceptionMsg()) e.parent = getCurrentException() raise e - lex.c = lex.source.buf[lex.source.bufpos] + lex.advance() template lexLF(lex: var Lexer) = - try: lex.source.bufpos = lex.source.handleLF(lex.source.bufpos) + try: lex.source.bufpos = lex.source.handleLF(lex.source.bufpos - 1) except: var e = generateError(lex, "Encountered stream error: " & getCurrentExceptionMsg()) e.parent = getCurrentException() raise e - lex.c = lex.source.buf[lex.source.bufpos] + lex.advance() template lineNumber(lex: Lexer): Positive = lex.source.lineNumber template columnNumber(lex: Lexer): Positive = - lex.source.getColNumber(lex.source.bufpos) + 1 + lex.source.getColNumber(lex.source.bufpos) template currentLine(lex: Lexer): string = lex.source.getCurrentLine(true) proc isPlainSafe(lex: Lexer): bool {.inline.} = - case lex.source.buf[lex.source.bufpos + 1] + case lex.source.buf[lex.source.bufpos] of spaceOrLineEnd: result = false of flowIndicators: result = lex.flowDepth == 0 else: result = true @@ -218,26 +218,22 @@ proc isDocumentEnd(lex: var Lexer): bool = proc readHexSequence(lex: var Lexer, len: int) = var charPos = 0 - let startPos = lex.source.bufpos for i in countup(0, len-1): - if lex.source.buf[startPos + 1] notin hexDigits: - raise lex.generateError("Invalid character in hex escape sequence: " & - escape("" & lex.source.buf[startPos + i])) - # no pow() for ints, do it manually - var coeff = 1 - for exponent in countup(0, len-1): coeff *= 16 - for exponent in countdown(len-1, 0): lex.advance() + let digitPosition = len - i - 1 case lex.c - of digits: - charPos += coeff * (int(lex.c) - int('0')) - of 'a' .. 'f': - charPos += coeff * (int(lex.c) - int('a') + 10) + of lineEnd: + raise lex.generateError("Unfinished unicode escape sequence") + of '0'..'9': + charPos = charPos or (int(lex.c) - 0x30) shl (digitPosition * 4) of 'A' .. 'F': - charPos += coeff * (int(lex.c) - int('A') + 10) - else: discard # cannot happen, we checked - coeff = coeff div 16 - lex.evaluated.add($Rune(charPos)) + charPos = charPos or (int(lex.c) - 0x37) shl (digitPosition * 4) + of 'a' .. 'f': + charPos = charPos or (int(lex.c) - 0x57) shl (digitPosition * 4) + else: + raise lex.generateError("Invalid character in hex escape sequence: " & + escape("" & lex.c)) + lex.evaluated.add(toUTF8(Rune(charPos))) proc readURI(lex: var Lexer) = lex.evaluated.setLen(0) @@ -383,7 +379,7 @@ proc readPlainScalar(lex: var Lexer) = break inlineLoop of EndOfFile: lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2]) - if lex.columnNumber() > 0: + if lex.currentIndentation() > 0: lex.endToken() lex.state = streamEnd break multilineLoop @@ -394,7 +390,7 @@ proc readPlainScalar(lex: var Lexer) = while true: case lex.startLine() of lsContent: - if lex.columnNumber() <= lex.indentation: + if lex.currentIndentation() <= lex.indentation: lex.state = afterNewlineState break multilineLoop break newlineLoop @@ -412,6 +408,7 @@ proc readPlainScalar(lex: var Lexer) = break multilineLoop of lsNewline: lex.endLine() newlines += 1 + while lex.c == ' ': lex.advance() if (lex.c == ':' and not lex.isPlainSafe()) or lex.c == '#' or (lex.c in flowIndicators and lex.flowDepth > 0): @@ -423,7 +420,7 @@ proc readPlainScalar(lex: var Lexer) = for i in countup(2, newlines): lex.evaluated.add('\l') proc streamEndAfterBlock(lex: var Lexer) = - if lex.columnNumber() != 0: + if lex.currentIndentation() != 0: lex.endToken() lex.curEndPos.column -= 1 @@ -475,13 +472,13 @@ proc readBlockScalar(lex: var Lexer) = if indent == 0: while lex.c == ' ': lex.advance() else: - maxLeadingSpaces = lex.columnNumber + indent - while lex.c == ' ' and lex.columnNumber < maxLeadingSpaces: + maxLeadingSpaces = lex.currentIndentation() + indent + while lex.c == ' ' and lex.currentIndentation() < maxLeadingSpaces: lex.advance() case lex.c of '\l', '\c': lex.endToken() - maxLeadingSpaces = max(maxLeadingSpaces, lex.columnNumber()) + maxLeadingSpaces = max(maxLeadingSpaces, lex.currentIndentation()) lex.endLine() separationLines += 1 of EndOfFile: @@ -490,59 +487,60 @@ proc readBlockScalar(lex: var Lexer) = break body else: if indent == 0: - indent = lex.columnNumber() + indent = lex.currentIndentation() if indent <= max(0, lex.indentation): lex.state = lineIndentation break body elif indent < maxLeadingSpaces: raise lex.generateError("Leading all-spaces line contains too many spaces") - elif lex.columnNumber < indent: break body + elif lex.currentIndentation() < indent: break body break for i in countup(0, separationLines - 1): lex.evaluated.add('\l') block content: - contentStart = lex.source.bufpos - 1 - while lex.c notin lineEnd: lex.advance() - lex.evaluated.add(lex.source.buf[contentStart .. lex.source.bufpos - 2]) - separationLines = 0 - if lex.c == EndOfFile: - lex.state = streamEnd - lex.streamEndAfterBlock() - break body - separationLines += 1 - lex.endToken() - lex.endLine() - - # empty lines and indentation of next line while true: - while lex.c == ' ' and lex.columnNumber() < indent: - lex.advance() - case lex.c - of '\l', '\c': - lex.endToken() - separationLines += 1 - lex.endLine() - of EndOfFile: + contentStart = lex.source.bufpos - 1 + while lex.c notin lineEnd: lex.advance() + lex.evaluated.add(lex.source.buf[contentStart .. lex.source.bufpos - 2]) + separationLines = 0 + if lex.c == EndOfFile: lex.state = streamEnd lex.streamEndAfterBlock() break body + separationLines += 1 + lex.endToken() + lex.endLine() + + # empty lines and indentation of next line + while true: + while lex.c == ' ' and lex.currentIndentation() < indent: + lex.advance() + case lex.c + of '\l', '\c': + lex.endToken() + separationLines += 1 + lex.endLine() + of EndOfFile: + lex.state = streamEnd + lex.streamEndAfterBlock() + break body + else: + if lex.currentIndentation() < indent: + break content + else: break + + # line folding + if lex.cur == Token.Literal: + for i in countup(0, separationLines - 1): + lex.evaluated.add('\l') + elif separationLines == 1: + lex.evaluated.add(' ') else: - if lex.columnNumber() < indent: - break content - else: break + for i in countup(0, separationLines - 2): + lex.evaluated.add('\l') - # line folding - if lex.cur == Token.Literal: - for i in countup(0, separationLines - 1): - lex.evaluated.add('\l') - elif separationLines == 1: - lex.evaluated.add(' ') - else: - for i in countup(0, separationLines - 2): - lex.evaluated.add('\l') - - if lex.columnNumber() > max(0, lex.indentation): + if lex.currentIndentation() > max(0, lex.indentation): if lex.c == '#': lex.state = expectLineEnd else: @@ -755,7 +753,7 @@ proc outsideDoc(lex: var Lexer): bool = lex.startToken() if lex.isDirectivesEnd(): lex.state = expectLineEnd - lex.cur = Token.DocumentEnd + lex.cur = Token.DirectivesEnd else: lex.state = indentationSettingToken lex.cur = Token.Indentation @@ -799,6 +797,7 @@ proc yamlVersion(lex: var Lexer): bool = lex.cur = Token.DirectiveParam lex.endToken() lex.state = expectLineEnd + return true proc tagShorthand(lex: var Lexer): bool = debug("lex: tagShorthand") @@ -822,6 +821,7 @@ proc tagShorthand(lex: var Lexer): bool = lex.cur = Token.TagHandle lex.endToken() lex.state = tagUri + return true proc tagUri(lex: var Lexer): bool = debug("lex: tagUri") @@ -886,7 +886,7 @@ proc flowLineStart(lex: var Lexer): bool = return false proc flowLineIndentation(lex: var Lexer): bool = - if lex.columnNumber() < lex.indentation: + if lex.currentIndentation() < lex.indentation: raise lex.generateError("Too few indentation spaces (must surpass surrounding block level)") lex.state = insideLine return false @@ -933,6 +933,7 @@ proc readNamespace(lex: var Lexer) = lex.readURI() lex.endToken() lex.cur = Token.VerbatimTag + lex.state = afterToken else: var handleEnd = lex.tokenStart while true: @@ -1022,9 +1023,9 @@ proc insideLine(lex: var Lexer): bool = return true proc indentationSettingToken(lex: var Lexer): bool = - let cachedIntentation = lex.columnNumber() + let cachedIntentation = lex.currentIndentation() result = lex.insideLine() - if result and lex.flowDepth > 0: + if result and lex.flowDepth == 0: if lex.cur in nodePropertyKind: lex.propertyIndentation = cachedIntentation else: @@ -1054,6 +1055,7 @@ proc afterJsonEnablingToken(lex: var Lexer): bool = lex.endToken() lex.cur = Token.MapValueInd lex.state = afterToken + return true of '#', '\l', '\c': lex.endLine() discard lex.flowLineStart()