made lexer tests green again

This commit is contained in:
Felix Krause 2020-11-04 19:32:09 +01:00
parent 4c604b09df
commit 2840d4d654
3 changed files with 123 additions and 105 deletions

View File

@ -2,17 +2,25 @@ import ../yaml/private/lex
import unittest, strutils
const tokensWithValue =
const
tokensWithValue =
{Token.Plain, Token.SingleQuoted, Token.DoubleQuoted, Token.Literal,
Token.Folded, Token.DirectiveParam,
Token.TagHandle, Token.Suffix, Token.VerbatimTag,
Token.UnknownDirective, Token.Anchor, Token.Alias}
Token.Folded, Token.Suffix, Token.VerbatimTag,
Token.UnknownDirective}
tokensWithFullLexeme =
{Token.DirectiveParam, Token.TagHandle}
tokensWithShortLexeme = {Token.Anchor, Token.Alias}
type
TokenWithValue = object
case kind: Token
of tokensWithValue:
value: string
of tokensWithFullLexeme:
lexeme: string
of tokensWithShortLexeme:
slexeme: string
of Indentation:
indentation: int
else: discard
@ -23,7 +31,7 @@ proc actualRepr(lex: Lexer, t: Token): string =
of tokensWithValue + {Token.TagHandle}:
result.add("(" & escape(lex.evaluated) & ")")
of Indentation:
result.add("(" & $lex.indentation & ")")
result.add("(" & $lex.currentIndentation() & ")")
else: discard
proc assertEquals(input: string, expected: varargs[TokenWithValue]) =
@ -43,14 +51,22 @@ proc assertEquals(input: string, expected: varargs[TokenWithValue]) =
doAssert lex.evaluated == expectedToken.value, "Wrong token content at #" &
$i & ": Expected " & escape(expectedToken.value) &
", got " & escape(lex.evaluated)
of tokensWithFullLexeme:
doAssert lex.fullLexeme() == expectedToken.lexeme, "Wrong token lexeme at #" &
$i & ": Expected" & escape(expectedToken.lexeme) &
", got " & escape(lex.fullLexeme())
of tokensWithShortLexeme:
doAssert lex.shortLexeme() == expectedToken.slexeme, "Wrong token slexeme at #" &
$i & ": Expected" & escape(expectedToken.slexeme) &
", got " & escape(lex.shortLexeme())
of Indentation:
doAssert lex.indentation == expectedToken.indentation,
doAssert lex.currentIndentation() == expectedToken.indentation,
"Wrong indentation length at #" & $i & ": Expected " &
$expectedToken.indentation & ", got " & $lex.indentation
$expectedToken.indentation & ", got " & $lex.currentIndentation()
else: discard
except LexerError:
let e = (ref LexerError)(getCurrentException())
echo "Error at line " & $e.line & ", column " & $e.column & ":"
echo "Error at line", e.line, ", column", e.column, ":", e.msg
echo e.lineContent
assert false
@ -71,9 +87,9 @@ proc dt(): TokenWithValue = TokenWithValue(kind: Token.TagDirective)
proc du(v: string): TokenWithValue =
TokenWithValue(kind: Token.UnknownDirective, value: v)
proc dp(v: string): TokenWithValue =
TokenWithValue(kind: Token.DirectiveParam, value: v)
TokenWithValue(kind: Token.DirectiveParam, lexeme: v)
proc th(v: string): TokenWithValue =
TokenWithValue(kind: Token.TagHandle, value: v)
TokenWithValue(kind: Token.TagHandle, lexeme: v)
proc ts(v: string): TokenWithValue =
TokenWithValue(kind: Token.Suffix, value: v)
proc tv(v: string): TokenWithValue =
@ -87,8 +103,8 @@ proc se(): TokenWithValue = TokenWithValue(kind: Token.SeqEnd)
proc ms(): TokenWithValue = TokenWithValue(kind: Token.MapStart)
proc me(): TokenWithValue = TokenWithValue(kind: Token.MapEnd)
proc sep(): TokenWithValue = TokenWithValue(kind: Token.SeqSep)
proc an(v: string): TokenWithValue = TokenWithValue(kind: Token.Anchor, value: v)
proc al(v: string): TokenWithValue = TokenWithValue(kind: Token.Alias, value: v)
proc an(v: string): TokenWithValue = TokenWithValue(kind: Token.Anchor, slexeme: v)
proc al(v: string): TokenWithValue = TokenWithValue(kind: Token.Alias, slexeme: v)
suite "Lexer":
test "Empty document":
@ -133,11 +149,11 @@ suite "Lexer":
test "Directives":
assertEquals("%YAML 1.2\n---\n%TAG\n...\n\n%TAG ! example.html",
dy(), dp("1.2"), dirE(), i(0), pl("%TAG"), i(0), docE(), dt(),
dy(), dp("1.2"), dirE(), i(0), pl("%TAG"), docE(), dt(),
th("!"), ts("example.html"), e())
test "Markers and Unknown Directive":
assertEquals("---\n---\n...\n%UNKNOWN warbl", dirE(), dirE(), i(0),
assertEquals("---\n---\n...\n%UNKNOWN warbl", dirE(), dirE(),
docE(), du("UNKNOWN"), dp("warbl"), e())
test "Block scalar":
@ -145,7 +161,7 @@ suite "Lexer":
test "Block Scalars":
assertEquals("one : >2-\l foo\l bar\ltwo: |+\l bar\l baz", i(0),
pl("one"), mv(), fs(" foo\lbar"), i(0), pl("two"), mv(),
pl("one"), mv(), fs(" foo bar"), i(0), pl("two"), mv(),
ls("bar\l baz"), e())
test "Flow indicators":
@ -153,7 +169,7 @@ suite "Lexer":
mv(), pl("d"), sep(), ss(), pl("e"), se(), mv(), pl("f"), me(), e())
test "Adjacent map values in flow style":
assertEquals("{\"foo\":bar, [1]\l:egg}", i(0), ms(), dq("foo"), mv(),
assertEquals("{\"foo\":bar, [1]\l :egg}", i(0), ms(), dq("foo"), mv(),
pl("bar"), sep(), ss(), pl("1"), se(), mv(), pl("egg"), me(), e())
test "Tag handles":

View File

@ -273,7 +273,7 @@ proc beforeImplicitRoot(c: Context, e: var Event): bool =
if c.lex.cur != Token.Indentation:
raise c.generateError("Unexpected token (expected line start): " & $c.lex.cur)
c.inlineStart = c.lex.curEndPos
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
c.lex.next()
case c.lex.cur
of SeqItemInd, MapKeyInd, MapValueInd:
@ -292,7 +292,7 @@ proc beforeImplicitRoot(c: Context, e: var Event): bool =
raise c.generateError("Unexpected token (expected collection start): " & $c.lex.cur)
proc requireImplicitMapStart(c: Context, e: var Event): bool =
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
case c.lex.cur
of Alias:
e = aliasEvent(c.lex.shortLexeme().Anchor, c.inlineStart, c.lex.curEndPos)
@ -346,7 +346,7 @@ proc atBlockIndentation(c: Context, e: var Event): bool =
discard c.levels.pop()
return true
c.inlineStart = c.lex.curStartPos
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
case c.lex.cur
of nodePropertyKind:
if isEmpty(c.headerProps):
@ -359,9 +359,9 @@ proc atBlockIndentation(c: Context, e: var Event): bool =
e = startSeqEvent(csBlock, c.headerProps,
c.headerStart, c.lex.curEndPos)
c.headerProps = defaultProperties
c.levels[^1] = Level(state: inBlockSeq, indentation: c.lex.indentation)
c.levels[^1] = Level(state: inBlockSeq, indentation: c.lex.currentIndentation())
c.levels.add(Level(state: beforeBlockIndentation, indentation: 0))
c.levels.add(Level(state: afterCompactParent, indentation: c.lex.indentation))
c.levels.add(Level(state: afterCompactParent, indentation: c.lex.currentIndentation()))
c.lex.next()
return true
of MapKeyInd:
@ -370,10 +370,10 @@ proc atBlockIndentation(c: Context, e: var Event): bool =
c.headerProps = defaultProperties
c.levels[^1] = Level(state: beforeBlockMapValue, indentation: 0)
c.levels.add(Level(state: beforeBlockIndentation))
c.levels.add(Level(state: afterCompactParent, indentation: c.lex.indentation))
c.levels.add(Level(state: afterCompactParent, indentation: c.lex.currentIndentation()))
c.lex.next()
of Plain, SingleQuoted, DoubleQuoted:
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
e = scalarEvent(c.lex.evaluated, c.headerProps,
toStyle(c.lex.cur), c.inlineStart, c.lex.curEndPos)
c.headerProps = defaultProperties
@ -409,7 +409,7 @@ proc atBlockIndentation(c: Context, e: var Event): bool =
c.levels[^1].state = atBlockIndentationProps
proc atBlockIndentationProps(c: Context, e: var Event): bool =
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
case c.lex.cur
of MapValueInd:
c.peek = scalarEvent("", c.inlineProps, ssPlain, c.inlineStart, c.lex.curEndPos)
@ -487,7 +487,7 @@ proc afterCompactParent(c: Context, e: var Event): bool =
of SeqItemInd:
e = startSeqEvent(csBlock, c.headerProps, c.headerStart, c.lex.curEndPos)
c.headerProps = defaultProperties
c.levels[^1] = Level(state: inBlockSeq, indentation: c.lex.indentation)
c.levels[^1] = Level(state: inBlockSeq, indentation: c.lex.currentIndentation())
c.levels.add(Level(state: beforeBlockIndentation))
c.levels.add(Level(state: afterCompactParent))
c.lex.next()
@ -495,7 +495,7 @@ proc afterCompactParent(c: Context, e: var Event): bool =
of MapKeyInd:
e = startMapEvent(csBlock, c.headerProps, c.headerStart, c.lex.curEndPos)
c.headerProps = defaultProperties
c.levels[^1] = Level(state: beforeBlockMapValue, indentation: c.lex.indentation)
c.levels[^1] = Level(state: beforeBlockMapValue, indentation: c.lex.currentIndentation())
c.levels.add(Level(state: beforeBlockIndentation))
c.levels.add(Level(state: afterCompactParent))
return true
@ -504,7 +504,7 @@ proc afterCompactParent(c: Context, e: var Event): bool =
return false
proc afterCompactParentProps(c: Context, e: var Event): bool =
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
case c.lex.cur
of nodePropertyKind:
c.levels.add(Level(state: beforeNodeProperties))
@ -541,7 +541,7 @@ proc afterCompactParentProps(c: Context, e: var Event): bool =
c.inlineStart, c.lex.curEndPos)
c.inlineProps = defaultProperties
let headerEnd = c.lex.curStartPos
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
c.lex.next()
if c.lex.cur == Token.MapValueInd:
if c.lex.lastScalarWasMultiline():
@ -580,7 +580,7 @@ proc afterBlockParent(c: Context, e: var Event): bool =
return false
proc afterBlockParentProps(c: Context, e: var Event): bool =
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
case c.lex.cur
of nodePropertyKind:
c.levels.add(Level(state: beforeNodeProperties))
@ -600,7 +600,7 @@ proc afterBlockParentProps(c: Context, e: var Event): bool =
return false
proc requireInlineBlockItem(c: Context, e: var Event): bool =
c.levels[^1].indentation = c.lex.indentation
c.levels[^1].indentation = c.lex.currentIndentation()
case c.lex.cur
of Indentation:
raise c.generateError("Node properties may not stand alone on a line")
@ -740,7 +740,7 @@ proc beforeBlockIndentation(c: Context, e: var Event): bool =
discard c.levels.pop()
case c.lex.cur
of Indentation:
c.blockIndentation = c.lex.indentation
c.blockIndentation = c.lex.currentIndentation()
if c.blockIndentation < c.levels[^1].indentation:
endBlockNode(e)
return true

View File

@ -16,10 +16,8 @@ type
curStartPos*, curEndPos*: Mark
# recently read scalar or URI, if any
evaluated*: string
# ltIndentation
indentation*: int
# internals
indentation: int
source: BaseLexer
tokenStart: int
flowDepth: int
@ -75,7 +73,6 @@ const
spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile}
commentOrLineEnd = {'\l', '\c', EndOfFile, '#'}
digits = {'0'..'9'}
hexDigits = {'0'..'9', 'a'..'f', 'A'..'F'}
flowIndicators = {'[', ']', '{', '}', ','}
uriChars = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':',
'@', '&', '-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')'}
@ -93,41 +90,44 @@ const
UnknownIndentation* = int.low
proc currentIndentation*(lex: Lexer): Natural =
return lex.source.getColNumber(lex.source.bufpos) - 1
# lexer source handling
proc advance(lex: var Lexer, step: int = 1) {.inline.} =
lex.source.bufpos.inc(step)
lex.c = lex.source.buf[lex.source.bufpos]
lex.source.bufpos.inc(step)
template lexCR(lex: var Lexer) =
try: lex.source.bufpos = lex.source.handleCR(lex.source.bufpos)
try: lex.source.bufpos = lex.source.handleCR(lex.source.bufpos - 1)
except:
var e = lex.generateError("Encountered stream error: " &
getCurrentExceptionMsg())
e.parent = getCurrentException()
raise e
lex.c = lex.source.buf[lex.source.bufpos]
lex.advance()
template lexLF(lex: var Lexer) =
try: lex.source.bufpos = lex.source.handleLF(lex.source.bufpos)
try: lex.source.bufpos = lex.source.handleLF(lex.source.bufpos - 1)
except:
var e = generateError(lex, "Encountered stream error: " &
getCurrentExceptionMsg())
e.parent = getCurrentException()
raise e
lex.c = lex.source.buf[lex.source.bufpos]
lex.advance()
template lineNumber(lex: Lexer): Positive =
lex.source.lineNumber
template columnNumber(lex: Lexer): Positive =
lex.source.getColNumber(lex.source.bufpos) + 1
lex.source.getColNumber(lex.source.bufpos)
template currentLine(lex: Lexer): string =
lex.source.getCurrentLine(true)
proc isPlainSafe(lex: Lexer): bool {.inline.} =
case lex.source.buf[lex.source.bufpos + 1]
case lex.source.buf[lex.source.bufpos]
of spaceOrLineEnd: result = false
of flowIndicators: result = lex.flowDepth == 0
else: result = true
@ -218,26 +218,22 @@ proc isDocumentEnd(lex: var Lexer): bool =
proc readHexSequence(lex: var Lexer, len: int) =
var charPos = 0
let startPos = lex.source.bufpos
for i in countup(0, len-1):
if lex.source.buf[startPos + 1] notin hexDigits:
raise lex.generateError("Invalid character in hex escape sequence: " &
escape("" & lex.source.buf[startPos + i]))
# no pow() for ints, do it manually
var coeff = 1
for exponent in countup(0, len-1): coeff *= 16
for exponent in countdown(len-1, 0):
lex.advance()
let digitPosition = len - i - 1
case lex.c
of digits:
charPos += coeff * (int(lex.c) - int('0'))
of 'a' .. 'f':
charPos += coeff * (int(lex.c) - int('a') + 10)
of lineEnd:
raise lex.generateError("Unfinished unicode escape sequence")
of '0'..'9':
charPos = charPos or (int(lex.c) - 0x30) shl (digitPosition * 4)
of 'A' .. 'F':
charPos += coeff * (int(lex.c) - int('A') + 10)
else: discard # cannot happen, we checked
coeff = coeff div 16
lex.evaluated.add($Rune(charPos))
charPos = charPos or (int(lex.c) - 0x37) shl (digitPosition * 4)
of 'a' .. 'f':
charPos = charPos or (int(lex.c) - 0x57) shl (digitPosition * 4)
else:
raise lex.generateError("Invalid character in hex escape sequence: " &
escape("" & lex.c))
lex.evaluated.add(toUTF8(Rune(charPos)))
proc readURI(lex: var Lexer) =
lex.evaluated.setLen(0)
@ -383,7 +379,7 @@ proc readPlainScalar(lex: var Lexer) =
break inlineLoop
of EndOfFile:
lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
if lex.columnNumber() > 0:
if lex.currentIndentation() > 0:
lex.endToken()
lex.state = streamEnd
break multilineLoop
@ -394,7 +390,7 @@ proc readPlainScalar(lex: var Lexer) =
while true:
case lex.startLine()
of lsContent:
if lex.columnNumber() <= lex.indentation:
if lex.currentIndentation() <= lex.indentation:
lex.state = afterNewlineState
break multilineLoop
break newlineLoop
@ -412,6 +408,7 @@ proc readPlainScalar(lex: var Lexer) =
break multilineLoop
of lsNewline: lex.endLine()
newlines += 1
while lex.c == ' ': lex.advance()
if (lex.c == ':' and not lex.isPlainSafe()) or
lex.c == '#' or (lex.c in flowIndicators and
lex.flowDepth > 0):
@ -423,7 +420,7 @@ proc readPlainScalar(lex: var Lexer) =
for i in countup(2, newlines): lex.evaluated.add('\l')
proc streamEndAfterBlock(lex: var Lexer) =
if lex.columnNumber() != 0:
if lex.currentIndentation() != 0:
lex.endToken()
lex.curEndPos.column -= 1
@ -475,13 +472,13 @@ proc readBlockScalar(lex: var Lexer) =
if indent == 0:
while lex.c == ' ': lex.advance()
else:
maxLeadingSpaces = lex.columnNumber + indent
while lex.c == ' ' and lex.columnNumber < maxLeadingSpaces:
maxLeadingSpaces = lex.currentIndentation() + indent
while lex.c == ' ' and lex.currentIndentation() < maxLeadingSpaces:
lex.advance()
case lex.c
of '\l', '\c':
lex.endToken()
maxLeadingSpaces = max(maxLeadingSpaces, lex.columnNumber())
maxLeadingSpaces = max(maxLeadingSpaces, lex.currentIndentation())
lex.endLine()
separationLines += 1
of EndOfFile:
@ -490,59 +487,60 @@ proc readBlockScalar(lex: var Lexer) =
break body
else:
if indent == 0:
indent = lex.columnNumber()
indent = lex.currentIndentation()
if indent <= max(0, lex.indentation):
lex.state = lineIndentation
break body
elif indent < maxLeadingSpaces:
raise lex.generateError("Leading all-spaces line contains too many spaces")
elif lex.columnNumber < indent: break body
elif lex.currentIndentation() < indent: break body
break
for i in countup(0, separationLines - 1):
lex.evaluated.add('\l')
block content:
contentStart = lex.source.bufpos - 1
while lex.c notin lineEnd: lex.advance()
lex.evaluated.add(lex.source.buf[contentStart .. lex.source.bufpos - 2])
separationLines = 0
if lex.c == EndOfFile:
lex.state = streamEnd
lex.streamEndAfterBlock()
break body
separationLines += 1
lex.endToken()
lex.endLine()
# empty lines and indentation of next line
while true:
while lex.c == ' ' and lex.columnNumber() < indent:
lex.advance()
case lex.c
of '\l', '\c':
lex.endToken()
separationLines += 1
lex.endLine()
of EndOfFile:
contentStart = lex.source.bufpos - 1
while lex.c notin lineEnd: lex.advance()
lex.evaluated.add(lex.source.buf[contentStart .. lex.source.bufpos - 2])
separationLines = 0
if lex.c == EndOfFile:
lex.state = streamEnd
lex.streamEndAfterBlock()
break body
separationLines += 1
lex.endToken()
lex.endLine()
# empty lines and indentation of next line
while true:
while lex.c == ' ' and lex.currentIndentation() < indent:
lex.advance()
case lex.c
of '\l', '\c':
lex.endToken()
separationLines += 1
lex.endLine()
of EndOfFile:
lex.state = streamEnd
lex.streamEndAfterBlock()
break body
else:
if lex.currentIndentation() < indent:
break content
else: break
# line folding
if lex.cur == Token.Literal:
for i in countup(0, separationLines - 1):
lex.evaluated.add('\l')
elif separationLines == 1:
lex.evaluated.add(' ')
else:
if lex.columnNumber() < indent:
break content
else: break
for i in countup(0, separationLines - 2):
lex.evaluated.add('\l')
# line folding
if lex.cur == Token.Literal:
for i in countup(0, separationLines - 1):
lex.evaluated.add('\l')
elif separationLines == 1:
lex.evaluated.add(' ')
else:
for i in countup(0, separationLines - 2):
lex.evaluated.add('\l')
if lex.columnNumber() > max(0, lex.indentation):
if lex.currentIndentation() > max(0, lex.indentation):
if lex.c == '#':
lex.state = expectLineEnd
else:
@ -755,7 +753,7 @@ proc outsideDoc(lex: var Lexer): bool =
lex.startToken()
if lex.isDirectivesEnd():
lex.state = expectLineEnd
lex.cur = Token.DocumentEnd
lex.cur = Token.DirectivesEnd
else:
lex.state = indentationSettingToken
lex.cur = Token.Indentation
@ -799,6 +797,7 @@ proc yamlVersion(lex: var Lexer): bool =
lex.cur = Token.DirectiveParam
lex.endToken()
lex.state = expectLineEnd
return true
proc tagShorthand(lex: var Lexer): bool =
debug("lex: tagShorthand")
@ -822,6 +821,7 @@ proc tagShorthand(lex: var Lexer): bool =
lex.cur = Token.TagHandle
lex.endToken()
lex.state = tagUri
return true
proc tagUri(lex: var Lexer): bool =
debug("lex: tagUri")
@ -886,7 +886,7 @@ proc flowLineStart(lex: var Lexer): bool =
return false
proc flowLineIndentation(lex: var Lexer): bool =
if lex.columnNumber() < lex.indentation:
if lex.currentIndentation() < lex.indentation:
raise lex.generateError("Too few indentation spaces (must surpass surrounding block level)")
lex.state = insideLine
return false
@ -933,6 +933,7 @@ proc readNamespace(lex: var Lexer) =
lex.readURI()
lex.endToken()
lex.cur = Token.VerbatimTag
lex.state = afterToken
else:
var handleEnd = lex.tokenStart
while true:
@ -1022,9 +1023,9 @@ proc insideLine(lex: var Lexer): bool =
return true
proc indentationSettingToken(lex: var Lexer): bool =
let cachedIntentation = lex.columnNumber()
let cachedIntentation = lex.currentIndentation()
result = lex.insideLine()
if result and lex.flowDepth > 0:
if result and lex.flowDepth == 0:
if lex.cur in nodePropertyKind:
lex.propertyIndentation = cachedIntentation
else:
@ -1054,6 +1055,7 @@ proc afterJsonEnablingToken(lex: var Lexer): bool =
lex.endToken()
lex.cur = Token.MapValueInd
lex.state = afterToken
return true
of '#', '\l', '\c':
lex.endLine()
discard lex.flowLineStart()