Implemented tag handles

This commit is contained in:
Felix Krause 2016-09-11 12:52:24 +02:00
parent 279d233c4f
commit 4d1a444f61
2 changed files with 117 additions and 4 deletions

View File

@ -69,6 +69,8 @@ const
spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile}
digits = {'0'..'9'}
flowIndicators = {'[', ']', '{', '}', ','}
uriChars = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':',
'@', '&', '-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')'}
UTF8NextLine = toUTF8(0x85.Rune)
UTF8NonBreakingSpace = toUTF8(0xA0.Rune)
@ -153,6 +155,15 @@ proc nextIsPlainSafe(lex: YamlLexer, t: typedesc[StringSource],
of flowIndicators: result = not inFlow
else: result = true
proc mark(lex: YamlLexer, t: typedesc[BaseLexer]): int = lex.blSource.bufpos
proc mark(lex: YamlLexer, t: typedesc[StringSource]): int = lex.sSource.pos
proc afterMark(lex: YamlLexer, t: typedesc[BaseLexer], m: int): int =
lex.blSource.bufpos - m
proc afterMark(lex: YamlLexer, t: typedesc[StringSource], m: int): int =
lex.sSource.pos - m
# lexer states
proc outsideDoc[T](lex: YamlLexer): bool
@ -169,6 +180,7 @@ proc insideLine[T](lex: YamlLexer): bool
proc plainScalarPart[T](lex: YamlLexer): bool
proc blockScalarHeader[T](lex: YamlLexer): bool
proc blockScalar[T](lex: YamlLexer): bool
proc tagHandle[T](lex: YamlLexer): bool
proc streamEnd(lex: YamlLexer): bool
# implementation
@ -574,6 +586,9 @@ proc insideLine[T](lex: YamlLexer): bool =
of '[': result = flowIndicator[T](lex, ltBracketOpen)
of ']': result = flowIndicator[T](lex, ltBracketClose)
of ',': result = flowIndicator[T](lex, ltComma)
of '!':
lex.nextState = tagHandle[T]
result = false
else:
lex.nextState = plainScalarPart[T]
result = false
@ -678,6 +693,71 @@ proc blockScalar[T](lex: YamlLexer): bool =
result = true
lex.nextState = expectLineEnd[T]
proc byteSequence[T](lex: YamlLexer) =
debug("lex: byteSequence")
var charCode = 0.int8
for i in 0 .. 1:
lex.advance(T)
let digitPosition = int8(1 - i)
case lex.c
of EndOfFile, '\l', 'r':
raise generateError[T](lex, "Unfinished octet escape sequence")
of '0' .. '9':
charCode = charCode or (int8(lex.c) - 0x30.int8) shl (digitPosition * 4)
of 'A' .. 'F':
charCode = charCode or (int8(lex.c) - 0x37.int8) shl (digitPosition * 4)
of 'a' .. 'f':
charCode = charCode or (int8(lex.c) - 0x57.int8) shl (digitPosition * 4)
else:
raise generateError[T](lex, "Invalid character in octet escape sequence")
lex.buf.add(char(charCode))
proc tagHandle[T](lex: YamlLexer): bool =
debug("lex: tagHandle")
lex.advance(T)
if lex.c == '<':
lex.advance(T)
if lex.c == '!':
lex.buf.add('!')
lex.advance(T)
while true:
case lex.c
of spaceOrLineEnd: raise generateError[T](lex, "Unclosed verbatim tag")
of '%': byteSequence[T](lex)
of uriChars + {','}: lex.buf.add(lex.c)
of '>': break
else: raise generateError[T](lex, "Illegal character in verbatim tag")
lex.advance(T)
lex.advance(T)
lex.cur = ltLiteralTag
else:
lex.shorthandEnd = 0
let m = lex.mark(T)
lex.buf.add('!')
while true:
case lex.c
of spaceOrLineEnd: break
of '!':
if lex.shorthandEnd != 0:
raise generateError[T](lex, "Illegal character in tag suffix")
lex.shorthandEnd = lex.afterMark(T, m) + 1
lex.buf.add('!')
of ',':
if lex.shorthandEnd > 0: break # ',' after shorthand is flow indicator
lex.buf.add(',')
of '%':
if lex.shorthandEnd == 0:
raise generateError[T](lex, "Illegal character in tag handle")
byteSequence[T](lex)
of uriChars: lex.buf.add(lex.c)
else: raise generateError[T](lex, "Illegal character in tag handle")
lex.advance(T)
lex.cur = ltTagHandle
while lex.c in space: lex.advance(T)
if lex.c in lineEnd: lex.nextState = expectLineEnd[T]
else: lex.nextState = insideLine[T]
result = true
proc streamEnd(lex: YamlLexer): bool =
debug("lex: streamEnd")
lex.cur = ltStreamEnd
@ -718,6 +798,13 @@ proc next*(lex: YamlLexer) =
proc setFlow*(lex: YamlLexer, value: bool) =
lex.inFlow = value
# in flow mode, no indentation tokens are generated because they are not
# necessary. actually, the lexer will behave wrongly if we do that, because
# adjacent values need to check if the preceding token was a JSON value, and
# if indentation tokens are generated, that information is not available.
# therefore, we do not use insideDoc in flow mode. another reason is that this
# would erratically check for document markers (---, ...) which are simply
# scalars in flow mode.
if value: lex.lineStartState = lex.insideLineImpl
else: lex.lineStartState = lex.insideDocImpl

View File

@ -3,8 +3,8 @@ import ../private/lex
import unittest, strutils
const tokensWithValue =
[ltScalarPart, ltQuotedScalar, ltYamlVersion, ltTagShorthand, ltTagUri,
ltUnknownDirective, ltUnknownDirectiveParams]
{ltScalarPart, ltQuotedScalar, ltYamlVersion, ltTagShorthand, ltTagUri,
ltUnknownDirective, ltUnknownDirectiveParams, ltLiteralTag}
type
TokenWithValue = object
@ -16,12 +16,14 @@ type
of ltBlockScalarHeader:
folded: bool
chomp: ChompType
of ltTagHandle:
handle, suffix: string
else: discard
proc actualRepr(lex: YamlLexer, t: LexerToken): string =
result = $t
case t
of tokensWithValue:
of tokensWithValue + {ltTagHandle}:
result.add("(" & escape(lex.buf) & ")")
of ltIndentation:
result.add("(" & $lex.indentation & ")")
@ -69,6 +71,17 @@ proc assertEquals(input: string, expected: varargs[TokenWithValue]) =
of ltBraceClose, ltBracketClose:
dec(flowDepth)
if flowDepth == 0: lex.setFlow(false)
of ltTagHandle:
let
handle = lex.buf.substr(0, lex.shorthandEnd)
suffix = lex.buf.substr(lex.shorthandEnd + 1)
doAssert handle == expectedToken.handle,
"Wrong handle at #" & $i & ": Expected " & expectedToken.handle &
", got " & handle
doAssert suffix == expectedToken.suffix,
"Wrong suffix at #" & $i & ": Expected " & expectedToken.suffix &
", got " & suffix
lex.buf = ""
else: discard
except YamlLexerError:
let e = (ref YamlLexerError)(getCurrentException())
@ -108,6 +121,10 @@ proc ac(): TokenWithValue = TokenWithValue(kind: ltBracketClose)
proc oo(): TokenWithValue = TokenWithValue(kind: ltBraceOpen)
proc oc(): TokenWithValue = TokenWithValue(kind: ltBraceClose)
proc c(): TokenWithValue = TokenWithValue(kind: ltComma)
proc th(handle, suffix: string): TokenWithValue =
TokenWithValue(kind: ltTagHandle, handle: handle, suffix: suffix)
proc lt(v: string): TokenWithValue =
TokenWithValue(kind: ltLiteralTag, value: v)
suite "Lexer":
test "Empty document":
@ -176,4 +193,13 @@ suite "Lexer":
test "Adjacent map values in flow style":
assertEquals("{\"foo\":bar, [1]\l:egg}", i(0), oo(), qs("foo"), mv(),
sp("bar"), c(), ao(), sp("1"), ac(), mv(), sp("egg"), oc(), se())
sp("bar"), c(), ao(), sp("1"), ac(), mv(), sp("egg"), oc(), se())
test "Tag handles":
assertEquals("- !!str string\l- !local local\l- !e! e", i(0), si(),
th("!!", "str"), sp("string"), i(0), si(), th("!", "local"),
sp("local"), i(0), si(), th("!e!", ""), sp("e"), se())
test "Literal tag handle":
assertEquals("!<tag:yaml.org,2002:str> string", i(0),
lt("tag:yaml.org,2002:str"), sp("string"), se())