From 4d1a444f6175de011da9ad7d6da6712410dbbf39 Mon Sep 17 00:00:00 2001 From: Felix Krause Date: Sun, 11 Sep 2016 12:52:24 +0200 Subject: [PATCH] Implemented tag handles --- private/lex.nim | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ test/tlex.nim | 34 ++++++++++++++++--- 2 files changed, 117 insertions(+), 4 deletions(-) diff --git a/private/lex.nim b/private/lex.nim index a96b11d..eea2a65 100644 --- a/private/lex.nim +++ b/private/lex.nim @@ -69,6 +69,8 @@ const spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile} digits = {'0'..'9'} flowIndicators = {'[', ']', '{', '}', ','} + uriChars = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', + '@', '&', '-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')'} UTF8NextLine = toUTF8(0x85.Rune) UTF8NonBreakingSpace = toUTF8(0xA0.Rune) @@ -153,6 +155,15 @@ proc nextIsPlainSafe(lex: YamlLexer, t: typedesc[StringSource], of flowIndicators: result = not inFlow else: result = true +proc mark(lex: YamlLexer, t: typedesc[BaseLexer]): int = lex.blSource.bufpos +proc mark(lex: YamlLexer, t: typedesc[StringSource]): int = lex.sSource.pos + +proc afterMark(lex: YamlLexer, t: typedesc[BaseLexer], m: int): int = + lex.blSource.bufpos - m + +proc afterMark(lex: YamlLexer, t: typedesc[StringSource], m: int): int = + lex.sSource.pos - m + # lexer states proc outsideDoc[T](lex: YamlLexer): bool @@ -169,6 +180,7 @@ proc insideLine[T](lex: YamlLexer): bool proc plainScalarPart[T](lex: YamlLexer): bool proc blockScalarHeader[T](lex: YamlLexer): bool proc blockScalar[T](lex: YamlLexer): bool +proc tagHandle[T](lex: YamlLexer): bool proc streamEnd(lex: YamlLexer): bool # implementation @@ -574,6 +586,9 @@ proc insideLine[T](lex: YamlLexer): bool = of '[': result = flowIndicator[T](lex, ltBracketOpen) of ']': result = flowIndicator[T](lex, ltBracketClose) of ',': result = flowIndicator[T](lex, ltComma) + of '!': + lex.nextState = tagHandle[T] + result = false else: lex.nextState = plainScalarPart[T] result = false @@ -678,6 +693,71 @@ proc blockScalar[T](lex: YamlLexer): bool = result = true lex.nextState = expectLineEnd[T] +proc byteSequence[T](lex: YamlLexer) = + debug("lex: byteSequence") + var charCode = 0.int8 + for i in 0 .. 1: + lex.advance(T) + let digitPosition = int8(1 - i) + case lex.c + of EndOfFile, '\l', 'r': + raise generateError[T](lex, "Unfinished octet escape sequence") + of '0' .. '9': + charCode = charCode or (int8(lex.c) - 0x30.int8) shl (digitPosition * 4) + of 'A' .. 'F': + charCode = charCode or (int8(lex.c) - 0x37.int8) shl (digitPosition * 4) + of 'a' .. 'f': + charCode = charCode or (int8(lex.c) - 0x57.int8) shl (digitPosition * 4) + else: + raise generateError[T](lex, "Invalid character in octet escape sequence") + lex.buf.add(char(charCode)) + +proc tagHandle[T](lex: YamlLexer): bool = + debug("lex: tagHandle") + lex.advance(T) + if lex.c == '<': + lex.advance(T) + if lex.c == '!': + lex.buf.add('!') + lex.advance(T) + while true: + case lex.c + of spaceOrLineEnd: raise generateError[T](lex, "Unclosed verbatim tag") + of '%': byteSequence[T](lex) + of uriChars + {','}: lex.buf.add(lex.c) + of '>': break + else: raise generateError[T](lex, "Illegal character in verbatim tag") + lex.advance(T) + lex.advance(T) + lex.cur = ltLiteralTag + else: + lex.shorthandEnd = 0 + let m = lex.mark(T) + lex.buf.add('!') + while true: + case lex.c + of spaceOrLineEnd: break + of '!': + if lex.shorthandEnd != 0: + raise generateError[T](lex, "Illegal character in tag suffix") + lex.shorthandEnd = lex.afterMark(T, m) + 1 + lex.buf.add('!') + of ',': + if lex.shorthandEnd > 0: break # ',' after shorthand is flow indicator + lex.buf.add(',') + of '%': + if lex.shorthandEnd == 0: + raise generateError[T](lex, "Illegal character in tag handle") + byteSequence[T](lex) + of uriChars: lex.buf.add(lex.c) + else: raise generateError[T](lex, "Illegal character in tag handle") + lex.advance(T) + lex.cur = ltTagHandle + while lex.c in space: lex.advance(T) + if lex.c in lineEnd: lex.nextState = expectLineEnd[T] + else: lex.nextState = insideLine[T] + result = true + proc streamEnd(lex: YamlLexer): bool = debug("lex: streamEnd") lex.cur = ltStreamEnd @@ -718,6 +798,13 @@ proc next*(lex: YamlLexer) = proc setFlow*(lex: YamlLexer, value: bool) = lex.inFlow = value + # in flow mode, no indentation tokens are generated because they are not + # necessary. actually, the lexer will behave wrongly if we do that, because + # adjacent values need to check if the preceding token was a JSON value, and + # if indentation tokens are generated, that information is not available. + # therefore, we do not use insideDoc in flow mode. another reason is that this + # would erratically check for document markers (---, ...) which are simply + # scalars in flow mode. if value: lex.lineStartState = lex.insideLineImpl else: lex.lineStartState = lex.insideDocImpl diff --git a/test/tlex.nim b/test/tlex.nim index 51eac98..cc5492b 100644 --- a/test/tlex.nim +++ b/test/tlex.nim @@ -3,8 +3,8 @@ import ../private/lex import unittest, strutils const tokensWithValue = - [ltScalarPart, ltQuotedScalar, ltYamlVersion, ltTagShorthand, ltTagUri, - ltUnknownDirective, ltUnknownDirectiveParams] + {ltScalarPart, ltQuotedScalar, ltYamlVersion, ltTagShorthand, ltTagUri, + ltUnknownDirective, ltUnknownDirectiveParams, ltLiteralTag} type TokenWithValue = object @@ -16,12 +16,14 @@ type of ltBlockScalarHeader: folded: bool chomp: ChompType + of ltTagHandle: + handle, suffix: string else: discard proc actualRepr(lex: YamlLexer, t: LexerToken): string = result = $t case t - of tokensWithValue: + of tokensWithValue + {ltTagHandle}: result.add("(" & escape(lex.buf) & ")") of ltIndentation: result.add("(" & $lex.indentation & ")") @@ -69,6 +71,17 @@ proc assertEquals(input: string, expected: varargs[TokenWithValue]) = of ltBraceClose, ltBracketClose: dec(flowDepth) if flowDepth == 0: lex.setFlow(false) + of ltTagHandle: + let + handle = lex.buf.substr(0, lex.shorthandEnd) + suffix = lex.buf.substr(lex.shorthandEnd + 1) + doAssert handle == expectedToken.handle, + "Wrong handle at #" & $i & ": Expected " & expectedToken.handle & + ", got " & handle + doAssert suffix == expectedToken.suffix, + "Wrong suffix at #" & $i & ": Expected " & expectedToken.suffix & + ", got " & suffix + lex.buf = "" else: discard except YamlLexerError: let e = (ref YamlLexerError)(getCurrentException()) @@ -108,6 +121,10 @@ proc ac(): TokenWithValue = TokenWithValue(kind: ltBracketClose) proc oo(): TokenWithValue = TokenWithValue(kind: ltBraceOpen) proc oc(): TokenWithValue = TokenWithValue(kind: ltBraceClose) proc c(): TokenWithValue = TokenWithValue(kind: ltComma) +proc th(handle, suffix: string): TokenWithValue = + TokenWithValue(kind: ltTagHandle, handle: handle, suffix: suffix) +proc lt(v: string): TokenWithValue = + TokenWithValue(kind: ltLiteralTag, value: v) suite "Lexer": test "Empty document": @@ -176,4 +193,13 @@ suite "Lexer": test "Adjacent map values in flow style": assertEquals("{\"foo\":bar, [1]\l:egg}", i(0), oo(), qs("foo"), mv(), - sp("bar"), c(), ao(), sp("1"), ac(), mv(), sp("egg"), oc(), se()) \ No newline at end of file + sp("bar"), c(), ao(), sp("1"), ac(), mv(), sp("egg"), oc(), se()) + + test "Tag handles": + assertEquals("- !!str string\l- !local local\l- !e! e", i(0), si(), + th("!!", "str"), sp("string"), i(0), si(), th("!", "local"), + sp("local"), i(0), si(), th("!e!", ""), sp("e"), se()) + + test "Literal tag handle": + assertEquals("! string", i(0), + lt("tag:yaml.org,2002:str"), sp("string"), se()) \ No newline at end of file