lexer: implemented quoted scalars

2025-02-04 07:24:37 +00:00 · 2016-09-10 13:38:42 +02:00 · 2016-09-10 13:38:42 +02:00 · 79f432a27d
commit 79f432a27d
parent 68a157e173
2 changed files with 195 additions and 11 deletions
--- a/private/lex.nim
+++ b/private/lex.nim
@ -4,7 +4,7 @@
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.

-import lexbase, streams, strutils
+import lexbase, streams, strutils, unicode

 type
  StringSource* = object
@ -43,13 +43,14 @@ type

 # templates

-template advance(lex: YamlLexer[BaseLexer], step: int = 1) =
+proc advance(lex: YamlLexer[BaseLexer], step: int = 1) {.inline.} =
  lex.source.bufpos.inc(step)
  lex.c = lex.source.buf[lex.source.bufpos]

-template advance(lex: YamlLexer[StringSource], step: int = 1) =
+proc advance(lex: YamlLexer[StringSource], step: int = 1) {.inline.} =
  lex.source.pos.inc(step)
-  lex.c = lex.source.src[lex.source.pos]
+  if lex.source.pos >= lex.source.src.len: lex.c = EndOfFile
+  else: lex.c = lex.source.src[lex.source.pos]

 # lexer states

@ -62,7 +63,7 @@ proc expectLineEnd[T](lex: YamlLexer[T], t: var LexerToken): bool
 proc blockStyle[T](lex: YamlLexer[T], t: var LexerToken): bool {.locks:0.}
 proc blockStyleInline[T](lex: YamlLexer[T], t: var LexerToken): bool
 proc plainScalarPart[T](lex: YamlLexer[T], t: var LexerToken): bool
-proc flowStyle[T](lex: YamlLexer[T], t: var LexerToken): bool {.locks:0.}
+proc flowStyle[T](lex: YamlLexer[T], t: var LexerToken): bool
 proc streamEnd[T](lex: YamlLexer[T], t: var LexerToken): bool

 # interface
@ -92,6 +93,11 @@ const
  spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile}
  digits         = {'0'..'9'}
  flowIndicators = {'[', ']', '{', '}', ','}
+  
+  UTF8NextLine           = toUTF8(0x85.Rune)
+  UTF8NonBreakingSpace   = toUTF8(0xA0.Rune)
+  UTF8LineSeparator      = toUTF8(0x2028.Rune)
+  UTF8ParagraphSeparator = toUTF8(0x2029.Rune)

 template debug(message: string) {.dirty.} =
  when defined(yamlDebug):
@ -100,20 +106,24 @@ template debug(message: string) {.dirty.} =

 template lexCR(lex: YamlLexer[BaseLexer]) =
  lex.source.bufpos = lex.source.handleCR(lex.source.bufpos)
+  lex.c = lex.source.buf[lex.source.bufpos]

 template lexCR(lex: YamlLexer[StringSource]) =
  lex.source.pos.inc()
  if lex.source.src[lex.source.pos] == '\l': lex.source.pos.inc()
  lex.source.lineStart = lex.source.pos
  lex.source.line.inc()
+  lex.c = lex.source.src[lex.source.pos]

 template lexLF(lex: YamlLexer[BaseLexer]) =
  lex.source.bufpos = lex.source.handleLF(lex.source.bufpos)
+  lex.c = lex.source.buf[lex.source.bufpos]

 template lexLF(lex: YamlLexer[StringSource]) =
  lex.source.pos.inc()
  lex.source.lineStart = lex.source.pos
  lex.source.line.inc()
+  lex.c = lex.source.src[lex.source.pos]

 template lineNumber(lex: YamlLexer[BaseLexer]): int =
  lex.source.lineNumber
@ -382,11 +392,127 @@ proc flowIndicator[T](lex: YamlLexer[T], indicator: LexerToken,
    when inFlow: lex.stored = flowStyle[T]
    else: lex.stored = blockStyle[T]

+proc addMultiple(s: var string, c: char, num: int) {.raises: [], inline.} =
+  for i in 1..num:
+    s.add(c)
+
+proc processQuotedWhitespace(lex: YamlLexer, newlines: var int) =
+  block outer:
+    let beforeSpace = lex.buf.len
+    while true:
+      case lex.c
+      of ' ', '\t': lex.buf.add(lex.c)
+      of '\l':
+        lex.lexLF()
+        break
+      of '\c':
+        lex.lexCR()
+        break
+      else: break outer
+      lex.advance()
+    lex.buf.setLen(beforeSpace)
+    while true:
+      case lex.c
+      of ' ', '\t': discard
+      of '\l':
+        lex.lexLF()
+        newlines.inc()
+        continue
+      of '\c':
+        lex.lexCR()
+        newlines.inc()
+        continue
+      else:
+        if newlines == 0: discard
+        elif newlines == 1: lex.buf.add(' ')
+        else: lex.buf.addMultiple('\l', newlines - 1)
+        break
+      lex.advance()
+
 proc singleQuotedScalar[T](lex: YamlLexer[T]) =
-  discard
-  
+  debug("lex: singleQuotedScalar")
+  lex.advance()
+  while true:
+    case lex.c
+    of '\'':
+      lex.advance()
+      if lex.c == '\'': lex.buf.add('\'')
+      else: break
+    of EndOfFile: raise lex.generateError("Unfinished single quoted string")
+    of '\l', '\c', '\t', ' ':
+      var newlines = 1
+      lex.processQuotedWhitespace(newlines)
+      continue
+    else: lex.buf.add(lex.c)
+    lex.advance()
+
+proc unicodeSequence(lex: YamlLexer, length: int) =
+  debug("lex: unicodeSequence")
+  var unicodeChar = 0.int
+  for i in countup(0, length - 1):
+    lex.advance()
+    let digitPosition = length - i - 1
+    case lex.c
+    of EndOFFile, '\l', '\c':
+      raise lex.generateError("Unfinished unicode escape sequence")
+    of '0' .. '9':
+      unicodeChar = unicodechar or (int(lex.c) - 0x30) shl (digitPosition * 4)
+    of 'A' .. 'F':
+      unicodeChar = unicodechar or (int(lex.c) - 0x37) shl (digitPosition * 4)
+    of 'a' .. 'f':
+      unicodeChar = unicodechar or (int(lex.c) - 0x57) shl (digitPosition * 4)
+    else:
+      raise lex.generateError(
+          "Invalid character in unicode escape sequence: " &
+          escape("" & lex.c))
+  lex.buf.add(toUTF8(Rune(unicodeChar)))
+
 proc doubleQuotedScalar[T](lex: YamlLexer[T]) =
-  discard
+  debug("lex: doubleQuotedScalar")
+  lex.advance()
+  while true:
+    case lex.c
+    of EndOfFile:
+      raise lex.generateError("Unfinished double quoted string")
+    of '\\':
+      lex.advance()
+      case lex.c
+      of EndOfFile:
+        raise lex.generateError("Unfinished escape sequence")
+      of '0':       lex.buf.add('\0')
+      of 'a':       lex.buf.add('\x07')
+      of 'b':       lex.buf.add('\x08')
+      of '\t', 't': lex.buf.add('\t')
+      of 'n':       lex.buf.add('\l')
+      of 'v':       lex.buf.add('\v')
+      of 'f':       lex.buf.add('\f')
+      of 'r':       lex.buf.add('\c')
+      of 'e':       lex.buf.add('\e')
+      of ' ':       lex.buf.add(' ')
+      of '"':       lex.buf.add('"')
+      of '/':       lex.buf.add('/')
+      of '\\':      lex.buf.add('\\')
+      of 'N':       lex.buf.add(UTF8NextLine)
+      of '_':       lex.buf.add(UTF8NonBreakingSpace)
+      of 'L':       lex.buf.add(UTF8LineSeparator)
+      of 'P':       lex.buf.add(UTF8ParagraphSeparator)
+      of 'x':       lex.unicodeSequence(2)
+      of 'u':       lex.unicodeSequence(4)
+      of 'U':       lex.unicodeSequence(8)
+      of '\l', '\c':
+        var newlines = 0
+        lex.processQuotedWhitespace(newlines)
+        continue
+      else: raise lex.generateError("Illegal character in escape sequence")
+    of '"':
+      lex.advance()
+      break
+    of '\l', '\c', '\t', ' ':
+      var newlines = 1
+      lex.processQuotedWhitespace(newlines)
+      continue
+    else: lex.buf.add(lex.c)
+    lex.advance()

 proc blockStyleInline[T](lex: YamlLexer[T], t: var LexerToken): bool =
  case lex.c
@ -452,12 +578,18 @@ proc plainScalarPart[T](lex: YamlLexer[T], t: var LexerToken): bool =
              break
          of space: discard
          else: break
+      of lineEnd:
+        lex.nextImpl = expectLineEnd[T]
+        lex.stored = if lex.inFlow: flowStyle[T] else: blockStyle[T]
+        break
      of flowIndicators:
        if lex.inFlow:
          lex.nextImpl = lex.stored
          break
      of ':':
-        if not lex.nextIsPlainSafe(lex.inFlow): break outer
+        if not lex.nextIsPlainSafe(lex.inFlow):
+          lex.nextImpl = blockStyleInline[T]
+          break outer
      else: discard
  t = ltScalarPart
  result = true
--- a/test/tlex.nim
+++ b/test/tlex.nim
@ -9,6 +9,8 @@ type
    case kind: LexerToken
    of tokensWithValue:
      value: string
+    of ltIndentation:
+      indentation: int
    else: discard

 proc assertEquals(input: string, expected: varargs[TokenWithValue]) =
@ -18,13 +20,63 @@ proc assertEquals(input: string, expected: varargs[TokenWithValue]) =
    let t = lex.next()
    doAssert t == expectedToken.kind, "Wrong token kind: Expected " &
        $expectedToken.kind & ", got " & $t
-    if expectedToken.kind in tokensWithValue:
+    case expectedToken.kind 
+    of tokensWithValue:
      doAssert lex.buf == expectedToken.value,
          "Wrong token content: Expected " & escape(expectedToken.value) &
          ", got " & escape(lex.buf)
+      lex.buf = ""
+    of ltIndentation:
+      doAssert lex.indentation == expectedToken.indentation,
+          "Wrong indentation length: Expected " & $expectedToken.indentation &
+          ", got " & $lex.indentation
+    else: discard

+proc i(indent: int): TokenWithValue =
+  TokenWithValue(kind: ltIndentation, indentation: indent)
+proc sp(v: string): TokenWithValue =
+  TokenWithValue(kind: ltScalarPart, value: v)
+proc qs(v: string): TokenWithValue =
+  TokenWithValue(kind: ltQuotedScalar, value: v)
 proc se(): TokenWithValue = TokenWithValue(kind: ltStreamEnd)
+proc mk(): TokenWithValue = TokenWithValue(kind: ltMapKeyInd)
+proc mv(): TokenWithValue = TokenWithValue(kind: ltMapValInd)

 suite "Lexer":
  test "Empty document":
-    assertEquals("", se())
+    assertEquals("", se())
+  
+  test "Single-line scalar":
+    assertEquals("scalar", i(0), sp("scalar"), se())
+  
+  test "Multiline scalar":
+    assertEquals("scalar\l  line two", i(0), sp("scalar"), i(2),
+        sp("line two"), se())
+  
+  test "Single-line mapping":
+    assertEquals("key: value", i(0), sp("key"), mv(), sp("value"), se())
+  
+  test "Multiline mapping":
+    assertEquals("key:\n  value", i(0), sp("key"), mv(), i(2), sp("value"),
+        se())
+  
+  test "Explicit mapping":
+    assertEquals("? key\n: value", i(0), mk(), sp("key"), i(0), mv(),
+        sp("value"), se())
+  
+  test "Single-line single-quoted scalar":
+    assertEquals("'quoted  scalar'", i(0), qs("quoted  scalar"), se())
+  
+  test "Multiline single-quoted scalar":
+    assertEquals("'quoted\l  multi line  \l\lscalar'", i(0),
+    qs("quoted multi line\lscalar"), se())
+  
+  test "Single-line double-quoted scalar":
+    assertEquals("\"quoted  scalar\"", i(0), qs("quoted  scalar"), se())
+  
+  test "Multiline double-quoted scalar":
+    assertEquals("\"quoted\l  multi line  \l\lscalar\"", i(0),
+    qs("quoted multi line\lscalar"), se())
+  
+  test "Escape sequences":
+    assertEquals(""""\n\x31\u0032\U00000033"""", i(0), qs("\l123"), se())