NimYAML/yaml/private/lex.nim

#            NimYAML - YAML implementation in Nim
#        (c) Copyright 2015-2023 Felix Krause
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.

import lexbase, streams, strutils, unicode
import ../data
when defined(yamlDebug):
  import terminal
  export terminal

type
  Lexer* = object
    cur*: Token
    curStartPos*, curEndPos*: Mark
    flowDepth*: int
    # recently read scalar or URI, if any
    evaluated*: string
    # internals
    indentation: int
    source: BaseLexer
    tokenStart: int
    state, lineStartState, jsonEnablingState: State
    c: char
    seenMultiline: bool
    # indentation of recently started set of node properties.
    # necessary for implicit keys with properties.
    propertyIndentation: int

  LexerError* = object of ValueError
    line*, column*: Positive
    lineContent*: string

  State = proc(lex: var Lexer): bool {.gcSafe, nimcall, raises: [LexerError].}

  Token* {.pure.} = enum
    YamlDirective,    # `%YAML`
    TagDirective,     # `%TAG`
    UnknownDirective, # any directive but `%YAML` and `%TAG`
    DirectiveParam,   # parameters of %YAML and unknown directives
    EmptyLine,        # for line folding in multiline plain scalars
    DirectivesEnd,    # explicit `---`
    DocumentEnd,      # explicit `...`
    StreamEnd,        # end of input
    Indentation,      # beginning of non-empty line
    Plain, SingleQuoted, DoubleQuoted, Literal, Folded,
    SeqItemInd,       # block sequence item indicator `- `
    MapKeyInd,        # block mapping key indicator `? `
    MapValueInd       # block mapping value indicator `: `
    MapStart, MapEnd, SeqStart, SeqEnd, SeqSep # {}[],
    TagHandle,        # a handle of a tag, e.g. `!!` of `!!str`
    Suffix,           # suffix of a tag shorthand, e.g. `str` of `!!str`.
                        # also used for the URI of the %TAG directive
    VerbatimTag,      # a verbatim tag, e.g. `!<tag:yaml.org,2002:str>`
    Anchor,           # anchor property of a node, e.g. `&anchor`
    Alias             # alias node, e.g. `*alias`

  ChompType* = enum
    ctKeep, ctClip, ctStrip

  LineStartType = enum
    lsDirectivesEndMarker, lsDocumentEndMarker, lsComment,
    lsNewline, lsStreamEnd, lsContent

# consts

const
  space          = {' ', '\t'}
  lineEnd        = {'\l', '\c', EndOfFile}
  spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile}
  commentOrLineEnd = {'\l', '\c', EndOfFile, '#'}
  digits         = {'0'..'9'}
  flowIndicators = {'[', ']', '{', '}', ','}
  uriChars       = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':',
      '@', '&', '-', '=', '+', '$', '_', '.', '~', '*', '\'', '(', ')'}
  tagShorthandChars = {'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-'}
  nodePropertyKind* = {Token.TagHandle, Token.VerbatimTag, Token.Anchor}
  scalarTokenKind* = {Token.Plain, Token.SingleQuoted, Token.DoubleQuoted,
                     Token.Literal, Token.Folded}

  UTF8NextLine           = toUTF8(0x85.Rune)
  UTF8NonBreakingSpace   = toUTF8(0xA0.Rune)
  UTF8LineSeparator      = toUTF8(0x2028.Rune)
  UTF8ParagraphSeparator = toUTF8(0x2029.Rune)

  UnknownIndentation* = int.low

proc currentIndentation*(lex: Lexer): int =
  return lex.source.getColNumber(lex.source.bufpos) - 1

proc recentIndentation*(lex: Lexer): int =
  return lex.indentation

# lexer source handling

proc advance(lex: var Lexer, step: int = 1) {.inline.} =
  lex.c = lex.source.buf[lex.source.bufpos]
  lex.source.bufpos.inc(step)

template lexCR(lex: var Lexer) =
  try: lex.source.bufpos = lex.source.handleCR(lex.source.bufpos - 1)
  except CatchableError as ce:
    var e = lex.generateError("Encountered stream error: " & ce.msg)
    e.parent = ce
    raise e
  lex.advance()

template lexLF(lex: var Lexer) =
  try: lex.source.bufpos = lex.source.handleLF(lex.source.bufpos - 1)
  except CatchableError as ce:
    var e = generateError(lex, "Encountered stream error: " & ce.msg)
    e.parent = ce
    raise e
  lex.advance()

template lineNumber(lex: Lexer): Positive =
  lex.source.lineNumber

template columnNumber(lex: Lexer): Positive =
  lex.source.getColNumber(lex.source.bufpos)

template currentLine(lex: Lexer): string =
  lex.source.getCurrentLine(true)

proc isPlainSafe(lex: Lexer): bool {.inline.} =
  case lex.source.buf[lex.source.bufpos]
  of spaceOrLineEnd: result = false
  of flowIndicators: result = lex.flowDepth == 0
  else: result = true

# lexer states

{.push gcSafe.}
# `raises` cannot be pushed.
proc outsideDoc(lex: var Lexer): bool {.raises: [].}
proc yamlVersion(lex: var Lexer): bool {.raises: LexerError.}
proc tagShorthand(lex: var Lexer): bool {.raises: LexerError.}
proc tagUri(lex: var Lexer): bool {.raises: LexerError.}
proc unknownDirParams(lex: var Lexer): bool {.raises: [].}
proc expectLineEnd(lex: var Lexer): bool {.raises: LexerError.}
proc lineStart(lex: var Lexer): bool {.raises: LexerError.}
proc flowLineStart(lex: var Lexer): bool {.raises: LexerError.}
proc flowLineIndentation(lex: var Lexer): bool {.raises: LexerError.}
proc insideLine(lex: var Lexer): bool {.raises: LexerError.}
proc indentationSettingToken(lex: var Lexer): bool {.raises: LexerError.}
proc afterToken(lex: var Lexer): bool {.raises: LexerError.}
proc beforeIndentationSettingToken(lex: var Lexer): bool {.raises: LexerError.}
proc afterJsonEnablingToken(lex: var Lexer): bool {.raises: LexerError.}
proc lineIndentation(lex: var Lexer): bool {.raises: [].}
proc lineDirEnd(lex: var Lexer): bool {.raises: [].}
proc lineDocEnd(lex: var Lexer): bool {.raises: [].}
proc atSuffix(lex: var Lexer): bool {.raises: [LexerError].}
proc streamEnd(lex: var Lexer): bool {.raises: [].}
{.pop.}

# helpers

template debug*(message: string) =
  when defined(yamlDebug):
    when nimvm:
      echo "yamlDebug: ", message
    else:
      try: styledWriteLine(stdout, fgBlue, message)
      except ValueError, IOError: discard

proc generateError(lex: Lexer, message: string):
    ref LexerError {.raises: [].} =
  result = (ref LexerError)(
    msg: message, line: lex.lineNumber(), column: lex.columnNumber(),
    lineContent: lex.currentLine())

proc startToken(lex: var Lexer) {.inline.} =
  lex.curStartPos = Mark(line: lex.lineNumber(), column: lex.columnNumber())
  lex.tokenStart = lex.source.bufpos

proc endToken(lex: var Lexer) {.inline.} =
  lex.curEndPos = Mark(line: lex.lineNumber(), column: lex.columnNumber())

proc readNumericSubtoken(lex: var Lexer) {.inline.} =
  if lex.c notin digits:
    raise lex.generateError("Illegal character in YAML version string: " & escape("" & lex.c))
  while true:
    lex.advance()
    if lex.c notin digits: break

proc isDirectivesEnd(lex: var Lexer): bool =
  var peek = lex.source.bufpos
  if lex.source.buf[peek] == '-':
    peek += 1
    if lex.source.buf[peek] == '-':
      peek += 1
      if lex.source.buf[peek] in spaceOrLineEnd:
        lex.source.bufpos = peek
        lex.advance()
        return true
  return false

proc isDocumentEnd(lex: var Lexer): bool =
  var peek = lex.source.bufpos
  if lex.source.buf[peek] == '.':
    peek += 1
    if lex.source.buf[peek] == '.':
      peek += 1
      if lex.source.buf[peek] in spaceOrLineEnd:
        lex.source.bufpos = peek
        lex.advance()
        return true
  return false

proc readHexSequence(lex: var Lexer, len: int) =
  var charPos = 0
  for i in countup(0, len-1):
    lex.advance()
    let digitPosition = len - i - 1
    case lex.c
    of lineEnd:
      raise lex.generateError("Unfinished unicode escape sequence")
    of '0'..'9':
      charPos = charPos or (int(lex.c) - 0x30) shl (digitPosition * 4)
    of 'A' .. 'F':
      charPos = charPos or (int(lex.c) - 0x37) shl (digitPosition * 4)
    of 'a' .. 'f':
      charPos = charPos or (int(lex.c) - 0x57) shl (digitPosition * 4)
    else:
      raise lex.generateError("Invalid character in hex escape sequence: " &
          escape("" & lex.c))
  lex.evaluated.add(toUTF8(Rune(charPos)))

proc readURI(lex: var Lexer, verbatim: bool) =
  lex.evaluated.setLen(0)
  let endWithSpace = lex.c != '<'
  let restricted = lex.flowDepth > 0 and not verbatim
  var literalStart: int
  if endWithSpace:
    if not restricted and lex.c in {'[', ']', ','}:
      raise lex.generateError("Flow indicator cannot start tag prefix")
    literalStart = lex.source.bufpos - 1
  else:
    literalStart = lex.source.bufpos
    lex.advance()
  while true:
    case lex.c
    of spaceOrLineEnd:
      if endWithSpace:
        lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
        break
      raise lex.generateError("Unclosed verbatim tag")
    of '%':
      lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
      lex.readHexSequence(2)
      literalStart = lex.source.bufpos
    of uriChars: discard
    of '[', ']', ',':
      if restricted:
        lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
        break
    of '!':
      if restricted:
        raise lex.generateError("Illegal '!' in tag suffix")
    of '>':
      if endWithSpace:
        raise lex.generateError("Illegal character in URI: `>`")
      lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
      lex.advance()
      break
    else:
      raise lex.generateError("Illegal character in URI: " & escape("" & lex.c))
    lex.advance()

proc endLine(lex: var Lexer) =
  while true:
    case lex.c
    of '\l':
      lex.lexLF()
      lex.state = lex.lineStartState
      break
    of '\c':
      lex.lexCR()
      lex.state = lex.lineStartState
      break
    of EndOfFile:
      lex.state = streamEnd
      break
    of '#':
      while true:
        lex.advance()
        if lex.c in lineEnd: break
    else: discard

proc startLine(lex: var Lexer): LineStartType =
  case lex.c
  of '-':
    return if lex.isDirectivesEnd(): lsDirectivesEndMarker
           else: lsContent
  of '.':
    return if lex.isDocumentEnd(): lsDocumentEndMarker
           else: lsContent
  else:
    while lex.c == ' ': lex.advance()
    if lex.c == '\t':
      var peek = lex.source.bufpos
      while lex.source.buf[peek] in space:
        peek += 1
      if lex.source.buf[peek] in commentOrLineEnd:
        lex.source.bufpos = peek + 1
        lex.c = lex.source.buf[peek]
      else:
        return lsContent
    return case lex.c
    of '#': lsComment
    of '\l', '\c': lsNewline
    of EndOfFile: lsStreamEnd
    else: lsContent

proc readPlainScalar(lex: var Lexer) =
  lex.evaluated.setLen(0)
  let afterNewlineState = if lex.flowDepth == 0: lineIndentation
                          else: flowLineIndentation
  var lineStartPos: int
  lex.seenMultiline = false
  lex.startToken()
  if lex.propertyIndentation != -1:
    lex.indentation = lex.propertyIndentation
    lex.propertyIndentation = -1
  lex.cur = Token.Plain
  block multilineLoop:
    while true:
      lineStartPos = lex.source.bufpos - 1
      block inlineLoop:
        while true:
          lex.advance()
          case lex.c
          of space:
            lex.endToken()
            let spaceStart = lex.source.bufpos - 2
            block spaceLoop:
              while true:
                lex.advance()
                case lex.c
                of '\l', '\c':
                  lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
                  break inlineLoop
                of EndOfFile:
                  lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
                  lex.state = streamEnd
                  break multilineLoop
                of '#':
                  lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
                  lex.state = expectLineEnd
                  break multilineLoop
                of ':':
                  if not lex.isPlainSafe():
                    lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
                    lex.state = insideLine
                    break multilineLoop
                  break spaceLoop
                of flowIndicators:
                  if lex.flowDepth > 0:
                    lex.evaluated.add(lex.source.buf[lineStartPos..spaceStart])
                    lex.state = insideLine
                    break multilineLoop
                  break spaceLoop
                of space: discard
                else: break spaceLoop
          of ':':
            if not lex.isPlainSafe():
              lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
              lex.endToken()
              lex.state = insideLine
              break multilineLoop
          of flowIndicators:
            if lex.flowDepth > 0:
              lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
              lex.endToken()
              lex.state = insideLine
              break multilineLoop
          of '\l', '\c':
            lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
            lex.endToken()
            break inlineLoop
          of EndOfFile:
            lex.evaluated.add(lex.source.buf[lineStartPos..lex.source.bufpos - 2])
            if lex.currentIndentation() > 0:
              lex.endToken()
            lex.state = streamEnd
            break multilineLoop
          else: discard
      lex.endLine()
      var newlines = 1
      block newlineLoop:
        while true:
          case lex.startLine()
          of lsContent:
            if lex.currentIndentation() <= lex.indentation:
              lex.state = afterNewlineState
              break multilineLoop
            if lex.c == '\t':
              while lex.c in space: lex.advance()
              case lex.c:
              of '#':
                lex.endLine()
                lex.state = lineStart
                break multilineLoop
              of '\l', '\c':
                lex.endLine()
                newlines += 1
                continue
              else: discard
            break newlineLoop
          of lsDirectivesEndMarker:
            lex.state = lineDirEnd
            break multilineLoop
          of lsDocumentEndMarker:
            lex.state = lineDocEnd
            break multilineLoop
          of lsStreamEnd:
            break multilineLoop
          of lsComment:
            lex.endLine()
            lex.state = lineStart
            break multilineLoop
          of lsNewline: lex.endLine()
          newlines += 1
      while lex.c in space: lex.advance()
      if (lex.c == ':' and not lex.isPlainSafe()) or
         lex.c == '#' or (lex.c in flowIndicators and
         lex.flowDepth > 0):
        lex.state = afterNewlineState
        break multilineLoop
      lex.seenMultiline = true
      if newlines == 1: lex.evaluated.add(' ')
      else:
        for i in countup(2, newlines): lex.evaluated.add('\l')

proc streamEndAfterBlock(lex: var Lexer) =
  if lex.currentIndentation() != 0:
    lex.endToken()
    lex.curEndPos.column -= 1

proc dirEndFollows(lex: Lexer): bool =
  return lex.c == '-' and lex.source.buf[lex.source.bufpos] == '-' and
      lex.source.buf[lex.source.bufpos+1] == '-'

proc docEndFollows(lex: Lexer): bool =
  return lex.c == '.' and lex.source.buf[lex.source.bufpos] == '.' and
      lex.source.buf[lex.source.bufpos+1] == '.'

proc readBlockScalar(lex: var Lexer) =
  var
    chomp = ctClip
    indent = 0
    separationLines = 0
    contentStart: int
    hasBody = true
  lex.startToken()
  lex.cur = if lex.c == '>': Token.Folded else: Token.Literal
  lex.evaluated.setLen(0)

  # header
  while true:
    lex.advance()
    case lex.c
    of '+':
      if chomp != ctClip:
        raise lex.generateError("Multiple chomping indicators")
      chomp = ctKeep
    of '-':
      if chomp != ctClip:
        raise lex.generateError("Multiple chomping indicators")
      chomp = ctStrip
    of '1' .. '9':
      if indent != 0:
        raise lex.generateError("Multiple indentation indicators")
      indent = max(0, lex.indentation) + int(lex.c) - int('0')
    of ' ':
      while true:
        lex.advance()
        if lex.c != ' ': break
      if lex.c notin commentOrLineEnd:
        raise lex.generateError("Illegal character after block scalar header: " &
            escape("" & lex.c))
      break
    of EndOfFile:
      hasBody = false
      break
    of '\l', '\c': break
    else:
      raise lex.generateError("Illegal character in block scalar header: " &
          escape("" & lex.c))
  lex.endLine()

  block body:
    # determining indentation and leading empty lines
    var
      maxLeadingSpaces = 0
      moreIndented = false
    while true:
      if indent == 0:
        while lex.c == ' ': lex.advance()
      else:
        maxLeadingSpaces = lex.currentIndentation() + indent
        while lex.c == ' ' and lex.currentIndentation() < maxLeadingSpaces:
          lex.advance()
      case lex.c
      of '\l', '\c':
        lex.endToken()
        maxLeadingSpaces = max(maxLeadingSpaces, lex.currentIndentation())
        lex.endLine()
        separationLines += 1
      of EndOfFile:
        lex.state = streamEnd
        lex.streamEndAfterBlock()
        if lex.source.getColNumber(lex.source.bufpos) > 1 and hasBody: separationLines += 1
        break body
      else:
        if indent == 0:
          indent = lex.currentIndentation()
          if indent <= lex.indentation or
              (indent == 0 and (lex.dirEndFollows() or lex.docEndFollows())):
            lex.state = lineIndentation
            break body
          elif indent < maxLeadingSpaces:
            raise lex.generateError("Leading all-spaces line contains too many spaces")
        elif lex.currentIndentation() < indent: break body
        if lex.cur == Token.Folded and lex.c in space:
          moreIndented = true
        break
    for i in countup(0, separationLines - 1):
      lex.evaluated.add('\l')
    separationLines = if moreIndented: 1 else: 0

    block content:
      while true:
        contentStart = lex.source.bufpos - 1
        while lex.c notin lineEnd: lex.advance()
        lex.evaluated.add(lex.source.buf[contentStart .. lex.source.bufpos - 2])
        if lex.c == EndOfFile:
          lex.state = streamEnd
          lex.streamEndAfterBlock()
          break body
        separationLines += 1
        lex.endToken()
        lex.endLine()

        let oldMoreIndented = moreIndented
        # empty lines and indentation of next line
        moreIndented = false
        while true:
          while lex.c == ' ' and lex.currentIndentation() < indent:
            lex.advance()
          case lex.c
          of '\l', '\c':
            lex.endToken()
            separationLines += 1
            lex.endLine()
          of EndOfFile:
            lex.state = streamEnd
            lex.streamEndAfterBlock()
            break body
          else:
            if lex.currentIndentation() < indent or
                (indent == 0 and (lex.dirEndFollows() or lex.docEndFollows())):
              break content
            if lex.cur == Token.Folded and lex.c in space:
              moreIndented = true
              if not oldMoreIndented:
                separationLines += 1
            break

        # line folding
        if lex.cur == Token.Literal:
          for i in countup(0, separationLines - 1):
            lex.evaluated.add('\l')
        elif separationLines == 1:
          lex.evaluated.add(' ')
        else:
          for i in countup(0, separationLines - 2):
            lex.evaluated.add('\l')
        separationLines = if moreIndented: 1 else: 0

    let markerFollows = lex.currentIndentation() == 0 and
        (lex.dirEndFollows() or lex.docEndFollows())
    if lex.c == '#':
      lex.state = expectLineEnd
    elif lex.currentIndentation() > lex.indentation and not markerFollows:
      raise lex.generateError("This line #" & $lex.curStartPos.line & " at " & escape("" & lex.c) & " is less indented than necessary")
    elif lex.currentIndentation() == 0:
      lex.state = lineStart
    else:
      lex.state = lineIndentation

  lex.endToken()

  case chomp
  of ctStrip: discard
  of ctClip:
    if len(lex.evaluated) > 0: lex.evaluated.add('\l')
  of ctKeep:
    for i in countup(0, separationLines - 1):
      lex.evaluated.add('\l')

proc processQuotedWhitespace(lex: var Lexer, initial: int) =
  var newlines = initial
  let firstSpace = lex.source.bufpos - 1
  while true:
    case lex.c
    of ' ', '\t': discard
    of '\l':
      lex.lexLF()
      break
    of '\c':
      lex.lexCR()
      break
    else:
      lex.evaluated.add(lex.source.buf[firstSpace..lex.source.bufpos - 2])
      return
    lex.advance()
  lex.seenMultiline = true
  while true:
    case lex.startLine()
    of lsContent, lsComment:
      while lex.c in space: lex.advance()
      if lex.c in {'\l', '\c'}:
        lex.endLine()
      else: break
    of lsDirectivesEndMarker:
      raise lex.generateError("Illegal `---` within quoted scalar")
    of lsDocumentEndMarker:
      raise lex.generateError("Illegal `...` within quoted scalar")
    of lsNewline: lex.endLine()
    of lsStreamEnd:
      raise lex.generateError("Unclosed quoted string")
    newlines += 1
  if newlines == 0: discard
  elif newlines == 1: lex.evaluated.add(' ')
  else:
    for i in countup(2, newlines): lex.evaluated.add('\l')

proc readSingleQuotedScalar(lex: var Lexer) =
  lex.seenMultiline = false
  lex.startToken()
  lex.evaluated.setLen(0)
  if lex.propertyIndentation != -1:
    lex.indentation = lex.propertyIndentation
    lex.propertyIndentation = -1
  var literalStart = lex.source.bufpos
  lex.advance()
  while true:
    case lex.c
    of EndOfFile:
      raise lex.generateError("Unclosed quoted string")
    of '\'':
      lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
      lex.advance()
      if lex.c == '\'':
        lex.evaluated.add('\'')
        literalStart = lex.source.bufpos
        lex.advance()
      else: break
    of ' ', '\t', '\l', '\c':
      lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
      lex.processQuotedWhitespace(1)
      literalStart = lex.source.bufpos - 1
    else:
      lex.advance()
  lex.endToken()
  lex.cur = Token.SingleQuoted

proc readDoubleQuotedScalar(lex: var Lexer) =
  lex.seenMultiline = false
  lex.startToken()
  lex.evaluated.setLen(0)
  if lex.propertyIndentation != -1:
    lex.indentation = lex.propertyIndentation
    lex.propertyIndentation = -1
  var literalStart = lex.source.bufpos
  lex.advance()
  while true:
    case lex.c
    of EndOfFile:
      raise lex.generateError("Unclosed quoted string")
    of '\\':
      lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
      lex.advance()
      literalStart = lex.source.bufpos
      case lex.c
      of '0': lex.evaluated.add('\0')
      of 'a': lex.evaluated.add('\a')
      of 'b': lex.evaluated.add('\b')
      of 't', '\t': lex.evaluated.add('\t')
      of 'n': lex.evaluated.add('\l')
      of 'v': lex.evaluated.add('\v')
      of 'f': lex.evaluated.add('\f')
      of 'r': lex.evaluated.add('\c')
      of 'e': lex.evaluated.add('\e')
      of ' ': lex.evaluated.add(' ')
      of '"': lex.evaluated.add('"')
      of '/': lex.evaluated.add('/')
      of '\\':lex.evaluated.add('\\')
      of 'N': lex.evaluated.add(UTF8NextLine)
      of '_': lex.evaluated.add(UTF8NonBreakingSpace)
      of 'L': lex.evaluated.add(UTF8LineSeparator)
      of 'P': lex.evaluated.add(UTF8ParagraphSeparator)
      of 'x':
        lex.readHexSequence(2)
        literalStart = lex.source.bufpos
      of 'u':
        lex.readHexSequence(4)
        literalStart = lex.source.bufpos
      of 'U':
        lex.readHexSequence(8)
        literalStart = lex.source.bufpos
      of '\l', '\c':
        lex.processQuotedWhitespace(0)
        literalStart = lex.source.bufpos - 1
        continue
      else:
        raise lex.generateError("Illegal character in escape sequence: " & escape("" & lex.c))
    of '"':
      lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
      break
    of ' ', '\t', '\l', '\c':
      lex.evaluated.add(lex.source.buf[literalStart..lex.source.bufpos - 2])
      lex.processQuotedWhitespace(1)
      literalStart = lex.source.bufpos - 1
      continue
    else: discard
    lex.advance()
  lex.advance()
  lex.endToken()
  lex.cur = Token.DoubleQuoted

proc basicInit(lex: var Lexer) =
  lex.state = outsideDoc
  lex.flowDepth = 0
  lex.lineStartState = outsideDoc
  lex.jsonEnablingState = afterToken
  lex.propertyIndentation = -1
  lex.evaluated = ""
  lex.advance()

# interface

proc lastScalarWasMultiline*(lex: Lexer): bool =
  result = lex.seenMultiline

proc shortLexeme*(lex: Lexer): string =
  return lex.source.buf[lex.tokenStart..lex.source.bufpos-2]

proc fullLexeme*(lex: Lexer): string =
  return lex.source.buf[lex.tokenStart - 1..lex.source.bufpos-2]

proc currentLine*(lex: Lexer): string =
  return lex.source.getCurrentLine(false)

proc next*(lex: var Lexer) {.raises: [LexerError].}=
  while not lex.state(lex): discard
  debug("lexer -> [" & $lex.curStartPos.line & "," & $lex.curStartPos.column &
      "-" & $lex.curEndPos.line & "," & $lex.curEndPos.column & "] " & $lex.cur)

proc init*(lex: var Lexer, source: Stream) {.raises: [IOError, OSError].} =
  lex.source.open(source)
  lex.basicInit()

proc init*(lex: var Lexer, source: string) {.raises: [].} =
  try:
    lex.source.open(newStringStream(source))
  except CatchableError:
    discard # can never happen with StringStream
  lex.basicInit()

# states

proc outsideDoc(lex: var Lexer): bool =
  case lex.c
  of '%':
    lex.startToken()
    while true:
      lex.advance()
      if lex.c in spaceOrLineEnd: break
    lex.endToken()
    let name = lex.shortLexeme()
    case name
    of "YAML":
      lex.state = yamlVersion
      lex.cur = Token.YamlDirective
    of "TAG":
      lex.state = tagShorthand
      lex.cur = Token.TagDirective
    else:
      lex.state = unknownDirParams
      lex.cur = Token.UnknownDirective
      lex.evaluated.setLen(0)
      lex.evaluated.add(name)
  of '-':
    lex.startToken()
    if lex.isDirectivesEnd():
      lex.state = afterToken
      lex.cur = Token.DirectivesEnd
    else:
      lex.state = indentationSettingToken
      lex.cur = Token.Indentation
    lex.lineStartState = lineStart
    lex.indentation = -1
    lex.endToken()
  of '.':
    lex.startToken()
    if lex.isDocumentEnd():
      lex.state = expectLineEnd
      lex.cur = Token.DocumentEnd
    else:
      lex.state = indentationSettingToken
      lex.lineStartState = lineStart
      lex.indentation = -1
      lex.cur = Token.Indentation
    lex.endToken()
  else:
    lex.startToken()
    while lex.c == ' ': lex.advance()
    if lex.c in commentOrLineEnd:
      lex.state = expectLineEnd
      return false
    if lex.c == '\t':
      var peek = lex.source.bufpos
      while lex.source.buf[peek] in space:
        peek += 1
      if lex.source.buf[peek] in commentOrLineEnd:
        lex.state = expectLineEnd
        lex.source.bufpos = peek
        return false
    lex.endToken()
    lex.cur = Token.Indentation
    lex.indentation = -1
    lex.state = indentationSettingToken
    lex.lineStartState = lineStart
  return true

proc yamlVersion(lex: var Lexer): bool =
  while lex.c in space: lex.advance()
  lex.startToken()
  lex.readNumericSubtoken()
  if lex.c != '.':
    raise lex.generateError("Illegal character in YAML version string: " & escape("" & lex.c))
  lex.advance()
  lex.readNumericSubtoken()
  if lex.c notin spaceOrLineEnd:
    raise lex.generateError("Illegal character in YAML version string: " & escape("" & lex.c))
  lex.cur = Token.DirectiveParam
  lex.endToken()
  lex.state = expectLineEnd
  return true

proc tagShorthand(lex: var Lexer): bool =
  while lex.c in space: lex.advance()
  if lex.c != '!':
    raise lex.generateError("Illegal character, tag shorthand must start with '!': " & escape("" & lex.c))
  lex.startToken()
  lex.advance()

  if lex.c in spaceOrLineEnd: discard
  else:
    while lex.c in tagShorthandChars: lex.advance()
    if lex.c != '!':
      if lex.c in spaceOrLineEnd:
        raise lex.generateError("Tag shorthand must end with '!'.")
      else:
        raise lex.generateError("Illegal character in tag shorthand: " & escape("" & lex.c))
    lex.advance()
    if lex.c notin spaceOrLineEnd:
      raise lex.generateError("Missing space after tag shorthand")
  lex.cur = Token.TagHandle
  lex.endToken()
  lex.state = tagUri
  return true

proc tagUri(lex: var Lexer): bool =
  while lex.c in space: lex.advance()
  lex.startToken()
  if lex.c == '<':
    raise lex.generateError("Illegal character in tag URI: " & escape("" & lex.c))
  lex.readUri(false)
  lex.cur = Token.Suffix
  lex.endToken()
  lex.state = expectLineEnd
  return true

proc unknownDirParams(lex: var Lexer): bool =
  while lex.c in space: lex.advance()
  if lex.c in lineEnd + {'#'}:
    lex.state = expectLineEnd
    return false
  lex.startToken()
  while true:
    lex.advance()
    if lex.c in lineEnd + {'#'}: break
  lex.cur = Token.DirectiveParam
  return true

proc expectLineEnd(lex: var Lexer): bool =
  while lex.c in space: lex.advance()
  if lex.c notin commentOrLineEnd:
    raise lex.generateError("Unexpected character (expected line end): " & escape("" & lex.c))
  lex.endLine()
  return false

proc lineStart(lex: var Lexer): bool =
  return case lex.startLine()
  of lsDirectivesEndMarker: lex.lineDirEnd()
  of lsDocumentEndMarker: lex.lineDocEnd()
  of lsComment, lsNewline: lex.endLine(); false
  of lsStreamEnd: lex.state = streamEnd; false
  of lsContent:
    if lex.flowDepth == 0: lex.lineIndentation()
    else: lex.flowLineIndentation()

proc flowLineStart(lex: var Lexer): bool =
  var indent: int
  case lex.c
  of '-':
    if lex.isDirectivesEnd():
      raise lex.generateError("Directives end marker before end of flow content")
    indent = 0
  of '.':
    if lex.isDocumentEnd():
      raise lex.generateError("Document end marker before end of flow content")
    indent = 0
  else:
    let lineStart = lex.source.bufpos
    while lex.c == ' ': lex.advance()
    indent = lex.source.bufpos - lineStart
    while lex.c in space: lex.advance()
  if lex.c in commentOrLineEnd:
    lex.state = expectLineEnd
    return false
  if indent <= lex.indentation:
    raise lex.generateError("Too few indentation spaces (must surpass surrounding block level)")
  lex.state = insideLine
  return false

proc flowLineIndentation(lex: var Lexer): bool =
  if lex.currentIndentation() < lex.indentation:
    raise lex.generateError("Too few indentation spaces (must surpass surrounding block level)")
  lex.state = insideLine
  return false

proc checkIndicatorChar(lex: var Lexer, kind: Token) =
  if lex.isPlainSafe():
    lex.readPlainScalar()
  else:
    lex.startToken()
    lex.advance()
    lex.endToken()
    lex.cur = kind
    lex.state = beforeIndentationSettingToken

proc enterFlowCollection(lex: var Lexer, kind: Token) =
  lex.startToken()
  if lex.flowDepth == 0:
    lex.jsonEnablingState = afterJsonEnablingToken
    lex.lineStartState = flowLineStart
    lex.propertyIndentation = -1
  lex.flowDepth += 1
  lex.state = afterToken
  lex.advance()
  lex.endToken()
  lex.cur = kind

proc leaveFlowCollection(lex: var Lexer, kind: Token) =
  lex.startToken()
  if lex.flowDepth == 0:
    raise lex.generateError("No flow collection to leave!")
  lex.flowDepth -= 1
  if lex.flowDepth == 0:
    lex.jsonEnablingState = afterToken
    lex.lineStartState = lineStart
  lex.state = lex.jsonEnablingState
  lex.advance()
  lex.endToken()
  lex.cur = kind

proc readNamespace(lex: var Lexer) =
  lex.startToken()
  lex.advance()
  if lex.c == '<':
    lex.readURI(true)
    lex.endToken()
    lex.cur = Token.VerbatimTag
    lex.state = afterToken
  else:
    var handleEnd = lex.tokenStart
    while true:
      case lex.source.buf[handleEnd]
      of spaceOrLineEnd + flowIndicators:
        handleEnd = lex.tokenStart
        lex.source.bufpos -= 1
        break
      of '!':
        handleEnd += 1
        break
      else:
        handleEnd += 1
    while lex.source.bufpos < handleEnd:
      lex.advance()
      if lex.c notin tagShorthandChars + {'!'}:
        raise lex.generateError("Illegal character in tag handle: " & escape("" & lex.c))
    lex.advance()
    lex.endToken()
    lex.cur = Token.TagHandle
    lex.state = atSuffix

proc readAnchorName(lex: var Lexer) =
  lex.startToken()
  while true:
    lex.advance()
    if lex.c in spaceOrLineEnd + flowIndicators: break
  if lex.source.bufpos == lex.tokenStart + 1:
    raise lex.generateError("Anchor name must not be empty")
  lex.state = afterToken

proc insideLine(lex: var Lexer): bool =
  case lex.c
  of ':':
    lex.checkIndicatorChar(Token.MapValueInd)
    if lex.cur == Token.MapValueInd and lex.propertyIndentation != -1:
      lex.indentation = lex.propertyIndentation
      lex.propertyIndentation = -1
  of '?':
    lex.checkIndicatorChar(Token.MapKeyInd)
  of '-':
    lex.checkIndicatorChar(Token.SeqItemInd)
  of commentOrLineEnd:
    lex.endLine()
    return false
  of '"':
    lex.readDoubleQuotedScalar()
    lex.state = lex.jsonEnablingState
  of '\'':
    lex.readSingleQuotedScalar()
    lex.state = lex.jsonEnablingState
  of '>', '|':
    if lex.flowDepth > 0:
      lex.readPlainScalar()
    else:
      lex.readBlockScalar()
  of '{':
    lex.enterFlowCollection(Token.MapStart)
  of '}':
    lex.leaveFlowCollection(Token.MapEnd)
  of '[':
    lex.enterFlowCollection(Token.SeqStart)
  of ']':
    lex.leaveFlowCollection(Token.SeqEnd)
  of ',':
    lex.startToken()
    lex.advance()
    lex.endToken()
    lex.cur = Token.SeqSep
    lex.state = afterToken
  of '!':
    lex.readNamespace()
  of '&':
    lex.readAnchorName()
    lex.endToken()
    lex.cur = Token.Anchor
  of '*':
    lex.readAnchorName()
    lex.endToken()
    lex.cur = Token.Alias
  of ' ', '\t':
    while true:
      lex.advance()
      if lex.c notin space: break
    return false
  of '@', '`':
    raise lex.generateError("Reserved character may not start any token")
  else:
    lex.readPlainScalar()
  return true

proc indentationSettingToken(lex: var Lexer): bool =
  let cachedIntentation = lex.currentIndentation()
  result = lex.insideLine()
  if result and lex.flowDepth == 0:
    if lex.cur in nodePropertyKind:
      lex.propertyIndentation = cachedIntentation
    else:
      lex.indentation = cachedIntentation

proc afterToken(lex: var Lexer): bool =
  while lex.c in space: lex.advance()
  if lex.c in commentOrLineEnd:
    lex.endLine()
  else:
    lex.state = insideLine
  return false

proc beforeIndentationSettingToken(lex: var Lexer): bool =
  discard lex.afterToken()
  if lex.state == insideLine:
    lex.state = indentationSettingToken
  return false

proc afterJsonEnablingToken(lex: var Lexer): bool =
  while lex.c == ' ': lex.advance()
  while true:
    case lex.c
    of ':':
      lex.startToken()
      lex.advance()
      lex.endToken()
      lex.cur = Token.MapValueInd
      lex.state = afterToken
      return true
    of '#', '\l', '\c':
      lex.endLine()
      discard lex.flowLineStart()
    of EndOfFile:
      lex.state = streamEnd
      return false
    else:
      lex.state = insideLine
      return false

proc lineIndentation(lex: var Lexer): bool =
  lex.curStartPos.line = lex.source.lineNumber
  lex.curStartPos.column = 1
  lex.endToken()
  lex.cur = Token.Indentation
  lex.state = indentationSettingToken
  return true

proc lineDirEnd(lex: var Lexer): bool =
  lex.curStartPos.line = lex.source.lineNumber
  lex.curStartPos.column = 1
  lex.endToken()
  lex.cur = Token.DirectivesEnd
  lex.state = afterToken
  lex.indentation = -1
  lex.propertyIndentation = -1
  return true

proc lineDocEnd(lex: var Lexer): bool =
  lex.curStartPos.line = lex.source.lineNumber
  lex.curStartPos.column = 1
  lex.endToken()
  lex.cur = Token.DocumentEnd
  lex.state = expectLineEnd
  lex.lineStartState = outsideDoc
  return true

proc atSuffix(lex: var Lexer): bool =
  lex.startToken()
  lex.evaluated.setLen(0)
  var curStart = lex.tokenStart - 1
  while true:
    case lex.c
    of uriChars: lex.advance()
    of '%':
      if curStart <= lex.source.bufpos - 2:
        lex.evaluated.add(lex.source.buf[curStart..lex.source.bufpos - 2])
      lex.readHexSequence(2)
      curStart = lex.source.bufpos
      lex.advance()
    else: break
  if curStart <= lex.source.bufpos - 2:
    lex.evaluated.add(lex.source.buf[curStart..lex.source.bufpos - 2])
  lex.endToken()
  lex.cur = Token.Suffix
  lex.state = afterToken
  return true

proc streamEnd(lex: var Lexer): bool =
  lex.startToken()
  lex.endToken()
  lex.cur = Token.StreamEnd
  return true