NimYAML/src/private/lexer.nim

# file must be included from yaml.nim and cannot compile on its own

type
    Encoding = enum
      Unsupported, ## Unsupported encoding
      UTF8,        ## UTF-8
      UTF16LE,     ## UTF-16 Little Endian
      UTF16BE,     ## UTF-16 Big Endian
      UTF32LE,     ## UTF-32 Little Endian
      UTF32BE      ## UTF-32 Big Endian
    
    YamlLexerToken = enum
        # separating tokens
        tDirectivesEnd, tDocumentEnd, tStreamEnd,
        # tokens only in directives
        tTagDirective, tYamlDirective, tUnknownDirective,
        tVersionPart, tTagURI,
        tUnknownDirectiveParam,
        # tokens in directives and content
        tTagHandle, tComment,
        # from here on tokens only in content
        tLineStart,
        # control characters
        tColon, tDash, tQuestionmark, tComma, tOpeningBrace,
        tOpeningBracket, tClosingBrace, tClosingBracket, tPipe, tGreater,
        # block scalar header
        tBlockIndentationIndicator, tPlus,
        # scalar content
        tScalar, tScalarPart,
        # tags
        tVerbatimTag, tTagSuffix,
        # anchoring
        tAnchor, tAlias,
        # error reporting
        tError
            
    YamlLexerState = enum
        # initial states (not started reading any token)
        ylInitial, ylInitialSpaces, ylInitialUnknown, ylInitialContent,
        ylDefineTagHandleInitial, ylDefineTagURIInitial, ylInitialInLine,
        ylLineEnd, ylDirectiveLineEnd,
        # directive reading states
        ylDirective, ylDefineTagHandle, ylDefineTagURI, ylMajorVersion,
        ylMinorVersion, ylUnknownDirectiveParam, ylDirectiveComment,
        # scalar reading states
        ylPlainScalar, ylPlainScalarNone, ylSingleQuotedScalar,
        ylDoublyQuotedScalar, ylEscape, ylBlockScalar, ylBlockScalarHeader,
        ylSpaceAfterPlainScalar, ylSpaceAfterQuotedScalar,
        # indentation
        ylIndentation,
        # comments
        ylComment,
        # tags
        ylTagHandle, ylTagSuffix, ylVerbatimTag,
        # document separation
        ylDashes, ylDots,
        # anchoring
        ylAnchor, ylAlias
    
    YamlTypeHintState = enum
        ythInitial, ythN, ythNU, ythNUL, ythNULL, ythF, ythFA, ythFAL, ythFALS,
        ythFALSE, ythT, ythTR, ythTRU, ythTRUE, ythMinus, yth0, ythInt, 
        ythDecimal, ythNumE, ythNumEPlusMinus, ythExponent, ythNone
    
    YamlLexer = object of BaseLexer
        indentations: seq[int]
        encoding: Encoding
        charlen: int
        charoffset: int
        content*: string # my.content of the last returned token.
        line*, column*: int
        typeHint*: YamlTypeHint

const
    UTF8NextLine           = toUTF8(Rune(0x85))
    UTF8NonBreakingSpace   = toUTF8(Rune(0xA0))
    UTF8LineSeparator      = toUTF8(Rune(0x2028))
    UTF8ParagraphSeparator = toUTF8(Rune(0x2029))

proc detect_encoding(my: var YamlLexer) =
    var numBomChars = 0
    my.encoding = Unsupported
    if my.bufpos == 3:
        # BaseLexer already skipped UTF-8 BOM
        my.encoding = UTF8
    else:
        case my.buf[0]
        of '\0':
            if my.buf[1] == '\0':
                if my.buf[2] == '\0':
                    my.encoding = UTF32LE
                elif my.buf[2] == '\xFE' and my.buf[3] == '\xFF':
                    my.encoding = UTF32BE
                    numBomChars = 4
                else:
                    # this is probably not a unicode character stream,
                    # but we just use the next match in the table
                    my.encoding = UTF16BE
            else:
                # this is how a BOM-less UTF16BE input should actually look like
                my.encoding = UTF16BE
        of '\xFF':
            case my.buf[1]
            of '\xFE':
                if my.buf[2] == '\0' and my.buf[3] == '\0':
                    my.encoding = UTF32LE
                    numBomChars = 4
                else:
                    my.encoding = UTF16LE
                    numBomChars = 2
            of '\0':
                my.encoding = UTF16LE
            else:
                my.encoding = UTF8
        of '\xFE':
            case my.buf[1]
            of '\xFF':
                my.encoding = UTF16BE
                numBomChars = 2
            of '\0':
                my.encoding = UTF16LE
            else:
                my.encoding = UTF8
        else:
            if my.buf[1] == '\0':
                my.encoding = UTF16LE
            else:
                my.encoding = UTF8
    inc(my.bufPos, numBomChars)
    my.charlen = case my.encoding
        of UTF8, Unsupported: 1
        of UTF16LE, UTF16BE:  2
        of UTF32LE, UTF32BE:  4
    my.charoffset = case my.encoding
        of UTF8, Unsupported, UTF16LE, UTF32LE: 0
        of UTF16BE: 1
        of UTF32BE: 3
        
proc open(my: var YamlLexer, input: Stream) =
    lexbase.open(my, input)
    my.indentations = newSeq[int]()
    my.detect_encoding()
    my.content = ""
    my.line = 0
    my.column = 0

template yieldToken(kind: YamlLexerToken) {.dirty.} =
    when defined(yamlDebug):
        if kind == tScalar:
            echo "Lexer token: tScalar(\"", my.content, "\")"
        else:
            echo "Lexer token: ", kind
    yield kind
    my.content = ""

template yieldScalarPart() {.dirty.} =
    case typeHintState
    of ythNULL:
        my.typeHint = yTypeNull
    of ythTRUE, ythFALSE:
        my.typeHint = yTypeBoolean
    of ythInt, yth0:
        my.typeHint = yTypeInteger
    of ythDecimal, ythExponent:
        my.typeHint = yTypeFloat
    else:
        my.typeHint = yTypeString
    
    when defined(yamlDebug):
        echo "Lexer token: tScalarPart(\"", my.content, "\".", my.typeHint,
             ")"
    yield tScalarPart
    my.content = ""

template yieldLexerError(message: string) {.dirty.} =
    when defined(yamlDebug):
        echo "Lexer error: " & message
    my.content = message
    yield tError
    my.content = ""

template handleCR() {.dirty.} =
    my.bufpos = lexbase.handleCR(my, my.bufpos + my.charoffset) + my.charlen -
            my.charoffset - 1
    my.line.inc()
    curPos = 0

template handleLF() {.dirty.} =
    my.bufpos = lexbase.handleLF(my, my.bufpos + my.charoffset) +
            my.charlen - my.charoffset - 1
    my.line.inc()
    curPos = 0

template `or`(r: Rune, i: int): Rune =
    cast[Rune](cast[int](r) or i)

template advanceTypeHint(ch: char) {.dirty.} =
    case ch
    of '.':
        case typeHintState
        of yth0, ythInt:
            typeHintState = ythDecimal
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of '+':
        case typeHintState
        of ythNumE:
            typeHintState = ythNumEPlusMinus
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of '-':
        case typeHintState
        of ythInitial:
            typeHintState = ythMinus
        of ythNumE:
            typeHintState = ythNumEPlusMinus
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of '0':
        case typeHintState
        of ythInitial, ythMinus:
            typeHintState = yth0
        of ythNumE, ythNumEPlusMinus:
            typeHintState = ythExponent
        of ythInt, ythDecimal, ythExponent:
            discard
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of '1'..'9':
        case typeHintState
        of ythInitial, ythMinus:
            typeHintState = ythInt
        of ythNumE, ythNumEPlusMinus:
            typeHintState = ythExponent
        of ythInt, ythDecimal, ythExponent:
            discard
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 'a':
        case typeHintState
        of ythF:
            typeHintState = ythFA
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 'e':
        case typeHintState
        of yth0, ythInt, ythDecimal:
            typeHintState = ythNumE
        of ythTRU:
            typeHintState = ythTRUE
        of ythFALS:
            typeHintState = ythFALSE
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 'E':
        case typeHintState
        of yth0, ythInt, ythDecimal:
            typeHintState = ythNumE
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 'f':
        case typeHintState
        of ythInitial:
            typeHintState = ythF
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 'l':
        case typeHintState
        of ythNU:
            typeHintState = ythNUL
        of ythNUL:
            typeHintState = ythNULL
        of ythFA:
            typeHintState = ythFAL
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 'n':
        case typeHintState
        of ythInitial:
            typeHintState = ythN
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 'r':
        case typeHintState
        of ythT:
            typeHintState = ythTR
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 's':
        case typeHintState
        of ythFAL:
            typeHintState = ythFALS
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 't':
        case typeHintState
        of ythInitial:
            typeHintState = ythT
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    of 'u':
        case typeHintState
        of ythN:
            typeHintState = ythNU
        of ythTR:
            typeHintState = ythTRU
        else:
            typeHintState = ythNone
            state = ylPlainScalarNone
    else:
        typeHintState = ythNone
        state = ylPlainScalarNone
    

iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
    var
        # the following three values are used for parsing escaped unicode chars
        
        unicodeChar: Rune = cast[Rune](0)
        escapeLength = 0
        expectedEscapeLength = 0
        
        trailingSpace = ""
            # used to temporarily store whitespace after a plain scalar
        lastSpecialChar: char = '\0'
            # stores chars that behave differently dependent on the following
            # char. handling will be deferred to next loop iteration.
        flowDepth = 0
            # Lexer must know whether it parses block or flow style. Therefore,
            # it counts the number of open flow arrays / maps here
        state = ylInitial # lexer state
        typeHintState = ythInitial # for giving type hints of plain scalars
        lastIndentationLength = 0
            # after parsing the indentation of the line, this will hold the
            # indentation length of the current line. Needed for checking where
            # a block scalar ends.
        blockScalarIndentation = -1
            # when parsing a block scalar, this will be set to the indentation
            # of the line that starts the flow scalar.
        curPos = 0
    
    while true:
        let c = my.buf[my.bufpos + my.charoffset]
        case state
        of ylInitial:
            case c
            of '%':
                state = ylDirective
                continue
            of ' ', '\t':
                state = ylInitialSpaces
                continue
            of '#':
                state = ylDirectiveComment
            else:
                state = ylInitialContent
                continue
        of ylInitialSpaces:
            case c
            of ' ', '\t':
                my.content.add(c)
            of '#':
                my.content = ""
                state = ylDirectiveComment
            of EndOfFile, '\r', '\x0A':
                state = ylDirectiveLineEnd
                continue
            else:
                state = ylIndentation
                continue
        of ylInitialContent:
            case c
            of '-':
                my.column = 0
                state = ylDashes
                continue
            of '.':
                yieldToken(tLineStart)
                my.column = 0
                state = ylDots
                continue
            else:
                state = ylIndentation
                continue
        of ylDashes:
            case c
            of '-':
                my.content.add(c)
            of ' ', '\t', '\r', '\x0A', EndOfFile:
                case my.content.len
                of 3:
                    yieldToken(tDirectivesEnd)
                    state = ylInitialInLine
                of 1:
                    my.content = ""
                    yieldToken(tLineStart)
                    lastSpecialChar = '-'
                    state = ylInitialInLine
                else:
                    let tmp = my.content
                    my.content = ""
                    yieldToken(tLineStart)
                    my.content = tmp
                    my.column = curPos
                    state = ylPlainScalarNone
                    typeHintState = ythNone
                continue
            else:
                let tmp = my.content
                my.content = ""
                yieldToken(tLineStart)
                my.content = tmp
                if my.content.len == 1:
                    typeHintState = ythMinus
                    state = ylPlainScalar
                else:
                    typeHintState = ythNone
                    state = ylPlainScalarNone
                continue
        of ylDots:
            case c
            of '.':
                my.content.add(c)
            of ' ', '\t', '\r', '\x0A', EndOfFile:
                case my.content.len
                of 3:
                    yieldToken(tDocumentEnd)
                    state = ylDirectiveLineEnd
                else:
                    state = ylPlainScalarNone
                    typeHintState = ythNone
                continue
            else:
                state = ylPlainScalarNone
                typeHintState = ythNone
                continue
        of ylDirectiveLineEnd:
            case c
            of '\r':
                handleCR()
                state = ylInitial
                continue
            of '\x0A':
                handleLF()
                state = ylInitial
                continue
            of EndOfFile:
                yieldToken(tStreamEnd)
                break
            of ' ', '\t':
                discard
            of '#':
                state = ylDirectiveComment
            else:
                yieldLexerError("Unexpected content at end of directive: " & c)
        of ylLineEnd:
            case c
            of '\r':
                handleCR()
            of '\x0A':
                handleLF()
            of EndOfFile:
                yieldToken(tStreamEnd)
                break
            else:
                yieldLexerError("Internal error: Unexpected char at line end: " & c)
            state = ylInitialContent
            continue
        of ylSingleQuotedScalar:
            if lastSpecialChar != '\0':
                # ' is the only special char
                case c
                of '\'':
                    my.content.add(c)
                    lastSpecialChar = '\0'
                of EndOfFile, '\r', '\x0A':
                    yieldToken(tScalar)
                    lastSpecialChar = '\0'
                    state = ylLineEnd
                    continue
                else:
                    yieldToken(tScalar)
                    lastSpecialChar = '\0'
                    state = ylSpaceAfterQuotedScalar
                    continue
            else:
                case c
                of '\'':
                    lastSpecialChar = c
                of EndOfFile:
                    yieldLexerError("Unterminated single quoted string")
                    yieldToken(tStreamEnd)
                    break
                else:
                    my.content.add(c)
        of ylDoublyQuotedScalar:
            case c
            of '"':
                yieldToken(tScalar)
                state = ylSpaceAfterQuotedScalar
            of EndOfFile:
                yieldLexerError("Unterminated doubly quoted string")
                yieldToken(tStreamEnd)
                break
            of '\\':
                state = ylEscape
                escapeLength = 0
            of '\r':
                my.content.add("\x0A")
                handleCR()
            of '\x0A':
                my.content.add(c)
                handleLF()
            else:
                my.content.add(c)
        of ylEscape:
            if escapeLength == 0:
                expectedEscapeLength = 0
                case c
                of EndOfFile:
                    yieldLexerError("Unterminated doubly quoted string")
                of '0':       my.content.add('\0')
                of 'a':       my.content.add('\x07')
                of 'b':       my.content.add('\x08')
                of '\t', 't': my.content.add('\t')
                of 'n':       my.content.add('\x0A')
                of 'v':       my.content.add('\v')
                of 'f':       my.content.add('\f')
                of 'r':       my.content.add('\r')
                of 'e':       my.content.add('\e')
                of ' ':       my.content.add(' ')
                of '"':       my.content.add('"')
                of '/':       my.content.add('/')
                of '\\':      my.content.add('\\')
                of 'N':       my.content.add(UTF8NextLine)
                of '_':       my.content.add(UTF8NonBreakingSpace)
                of 'L':       my.content.add(UTF8LineSeparator)
                of 'P':       my.content.add(UTF8ParagraphSeparator)
                of 'x': unicodeChar = cast[Rune](0); expectedEscapeLength = 3
                of 'u': unicodeChar = cast[Rune](0); expectedEscapeLength = 5
                of 'U': unicodeChar = cast[Rune](0); expectedEscapeLength = 9
                else:
                    yieldLexerError("Unsupported escape sequence: \\" & c)
                if expectedEscapeLength == 0: state = ylDoublyQuotedScalar
            else:
                let digitPosition = expectedEscapeLength - escapeLength - 1
                case c
                of EndOFFile:
                    yieldLexerError("Unterminated escape sequence")
                    state = ylLineEnd
                    continue
                of '0' .. '9':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x30) shl (digitPosition * 4)
                of 'A' .. 'F':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x37) shl (digitPosition * 4)
                of 'a' .. 'f':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x57) shl (digitPosition * 4)
                else:
                    yieldLexerError("unsupported char in unicode escape sequence: " &
                               c)
                    escapeLength = 0
                    state = ylDoublyQuotedScalar
                    continue
            inc(escapeLength)
            if escapeLength == expectedEscapeLength and escapeLength > 0:
                my.content.add(toUTF8(unicodeChar))
                state = ylDoublyQuotedScalar
        
        of ylSpaceAfterQuotedScalar:
            case c
            of ' ', '\t':
                trailingSpace.add(c)
            of '#':
                if trailingSpace.len > 0:
                    yieldLexerError("Missing space before comment start")
                state = ylComment
                trailingSpace = ""
            else:
                trailingSpace = ""
                state = ylInitialInLine
                continue
        
        of ylPlainScalar:
            case c
            of EndOfFile, '\r', '\x0A':
                yieldScalarPart()
                state = ylLineEnd
                continue
            of ':':
                lastSpecialChar = c
                state = ylSpaceAfterPlainScalar
            of ' ':
                state = ylSpaceAfterPlainScalar
                continue
            of ',':
                if flowDepth > 0:
                    lastSpecialChar = c
                    state = ylSpaceAfterPlainScalar
                else:
                    my.content.add(c)
                    state = ylPlainScalarNone
                    typeHintState = ythNone
            of '[', ']', '{', '}':
                yieldScalarPart()
                state = ylInitialInLine
                continue
            else:
                advanceTypeHint(c)
                my.content.add(c)
        
        of ylPlainScalarNone:
            case c
            of EndOfFile, '\r', '\x0A':
                yieldScalarPart()
                state = ylLineEnd
                continue
            of ':':
                lastSpecialChar = c
                state = ylSpaceAfterPlainScalar
            of ' ':
                state = ylSpaceAfterPlainScalar
                continue
            of ',':
                if flowDepth > 0:
                    lastSpecialChar = c
                    state = ylSpaceAfterPlainScalar
                else:
                    my.content.add(c)
            of '[', ']', '{', '}':
                yieldScalarPart()
                state = ylInitialInLine
                continue
            else:
                my.content.add(c)
        
        of ylSpaceAfterPlainScalar:
            if lastSpecialChar != '\0':
                case c
                of ' ', '\t', EndOfFile, '\r', '\x0A':
                    yieldScalarPart()
                    state = ylInitialInLine
                else:
                    my.content.add(trailingSpace)
                    my.content.add(lastSpecialChar)
                    lastSpecialChar = '\0'
                    trailingSpace = ""
                    state = ylPlainScalarNone
                    typeHintState = ythNone
                continue
            
            case c
            of EndOfFile, '\r', '\x0A':
                trailingSpace = ""
                yieldScalarPart()
                state = ylLineEnd
                continue
            of ' ', '\t':
                trailingSpace.add(c)
            of ',':
                if flowDepth > 0:
                    lastSpecialChar = c
                else:
                    my.content.add(trailingSpace)
                    my.content.add(c)
                    trailingSpace = ""
                    state = ylPlainScalarNone
                    typeHintState = ythNone
            of ':', '#':
                lastSpecialChar = c
            of '[', ']', '{', '}':
                yieldToken(tScalar)
                trailingSpace = ""
                state = ylInitialInLine
                continue
            else:
                my.content.add(trailingSpace)
                my.content.add(c)
                trailingSpace = ""
                state = ylPlainScalarNone
                typeHintState = ythNone
                
        of ylInitialInLine:
            if lastSpecialChar != '\0':
                my.column = curPos - 1
                case c
                of ' ', '\t', '\r', '\x0A', EndOfFile:
                    case lastSpecialChar
                    of '#':
                        my.content = "#"
                        state = ylComment
                    of ':':
                        yieldToken(tColon)
                    of '?':
                        yieldToken(tQuestionmark)
                    of '-':
                        yieldToken(tDash)
                    of ',':
                        yieldToken(tComma)
                    of '!':
                        my.content = "!"
                        yieldToken(tTagHandle)
                        my.content = ""
                        yieldToken(tTagSuffix)
                    else:
                        yieldLexerError("Unexpected special char: \"" &
                                   lastSpecialChar & "\"")
                    lastSpecialChar = '\0'
                elif lastSpecialChar == '!':
                    case c
                    of '<':
                        state = ylVerbatimTag
                        lastSpecialChar = '\0'
                        my.bufpos += my.charlen
                    else:
                        state = ylTagHandle
                        my.content = "!"
                        lastSpecialChar = '\0'
                else:
                    my.content.add(lastSpecialChar)
                    lastSpecialChar = '\0'
                    my.column = curPos - 1
                    state = ylPlainScalar
                    typeHintState = ythInitial
                    advanceTypeHint(lastSpecialChar)
                continue
            case c
            of '\r', '\x0A', EndOfFile:
                state = ylLineEnd
                continue
            of ',':
                if flowDepth > 0:
                    yieldToken(tComma)
                else:
                    my.content = "" & c
                    my.column = curPos
                    state = ylPlainScalar
                    typeHintState = ythInitial
                    advanceTypeHint(c)
            of '[':
                inc(flowDepth)
                yieldToken(tOpeningBracket)
            of '{':
                inc(flowDepth)
                yieldToken(tOpeningBrace)
            of ']':
                yieldToken(tClosingBracket)
                if flowDepth > 0:
                    inc(flowDepth, -1)
            of '}':
                yieldToken(tClosingBrace)
                if flowDepth > 0:
                    inc(flowDepth, -1)
            of '#':
                lastSpecialChar = '#'
            of '"':
                my.column = curPos
                state = ylDoublyQuotedScalar
            of '\'':
                my.column = curPos
                state = ylSingleQuotedScalar
            of '!':
                my.column = curPos
                lastSpecialChar = '!'
            of '&':
                my.column = curPos
                state = ylAnchor
            of '*':
                my.column = curPos
                state = ylAlias
            of ' ':
                discard
            of '-':
                if flowDepth == 0:
                    lastSpecialChar = '-'
                else:
                    my.content = "" & c
                    my.column = curPos
                    state = ylPlainScalar
                    typeHintState = ythInitial
                    advanceTypeHint(c)
            of '?', ':':
                my.column = curPos
                lastSpecialChar = c
            of '|':
                yieldToken(tPipe)
                state = ylBlockScalarHeader
            of '>':
                yieldToken(tGreater)
                state = ylBlockScalarHeader
            of '\t':
                discard
            else:
                my.content = "" & c
                my.column = curPos
                state = ylPlainScalar
                typeHintState = ythInitial
                advanceTypeHint(c)
        of ylComment, ylDirectiveComment:
            case c
            of EndOfFile, '\r', '\x0A':
                yieldToken(tComment)
                case state
                of ylComment:
                    state = ylLineEnd
                of ylDirectiveComment:
                    state = ylDirectiveLineEnd
                else:
                    yieldLexerError("Should never happen")
                continue
            else:
                my.content.add(c)
        of ylIndentation:
            case c
            of EndOfFile, '\r', '\x0A':
                lastIndentationLength = my.content.len
                yieldToken(tLineStart)
                state = ylLineEnd
                continue
            of ' ':
                my.content.add(' ')
            else:
                lastIndentationLength =  my.content.len
                yieldToken(tLineStart)
                if blockScalarIndentation != -1:
                    if lastIndentationLength <= blockScalarIndentation:
                        blockScalarIndentation = -1
                    else:
                        state = ylBlockScalar
                        continue
                state = ylInitialInLine
                continue
        of ylTagHandle:
            case c
            of '!':
                my.content.add(c)
                yieldToken(tTagHandle)
                state = ylTagSuffix
            of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-':
                my.content.add(c)
            of ' ', '\t', EndOfFile, '\r', '\x0A':
                var suffix = my.content[1..^1]
                my.content = "!"
                yieldToken(tTagHandle)
                my.content = suffix
                yieldToken(tTagSuffix)
                state = ylInitialInLine
                continue
            else:
                yieldLexerError("Invalid character in tag handle: " & c)
                my.content = ""
                state = ylInitialInLine
        of ylTagSuffix:
            case c
            of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@',
               '&', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
                my.content.add(c)
            of ' ', '\t', EndOfFile, '\r', '\x0A':
                yieldToken(tTagSuffix)
                state = ylInitialInLine
                continue
            else:
                yieldLexerError("Invalid character in tag suffix: " & c)
                state = ylInitialInLine
        of ylVerbatimTag:
            case c
            of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@',
               '&', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
                my.content.add(c)
            of '>':
                yieldToken(tVerbatimTag)
                state = ylInitialInLine
            of EndOfFile, '\r', '\x0A':
                yieldLexerError("Unfinished verbatim tag")
                state = ylLineEnd
                continue
            else:
                yieldLexerError("Invalid character in tag URI: " & c)
                my.content = ""
                state = ylInitialInLine
        of ylDirective:
            case c
            of ' ', '\t', '\r', '\x0A', EndOfFile:
                if my.content == "%YAML":
                    yieldToken(tYamlDirective)
                    state = ylMajorVersion
                elif my.content == "%TAG":
                    yieldToken(tTagDirective)
                    state = ylDefineTagHandleInitial
                else:
                    yieldToken(tUnknownDirective)
                    state = ylInitialUnknown
                if c == EndOfFile:
                    continue
            else:
                my.content.add(c)
        of ylInitialUnknown:
            case c
            of ' ', '\t':
                discard
            of '\r', '\x0A', EndOfFile:
                state = ylDirectiveLineEnd
                continue
            of '#':
                state = ylDirectiveComment
                continue
            else:
                state = ylUnknownDirectiveParam
                continue
        of ylUnknownDirectiveParam:
            case c
            of '\r', '\x0A', EndOfFile, ' ', '\t':
                yieldToken(tUnknownDirectiveParam)
                state = ylInitialUnknown
                continue
            else:
                my.content.add(c)
        of ylMajorVersion:
            case c
            of '0' .. '9':
                my.content.add(c)
            of '.':
                yieldToken(tVersionPart)
                state = ylMinorVersion
            of EndOfFile, '\r', '\x0A', ' ', '\t':
                yieldLexerError("Missing YAML minor version.")
                state = ylDirectiveLineEnd
                continue
            else:
                yieldLexerError("Invalid character in YAML version: " & c)
                state = ylInitialUnknown
        of ylMinorVersion:
            case c
            of '0' .. '9':
                my.content.add(c)
            of EndOfFile, '\r', '\x0A', ' ', '\t':
                yieldToken(tVersionPart)
                state = ylDirectiveLineEnd
                continue
            else:
                yieldLexerError("Invalid character in YAML version: " & c)
                state = ylInitialUnknown
        of ylDefineTagHandleInitial:
            case c
            of ' ', '\t':
                discard
            of EndOfFile, '\r', '\x0A':
                yieldLexerError("Unfinished %TAG directive")
                state = ylDirectiveLineEnd
                continue
            of '!':
                my.content.add(c)
                state = ylDefineTagHandle
            else:
                yieldLexerError("Unexpected character in %TAG directive: " & c)
                state = ylInitialInLine
        of ylDefineTagHandle:
            case c
            of '!':
                my.content.add(c)
                yieldToken(tTagHandle)
                state = ylDefineTagURIInitial
            of 'a' .. 'z', 'A' .. 'Z', '-':
                my.content.add(c)
            of EndOfFile, '\r', '\x0A':
                yieldLexerError("Unfinished %TAG directive")
                state = ylDirectiveLineEnd
                continue
            else:
                yieldLexerError("Unexpected char in %TAG directive: " & c)
                state = ylInitialInLine
        of ylDefineTagURIInitial:
            case c
            of '\t', ' ':
                my.content.add(c)
            of '\x0A', '\r', EndOfFile:
                yieldLexerError("Unfinished %TAG directive")
                state = ylDirectiveLineEnd
                continue
            else:
                if my.content.len == 0:
                    yieldLexerError("Missing whitespace in %TAG directive")
                my.content = ""
                state = ylDefineTagURI
                continue
        of ylDefineTagURI:
            case c
            of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '#', ';', '/', '?', ':', '@',
               '&', '=', '+', '$', ',', '_', '.', '~', '*', '\'', '(', ')':
                my.content.add(c)
            of '\x0A', '\r', EndOfFile, ' ', '\t':
                yieldToken(tTagURI)
                state = ylDirectiveLineEnd
                continue
            else:
                yieldLexerError("Invalid URI character: " & c)
                state = ylInitialInLine
                continue
        of ylBlockScalarHeader:
            case c
            of '0' .. '9':
                my.content = "" & c
                yieldToken(tBlockIndentationIndicator)
            of '+':
                yieldToken(tPlus)
            of '-':
                yieldToken(tDash)
            of '\r', '\x0A', EndOfFile:
                blockScalarIndentation = lastIndentationLength
                state = ylLineEnd
                continue
            else:
                yieldLexerError("Unexpected character in block scalar header: " & c)
        of ylBlockScalar:
            case c
            of EndOfFile, '\r', '\x0A':
                yieldScalarPart()
                state = ylLineEnd
                continue
            else:
                my.content.add(c)
        of ylAnchor:
            case c
            of EndOfFile, '\r', '\x0A', ' ', '\t', '{', '}', '[', ']':
                yieldToken(tAnchor)
                state = ylInitialInLine
                continue
            else:
                my.content.add(c)
        of ylAlias:
            if lastSpecialChar != '\0':
                case c
                of EndOfFile, '\r', '\x0A', ' ', '\t', '{', '}', '[', ']':
                    yieldToken(tAlias)
                    state = ylInitialInLine
                    continue
                else:
                    my.content.add(lastSpecialChar)
                    lastSpecialChar = '\0'
            case c
            of EndOfFile, '\r', '\x0A', ' ', '\t', '{', '}', '[', ']':
                yieldToken(tAlias)
                state = ylInitialInLine
                continue
            of ':':
                lastSpecialChar = ':'
            of ',':
                if flowDepth > 0:
                    yieldToken(tAlias)
                    state = ylInitialInLine
                    continue
                my.content.add(c)
            else:
                my.content.add(c)
        
        my.bufpos += my.charlen
        curPos.inc