Refactoring lexer, part 1

2016-01-15 00:06:57 +01:00 · 2016-01-15 00:06:57 +01:00 · 9c731cb6d1
parent 56d3537920
commit 9c731cb6d1
2 changed files with 210 additions and 184 deletions
--- a/private/lexer.nim
+++ b/private/lexer.nim
@ -40,16 +40,15 @@ type
    YamlLexerState = enum
        # initial states (not started reading any token)
-        ylInitial, ylInitialSpaces, ylInitialUnknown, ylInitialContent,
+        ylInitial, ylInitialUnknown, ylInitialContent,
        ylDefineTagHandleInitial, ylDefineTagURIInitial, ylInitialInLine,
        ylLineEnd, ylDirectiveLineEnd,
        # directive reading states
        ylDirective, ylDefineTagHandle, ylDefineTagURI, ylMajorVersion,
        ylMinorVersion, ylUnknownDirectiveParam, ylDirectiveComment,
        # scalar reading states
-        ylPlainScalar, ylSingleQuotedScalar, ylDoublyQuotedScalar, ylEscape,
+        ylPlainScalar, ylBlockScalar, ylBlockScalarHeader,
-        ylBlockScalar, ylBlockScalarHeader, ylSpaceAfterPlainScalar,
+        ylSpaceAfterPlainScalar, ylSpaceAfterQuotedScalar,
        ylSpaceAfterQuotedScalar,
        # indentation
        ylIndentation,
        # comments
@ -57,7 +56,7 @@ type
        # tags
        ylTagHandle, ylTagSuffix, ylVerbatimTag,
        # document separation
-        ylDashes, ylDots,
+        ylDots,
        # anchoring
        ylAnchor, ylAlias
@ -68,6 +67,7 @@ type
        charoffset: int
        content*: string # my.content of the last returned token.
        line*, column*: int
        curPos: int
 const
    UTF8NextLine           = toUTF8(Rune(0x85))
@ -162,7 +162,7 @@ template yieldLexerError(message: string) {.dirty.} =
    when defined(yamlDebug):
        echo "Lexer error: " & message
    my.content = message
-    my.column = curPos
+    my.column = my.curPos
    yield tError
    my.content = ""
@ -170,25 +170,148 @@ template handleCR() {.dirty.} =
    my.bufpos = lexbase.handleCR(my, my.bufpos + my.charoffset) + my.charlen -
            my.charoffset - 1
    my.line.inc()
-    curPos = 1
+    my.curPos = 1
    c = my.buf[my.bufpos + my.charoffset]
 template handleLF() {.dirty.} =
    my.bufpos = lexbase.handleLF(my, my.bufpos + my.charoffset) +
            my.charlen - my.charoffset - 1
    my.line.inc()
-    curPos = 1
+    my.curPos = 1
    c = my.buf[my.bufpos + my.charoffset]
 template `or`(r: Rune, i: int): Rune =
    cast[Rune](cast[int](r) or i)
 template advance() {.dirty.} =
    my.bufpos += my.charlen
    my.curPos.inc
    c = my.buf[my.bufpos + my.charoffset]
 proc lexComment(my: var YamlLexer, c: var char) =
    while c notin ['\r', '\x0A', EndOfFile]:
        my.content.add(c)
        advance()
 proc lexInitialSpaces(my: var YamlLexer, c: var char): YamlLexerState =
    while true:
        case c
        of ' ', '\t':
            my.content.add(c)
        of '#':
            my.content = ""
            result = ylInitial
            break
        of '\r', '\x0A', EndOfFile:
            result = ylDirectiveLineEnd
            break
        else:
            result = ylIndentation
            break
        advance()
 proc lexDashes(my: var YamlLexer, c: var char) =
    while c == '-':
        my.content.add(c)
        advance()
 proc lexSingleQuotedScalar(my: var YamlLexer, c: var char): bool =
    while true:
        advance()
        case c
        of '\'':
            advance()
            if c == '\'':
                my.content.add(c)
            else:
                result = true
                break
        of EndOfFile:
            result = false
            break
        else:
            my.content.add(c)
 proc lexDoublyQuotedScalar(my: var YamlLexer, c: var char): bool =
    while true:
        advance()
        case c
        of '"':
            result = true
            break
        of EndOfFile:
            result = false
            break
        of '\\':
            advance()
            var expectedEscapeLength = 0
            case c
            of EndOfFile:
                result = false
                break
            of '0':       my.content.add('\0')
            of 'a':       my.content.add('\x07')
            of 'b':       my.content.add('\x08')
            of '\t', 't': my.content.add('\t')
            of 'n':       my.content.add('\x0A')
            of 'v':       my.content.add('\v')
            of 'f':       my.content.add('\f')
            of 'r':       my.content.add('\r')
            of 'e':       my.content.add('\e')
            of ' ':       my.content.add(' ')
            of '"':       my.content.add('"')
            of '/':       my.content.add('/')
            of '\\':      my.content.add('\\')
            of 'N':       my.content.add(UTF8NextLine)
            of '_':       my.content.add(UTF8NonBreakingSpace)
            of 'L':       my.content.add(UTF8LineSeparator)
            of 'P':       my.content.add(UTF8ParagraphSeparator)
            of 'x': expectedEscapeLength = 3
            of 'u': expectedEscapeLength = 5
            of 'U': expectedEscapeLength = 9
            else:
                # TODO: how to transport this error?
                # yieldLexerError("Unsupported escape sequence: \\" & c)
                result = false
                break
            if expectedEscapeLength == 0: continue
            var
                escapeLength = 1
                unicodeChar: Rune = cast[Rune](0)
            while escapeLength < expectedEscapeLength:
                advance()
                let digitPosition = expectedEscapeLength - escapeLength - 1
                case c
                of EndOFFile:
                    return false
                of '0' .. '9':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x30) shl (digitPosition * 4)
                of 'A' .. 'F':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x37) shl (digitPosition * 4)
                of 'a' .. 'f':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x57) shl (digitPosition * 4)
                else:
                    # TODO: how to transport this error?
                    #yieldLexerError("unsupported char in unicode escape sequence: " & c)
                    return false
                inc(escapeLength)
            my.content.add(toUTF8(unicodeChar))
        of '\r':
            my.content.add("\x0A")
            handleCR()
        of '\x0A':
            my.content.add(c)
            handleLF()
        else:
            my.content.add(c)
 iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
-    var
+    var        
        # the following three values are used for parsing escaped unicode chars
        unicodeChar: Rune = cast[Rune](0)
        escapeLength = 0
        expectedEscapeLength = 0
        trailingSpace = ""
            # used to temporarily store whitespace after a plain scalar
        lastSpecialChar: char = '\0'
@ -206,10 +329,11 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
        blockScalarIndentation = -1
            # when parsing a block scalar, this will be set to the indentation
            # of the line that starts the flow scalar.
        curPos = 1
    my.curPos = 1
    var c = my.buf[my.bufpos + my.charoffset]
    while true:
        let c = my.buf[my.bufpos + my.charoffset]
        case state
        of ylInitial:
            case c
@ -217,68 +341,62 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
                state = ylDirective
                continue
            of ' ', '\t':
-                state = ylInitialSpaces
+                state = my.lexInitialSpaces(c)
                continue
            of '#':
-                state = ylDirectiveComment
+                my.lexComment(c)
-            else:
+                yieldToken(tComment)
                state = ylInitialContent
                continue
        of ylInitialSpaces:
            case c
            of ' ', '\t':
                my.content.add(c)
            of '#':
                my.content = ""
                state = ylDirectiveComment
            of EndOfFile, '\r', '\x0A':
                state = ylDirectiveLineEnd
                continue
            of '\r':
                handleCR()
                continue
            of '\x0A':
                handleLF()
                continue
            of EndOfFile:
                yieldToken(tStreamEnd)
                break
            else:
-                state = ylIndentation
+                state = ylInitialContent
                continue
        of ylInitialContent:
            case c
            of '-':
-                my.column = curPos
+                my.column = my.curPos
-                state = ylDashes
+                my.lexDashes(c)
-                continue
+                case c
-            of '.':
+                of ' ', '\t', '\r', '\x0A', EndOfFile:
-                yieldToken(tLineStart)
+                    case my.content.len
-                my.column = curPos
+                    of 3:
-                state = ylDots
+                        yieldToken(tDirectivesEnd)
-                continue
+                        state = ylInitialInLine
-            else:
+                    of 1:
-                state = ylIndentation
+                        my.content = ""
-                continue
+                        yieldToken(tLineStart)
-        of ylDashes:
+                        lastSpecialChar = '-'
-            case c
+                        state = ylInitialInLine
-            of '-':
+                    else:
-                my.content.add(c)
+                        let tmp = my.content
-            of ' ', '\t', '\r', '\x0A', EndOfFile:
+                        my.content = ""
-                case my.content.len
+                        yieldToken(tLineStart)
-                of 3:
+                        my.content = tmp
-                    yieldToken(tDirectivesEnd)
+                        my.column = my.curPos
-                    state = ylInitialInLine
+                        state = ylPlainScalar
                of 1:
                    my.content = ""
                    yieldToken(tLineStart)
                    lastSpecialChar = '-'
                    state = ylInitialInLine
                else:
                    let tmp = my.content
                    my.content = ""
                    yieldToken(tLineStart)
                    my.content = tmp
                    my.column = curPos
                    state = ylPlainScalar
                continue
-            else:
+            of '.':
                let tmp = my.content
                my.content = ""
                yieldToken(tLineStart)
-                my.content = tmp
+                my.column = my.curPos
-                state = ylPlainScalar
+                state = ylDots
                continue
            else:
                state = ylIndentation
                continue
        of ylDots:
            case c
@ -308,6 +426,7 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
            of EndOfFile:
                yieldToken(tStreamEnd)
                break
                {.linearScanEnd.}
            of ' ', '\t':
                discard
            of '#':
@ -327,109 +446,6 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
                yieldLexerError("Internal error: Unexpected char at line end: " & c)
            state = ylInitialContent
            continue
        of ylSingleQuotedScalar:
            if lastSpecialChar != '\0':
                # ' is the only special char
                case c
                of '\'':
                    my.content.add(c)
                    lastSpecialChar = '\0'
                of EndOfFile, '\r', '\x0A':
                    yieldToken(tScalar)
                    lastSpecialChar = '\0'
                    state = ylLineEnd
                    continue
                else:
                    yieldToken(tScalar)
                    lastSpecialChar = '\0'
                    state = ylSpaceAfterQuotedScalar
                    continue
            else:
                case c
                of '\'':
                    lastSpecialChar = c
                of EndOfFile:
                    yieldLexerError("Unterminated single quoted string")
                    yieldToken(tStreamEnd)
                    break
                else:
                    my.content.add(c)
        of ylDoublyQuotedScalar:
            case c
            of '"':
                yieldToken(tScalar)
                state = ylSpaceAfterQuotedScalar
            of EndOfFile:
                yieldLexerError("Unterminated doubly quoted string")
                yieldToken(tStreamEnd)
                break
            of '\\':
                state = ylEscape
                escapeLength = 0
            of '\r':
                my.content.add("\x0A")
                handleCR()
            of '\x0A':
                my.content.add(c)
                handleLF()
            else:
                my.content.add(c)
        of ylEscape:
            if escapeLength == 0:
                expectedEscapeLength = 0
                case c
                of EndOfFile:
                    yieldLexerError("Unterminated doubly quoted string")
                of '0':       my.content.add('\0')
                of 'a':       my.content.add('\x07')
                of 'b':       my.content.add('\x08')
                of '\t', 't': my.content.add('\t')
                of 'n':       my.content.add('\x0A')
                of 'v':       my.content.add('\v')
                of 'f':       my.content.add('\f')
                of 'r':       my.content.add('\r')
                of 'e':       my.content.add('\e')
                of ' ':       my.content.add(' ')
                of '"':       my.content.add('"')
                of '/':       my.content.add('/')
                of '\\':      my.content.add('\\')
                of 'N':       my.content.add(UTF8NextLine)
                of '_':       my.content.add(UTF8NonBreakingSpace)
                of 'L':       my.content.add(UTF8LineSeparator)
                of 'P':       my.content.add(UTF8ParagraphSeparator)
                of 'x': unicodeChar = cast[Rune](0); expectedEscapeLength = 3
                of 'u': unicodeChar = cast[Rune](0); expectedEscapeLength = 5
                of 'U': unicodeChar = cast[Rune](0); expectedEscapeLength = 9
                else:
                    yieldLexerError("Unsupported escape sequence: \\" & c)
                if expectedEscapeLength == 0: state = ylDoublyQuotedScalar
            else:
                let digitPosition = expectedEscapeLength - escapeLength - 1
                case c
                of EndOFFile:
                    yieldLexerError("Unterminated escape sequence")
                    state = ylLineEnd
                    continue
                of '0' .. '9':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x30) shl (digitPosition * 4)
                of 'A' .. 'F':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x37) shl (digitPosition * 4)
                of 'a' .. 'f':
                    unicodeChar = unicodechar or
                            (cast[int](c) - 0x57) shl (digitPosition * 4)
                else:
                    yieldLexerError("unsupported char in unicode escape sequence: " &
                               c)
                    escapeLength = 0
                    state = ylDoublyQuotedScalar
                    continue
            inc(escapeLength)
            if escapeLength == expectedEscapeLength and escapeLength > 0:
                my.content.add(toUTF8(unicodeChar))
                state = ylDoublyQuotedScalar
        of ylSpaceAfterQuotedScalar:
            case c
            of ' ', '\t':
@ -515,7 +531,7 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
        of ylInitialInLine:
            if lastSpecialChar != '\0':
-                my.column = curPos - 1
+                my.column = my.curPos - 1
                case c
                of ' ', '\t', '\r', '\x0A', EndOfFile:
                    case lastSpecialChar
@ -544,16 +560,16 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
                    of '<':
                        state = ylVerbatimTag
                        lastSpecialChar = '\0'
-                        my.bufpos += my.charlen
+                        advance()
                    else:
                        state = ylTagHandle
                        my.content = "!"
                        lastSpecialChar = '\0'
-                    my.column = curPos - 1
+                    my.column = my.curPos - 1
                else:
                    my.content.add(lastSpecialChar)
                    lastSpecialChar = '\0'
-                    my.column = curPos - 1
+                    my.column = my.curPos - 1
                    state = ylPlainScalar
                continue
            case c
@ -565,7 +581,7 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
                    yieldToken(tComma)
                else:
                    my.content = "" & c
-                    my.column = curPos
+                    my.column = my.curPos
                    state = ylPlainScalar
            of '[':
                inc(flowDepth)
@ -584,19 +600,30 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
            of '#':
                lastSpecialChar = '#'
            of '"':
-                my.column = curPos
+                my.column = my.curPos
-                state = ylDoublyQuotedScalar
+                if not my.lexDoublyQuotedScalar(c):
                    yieldLexerError("Unterminated doubly quoted string")
                else:
                    advance()
                yieldToken(tScalar)
                state = ylSpaceAfterQuotedScalar
                continue
            of '\'':
-                my.column = curPos
+                my.column = my.curPos
-                state = ylSingleQuotedScalar
+                if not my.lexSingleQuotedScalar(c):
                    yieldLexerError("Unterminated single quoted string")
                yieldToken(tScalar)
                lastSpecialChar = '\0'
                state = ylSpaceAfterQuotedScalar
                continue
            of '!':
-                my.column = curPos
+                my.column = my.curPos
                lastSpecialChar = '!'
            of '&':
-                my.column = curPos
+                my.column = my.curPos
                state = ylAnchor
            of '*':
-                my.column = curPos
+                my.column = my.curPos
                state = ylAlias
            of ' ':
                discard
@ -605,10 +632,10 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
                    lastSpecialChar = '-'
                else:
                    my.content = "" & c
-                    my.column = curPos
+                    my.column = my.curPos
                    state = ylPlainScalar
            of '?', ':':
-                my.column = curPos
+                my.column = my.curPos
                lastSpecialChar = c
            of '|':
                yieldToken(tPipe)
@ -620,7 +647,7 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
                discard
            else:
                my.content = "" & c
-                my.column = curPos
+                my.column = my.curPos
                state = ylPlainScalar
        of ylComment, ylDirectiveComment:
            case c
@ -887,5 +914,4 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
            else:
                my.content.add(c)
-        my.bufpos += my.charlen
+        advance()
        curPos.inc
--- a/test/lexing.nim
+++ b/test/lexing.nim
@ -77,8 +77,8 @@ suite "Lexing":
                 t(tVersionPart, "1"),
                 t(tVersionPart, "2"),
                 t(tComment, " version"),
-                 t(tComment, " at line start"),
+                 t(tComment, "# at line start"),
-                 t(tComment, " indented"),
+                 t(tComment, "# indented"),
                 t(tUnknownDirective, "%FOO"),
                 t(tStreamEnd, nil)])