Refactoring lexer, part 1

This commit is contained in:
Felix Krause 2016-01-15 00:06:57 +01:00
parent 56d3537920
commit 9c731cb6d1
2 changed files with 210 additions and 184 deletions

View File

@ -40,16 +40,15 @@ type
YamlLexerState = enum YamlLexerState = enum
# initial states (not started reading any token) # initial states (not started reading any token)
ylInitial, ylInitialSpaces, ylInitialUnknown, ylInitialContent, ylInitial, ylInitialUnknown, ylInitialContent,
ylDefineTagHandleInitial, ylDefineTagURIInitial, ylInitialInLine, ylDefineTagHandleInitial, ylDefineTagURIInitial, ylInitialInLine,
ylLineEnd, ylDirectiveLineEnd, ylLineEnd, ylDirectiveLineEnd,
# directive reading states # directive reading states
ylDirective, ylDefineTagHandle, ylDefineTagURI, ylMajorVersion, ylDirective, ylDefineTagHandle, ylDefineTagURI, ylMajorVersion,
ylMinorVersion, ylUnknownDirectiveParam, ylDirectiveComment, ylMinorVersion, ylUnknownDirectiveParam, ylDirectiveComment,
# scalar reading states # scalar reading states
ylPlainScalar, ylSingleQuotedScalar, ylDoublyQuotedScalar, ylEscape, ylPlainScalar, ylBlockScalar, ylBlockScalarHeader,
ylBlockScalar, ylBlockScalarHeader, ylSpaceAfterPlainScalar, ylSpaceAfterPlainScalar, ylSpaceAfterQuotedScalar,
ylSpaceAfterQuotedScalar,
# indentation # indentation
ylIndentation, ylIndentation,
# comments # comments
@ -57,7 +56,7 @@ type
# tags # tags
ylTagHandle, ylTagSuffix, ylVerbatimTag, ylTagHandle, ylTagSuffix, ylVerbatimTag,
# document separation # document separation
ylDashes, ylDots, ylDots,
# anchoring # anchoring
ylAnchor, ylAlias ylAnchor, ylAlias
@ -68,6 +67,7 @@ type
charoffset: int charoffset: int
content*: string # my.content of the last returned token. content*: string # my.content of the last returned token.
line*, column*: int line*, column*: int
curPos: int
const const
UTF8NextLine = toUTF8(Rune(0x85)) UTF8NextLine = toUTF8(Rune(0x85))
@ -162,7 +162,7 @@ template yieldLexerError(message: string) {.dirty.} =
when defined(yamlDebug): when defined(yamlDebug):
echo "Lexer error: " & message echo "Lexer error: " & message
my.content = message my.content = message
my.column = curPos my.column = my.curPos
yield tError yield tError
my.content = "" my.content = ""
@ -170,25 +170,148 @@ template handleCR() {.dirty.} =
my.bufpos = lexbase.handleCR(my, my.bufpos + my.charoffset) + my.charlen - my.bufpos = lexbase.handleCR(my, my.bufpos + my.charoffset) + my.charlen -
my.charoffset - 1 my.charoffset - 1
my.line.inc() my.line.inc()
curPos = 1 my.curPos = 1
c = my.buf[my.bufpos + my.charoffset]
template handleLF() {.dirty.} = template handleLF() {.dirty.} =
my.bufpos = lexbase.handleLF(my, my.bufpos + my.charoffset) + my.bufpos = lexbase.handleLF(my, my.bufpos + my.charoffset) +
my.charlen - my.charoffset - 1 my.charlen - my.charoffset - 1
my.line.inc() my.line.inc()
curPos = 1 my.curPos = 1
c = my.buf[my.bufpos + my.charoffset]
template `or`(r: Rune, i: int): Rune = template `or`(r: Rune, i: int): Rune =
cast[Rune](cast[int](r) or i) cast[Rune](cast[int](r) or i)
template advance() {.dirty.} =
my.bufpos += my.charlen
my.curPos.inc
c = my.buf[my.bufpos + my.charoffset]
proc lexComment(my: var YamlLexer, c: var char) =
while c notin ['\r', '\x0A', EndOfFile]:
my.content.add(c)
advance()
proc lexInitialSpaces(my: var YamlLexer, c: var char): YamlLexerState =
while true:
case c
of ' ', '\t':
my.content.add(c)
of '#':
my.content = ""
result = ylInitial
break
of '\r', '\x0A', EndOfFile:
result = ylDirectiveLineEnd
break
else:
result = ylIndentation
break
advance()
proc lexDashes(my: var YamlLexer, c: var char) =
while c == '-':
my.content.add(c)
advance()
proc lexSingleQuotedScalar(my: var YamlLexer, c: var char): bool =
while true:
advance()
case c
of '\'':
advance()
if c == '\'':
my.content.add(c)
else:
result = true
break
of EndOfFile:
result = false
break
else:
my.content.add(c)
proc lexDoublyQuotedScalar(my: var YamlLexer, c: var char): bool =
while true:
advance()
case c
of '"':
result = true
break
of EndOfFile:
result = false
break
of '\\':
advance()
var expectedEscapeLength = 0
case c
of EndOfFile:
result = false
break
of '0': my.content.add('\0')
of 'a': my.content.add('\x07')
of 'b': my.content.add('\x08')
of '\t', 't': my.content.add('\t')
of 'n': my.content.add('\x0A')
of 'v': my.content.add('\v')
of 'f': my.content.add('\f')
of 'r': my.content.add('\r')
of 'e': my.content.add('\e')
of ' ': my.content.add(' ')
of '"': my.content.add('"')
of '/': my.content.add('/')
of '\\': my.content.add('\\')
of 'N': my.content.add(UTF8NextLine)
of '_': my.content.add(UTF8NonBreakingSpace)
of 'L': my.content.add(UTF8LineSeparator)
of 'P': my.content.add(UTF8ParagraphSeparator)
of 'x': expectedEscapeLength = 3
of 'u': expectedEscapeLength = 5
of 'U': expectedEscapeLength = 9
else:
# TODO: how to transport this error?
# yieldLexerError("Unsupported escape sequence: \\" & c)
result = false
break
if expectedEscapeLength == 0: continue
var
escapeLength = 1
unicodeChar: Rune = cast[Rune](0)
while escapeLength < expectedEscapeLength:
advance()
let digitPosition = expectedEscapeLength - escapeLength - 1
case c
of EndOFFile:
return false
of '0' .. '9':
unicodeChar = unicodechar or
(cast[int](c) - 0x30) shl (digitPosition * 4)
of 'A' .. 'F':
unicodeChar = unicodechar or
(cast[int](c) - 0x37) shl (digitPosition * 4)
of 'a' .. 'f':
unicodeChar = unicodechar or
(cast[int](c) - 0x57) shl (digitPosition * 4)
else:
# TODO: how to transport this error?
#yieldLexerError("unsupported char in unicode escape sequence: " & c)
return false
inc(escapeLength)
my.content.add(toUTF8(unicodeChar))
of '\r':
my.content.add("\x0A")
handleCR()
of '\x0A':
my.content.add(c)
handleLF()
else:
my.content.add(c)
iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} = iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
var var
# the following three values are used for parsing escaped unicode chars
unicodeChar: Rune = cast[Rune](0)
escapeLength = 0
expectedEscapeLength = 0
trailingSpace = "" trailingSpace = ""
# used to temporarily store whitespace after a plain scalar # used to temporarily store whitespace after a plain scalar
lastSpecialChar: char = '\0' lastSpecialChar: char = '\0'
@ -206,10 +329,11 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
blockScalarIndentation = -1 blockScalarIndentation = -1
# when parsing a block scalar, this will be set to the indentation # when parsing a block scalar, this will be set to the indentation
# of the line that starts the flow scalar. # of the line that starts the flow scalar.
curPos = 1
my.curPos = 1
var c = my.buf[my.bufpos + my.charoffset]
while true: while true:
let c = my.buf[my.bufpos + my.charoffset]
case state case state
of ylInitial: of ylInitial:
case c case c
@ -217,68 +341,62 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
state = ylDirective state = ylDirective
continue continue
of ' ', '\t': of ' ', '\t':
state = ylInitialSpaces state = my.lexInitialSpaces(c)
continue continue
of '#': of '#':
state = ylDirectiveComment my.lexComment(c)
else: yieldToken(tComment)
state = ylInitialContent
continue
of ylInitialSpaces:
case c
of ' ', '\t':
my.content.add(c)
of '#':
my.content = ""
state = ylDirectiveComment
of EndOfFile, '\r', '\x0A':
state = ylDirectiveLineEnd state = ylDirectiveLineEnd
continue continue
of '\r':
handleCR()
continue
of '\x0A':
handleLF()
continue
of EndOfFile:
yieldToken(tStreamEnd)
break
else: else:
state = ylIndentation state = ylInitialContent
continue continue
of ylInitialContent: of ylInitialContent:
case c case c
of '-': of '-':
my.column = curPos my.column = my.curPos
state = ylDashes my.lexDashes(c)
continue case c
of '.': of ' ', '\t', '\r', '\x0A', EndOfFile:
yieldToken(tLineStart) case my.content.len
my.column = curPos of 3:
state = ylDots yieldToken(tDirectivesEnd)
continue state = ylInitialInLine
else: of 1:
state = ylIndentation my.content = ""
continue yieldToken(tLineStart)
of ylDashes: lastSpecialChar = '-'
case c state = ylInitialInLine
of '-': else:
my.content.add(c) let tmp = my.content
of ' ', '\t', '\r', '\x0A', EndOfFile: my.content = ""
case my.content.len yieldToken(tLineStart)
of 3: my.content = tmp
yieldToken(tDirectivesEnd) my.column = my.curPos
state = ylInitialInLine state = ylPlainScalar
of 1:
my.content = ""
yieldToken(tLineStart)
lastSpecialChar = '-'
state = ylInitialInLine
else: else:
let tmp = my.content let tmp = my.content
my.content = "" my.content = ""
yieldToken(tLineStart) yieldToken(tLineStart)
my.content = tmp my.content = tmp
my.column = curPos
state = ylPlainScalar state = ylPlainScalar
continue continue
else: of '.':
let tmp = my.content
my.content = ""
yieldToken(tLineStart) yieldToken(tLineStart)
my.content = tmp my.column = my.curPos
state = ylPlainScalar state = ylDots
continue
else:
state = ylIndentation
continue continue
of ylDots: of ylDots:
case c case c
@ -308,6 +426,7 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
of EndOfFile: of EndOfFile:
yieldToken(tStreamEnd) yieldToken(tStreamEnd)
break break
{.linearScanEnd.}
of ' ', '\t': of ' ', '\t':
discard discard
of '#': of '#':
@ -327,109 +446,6 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
yieldLexerError("Internal error: Unexpected char at line end: " & c) yieldLexerError("Internal error: Unexpected char at line end: " & c)
state = ylInitialContent state = ylInitialContent
continue continue
of ylSingleQuotedScalar:
if lastSpecialChar != '\0':
# ' is the only special char
case c
of '\'':
my.content.add(c)
lastSpecialChar = '\0'
of EndOfFile, '\r', '\x0A':
yieldToken(tScalar)
lastSpecialChar = '\0'
state = ylLineEnd
continue
else:
yieldToken(tScalar)
lastSpecialChar = '\0'
state = ylSpaceAfterQuotedScalar
continue
else:
case c
of '\'':
lastSpecialChar = c
of EndOfFile:
yieldLexerError("Unterminated single quoted string")
yieldToken(tStreamEnd)
break
else:
my.content.add(c)
of ylDoublyQuotedScalar:
case c
of '"':
yieldToken(tScalar)
state = ylSpaceAfterQuotedScalar
of EndOfFile:
yieldLexerError("Unterminated doubly quoted string")
yieldToken(tStreamEnd)
break
of '\\':
state = ylEscape
escapeLength = 0
of '\r':
my.content.add("\x0A")
handleCR()
of '\x0A':
my.content.add(c)
handleLF()
else:
my.content.add(c)
of ylEscape:
if escapeLength == 0:
expectedEscapeLength = 0
case c
of EndOfFile:
yieldLexerError("Unterminated doubly quoted string")
of '0': my.content.add('\0')
of 'a': my.content.add('\x07')
of 'b': my.content.add('\x08')
of '\t', 't': my.content.add('\t')
of 'n': my.content.add('\x0A')
of 'v': my.content.add('\v')
of 'f': my.content.add('\f')
of 'r': my.content.add('\r')
of 'e': my.content.add('\e')
of ' ': my.content.add(' ')
of '"': my.content.add('"')
of '/': my.content.add('/')
of '\\': my.content.add('\\')
of 'N': my.content.add(UTF8NextLine)
of '_': my.content.add(UTF8NonBreakingSpace)
of 'L': my.content.add(UTF8LineSeparator)
of 'P': my.content.add(UTF8ParagraphSeparator)
of 'x': unicodeChar = cast[Rune](0); expectedEscapeLength = 3
of 'u': unicodeChar = cast[Rune](0); expectedEscapeLength = 5
of 'U': unicodeChar = cast[Rune](0); expectedEscapeLength = 9
else:
yieldLexerError("Unsupported escape sequence: \\" & c)
if expectedEscapeLength == 0: state = ylDoublyQuotedScalar
else:
let digitPosition = expectedEscapeLength - escapeLength - 1
case c
of EndOFFile:
yieldLexerError("Unterminated escape sequence")
state = ylLineEnd
continue
of '0' .. '9':
unicodeChar = unicodechar or
(cast[int](c) - 0x30) shl (digitPosition * 4)
of 'A' .. 'F':
unicodeChar = unicodechar or
(cast[int](c) - 0x37) shl (digitPosition * 4)
of 'a' .. 'f':
unicodeChar = unicodechar or
(cast[int](c) - 0x57) shl (digitPosition * 4)
else:
yieldLexerError("unsupported char in unicode escape sequence: " &
c)
escapeLength = 0
state = ylDoublyQuotedScalar
continue
inc(escapeLength)
if escapeLength == expectedEscapeLength and escapeLength > 0:
my.content.add(toUTF8(unicodeChar))
state = ylDoublyQuotedScalar
of ylSpaceAfterQuotedScalar: of ylSpaceAfterQuotedScalar:
case c case c
of ' ', '\t': of ' ', '\t':
@ -515,7 +531,7 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
of ylInitialInLine: of ylInitialInLine:
if lastSpecialChar != '\0': if lastSpecialChar != '\0':
my.column = curPos - 1 my.column = my.curPos - 1
case c case c
of ' ', '\t', '\r', '\x0A', EndOfFile: of ' ', '\t', '\r', '\x0A', EndOfFile:
case lastSpecialChar case lastSpecialChar
@ -544,16 +560,16 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
of '<': of '<':
state = ylVerbatimTag state = ylVerbatimTag
lastSpecialChar = '\0' lastSpecialChar = '\0'
my.bufpos += my.charlen advance()
else: else:
state = ylTagHandle state = ylTagHandle
my.content = "!" my.content = "!"
lastSpecialChar = '\0' lastSpecialChar = '\0'
my.column = curPos - 1 my.column = my.curPos - 1
else: else:
my.content.add(lastSpecialChar) my.content.add(lastSpecialChar)
lastSpecialChar = '\0' lastSpecialChar = '\0'
my.column = curPos - 1 my.column = my.curPos - 1
state = ylPlainScalar state = ylPlainScalar
continue continue
case c case c
@ -565,7 +581,7 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
yieldToken(tComma) yieldToken(tComma)
else: else:
my.content = "" & c my.content = "" & c
my.column = curPos my.column = my.curPos
state = ylPlainScalar state = ylPlainScalar
of '[': of '[':
inc(flowDepth) inc(flowDepth)
@ -584,19 +600,30 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
of '#': of '#':
lastSpecialChar = '#' lastSpecialChar = '#'
of '"': of '"':
my.column = curPos my.column = my.curPos
state = ylDoublyQuotedScalar if not my.lexDoublyQuotedScalar(c):
yieldLexerError("Unterminated doubly quoted string")
else:
advance()
yieldToken(tScalar)
state = ylSpaceAfterQuotedScalar
continue
of '\'': of '\'':
my.column = curPos my.column = my.curPos
state = ylSingleQuotedScalar if not my.lexSingleQuotedScalar(c):
yieldLexerError("Unterminated single quoted string")
yieldToken(tScalar)
lastSpecialChar = '\0'
state = ylSpaceAfterQuotedScalar
continue
of '!': of '!':
my.column = curPos my.column = my.curPos
lastSpecialChar = '!' lastSpecialChar = '!'
of '&': of '&':
my.column = curPos my.column = my.curPos
state = ylAnchor state = ylAnchor
of '*': of '*':
my.column = curPos my.column = my.curPos
state = ylAlias state = ylAlias
of ' ': of ' ':
discard discard
@ -605,10 +632,10 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
lastSpecialChar = '-' lastSpecialChar = '-'
else: else:
my.content = "" & c my.content = "" & c
my.column = curPos my.column = my.curPos
state = ylPlainScalar state = ylPlainScalar
of '?', ':': of '?', ':':
my.column = curPos my.column = my.curPos
lastSpecialChar = c lastSpecialChar = c
of '|': of '|':
yieldToken(tPipe) yieldToken(tPipe)
@ -620,7 +647,7 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
discard discard
else: else:
my.content = "" & c my.content = "" & c
my.column = curPos my.column = my.curPos
state = ylPlainScalar state = ylPlainScalar
of ylComment, ylDirectiveComment: of ylComment, ylDirectiveComment:
case c case c
@ -887,5 +914,4 @@ iterator tokens(my: var YamlLexer): YamlLexerToken {.closure.} =
else: else:
my.content.add(c) my.content.add(c)
my.bufpos += my.charlen advance()
curPos.inc

View File

@ -77,8 +77,8 @@ suite "Lexing":
t(tVersionPart, "1"), t(tVersionPart, "1"),
t(tVersionPart, "2"), t(tVersionPart, "2"),
t(tComment, " version"), t(tComment, " version"),
t(tComment, " at line start"), t(tComment, "# at line start"),
t(tComment, " indented"), t(tComment, "# indented"),
t(tUnknownDirective, "%FOO"), t(tUnknownDirective, "%FOO"),
t(tStreamEnd, nil)]) t(tStreamEnd, nil)])