2022-03-02 17:46:16 -04:00

604 lines
13 KiB
Go

// Copyright 2015 Jean Niklas L'orange. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package edn
import (
"strconv"
u "unicode"
)
type lexState int
const (
lexCont = lexState(iota) // continue reading
lexIgnore // values you can ignore, just whitespace and comments atm
lexEnd // value ended with input given in
lexEndPrev // value ended with previous input
lexError // erroneous input
)
type tokenType int
const ( // value types from lexer
tokenSymbol = tokenType(iota)
tokenKeyword
tokenString
tokenInt
tokenFloat
tokenTag
tokenChar
tokenListStart
tokenListEnd
tokenVectorStart
tokenVectorEnd
tokenMapStart
tokenMapEnd
tokenSetStart
tokenDiscard
tokenError
)
func (t tokenType) String() string {
switch t {
case tokenSymbol:
return "symbol"
case tokenKeyword:
return "keyword"
case tokenString:
return "string"
case tokenInt:
return "integer"
case tokenFloat:
return "float"
case tokenTag:
return "tag"
case tokenChar:
return "character"
case tokenListStart:
return "list start"
case tokenListEnd:
return "list end"
case tokenVectorStart:
return "vector start"
case tokenVectorEnd:
return "vector end"
case tokenMapStart:
return "map start"
case tokenMapEnd:
return "map/set end"
case tokenSetStart:
return "set start"
case tokenDiscard:
return "discard token"
case tokenError:
return "error"
default:
return "[unknown]"
}
}
const tokenSetEnd = tokenMapEnd // sets ends the same way as maps do
// A SyntaxError is a description of an EDN syntax error.
type SyntaxError struct {
msg string // description of error
Offset int64 // error occurred after reading Offset bytes
}
func (e *SyntaxError) Error() string {
return e.msg
}
func okSymbolFirst(r rune) bool {
switch r {
case '.', '*', '+', '!', '-', '_', '?', '$', '%', '&', '=', '<', '>':
return true
}
return false
}
func okSymbol(r rune) bool {
switch r {
case '.', '*', '+', '!', '-', '_', '?', '$', '%', '&', '=', '<', '>', ':', '#', '\'':
return true
}
return false
}
func isWhitespace(r rune) bool {
return u.IsSpace(r) || r == ','
}
type lexer struct {
state func(rune) lexState
err error
position int64
token tokenType
count int // counter is used in some functions within the lexer
expecting []rune // expecting is used to avoid duplication when we expect e.g. \newline
}
func (l *lexer) reset() {
l.state = l.stateBegin
l.token = tokenType(-1)
l.err = nil
}
func (l *lexer) eof() lexState {
if l.err != nil {
return lexError
}
lt := l.state(' ')
if lt == lexCont {
l.err = &SyntaxError{"unexpected end of EDN input", l.position}
lt = lexError
}
if l.err != nil {
return lexError
}
if lt == lexEndPrev {
return lexEnd
}
return lt
}
func (l *lexer) stateBegin(r rune) lexState {
switch {
case isWhitespace(r):
return lexIgnore
case r == '{':
l.token = tokenMapStart
return lexEnd
case r == '}':
l.token = tokenMapEnd
return lexEnd
case r == '[':
l.token = tokenVectorStart
return lexEnd
case r == ']':
l.token = tokenVectorEnd
return lexEnd
case r == '(':
l.token = tokenListStart
return lexEnd
case r == ')':
l.token = tokenListEnd
return lexEnd
case r == '#':
l.state = l.statePound
return lexCont
case r == ':':
l.state = l.stateKeyword
return lexCont
case r == '/': // ohh, the lovely slash edge case
l.token = tokenSymbol
l.state = l.stateEndLit
return lexCont
case r == '+':
l.state = l.statePos
return lexCont
case r == '-':
l.state = l.stateNeg
return lexCont
case r == '.':
l.token = tokenSymbol
l.state = l.stateDotPre
return lexCont
case r == '"':
l.state = l.stateInString
return lexCont
case r == '\\':
l.state = l.stateChar
return lexCont
case okSymbolFirst(r) || u.IsLetter(r):
l.token = tokenSymbol
l.state = l.stateSym
return lexCont
case '0' < r && r <= '9':
l.state = l.state1
return lexCont
case r == '0':
l.state = l.state0
return lexCont
case r == ';':
l.state = l.stateComment
return lexIgnore
}
return l.error(r, "- unexpected rune")
}
func (l *lexer) stateComment(r rune) lexState {
if r == '\n' {
l.state = l.stateBegin
}
return lexIgnore
}
func (l *lexer) stateEndLit(r rune) lexState {
if isWhitespace(r) || r == '"' || r == '{' || r == '[' || r == '(' || r == ')' || r == ']' || r == '}' || r == '\\' || r == ';' {
return lexEndPrev
}
return l.error(r, "- unexpected rune after legal "+l.token.String())
}
func (l *lexer) stateKeyword(r rune) lexState {
switch {
case r == ':':
l.state = l.stateError
l.err = &SyntaxError{"EDN does not support namespace-qualified keywords", l.position}
return lexError
case r == '/':
l.state = l.stateError
l.err = &SyntaxError{"keywords cannot begin with /", l.position}
return lexError
case okSymbol(r) || u.IsLetter(r) || ('0' <= r && r <= '9'):
l.token = tokenKeyword
l.state = l.stateSym
return lexCont
}
return l.error(r, "after keyword start")
}
// examples: 'foo' 'bar'
// we reuse this from the keyword states, so we don't set token at the end,
// but before we call this
func (l *lexer) stateSym(r rune) lexState {
switch {
case okSymbol(r) || u.IsLetter(r) || ('0' <= r && r <= '9'):
l.state = l.stateSym
return lexCont
case r == '/':
l.state = l.stateSlash
return lexCont
}
return l.stateEndLit(r)
}
// example: 'foo/'
func (l *lexer) stateSlash(r rune) lexState {
switch {
case okSymbol(r) || u.IsLetter(r) || ('0' <= r && r <= '9'):
l.state = l.statePostSlash
return lexCont
}
return l.error(r, "directly after '/' in namespaced symbol")
}
// example : 'foo/bar'
func (l *lexer) statePostSlash(r rune) lexState {
switch {
case okSymbol(r) || u.IsLetter(r) || ('0' <= r && r <= '9'):
l.state = l.statePostSlash
return lexCont
}
return l.stateEndLit(r)
}
// example: '-'
func (l *lexer) stateNeg(r rune) lexState {
switch {
case r == '0':
l.state = l.state0
return lexCont
case '1' <= r && r <= '9':
l.state = l.state1
return lexCont
case okSymbol(r) || u.IsLetter(r):
l.token = tokenSymbol
l.state = l.stateSym
return lexCont
case r == '/':
l.token = tokenSymbol
l.state = l.stateSlash
return lexCont
}
l.token = tokenSymbol
return l.stateEndLit(r)
}
// example: '+'
func (l *lexer) statePos(r rune) lexState {
switch {
case r == '0':
l.state = l.state0
return lexCont
case '1' <= r && r <= '9':
l.state = l.state1
return lexCont
case okSymbol(r) || u.IsLetter(r):
l.token = tokenSymbol
l.state = l.stateSym
return lexCont
case r == '/':
l.token = tokenSymbol
l.state = l.stateSlash
return lexCont
}
l.token = tokenSymbol
return l.stateEndLit(r)
}
// value is '0'
func (l *lexer) state0(r rune) lexState {
switch {
case r == '.':
l.state = l.stateDot
return lexCont
case r == 'e' || r == 'E':
l.state = l.stateE
return lexCont
case r == 'M': // bigdecimal
l.token = tokenFloat
l.state = l.stateEndLit
return lexCont // must be ws or delimiter afterwards
case r == 'N': // bigint
l.token = tokenInt
l.state = l.stateEndLit
return lexCont // must be ws or delimiter afterwards
}
l.token = tokenInt
return l.stateEndLit(r)
}
// anything but a result starting with 0. example '10', '34'
func (l *lexer) state1(r rune) lexState {
if '0' <= r && r <= '9' {
return lexCont
}
return l.state0(r)
}
// example: '.', can only receive non-numerics here
func (l *lexer) stateDotPre(r rune) lexState {
switch {
case okSymbol(r) || u.IsLetter(r):
l.token = tokenSymbol
l.state = l.stateSym
return lexCont
case r == '/':
l.token = tokenSymbol
l.state = l.stateSlash
return lexCont
}
return l.stateEndLit(r)
}
// after reading numeric values plus '.', example: '12.'
func (l *lexer) stateDot(r rune) lexState {
if '0' <= r && r <= '9' {
l.state = l.stateDot0
return lexCont
}
// TODO (?): The spec says that there must be numbers after the dot, yet
// (clojure.edn/read-string "1.e1") returns 10.0
return l.error(r, "after decimal point in numeric literal")
}
// after reading numeric values plus '.', example: '12.34'
func (l *lexer) stateDot0(r rune) lexState {
switch {
case '0' <= r && r <= '9':
return lexCont
case r == 'e' || r == 'E':
l.state = l.stateE
return lexCont
case r == 'M':
l.token = tokenFloat
l.state = l.stateEndLit
return lexCont
}
l.token = tokenFloat
return l.stateEndLit(r)
}
// stateE is the state after reading the mantissa and e in a number,
// such as after reading `314e` or `0.314e`.
func (l *lexer) stateE(r rune) lexState {
if r == '+' || r == '-' {
l.state = l.stateESign
return lexCont
}
return l.stateESign(r)
}
// stateESign is the state after reading the mantissa, e, and sign in a number,
// such as after reading `314e-` or `0.314e+`.
func (l *lexer) stateESign(r rune) lexState {
if '0' <= r && r <= '9' {
l.state = l.stateE0
return lexCont
}
return l.error(r, "in exponent of numeric literal")
}
// stateE0 is the state after reading the mantissa, e, optional sign,
// and at least one digit of the exponent in a number,
// such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
func (l *lexer) stateE0(r rune) lexState {
if '0' <= r && r <= '9' {
return lexCont
}
if r == 'M' {
l.token = tokenFloat
l.state = l.stateEndLit
return lexCont
}
l.token = tokenFloat
return l.stateEndLit(r)
}
var (
newlineRunes = []rune("newline")
returnRunes = []rune("return")
spaceRunes = []rune("space")
tabRunes = []rune("tab")
formfeedRunes = []rune("formfeed")
)
// stateChar after a backslash ('\')
func (l *lexer) stateChar(r rune) lexState {
switch {
// oh my, I'm so happy that none of these share the same prefix.
case r == 'n':
l.count = 1
l.expecting = newlineRunes
l.state = l.stateSpecialChar
return lexCont
case r == 'r':
l.count = 1
l.expecting = returnRunes
l.state = l.stateSpecialChar
return lexCont
case r == 's':
l.count = 1
l.expecting = spaceRunes
l.state = l.stateSpecialChar
return lexCont
case r == 't':
l.count = 1
l.expecting = tabRunes
l.state = l.stateSpecialChar
return lexCont
case r == 'f':
l.count = 1
l.expecting = formfeedRunes
l.state = l.stateSpecialChar
return lexCont
case r == 'u':
l.count = 0
l.state = l.stateUnicodeChar
return lexCont
case isWhitespace(r):
l.state = l.stateError
l.err = &SyntaxError{"backslash cannot be followed by whitespace", l.position}
return lexError
}
// default is single name character
l.token = tokenChar
l.state = l.stateEndLit
return lexCont
}
func (l *lexer) stateSpecialChar(r rune) lexState {
if r == l.expecting[l.count] {
l.count++
if l.count == len(l.expecting) {
l.token = tokenChar
l.state = l.stateEndLit
return lexCont
}
return lexCont
}
if l.count != 1 {
return l.error(r, "after start of special character")
}
// it is likely just a normal character, like 'n' or 't'
l.token = tokenChar
return l.stateEndLit(r)
}
func (l *lexer) stateUnicodeChar(r rune) lexState {
if '0' <= r && r <= '9' || 'a' <= r && r <= 'f' || 'A' <= r && r <= 'F' {
l.count++
if l.count == 4 {
l.token = tokenChar
l.state = l.stateEndLit
}
return lexCont
}
if l.count != 0 {
return l.error(r, "after start of unicode character")
}
// likely just '\u'
l.token = tokenChar
return l.stateEndLit(r)
}
// stateInString is the state after reading `"`.
func (l *lexer) stateInString(r rune) lexState {
if r == '"' {
l.token = tokenString
return lexEnd
}
if r == '\\' {
l.state = l.stateInStringEsc
return lexCont
}
return lexCont
}
// stateInStringEsc is the state after reading `"\` during a quoted string.
func (l *lexer) stateInStringEsc(r rune) lexState {
switch r {
case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
l.state = l.stateInString
return lexCont
case 'u':
l.state = l.stateInStringEscU
l.count = 0
return lexCont
}
return l.error(r, "in string escape code")
}
// stateInStringEscU is the state after reading `"\u` and l.count elements in a
// quoted string.
func (l *lexer) stateInStringEscU(r rune) lexState {
if '0' <= r && r <= '9' || 'a' <= r && r <= 'f' || 'A' <= r && r <= 'F' {
l.count++
if l.count == 4 {
l.state = l.stateInString
}
return lexCont
}
// numbers
return l.error(r, "in \\u hexadecimal character escape")
}
// after reading the character '#'
func (l *lexer) statePound(r rune) lexState {
switch {
case r == '_':
l.token = tokenDiscard
return lexEnd
case r == '{':
l.token = tokenSetStart
return lexEnd
case u.IsLetter(r):
l.token = tokenTag
l.state = l.stateSym
return lexCont
}
return l.error(r, `after token starting with "#"`)
}
func (l *lexer) stateError(r rune) lexState {
return lexError
}
// error records an error and switches to the error state.
func (l *lexer) error(r rune, context string) lexState {
l.state = l.stateError
l.err = &SyntaxError{"invalid character " + quoteRune(r) + " " + context, l.position}
return lexError
}
// quoteRune formats r as a quoted rune literal
func quoteRune(r rune) string {
// special cases - different from quoted strings
if r == '\'' {
return `'\''`
}
if r == '"' {
return `'"'`
}
// use quoted string with different quotation marks
s := strconv.Quote(string(r))
return "'" + s[1:len(s)-1] + "'"
}