604 lines
13 KiB
Go
604 lines
13 KiB
Go
|
// Copyright 2015 Jean Niklas L'orange. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
package edn
|
||
|
|
||
|
import (
|
||
|
"strconv"
|
||
|
u "unicode"
|
||
|
)
|
||
|
|
||
|
type lexState int
|
||
|
|
||
|
const (
|
||
|
lexCont = lexState(iota) // continue reading
|
||
|
lexIgnore // values you can ignore, just whitespace and comments atm
|
||
|
lexEnd // value ended with input given in
|
||
|
lexEndPrev // value ended with previous input
|
||
|
lexError // erroneous input
|
||
|
)
|
||
|
|
||
|
type tokenType int
|
||
|
|
||
|
const ( // value types from lexer
|
||
|
tokenSymbol = tokenType(iota)
|
||
|
tokenKeyword
|
||
|
tokenString
|
||
|
tokenInt
|
||
|
tokenFloat
|
||
|
tokenTag
|
||
|
tokenChar
|
||
|
tokenListStart
|
||
|
tokenListEnd
|
||
|
tokenVectorStart
|
||
|
tokenVectorEnd
|
||
|
tokenMapStart
|
||
|
tokenMapEnd
|
||
|
tokenSetStart
|
||
|
tokenDiscard
|
||
|
|
||
|
tokenError
|
||
|
)
|
||
|
|
||
|
func (t tokenType) String() string {
|
||
|
switch t {
|
||
|
case tokenSymbol:
|
||
|
return "symbol"
|
||
|
case tokenKeyword:
|
||
|
return "keyword"
|
||
|
case tokenString:
|
||
|
return "string"
|
||
|
case tokenInt:
|
||
|
return "integer"
|
||
|
case tokenFloat:
|
||
|
return "float"
|
||
|
case tokenTag:
|
||
|
return "tag"
|
||
|
case tokenChar:
|
||
|
return "character"
|
||
|
case tokenListStart:
|
||
|
return "list start"
|
||
|
case tokenListEnd:
|
||
|
return "list end"
|
||
|
case tokenVectorStart:
|
||
|
return "vector start"
|
||
|
case tokenVectorEnd:
|
||
|
return "vector end"
|
||
|
case tokenMapStart:
|
||
|
return "map start"
|
||
|
case tokenMapEnd:
|
||
|
return "map/set end"
|
||
|
case tokenSetStart:
|
||
|
return "set start"
|
||
|
case tokenDiscard:
|
||
|
return "discard token"
|
||
|
case tokenError:
|
||
|
return "error"
|
||
|
default:
|
||
|
return "[unknown]"
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const tokenSetEnd = tokenMapEnd // sets ends the same way as maps do
|
||
|
|
||
|
// A SyntaxError is a description of an EDN syntax error.
|
||
|
type SyntaxError struct {
|
||
|
msg string // description of error
|
||
|
Offset int64 // error occurred after reading Offset bytes
|
||
|
}
|
||
|
|
||
|
func (e *SyntaxError) Error() string {
|
||
|
return e.msg
|
||
|
}
|
||
|
|
||
|
func okSymbolFirst(r rune) bool {
|
||
|
switch r {
|
||
|
case '.', '*', '+', '!', '-', '_', '?', '$', '%', '&', '=', '<', '>':
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func okSymbol(r rune) bool {
|
||
|
switch r {
|
||
|
case '.', '*', '+', '!', '-', '_', '?', '$', '%', '&', '=', '<', '>', ':', '#', '\'':
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func isWhitespace(r rune) bool {
|
||
|
return u.IsSpace(r) || r == ','
|
||
|
}
|
||
|
|
||
|
type lexer struct {
|
||
|
state func(rune) lexState
|
||
|
err error
|
||
|
position int64
|
||
|
token tokenType
|
||
|
|
||
|
count int // counter is used in some functions within the lexer
|
||
|
expecting []rune // expecting is used to avoid duplication when we expect e.g. \newline
|
||
|
}
|
||
|
|
||
|
func (l *lexer) reset() {
|
||
|
l.state = l.stateBegin
|
||
|
l.token = tokenType(-1)
|
||
|
l.err = nil
|
||
|
}
|
||
|
|
||
|
func (l *lexer) eof() lexState {
|
||
|
if l.err != nil {
|
||
|
return lexError
|
||
|
}
|
||
|
lt := l.state(' ')
|
||
|
if lt == lexCont {
|
||
|
l.err = &SyntaxError{"unexpected end of EDN input", l.position}
|
||
|
lt = lexError
|
||
|
}
|
||
|
if l.err != nil {
|
||
|
return lexError
|
||
|
}
|
||
|
if lt == lexEndPrev {
|
||
|
return lexEnd
|
||
|
}
|
||
|
return lt
|
||
|
}
|
||
|
|
||
|
func (l *lexer) stateBegin(r rune) lexState {
|
||
|
switch {
|
||
|
case isWhitespace(r):
|
||
|
return lexIgnore
|
||
|
case r == '{':
|
||
|
l.token = tokenMapStart
|
||
|
return lexEnd
|
||
|
case r == '}':
|
||
|
l.token = tokenMapEnd
|
||
|
return lexEnd
|
||
|
case r == '[':
|
||
|
l.token = tokenVectorStart
|
||
|
return lexEnd
|
||
|
case r == ']':
|
||
|
l.token = tokenVectorEnd
|
||
|
return lexEnd
|
||
|
case r == '(':
|
||
|
l.token = tokenListStart
|
||
|
return lexEnd
|
||
|
case r == ')':
|
||
|
l.token = tokenListEnd
|
||
|
return lexEnd
|
||
|
case r == '#':
|
||
|
l.state = l.statePound
|
||
|
return lexCont
|
||
|
case r == ':':
|
||
|
l.state = l.stateKeyword
|
||
|
return lexCont
|
||
|
case r == '/': // ohh, the lovely slash edge case
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateEndLit
|
||
|
return lexCont
|
||
|
case r == '+':
|
||
|
l.state = l.statePos
|
||
|
return lexCont
|
||
|
case r == '-':
|
||
|
l.state = l.stateNeg
|
||
|
return lexCont
|
||
|
case r == '.':
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateDotPre
|
||
|
return lexCont
|
||
|
case r == '"':
|
||
|
l.state = l.stateInString
|
||
|
return lexCont
|
||
|
case r == '\\':
|
||
|
l.state = l.stateChar
|
||
|
return lexCont
|
||
|
case okSymbolFirst(r) || u.IsLetter(r):
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateSym
|
||
|
return lexCont
|
||
|
case '0' < r && r <= '9':
|
||
|
l.state = l.state1
|
||
|
return lexCont
|
||
|
case r == '0':
|
||
|
l.state = l.state0
|
||
|
return lexCont
|
||
|
case r == ';':
|
||
|
l.state = l.stateComment
|
||
|
return lexIgnore
|
||
|
}
|
||
|
return l.error(r, "- unexpected rune")
|
||
|
}
|
||
|
|
||
|
func (l *lexer) stateComment(r rune) lexState {
|
||
|
if r == '\n' {
|
||
|
l.state = l.stateBegin
|
||
|
}
|
||
|
return lexIgnore
|
||
|
}
|
||
|
|
||
|
func (l *lexer) stateEndLit(r rune) lexState {
|
||
|
if isWhitespace(r) || r == '"' || r == '{' || r == '[' || r == '(' || r == ')' || r == ']' || r == '}' || r == '\\' || r == ';' {
|
||
|
return lexEndPrev
|
||
|
}
|
||
|
return l.error(r, "- unexpected rune after legal "+l.token.String())
|
||
|
}
|
||
|
|
||
|
func (l *lexer) stateKeyword(r rune) lexState {
|
||
|
switch {
|
||
|
case r == ':':
|
||
|
l.state = l.stateError
|
||
|
l.err = &SyntaxError{"EDN does not support namespace-qualified keywords", l.position}
|
||
|
return lexError
|
||
|
case r == '/':
|
||
|
l.state = l.stateError
|
||
|
l.err = &SyntaxError{"keywords cannot begin with /", l.position}
|
||
|
return lexError
|
||
|
case okSymbol(r) || u.IsLetter(r) || ('0' <= r && r <= '9'):
|
||
|
l.token = tokenKeyword
|
||
|
l.state = l.stateSym
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.error(r, "after keyword start")
|
||
|
}
|
||
|
|
||
|
// examples: 'foo' 'bar'
|
||
|
// we reuse this from the keyword states, so we don't set token at the end,
|
||
|
// but before we call this
|
||
|
func (l *lexer) stateSym(r rune) lexState {
|
||
|
switch {
|
||
|
case okSymbol(r) || u.IsLetter(r) || ('0' <= r && r <= '9'):
|
||
|
l.state = l.stateSym
|
||
|
return lexCont
|
||
|
case r == '/':
|
||
|
l.state = l.stateSlash
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
// example: 'foo/'
|
||
|
func (l *lexer) stateSlash(r rune) lexState {
|
||
|
switch {
|
||
|
case okSymbol(r) || u.IsLetter(r) || ('0' <= r && r <= '9'):
|
||
|
l.state = l.statePostSlash
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.error(r, "directly after '/' in namespaced symbol")
|
||
|
}
|
||
|
|
||
|
// example : 'foo/bar'
|
||
|
func (l *lexer) statePostSlash(r rune) lexState {
|
||
|
switch {
|
||
|
case okSymbol(r) || u.IsLetter(r) || ('0' <= r && r <= '9'):
|
||
|
l.state = l.statePostSlash
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
// example: '-'
|
||
|
func (l *lexer) stateNeg(r rune) lexState {
|
||
|
switch {
|
||
|
case r == '0':
|
||
|
l.state = l.state0
|
||
|
return lexCont
|
||
|
case '1' <= r && r <= '9':
|
||
|
l.state = l.state1
|
||
|
return lexCont
|
||
|
case okSymbol(r) || u.IsLetter(r):
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateSym
|
||
|
return lexCont
|
||
|
case r == '/':
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateSlash
|
||
|
return lexCont
|
||
|
}
|
||
|
l.token = tokenSymbol
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
// example: '+'
|
||
|
func (l *lexer) statePos(r rune) lexState {
|
||
|
switch {
|
||
|
case r == '0':
|
||
|
l.state = l.state0
|
||
|
return lexCont
|
||
|
case '1' <= r && r <= '9':
|
||
|
l.state = l.state1
|
||
|
return lexCont
|
||
|
case okSymbol(r) || u.IsLetter(r):
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateSym
|
||
|
return lexCont
|
||
|
case r == '/':
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateSlash
|
||
|
return lexCont
|
||
|
}
|
||
|
l.token = tokenSymbol
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
// value is '0'
|
||
|
func (l *lexer) state0(r rune) lexState {
|
||
|
switch {
|
||
|
case r == '.':
|
||
|
l.state = l.stateDot
|
||
|
return lexCont
|
||
|
case r == 'e' || r == 'E':
|
||
|
l.state = l.stateE
|
||
|
return lexCont
|
||
|
case r == 'M': // bigdecimal
|
||
|
l.token = tokenFloat
|
||
|
l.state = l.stateEndLit
|
||
|
return lexCont // must be ws or delimiter afterwards
|
||
|
case r == 'N': // bigint
|
||
|
l.token = tokenInt
|
||
|
l.state = l.stateEndLit
|
||
|
return lexCont // must be ws or delimiter afterwards
|
||
|
}
|
||
|
l.token = tokenInt
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
// anything but a result starting with 0. example '10', '34'
|
||
|
func (l *lexer) state1(r rune) lexState {
|
||
|
if '0' <= r && r <= '9' {
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.state0(r)
|
||
|
}
|
||
|
|
||
|
// example: '.', can only receive non-numerics here
|
||
|
func (l *lexer) stateDotPre(r rune) lexState {
|
||
|
switch {
|
||
|
case okSymbol(r) || u.IsLetter(r):
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateSym
|
||
|
return lexCont
|
||
|
case r == '/':
|
||
|
l.token = tokenSymbol
|
||
|
l.state = l.stateSlash
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
// after reading numeric values plus '.', example: '12.'
|
||
|
func (l *lexer) stateDot(r rune) lexState {
|
||
|
if '0' <= r && r <= '9' {
|
||
|
l.state = l.stateDot0
|
||
|
return lexCont
|
||
|
}
|
||
|
// TODO (?): The spec says that there must be numbers after the dot, yet
|
||
|
// (clojure.edn/read-string "1.e1") returns 10.0
|
||
|
return l.error(r, "after decimal point in numeric literal")
|
||
|
}
|
||
|
|
||
|
// after reading numeric values plus '.', example: '12.34'
|
||
|
func (l *lexer) stateDot0(r rune) lexState {
|
||
|
switch {
|
||
|
case '0' <= r && r <= '9':
|
||
|
return lexCont
|
||
|
case r == 'e' || r == 'E':
|
||
|
l.state = l.stateE
|
||
|
return lexCont
|
||
|
case r == 'M':
|
||
|
l.token = tokenFloat
|
||
|
l.state = l.stateEndLit
|
||
|
return lexCont
|
||
|
}
|
||
|
l.token = tokenFloat
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
// stateE is the state after reading the mantissa and e in a number,
|
||
|
// such as after reading `314e` or `0.314e`.
|
||
|
func (l *lexer) stateE(r rune) lexState {
|
||
|
if r == '+' || r == '-' {
|
||
|
l.state = l.stateESign
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.stateESign(r)
|
||
|
}
|
||
|
|
||
|
// stateESign is the state after reading the mantissa, e, and sign in a number,
|
||
|
// such as after reading `314e-` or `0.314e+`.
|
||
|
func (l *lexer) stateESign(r rune) lexState {
|
||
|
if '0' <= r && r <= '9' {
|
||
|
l.state = l.stateE0
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.error(r, "in exponent of numeric literal")
|
||
|
}
|
||
|
|
||
|
// stateE0 is the state after reading the mantissa, e, optional sign,
|
||
|
// and at least one digit of the exponent in a number,
|
||
|
// such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
|
||
|
func (l *lexer) stateE0(r rune) lexState {
|
||
|
if '0' <= r && r <= '9' {
|
||
|
return lexCont
|
||
|
}
|
||
|
if r == 'M' {
|
||
|
l.token = tokenFloat
|
||
|
l.state = l.stateEndLit
|
||
|
return lexCont
|
||
|
}
|
||
|
l.token = tokenFloat
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
var (
|
||
|
newlineRunes = []rune("newline")
|
||
|
returnRunes = []rune("return")
|
||
|
spaceRunes = []rune("space")
|
||
|
tabRunes = []rune("tab")
|
||
|
formfeedRunes = []rune("formfeed")
|
||
|
)
|
||
|
|
||
|
// stateChar after a backslash ('\')
|
||
|
func (l *lexer) stateChar(r rune) lexState {
|
||
|
switch {
|
||
|
// oh my, I'm so happy that none of these share the same prefix.
|
||
|
case r == 'n':
|
||
|
l.count = 1
|
||
|
l.expecting = newlineRunes
|
||
|
l.state = l.stateSpecialChar
|
||
|
return lexCont
|
||
|
case r == 'r':
|
||
|
l.count = 1
|
||
|
l.expecting = returnRunes
|
||
|
l.state = l.stateSpecialChar
|
||
|
return lexCont
|
||
|
case r == 's':
|
||
|
l.count = 1
|
||
|
l.expecting = spaceRunes
|
||
|
l.state = l.stateSpecialChar
|
||
|
return lexCont
|
||
|
case r == 't':
|
||
|
l.count = 1
|
||
|
l.expecting = tabRunes
|
||
|
l.state = l.stateSpecialChar
|
||
|
return lexCont
|
||
|
case r == 'f':
|
||
|
l.count = 1
|
||
|
l.expecting = formfeedRunes
|
||
|
l.state = l.stateSpecialChar
|
||
|
return lexCont
|
||
|
case r == 'u':
|
||
|
l.count = 0
|
||
|
l.state = l.stateUnicodeChar
|
||
|
return lexCont
|
||
|
case isWhitespace(r):
|
||
|
l.state = l.stateError
|
||
|
l.err = &SyntaxError{"backslash cannot be followed by whitespace", l.position}
|
||
|
return lexError
|
||
|
}
|
||
|
// default is single name character
|
||
|
l.token = tokenChar
|
||
|
l.state = l.stateEndLit
|
||
|
return lexCont
|
||
|
}
|
||
|
|
||
|
func (l *lexer) stateSpecialChar(r rune) lexState {
|
||
|
if r == l.expecting[l.count] {
|
||
|
l.count++
|
||
|
if l.count == len(l.expecting) {
|
||
|
l.token = tokenChar
|
||
|
l.state = l.stateEndLit
|
||
|
return lexCont
|
||
|
}
|
||
|
return lexCont
|
||
|
}
|
||
|
if l.count != 1 {
|
||
|
return l.error(r, "after start of special character")
|
||
|
}
|
||
|
// it is likely just a normal character, like 'n' or 't'
|
||
|
l.token = tokenChar
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
func (l *lexer) stateUnicodeChar(r rune) lexState {
|
||
|
if '0' <= r && r <= '9' || 'a' <= r && r <= 'f' || 'A' <= r && r <= 'F' {
|
||
|
l.count++
|
||
|
if l.count == 4 {
|
||
|
l.token = tokenChar
|
||
|
l.state = l.stateEndLit
|
||
|
}
|
||
|
return lexCont
|
||
|
}
|
||
|
if l.count != 0 {
|
||
|
return l.error(r, "after start of unicode character")
|
||
|
}
|
||
|
// likely just '\u'
|
||
|
l.token = tokenChar
|
||
|
return l.stateEndLit(r)
|
||
|
}
|
||
|
|
||
|
// stateInString is the state after reading `"`.
|
||
|
func (l *lexer) stateInString(r rune) lexState {
|
||
|
if r == '"' {
|
||
|
l.token = tokenString
|
||
|
return lexEnd
|
||
|
}
|
||
|
if r == '\\' {
|
||
|
l.state = l.stateInStringEsc
|
||
|
return lexCont
|
||
|
}
|
||
|
return lexCont
|
||
|
}
|
||
|
|
||
|
// stateInStringEsc is the state after reading `"\` during a quoted string.
|
||
|
func (l *lexer) stateInStringEsc(r rune) lexState {
|
||
|
switch r {
|
||
|
case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
|
||
|
l.state = l.stateInString
|
||
|
return lexCont
|
||
|
case 'u':
|
||
|
l.state = l.stateInStringEscU
|
||
|
l.count = 0
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.error(r, "in string escape code")
|
||
|
}
|
||
|
|
||
|
// stateInStringEscU is the state after reading `"\u` and l.count elements in a
|
||
|
// quoted string.
|
||
|
func (l *lexer) stateInStringEscU(r rune) lexState {
|
||
|
if '0' <= r && r <= '9' || 'a' <= r && r <= 'f' || 'A' <= r && r <= 'F' {
|
||
|
l.count++
|
||
|
if l.count == 4 {
|
||
|
l.state = l.stateInString
|
||
|
}
|
||
|
return lexCont
|
||
|
}
|
||
|
// numbers
|
||
|
return l.error(r, "in \\u hexadecimal character escape")
|
||
|
}
|
||
|
|
||
|
// after reading the character '#'
|
||
|
func (l *lexer) statePound(r rune) lexState {
|
||
|
switch {
|
||
|
case r == '_':
|
||
|
l.token = tokenDiscard
|
||
|
return lexEnd
|
||
|
case r == '{':
|
||
|
l.token = tokenSetStart
|
||
|
return lexEnd
|
||
|
case u.IsLetter(r):
|
||
|
l.token = tokenTag
|
||
|
l.state = l.stateSym
|
||
|
return lexCont
|
||
|
}
|
||
|
return l.error(r, `after token starting with "#"`)
|
||
|
}
|
||
|
|
||
|
func (l *lexer) stateError(r rune) lexState {
|
||
|
return lexError
|
||
|
}
|
||
|
|
||
|
// error records an error and switches to the error state.
|
||
|
func (l *lexer) error(r rune, context string) lexState {
|
||
|
l.state = l.stateError
|
||
|
l.err = &SyntaxError{"invalid character " + quoteRune(r) + " " + context, l.position}
|
||
|
return lexError
|
||
|
}
|
||
|
|
||
|
// quoteRune formats r as a quoted rune literal
|
||
|
func quoteRune(r rune) string {
|
||
|
// special cases - different from quoted strings
|
||
|
if r == '\'' {
|
||
|
return `'\''`
|
||
|
}
|
||
|
if r == '"' {
|
||
|
return `'"'`
|
||
|
}
|
||
|
|
||
|
// use quoted string with different quotation marks
|
||
|
s := strconv.Quote(string(r))
|
||
|
return "'" + s[1:len(s)-1] + "'"
|
||
|
}
|