import std/[unicode, json], faststreams/inputs, types export inputs, types {.push raises: [Defect].} type CustomIntHandler* = ##\ ## Custom decimal integer parser, result values need to be captured proc(dgt: int) {.gcsafe, raises: [Defect].} CustomByteAction* = enum Continue ##\ ## Default initialisation when provided to a `CustomBlobHandler` parser\ ## function type via call-by-reference StopBeforeByte ##\ ## Stop feeding and do not consume the current `byte` argument StopSwallowByte ##\ ## Stop and discard current `byte` argument (e.g. the last double quote\ ## '"' for a genuine string parser.) CustomBlobHandler* = ##\ ## Custom text or binary parser, result values need to be captured. The\ ## second argument `what` controlls the next action. proc(b: byte; what: var CustomByteAction) {.gcsafe, raises: [Defect].} TokKind* = enum tkError, tkEof, tkString, tkInt, tkNegativeInt, tkFloat, tkTrue, tkFalse, tkNull, tkCurlyLe, tkCurlyRi, tkBracketLe, tkBracketRi, tkColon, tkComma, tkQuoted, ##\ ## unfinished/lazy type, eventally becomes `tkString` tkExBlob, ##\ ## externally held string value after successful custom parsing tkNumeric, ##\ ## unfinished/lazy type, any of `tkInt`, `tkNegativeInt`, `tkFloat` tkExInt, ##\ ## externally held non-negative integer value after successful custom\ ## parsing tkExNegInt ## externally held negative integer value after successful custom parsing JsonErrorKind* = enum errNone = "no error", errHexCharExpected = "hex char expected (part of escape sequence)", errStringExpected = "string expected", errColonExpected = "':' expected", errCommaExpected = "',' expected", errBracketRiExpected = "']' expected", errCurlyRiExpected = "'}' expected", errQuoteExpected = "'\"' or \"'\" expected", errNumberExpected = "number expected", errExponentTooLarge = "exponent too large", errUnexpectedEof = "unexpected end of file", errCommentExpected = "comment expected" errOrphanSurrogate = "unicode surrogates must be followed by another unicode character" errNonPortableInt = "number is outside the range of portable values" errCustomIntExpexted = "not a customised integer" errCustomBlobExpexted = "not a customised quoted blob" JsonLexer* = object stream*: InputStream mode*: JsonMode line*: int lineStartPos: int tokenStart: int tokKind: TokKind # formerly `tok`, now accessible by getter err*: JsonErrorKind absIntVal*: uint64 # BEWARE: negative integers will have tok == tkNegativeInt floatVal*: float strVal*: string const powersOfTen = [1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22] # TODO: this table should be much larger # The largest JSON number value is 1E308 # needed in renderTok() proc scanNumber(lexer: var JsonLexer) {.gcsafe, raises: [Defect,IOError].} proc scanString(lexer: var JsonLexer) {.gcsafe, raises: [Defect,IOError].} proc renderTok*(lexer: var JsonLexer, output: var string) {.gcsafe, raises: [Defect,IOError].} = # The lazy part case lexer.tokKind of tkNumeric: lexer.scanNumber of tkQuoted: lexer.scanString else: discard # The real stuff case lexer.tokKind of tkError, tkEof, tkNumeric, tkExInt, tkExNegInt, tkQuoted, tkExBlob: discard of tkString: output.add '"' lexer.strVal.escapeJsonUnquoted output output.add '"' of tkInt: output.add $lexer.absIntVal of tkNegativeInt: output.add '-' output.add $lexer.absIntVal of tkFloat: output.add $lexer.floatVal of tkTrue: output.add "true" of tkFalse: output.add "false" of tkNull: output.add "null" of tkCurlyLe: output.add '{' of tkCurlyRi: output.add '}' of tkBracketLe: output.add '[' of tkBracketRi: output.add ']' of tkColon: output.add ':' of tkComma: output.add ',' template peek(s: InputStream): char = char inputs.peek(s) template read(s: InputStream): char = char inputs.read(s) proc hexCharValue(c: char): int = case c of '0'..'9': ord(c) - ord('0') of 'a'..'f': ord(c) - ord('a') + 10 of 'A'..'F': ord(c) - ord('A') + 10 else: -1 proc isDigit(c: char): bool = return (c >= '0' and c <= '9') proc col*(lexer: JsonLexer): int = lexer.stream.pos - lexer.lineStartPos proc tokenStartCol*(lexer: JsonLexer): int = 1 + lexer.tokenStart - lexer.lineStartPos proc init*(T: type JsonLexer, stream: InputStream, mode = defaultJsonMode): T = T(stream: stream, mode: mode, line: 1, lineStartPos: 0, tokenStart: -1, tokKind: tkError, err: errNone, absIntVal: uint64 0, floatVal: 0'f, strVal: "") template error(error: JsonErrorKind) {.dirty.} = lexer.err = error lexer.tokKind = tkError return template checkForUnexpectedEof {.dirty.} = if not lexer.stream.readable: error errUnexpectedEof template requireNextChar(): char = checkForUnexpectedEof() lexer.stream.read() template checkForNonPortableInt(val: uint64; overflow: bool) = if overflow or (lexer.mode == Portable and val > uint64(maxPortableInt)): error errNonPortableInt proc scanHexRune(lexer: var JsonLexer): int {.gcsafe, raises: [Defect,IOError].} = for i in 0..3: let hexValue = hexCharValue requireNextChar() if hexValue == -1: error errHexCharExpected result = (result shl 4) or hexValue proc scanString(lexer: var JsonLexer) = lexer.tokKind = tkString lexer.strVal.setLen 0 lexer.tokenStart = lexer.stream.pos advance lexer.stream while true: var c = requireNextChar() case c of '"': break of '\\': c = requireNextChar() case c of '\\', '"', '\'', '/': lexer.strVal.add c of 'b': lexer.strVal.add '\b' of 'f': lexer.strVal.add '\f' of 'n': lexer.strVal.add '\n' of 'r': lexer.strVal.add '\r' of 't': lexer.strVal.add '\t' of 'v': lexer.strVal.add '\x0B' of '0': lexer.strVal.add '\x00' of 'u': var rune = lexer.scanHexRune() if lexer.tokKind == tkError: return # Deal with surrogates if (rune and 0xfc00) == 0xd800: if requireNextChar() != '\\': error errOrphanSurrogate if requireNextChar() != 'u': error errOrphanSurrogate let nextRune = lexer.scanHexRune() if lexer.tokKind == tkError: return if (nextRune and 0xfc00) == 0xdc00: rune = 0x10000 + (((rune - 0xd800) shl 10) or (nextRune - 0xdc00)) lexer.strVal.add toUTF8(Rune(rune)) else: # don't bother with the error lexer.strVal.add c of '\r', '\n': error errQuoteExpected else: lexer.strVal.add c proc handleLF(lexer: var JsonLexer) = advance lexer.stream lexer.line += 1 lexer.lineStartPos = lexer.stream.pos proc skipWhitespace(lexer: var JsonLexer) {.gcsafe, raises: [Defect,IOError].} = template handleCR = # Beware: this is a template, because the return # statement has to exit `skipWhitespace`. advance lexer.stream if not lexer.stream.readable: return if lexer.stream.peek() == '\n': advance lexer.stream lexer.line += 1 lexer.lineStartPos = lexer.stream.pos while lexer.stream.readable: case lexer.stream.peek() of '/': advance lexer.stream checkForUnexpectedEof() case lexer.stream.peek() of '/': advance lexer.stream while true: if not lexer.stream.readable: return case lexer.stream.peek() of '\r': handleCR() break of '\n': lexer.handleLF() break else: advance lexer.stream of '*': advance lexer.stream while true: if not lexer.stream.readable: return case lexer.stream.peek() of '\r': handleCR() of '\n': lexer.handleLF() of '*': advance lexer.stream checkForUnexpectedEof() if lexer.stream.peek() == '/': advance lexer.stream break else: advance lexer.stream else: error errCommentExpected of ' ', '\t': advance lexer.stream of '\r': handleCR() of '\n': lexer.handleLF() else: break template requireMoreNumberChars(elseClause) = if not lexer.stream.readable: elseClause error errNumberExpected template eatDigitAndPeek: char = advance lexer.stream if not lexer.stream.readable: return lexer.stream.peek() proc scanSign(lexer: var JsonLexer): int {.gcsafe, raises: [Defect,IOError].} = # Returns +1 or -1 # If a sign character is present, it must be followed # by more characters representing the number. If this # is not the case, the return value will be 0. let c = lexer.stream.peek() if c == '-': requireMoreNumberChars: result = 0 advance lexer.stream return -1 elif c == '+': requireMoreNumberChars: result = 0 advance lexer.stream return 1 proc scanInt(lexer: var JsonLexer): (uint64,bool) {.gcsafe, raises: [Defect,IOError].} = ## Scan unsigned integer into uint64 if possible. ## If all goes ok, the tuple `(parsed-value,false)` is returned. ## On overflow, the tuple `(uint64.high,true)` is returned. var c = lexer.stream.peek() # Always possible to append `9` is result[0] is not larger const canAppendDigit9 = (uint64.high - 9) div 10 result[0] = uint64(ord(c) - ord('0')) c = eatDigitAndPeek() # implicit auto-return while c.isDigit: # Process next digit unless overflow if not result[1]: let lsDgt = uint64(ord(c) - ord('0')) if canAppendDigit9 < result[0] and (uint64.high - lsDgt) div 10 < result[0]: result[1] = true result[0] = uint64.high else: result[0] = result[0] * 10 + lsDgt # Fetch next digit c = eatDigitAndPeek() # implicit auto-return proc scanNumber(lexer: var JsonLexer) {.gcsafe, raises: [Defect,IOError].} = var sign = lexer.scanSign() if sign == 0: return var c = lexer.stream.peek() if c == '.': advance lexer.stream requireMoreNumberChars: discard lexer.tokKind = tkFloat c = lexer.stream.peek() elif c.isDigit: lexer.tokKind = if sign > 0: tkInt else: tkNegativeInt let (scannedValue,overflow) = lexer.scanInt() checkForNonPortableInt scannedValue, overflow lexer.absIntVal = scannedValue if not lexer.stream.readable: return c = lexer.stream.peek() if c == '.': lexer.tokKind = tkFloat lexer.floatVal = float(lexer.absIntVal) * float(sign) c = eatDigitAndPeek() else: error errNumberExpected var fraction = 0.1'f while c.isDigit: lexer.floatVal += fraction * float(ord(c) - ord('0')) fraction *= 0.1'f c = eatDigitAndPeek() if c in {'E', 'e'}: advance lexer.stream requireMoreNumberChars: discard let sign = lexer.scanSign() if sign == 0: return if not isDigit lexer.stream.peek(): error errNumberExpected let (exponent,_) = lexer.scanInt() if exponent >= uint64(len(powersOfTen)): error errExponentTooLarge if sign > 0: lexer.floatVal = lexer.floatVal * powersOfTen[exponent] else: lexer.floatVal = lexer.floatVal / powersOfTen[exponent] proc scanIdentifier(lexer: var JsonLexer, expectedIdent: string, expectedTok: TokKind) = for c in expectedIdent: if c != lexer.stream.read(): lexer.tokKind = tkError return lexer.tokKind = expectedTok proc accept*(lexer: var JsonLexer) {.gcsafe, raises: [Defect,IOError].} = ## Finalise token by parsing the value. Note that this might change ## the token type case lexer.tokKind of tkNumeric: lexer.scanNumber of tkQuoted: lexer.scanString else: discard proc next*(lexer: var JsonLexer) {.gcsafe, raises: [Defect,IOError].} = lexer.skipWhitespace() if not lexer.stream.readable: lexer.tokKind = tkEof return # in case the value parsing was missing lexer.accept() lexer.strVal.setLen 0 # release memory (if any) let c = lexer.stream.peek() case c of '+', '-', '.', '0'..'9': lexer.tokKind = tkNumeric of '"': lexer.tokKind = tkQuoted of '[': advance lexer.stream lexer.tokKind = tkBracketLe of '{': advance lexer.stream lexer.tokKind = tkCurlyLe of ']': advance lexer.stream lexer.tokKind = tkBracketRi of '}': advance lexer.stream lexer.tokKind = tkCurlyRi of ',': advance lexer.stream lexer.tokKind = tkComma of ':': advance lexer.stream lexer.tokKind = tkColon of '\0': lexer.tokKind = tkEof of 'n': lexer.scanIdentifier("null", tkNull) of 't': lexer.scanIdentifier("true", tkTrue) of 'f': lexer.scanIdentifier("false", tkFalse) else: advance lexer.stream lexer.tokKind = tkError proc tok*(lexer: var JsonLexer): TokKind {.gcsafe, raises: [Defect,IOError].} = ## Getter, implies full token parsing lexer.accept lexer.tokKind proc lazyTok*(lexer: JsonLexer): TokKind = ## Preliminary token state unless accepted, already lexer.tokKind proc customIntHandler*(lexer: var JsonLexer; handler: CustomIntHandler) {.gcsafe, raises: [Defect,IOError].} = ## Apply the `handler` argument function for parsing a `tkNumeric` type ## value. This function sets the token state to `tkExInt`, `tkExNegInt`, ## or `tkError`. proc customScan(lexer: var JsonLexer) {.gcsafe, raises: [Defect,IOError].} = var c = lexer.stream.peek() handler(ord(c) - ord('0')) c = eatDigitAndPeek() # implicit auto-return while c.isDigit: handler(ord(c) - ord('0')) c = eatDigitAndPeek() # implicit auto-return if lexer.tokKind == tkNumeric: var sign = lexer.scanSign() if sign != 0: if lexer.stream.peek.isDigit: lexer.tokKind = if 0 < sign: tkExInt else: tkExNegInt lexer.customScan if not lexer.stream.readable or lexer.stream.peek != '.': return error errCustomIntExpexted proc customBlobHandler*(lexer: var JsonLexer; handler: CustomBlobHandler) {.gcsafe, raises: [Defect,IOError].} = ## Apply the `handler` argument function for parsing a `tkQuoted` type ## value. This function sets the token state to `tkExBlob`, or `tkError`. proc customScan(lexer: var JsonLexer) {.gcsafe, raises: [Defect,IOError].} = var what = Continue while lexer.stream.readable: var c = lexer.stream.peek handler(c.byte, what) case what of StopBeforeByte: break of StopSwallowByte: advance lexer.stream break of Continue: advance lexer.stream if lexer.tokKind == tkQuoted: advance lexer.stream lexer.tokKind = tkExBlob lexer.customScan return error errCustomBlobExpexted template customIntValueIt*(lexer: var JsonLexer; body: untyped): untyped = ## Convenience wrapper around `customIntHandler()` for parsing integers. ## ## The `body` argument represents a virtual function body. So the current ## digit processing can be exited with `return`. var handler: CustomIntHandler = proc(digit: int) = let it {.inject.} = digit body lexer.customIntHandler(handler) template customBlobValueIt*(lexer: var JsonLexer; body: untyped): untyped = ## Convenience wrapper around `customBlobHandler()` for parsing any byte ## object. The body function needs to terminate explicitely with the typical ## phrase `doNext = StopSwallowByte` or with the more unusual phrase ## `doNext = StopBeforeByte`. ## ## The `body` argument represents a virtual function body. So the current ## byte processing can be exited with `return`. var handler: CustomBlobHandler = proc(c: byte; what: var CustomByteAction) = let it {.inject.} = c var doNext {.inject.} = what body what = doNext lexer.customBlobHandler(handler) template customTextValueIt*(lexer: var JsonLexer; body: untyped): untyped = ## Convenience wrapper around `customBlobHandler()` for parsing a text ## terminating with a double quote character '"' (no inner double quote ## allowed.) ## ## The `body` argument represents a virtual function body. So the current ## character processing can be exited with `return`. var handler: CustomBlobHandler = proc(c: byte; what: var CustomByteAction) = let it {.inject.} = c.chr if it == '"': what = StopSwallowByte else: body lexer.customBlobHandler(handler)