Performance optimizations in parser

This commit is contained in:
Felix Krause 2016-03-20 12:09:04 +01:00
parent 167c25af72
commit b6d363107a
1 changed files with 71 additions and 50 deletions

View File

@ -28,11 +28,11 @@ type
cBlock, cFlow cBlock, cFlow
const const
space = [' ', '\t'] space = {' ', '\t'}
lineEnd = ['\l', '\c', EndOfFile] lineEnd = {'\l', '\c', EndOfFile}
spaceOrLineEnd = [' ', '\t', '\l', '\c', EndOfFile] spaceOrLineEnd = {' ', '\t', '\l', '\c', EndOfFile}
digits = '0'..'9' digits = {'0'..'9'}
flowIndicators = ['[', ']', '{', '}', ','] flowIndicators = {'[', ']', '{', '}', ','}
UTF8NextLine = toUTF8(0x85.Rune) UTF8NextLine = toUTF8(0x85.Rune)
UTF8NonBreakingSpace = toUTF8(0xA0.Rune) UTF8NonBreakingSpace = toUTF8(0xA0.Rune)
@ -88,6 +88,13 @@ template yieldEmptyScalar() {.dirty.} =
tag = yTagQuestionMark tag = yTagQuestionMark
anchor = yAnchorNone anchor = yAnchorNone
template yieldShallowScalar(content: string) {.dirty.} =
var e = YamlStreamEvent(kind: yamlScalar, scalarTag: tag,
scalarAnchor: anchor)
shallowCopy(e.scalarContent, content)
shallow(e.scalarContent)
yield e
template yieldLevelEnd() {.dirty.} = template yieldLevelEnd() {.dirty.} =
case level.kind case level.kind
of fplSequence: yield endSeqEvent() of fplSequence: yield endSeqEvent()
@ -96,7 +103,7 @@ template yieldLevelEnd() {.dirty.} =
yieldEmptyScalar() yieldEmptyScalar()
yield endMapEvent() yield endMapEvent()
of fplScalar: of fplScalar:
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
tag = yTagQuestionMark tag = yTagQuestionMark
anchor = yAnchorNone anchor = yAnchorNone
of fplUnknown: of fplUnknown:
@ -265,27 +272,29 @@ template handleTagHandle() {.dirty.} =
if level.kind != fplUnknown: parserError("Unexpected tag handle") if level.kind != fplUnknown: parserError("Unexpected tag handle")
if tag != yTagQuestionMark: if tag != yTagQuestionMark:
parserError("Only one tag handle is allowed per node") parserError("Only one tag handle is allowed per node")
content = "" content.setLen(0)
var var
shorthandEnd: int shorthandEnd: int
tagUri: string
p.lexer.tagHandle(content, shorthandEnd) p.lexer.tagHandle(content, shorthandEnd)
if shorthandEnd != -1: if shorthandEnd != -1:
try: try:
let prefix = shorthands[content[0..shorthandEnd]] tagUri.setLen(0)
tagUri = prefix & content[shorthandEnd + 1 .. ^1] tagUri.add(shorthands[content[0..shorthandEnd]])
tagUri.add(content[shorthandEnd + 1 .. ^1])
except KeyError: except KeyError:
parserError("Undefined tag shorthand: " & content[0..shorthandEnd]) parserError("Undefined tag shorthand: " & content[0..shorthandEnd])
else: shallowCopy(tagUri, content) try: tag = p.tagLib.tags[tagUri]
try: tag = p.tagLib.tags[tagUri] except KeyError: tag = p.tagLib.registerUri(tagUri)
except KeyError: tag = p.tagLib.registerUri(tagUri) else:
try: tag = p.tagLib.tags[content]
except KeyError: tag = p.tagLib.registerUri(content)
template handleAnchor() {.dirty.} = template handleAnchor() {.dirty.} =
startToken() startToken()
if level.kind != fplUnknown: parserError("Unexpected token") if level.kind != fplUnknown: parserError("Unexpected token")
if anchor != yAnchorNone: if anchor != yAnchorNone:
parserError("Only one anchor is allowed per node") parserError("Only one anchor is allowed per node")
content = "" content.setLen(0)
p.lexer.anchorName(content) p.lexer.anchorName(content)
anchor = nextAnchorId anchor = nextAnchorId
anchors[content] = anchor anchors[content] = anchor
@ -296,7 +305,7 @@ template handleAlias() {.dirty.} =
if level.kind != fplUnknown: parserError("Unexpected token") if level.kind != fplUnknown: parserError("Unexpected token")
if anchor != yAnchorNone or tag != yTagQuestionMark: if anchor != yAnchorNone or tag != yTagQuestionMark:
parserError("Alias may not have anchor or tag") parserError("Alias may not have anchor or tag")
content = "" content.setLen(0)
p.lexer.anchorName(content) p.lexer.anchorName(content)
var id: AnchorId var id: AnchorId
try: id = anchors[content] try: id = anchors[content]
@ -491,8 +500,8 @@ template tagShorthand(lexer: BaseLexer, shorthand: var string) =
if lexer.buf[lexer.bufpos] notin spaceOrLineEnd: if lexer.buf[lexer.bufpos] notin spaceOrLineEnd:
lexerError(lexer, "Missing space after tag shorthand") lexerError(lexer, "Missing space after tag shorthand")
template tagUri(lexer: BaseLexer, uri: var string) = template tagUriMapping(lexer: BaseLexer, uri: var string) =
debug("lex: tagUri") debug("lex: tagUriMapping")
while lexer.buf[lexer.bufpos] in space: while lexer.buf[lexer.bufpos] in space:
lexer.bufpos.inc() lexer.bufpos.inc()
var c = lexer.buf[lexer.bufpos] var c = lexer.buf[lexer.bufpos]
@ -580,7 +589,7 @@ proc byteSequence(lexer: var BaseLexer): char {.raises: [YamlParserError].} =
return char(charCode) return char(charCode)
template processQuotedWhitespace(newlines: var int) {.dirty.} = template processQuotedWhitespace(newlines: var int) {.dirty.} =
var after = "" after.setLen(0)
block outer: block outer:
while true: while true:
case p.lexer.buf[p.lexer.bufpos] case p.lexer.buf[p.lexer.bufpos]
@ -685,6 +694,17 @@ proc isPlainSafe(lexer: BaseLexer, index: int, context: YamlContext): bool =
of flowIndicators: result = context == cBlock of flowIndicators: result = context == cBlock
else: result = true else: result = true
# tried this for performance optimization, but it didn't optimize any
# performance. keeping it around for future reference.
#const
# plainCharOut = {'!', '\"', '$'..'9', ';'..'\xFF'}
# plainCharIn = {'!', '\"', '$'..'+', '-'..'9', ';'..'Z', '\\', '^'..'z',
# '|', '~'..'\xFF'}
#template isPlainChar(c: char, context: YamlContext): bool =
# when context == cBlock: c in plainCharOut
# else: c in plainCharIn
template plainScalar(lexer: BaseLexer, content: var string, template plainScalar(lexer: BaseLexer, content: var string,
context: YamlContext) = context: YamlContext) =
debug("lex: plainScalar") debug("lex: plainScalar")
@ -694,9 +714,9 @@ template plainScalar(lexer: BaseLexer, content: var string,
lexer.bufpos.inc() lexer.bufpos.inc()
let c = lexer.buf[lexer.bufpos] let c = lexer.buf[lexer.bufpos]
case c case c
of lineEnd: break
of ' ', '\t': of ' ', '\t':
var after = "" & c after.setLen(1)
after[0] = c
while true: while true:
lexer.bufpos.inc() lexer.bufpos.inc()
let c2 = lexer.buf[lexer.bufpos] let c2 = lexer.buf[lexer.bufpos]
@ -719,9 +739,7 @@ template plainScalar(lexer: BaseLexer, content: var string,
content.add(after) content.add(after)
content.add(c2) content.add(c2)
break break
of flowIndicators: of lineEnd, flowIndicators: break
if context == cBlock: content.add(c)
else: break
of ':': of ':':
if lexer.isPlainSafe(lexer.bufpos + 1, context): content.add(':') if lexer.isPlainSafe(lexer.bufpos + 1, context): content.add(':')
else: break outer else: break outer
@ -735,7 +753,7 @@ template continueMultilineScalar() {.dirty.} =
state = fpBlockAfterPlainScalar state = fpBlockAfterPlainScalar
template handleFlowPlainScalar() {.dirty.} = template handleFlowPlainScalar() {.dirty.} =
content = "" content.setLen(0)
startToken() startToken()
p.lexer.plainScalar(content, cFlow) p.lexer.plainScalar(content, cFlow)
if p.lexer.buf[p.lexer.bufpos] in {'{', '}', '[', ']', ',', ':', '#'}: if p.lexer.buf[p.lexer.bufpos] in {'{', '}', '[', ']', ',', ':', '#'}:
@ -771,7 +789,7 @@ template handleFlowPlainScalar() {.dirty.} =
content.add(repeat(' ', newlines - 1)) content.add(repeat(' ', newlines - 1))
newlines = 0 newlines = 0
p.lexer.plainScalar(content, cFlow) p.lexer.plainScalar(content, cFlow)
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
handleObjectEnd(fpFlowAfterObject) handleObjectEnd(fpFlowAfterObject)
template ensureCorrectIndentation() {.dirty.} = template ensureCorrectIndentation() {.dirty.} =
@ -811,7 +829,7 @@ template tagHandle(lexer: var BaseLexer, content: var string,
of '<': of '<':
if i == 1: if i == 1:
shorthandEnd = -1 shorthandEnd = -1
content = "" content.setLen(0)
else: lexerError(lexer, "Illegal character in tag handle") else: lexerError(lexer, "Illegal character in tag handle")
of '>': of '>':
if shorthandEnd == -1: if shorthandEnd == -1:
@ -880,7 +898,7 @@ template blockScalar(lexer: BaseLexer, content: var string,
# TODO: is this correct? # TODO: is this correct?
else: debugFail() else: debugFail()
var newlines = 0 var newlines = 0
content = "" content.setLen(0)
block outer: block outer:
while true: while true:
block inner: block inner:
@ -1042,7 +1060,9 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
shorthands: Table[string, string] shorthands: Table[string, string]
anchors: Table[string, AnchorId] anchors: Table[string, AnchorId]
nextAnchorId: AnchorId nextAnchorId: AnchorId
content: string content: string = ""
after: string = ""
tagUri: string = ""
tag: TagId tag: TagId
anchor: AnchorId anchor: AnchorId
ancestry = newSeq[FastParseLevel]() ancestry = newSeq[FastParseLevel]()
@ -1078,11 +1098,12 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
p.lexer.lineEnding() p.lexer.lineEnding()
handleLineEnd(false) handleLineEnd(false)
of ldTag: of ldTag:
var shorthand, uri = "" var shorthand = ""
tagUri.setLen(0)
startToken() startToken()
p.lexer.tagShorthand(shorthand) p.lexer.tagShorthand(shorthand)
p.lexer.tagUri(uri) p.lexer.tagUriMapping(tagUri)
shorthands[shorthand] = uri shorthands[shorthand] = tagUri
p.lexer.lineEnding() p.lexer.lineEnding()
handleLineEnd(false) handleLineEnd(false)
of ldUnknown: of ldUnknown:
@ -1117,7 +1138,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
handleBlockSequenceIndicator() handleBlockSequenceIndicator()
state = fpBlockObjectStart state = fpBlockObjectStart
of lpdeScalarContent: of lpdeScalarContent:
content = "" content.setLen(0)
p.lexer.plainScalar(content, cBlock) p.lexer.plainScalar(content, cBlock)
state = fpBlockAfterPlainScalar state = fpBlockAfterPlainScalar
else: else:
@ -1151,7 +1172,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
ensureCorrectIndentation() ensureCorrectIndentation()
ancestry.add(level) ancestry.add(level)
level = initLevel(fplUnknown) level = initLevel(fplUnknown)
content = "" content.setLen(0)
p.lexer.plainScalar(content, cBlock) p.lexer.plainScalar(content, cBlock)
state = fpBlockAfterPlainScalar state = fpBlockAfterPlainScalar
of '.': of '.':
@ -1174,7 +1195,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
ensureCorrectIndentation() ensureCorrectIndentation()
ancestry.add(level) ancestry.add(level)
level = initLevel(fplUnknown) level = initLevel(fplUnknown)
content = "" content.setLen(0)
p.lexer.plainScalar(content, cBlock) p.lexer.plainScalar(content, cBlock)
state = fpBlockAfterPlainScalar state = fpBlockAfterPlainScalar
of ' ': of ' ':
@ -1252,7 +1273,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
startToken() startToken()
parserError("Unexpected token") parserError("Unexpected token")
of '#': of '#':
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
p.lexer.lineEnding() p.lexer.lineEnding()
handleLineEnd(true) handleLineEnd(true)
handleObjectEnd(fpBlockLineStart) handleObjectEnd(fpBlockLineStart)
@ -1279,7 +1300,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
p.lexer.bufpos = p.lexer.handleCR(p.lexer.bufpos) p.lexer.bufpos = p.lexer.handleCR(p.lexer.bufpos)
state = fpBlockLineStart state = fpBlockLineStart
else: else:
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
handleObjectEnd(fpBlockAfterObject) handleObjectEnd(fpBlockAfterObject)
of fpBlockAfterObject: of fpBlockAfterObject:
debug("state: blockAfterObject") debug("state: blockAfterObject")
@ -1348,29 +1369,29 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
level.indentation = UnknownIndentation level.indentation = UnknownIndentation
of '\'': of '\'':
handleBlockItemStart() handleBlockItemStart()
content = "" content.setLen(0)
startToken() startToken()
p.lexer.singleQuotedScalar(content) p.lexer.singleQuotedScalar(content)
if tag == yTagQuestionMark: tag = yTagExclamationMark if tag == yTagQuestionMark: tag = yTagExclamationMark
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
handleObjectEnd(fpBlockAfterObject) handleObjectEnd(fpBlockAfterObject)
of '"': of '"':
handleBlockItemStart() handleBlockItemStart()
content = "" content.setLen(0)
startToken() startToken()
p.lexer.doubleQuotedScalar(content) p.lexer.doubleQuotedScalar(content)
if tag == yTagQuestionMark: tag = yTagExclamationMark if tag == yTagQuestionMark: tag = yTagExclamationMark
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
handleObjectEnd(fpBlockAfterObject) handleObjectEnd(fpBlockAfterObject)
of '|', '>': of '|', '>':
# TODO: this will scan for possible map start, which is not # TODO: this will scan for possible map start, which is not
# neccessary in this case # neccessary in this case
handleBlockItemStart() handleBlockItemStart()
var stateAfter: FastParseState var stateAfter: FastParseState
content = "" content.setLen(0)
p.lexer.blockScalar(content, stateAfter) p.lexer.blockScalar(content, stateAfter)
if tag == yTagQuestionMark: tag = yTagExclamationMark if tag == yTagQuestionMark: tag = yTagExclamationMark
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
handleObjectEnd(stateAfter) handleObjectEnd(stateAfter)
if stateAfter == fpBlockObjectStart and if stateAfter == fpBlockObjectStart and
p.lexer.buf[p.lexer.bufpos] != '#': p.lexer.buf[p.lexer.bufpos] != '#':
@ -1379,7 +1400,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
of '-': of '-':
if p.lexer.isPlainSafe(p.lexer.bufpos + 1, cBlock): if p.lexer.isPlainSafe(p.lexer.bufpos + 1, cBlock):
handleBlockItemStart() handleBlockItemStart()
content = "" content.setLen(0)
startToken() startToken()
p.lexer.plainScalar(content, cBlock) p.lexer.plainScalar(content, cBlock)
state = fpBlockAfterPlainScalar state = fpBlockAfterPlainScalar
@ -1401,7 +1422,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
of '?': of '?':
if p.lexer.isPlainSafe(p.lexer.bufpos + 1, cBlock): if p.lexer.isPlainSafe(p.lexer.bufpos + 1, cBlock):
handleBlockItemStart() handleBlockItemStart()
content = "" content.setLen(0)
startToken() startToken()
p.lexer.plainScalar(content, cBlock) p.lexer.plainScalar(content, cBlock)
state = fpBlockAfterPlainScalar state = fpBlockAfterPlainScalar
@ -1411,7 +1432,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
of ':': of ':':
if p.lexer.isPlainSafe(p.lexer.bufpos + 1, cBlock): if p.lexer.isPlainSafe(p.lexer.bufpos + 1, cBlock):
handleBlockItemStart() handleBlockItemStart()
content = "" content.setLen(0)
startToken() startToken()
p.lexer.plainScalar(content, cBlock) p.lexer.plainScalar(content, cBlock)
state = fpBlockAfterPlainScalar state = fpBlockAfterPlainScalar
@ -1422,7 +1443,7 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
lexerError(p.lexer, "Reserved characters cannot start a plain scalar") lexerError(p.lexer, "Reserved characters cannot start a plain scalar")
else: else:
handleBlockItemStart() handleBlockItemStart()
content = "" content.setLen(0)
startToken() startToken()
p.lexer.plainScalar(content, cBlock) p.lexer.plainScalar(content, cBlock)
state = fpBlockAfterPlainScalar state = fpBlockAfterPlainScalar
@ -1602,19 +1623,19 @@ proc parse*(p: YamlParser, s: Stream): YamlStream =
p.lexer.bufpos.inc() p.lexer.bufpos.inc()
of '\'': of '\'':
handleFlowItemStart() handleFlowItemStart()
content = "" content.setLen(0)
startToken() startToken()
p.lexer.singleQuotedScalar(content) p.lexer.singleQuotedScalar(content)
if tag == yTagQuestionMark: tag = yTagExclamationMark if tag == yTagQuestionMark: tag = yTagExclamationMark
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
handleObjectEnd(fpFlowAfterObject) handleObjectEnd(fpFlowAfterObject)
of '"': of '"':
handleFlowItemStart() handleFlowItemStart()
content = "" content.setLen(0)
startToken() startToken()
p.lexer.doubleQuotedScalar(content) p.lexer.doubleQuotedScalar(content)
if tag == yTagQuestionMark: tag = yTagExclamationMark if tag == yTagQuestionMark: tag = yTagExclamationMark
yield scalarEvent(content, tag, anchor) yieldShallowScalar(content)
handleObjectEnd(fpFlowAfterObject) handleObjectEnd(fpFlowAfterObject)
of '!': of '!':
handleFlowItemStart() handleFlowItemStart()