265 lines
6.8 KiB
Nim
265 lines
6.8 KiB
Nim
## A simple lexer meant to tokenize an input string as a shell would do.
|
|
import lexbase
|
|
import options
|
|
import streams
|
|
import os
|
|
import strutils
|
|
|
|
type
|
|
ShellLexer = object of BaseLexer
|
|
preserveTrailingWs: bool
|
|
mergeWordBreaks: bool
|
|
wordBreakChars: string
|
|
|
|
const
|
|
WORDBREAKS = "\"'@><=;|&(:"
|
|
SAFE_CHARS = {'a'..'z', 'A'..'Z', '0'..'9', '@', '%', '+', '=', ':', ',', '.', '/', '-'}
|
|
|
|
proc open(l: var ShellLexer, input: Stream, wordBreakChars: string = WORDBREAKS, preserveTrailingWs = true) =
|
|
lexbase.open(l, input)
|
|
l.preserveTrailingWs = preserveTrailingWs
|
|
l.mergeWordBreaks = false
|
|
l.wordBreakChars = wordBreakChars
|
|
|
|
proc parseQuoted(l: var ShellLexer, pos: int, isSingle: bool, output: var string): int =
|
|
var pos = pos
|
|
while true:
|
|
case l.buf[pos]:
|
|
of '\c': pos = lexbase.handleCR(l, pos)
|
|
of '\L': pos = lexbase.handleLF(l, pos)
|
|
of lexbase.EndOfFile: break
|
|
of '\\':
|
|
# Consume the backslash and the following character
|
|
inc(pos)
|
|
if (isSingle and l.buf[pos] in {'\''}) or
|
|
(not isSingle and l.buf[pos] in {'$', '`', '\\', '"'}):
|
|
# Escape the character
|
|
output.add(l.buf[pos])
|
|
else:
|
|
# Rewrite the escape sequence as-is
|
|
output.add('\\')
|
|
output.add(l.buf[pos])
|
|
inc(pos)
|
|
of '\"':
|
|
inc(pos)
|
|
if isSingle: output.add('\"')
|
|
else: break
|
|
of '\'':
|
|
inc(pos)
|
|
if isSingle: break
|
|
else: output.add('\'')
|
|
else:
|
|
output.add(l.buf[pos])
|
|
inc(pos)
|
|
return pos
|
|
|
|
proc getTok(l: var ShellLexer): Option[string] =
|
|
var pos = l.bufpos
|
|
|
|
# Skip the initial whitespace
|
|
while true:
|
|
case l.buf[pos]:
|
|
of '\c': pos = lexbase.handleCR(l, pos)
|
|
of '\L': pos = lexbase.handleLF(l, pos)
|
|
of '#':
|
|
# Skip everything until EOF/EOL
|
|
while l.buf[pos] notin {'\c', '\L', lexbase.EndOfFile}:
|
|
inc(pos)
|
|
of lexbase.EndOfFile:
|
|
# If we did eat up some whitespace return an empty token, this is needed
|
|
# to find out if the string ends with whitespace.
|
|
if l.preserveTrailingWs and l.bufpos != pos:
|
|
l.bufpos = pos
|
|
return some("")
|
|
return none(string)
|
|
of ' ', '\t':
|
|
inc(pos)
|
|
else:
|
|
break
|
|
|
|
var tokLit = ""
|
|
# Parse the next token
|
|
while true:
|
|
case l.buf[pos]:
|
|
of '\c': pos = lexbase.handleCR(l, pos)
|
|
of '\L': pos = lexbase.handleLF(l, pos)
|
|
of '\'':
|
|
# Single-quoted string
|
|
inc(pos)
|
|
pos = parseQuoted(l, pos, true, tokLit)
|
|
of '"':
|
|
# Double-quoted string
|
|
inc(pos)
|
|
pos = parseQuoted(l, pos, false, tokLit)
|
|
of '\\':
|
|
# Escape sequence
|
|
inc(pos)
|
|
if l.buf[pos] != lexbase.EndOfFile:
|
|
tokLit.add(l.buf[pos])
|
|
inc(pos)
|
|
of '#', ' ', '\t', lexbase.EndOfFile:
|
|
break
|
|
else:
|
|
let ch = l.buf[pos]
|
|
if ch notin l.wordBreakChars:
|
|
tokLit.add(l.buf[pos])
|
|
inc(pos)
|
|
# Merge together runs of adjacent word-breaking characters if requested
|
|
elif l.mergeWordBreaks:
|
|
while l.buf[pos] in l.wordBreakChars:
|
|
tokLit.add(l.buf[pos])
|
|
inc(pos)
|
|
l.mergeWordBreaks = false
|
|
break
|
|
else:
|
|
l.mergeWordBreaks = true
|
|
break
|
|
|
|
l.bufpos = pos
|
|
return some(tokLit)
|
|
|
|
proc splitCompletionLine*(): seq[string] =
|
|
let comp_line = os.getEnv("COMP_LINE")
|
|
var comp_point =
|
|
try:
|
|
parseInt(os.getEnv("COMP_POINT", "0"))
|
|
except ValueError:
|
|
return @[]
|
|
|
|
if comp_point == len(comp_line):
|
|
comp_point -= 1
|
|
|
|
if comp_point < 0 or comp_point > len(comp_line):
|
|
return @[]
|
|
|
|
# Take the useful part only
|
|
var strm = newStringStream(comp_line[0..comp_point])
|
|
|
|
# Split the resulting string
|
|
var l: ShellLexer
|
|
try:
|
|
l.open(strm)
|
|
while true:
|
|
let token = l.getTok()
|
|
if token.isNone():
|
|
break
|
|
result.add(token.get())
|
|
except IOError, OSError:
|
|
return @[]
|
|
|
|
proc shellQuote*(word: string): string =
|
|
if len(word) == 0:
|
|
return "''"
|
|
|
|
if allCharsInSet(word, SAFE_CHARS):
|
|
return word
|
|
|
|
result.add('\'')
|
|
for ch in word:
|
|
if ch == '\'': result.add('\\')
|
|
result.add(ch)
|
|
|
|
result.add('\'')
|
|
|
|
proc shellPathEscape*(path: string): string =
|
|
if allCharsInSet(path, SAFE_CHARS):
|
|
return path
|
|
|
|
for ch in path:
|
|
if ch notin SAFE_CHARS:
|
|
result.add('\\')
|
|
result.add(ch)
|
|
|
|
when isMainModule:
|
|
# Test data lifted from python's shlex unit-tests
|
|
const data = """
|
|
foo bar|foo|bar|
|
|
foo bar|foo|bar|
|
|
foo bar |foo|bar|
|
|
foo bar bla fasel|foo|bar|bla|fasel|
|
|
x y z xxxx|x|y|z|xxxx|
|
|
\x bar|x|bar|
|
|
\ x bar| x|bar|
|
|
\ bar| bar|
|
|
foo \x bar|foo|x|bar|
|
|
foo \ x bar|foo| x|bar|
|
|
foo \ bar|foo| bar|
|
|
foo "bar" bla|foo|bar|bla|
|
|
"foo" "bar" "bla"|foo|bar|bla|
|
|
"foo" bar "bla"|foo|bar|bla|
|
|
"foo" bar bla|foo|bar|bla|
|
|
foo 'bar' bla|foo|bar|bla|
|
|
'foo' 'bar' 'bla'|foo|bar|bla|
|
|
'foo' bar 'bla'|foo|bar|bla|
|
|
'foo' bar bla|foo|bar|bla|
|
|
blurb foo"bar"bar"fasel" baz|blurb|foobarbarfasel|baz|
|
|
blurb foo'bar'bar'fasel' baz|blurb|foobarbarfasel|baz|
|
|
""||
|
|
''||
|
|
foo "" bar|foo||bar|
|
|
foo '' bar|foo||bar|
|
|
foo "" "" "" bar|foo||||bar|
|
|
foo '' '' '' bar|foo||||bar|
|
|
\"|"|
|
|
"\""|"|
|
|
"foo\ bar"|foo\ bar|
|
|
"foo\\ bar"|foo\ bar|
|
|
"foo\\ bar\""|foo\ bar"|
|
|
"foo\\" bar\"|foo\|bar"|
|
|
"foo\\ bar\" dfadf"|foo\ bar" dfadf|
|
|
"foo\\\ bar\" dfadf"|foo\\ bar" dfadf|
|
|
"foo\\\x bar\" dfadf"|foo\\x bar" dfadf|
|
|
"foo\x bar\" dfadf"|foo\x bar" dfadf|
|
|
\'|'|
|
|
'foo\ bar'|foo\ bar|
|
|
'foo\\ bar'|foo\\ bar|
|
|
"foo\\\x bar\" df'a\ 'df"|foo\\x bar" df'a\ 'df|
|
|
\"foo|"foo|
|
|
\"foo\x|"foox|
|
|
"foo\x"|foo\x|
|
|
"foo\ "|foo\ |
|
|
foo\ xx|foo xx|
|
|
foo\ x\x|foo xx|
|
|
foo\ x\x\"|foo xx"|
|
|
"foo\ x\x"|foo\ x\x|
|
|
"foo\ x\x\\"|foo\ x\x\|
|
|
"foo\ x\x\\""foobar"|foo\ x\x\foobar|
|
|
"foo\ x\x\\"\'"foobar"|foo\ x\x\'foobar|
|
|
"foo\ x\x\\"\'"fo'obar"|foo\ x\x\'fo'obar|
|
|
"foo\ x\x\\"\'"fo'obar" 'don'\''t'|foo\ x\x\'fo'obar|don't|
|
|
"foo\ x\x\\"\'"fo'obar" 'don'\''t' \\|foo\ x\x\'fo'obar|don't|\|
|
|
'foo\ bar'|foo\ bar|
|
|
'foo\\ bar'|foo\\ bar|
|
|
foo\ bar|foo bar|
|
|
:-) ;-)|:-)|;-)|
|
|
áéíóú|áéíóú|
|
|
"""
|
|
var corpus = newStringStream(data)
|
|
var line = ""
|
|
while corpus.readLine(line):
|
|
let chunks = line.split('|')
|
|
let expr = chunks[0]
|
|
let expected = chunks[1..^2]
|
|
|
|
var l: ShellLexer
|
|
var strm = newStringStream(expr)
|
|
var got: seq[string]
|
|
l.open(strm, wordBreakChars="", preserveTrailingWs=false)
|
|
while true:
|
|
let x = l.getTok()
|
|
if x.isNone():
|
|
break
|
|
got.add(x.get())
|
|
|
|
if got != expected:
|
|
echo "got ", got
|
|
echo "expected ", expected
|
|
doAssert(false)
|
|
|
|
doAssert(quoteWord("") == "''")
|
|
doAssert(quoteWord("\\\"") == "'\\\"'")
|
|
doAssert(quoteWord("foobar") == "foobar")
|
|
doAssert(quoteWord("foo$bar") == "'foo$bar'")
|
|
doAssert(quoteWord("foo bar") == "'foo bar'")
|
|
doAssert(quoteWord("foo'bar") == "'foo\\'bar'")
|