module shell import strings.textscanner pub const safe_chars = '%+,-./0123456789:=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz' pub const word_chars = 'abcdfeghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' pub const unicode_word_chars = 'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ' pub const extra_word_chars = '~-./*?=' pub const punct_chars = '();<>|&' // quote returns a shell-escaped version of string `s`. // Example: // ``` // assert shell.quote("d'arc") == 'd\'"\'"\'arc\'' // ``` pub fn quote(s string) string { if s == '' { return "''" } if s.contains_only(safe_chars) { return s } return "'" + s.replace("'", '\'"\'"\'') + "'" } // join joins `s` array members into a shell-escaped string. // Example: // ``` // assert shell.join(['sh', '-c', 'hostname -f']) == "sh -c 'hostname -f'" // ``` pub fn join(s []string) string { mut quoted_args := []string{} for arg in s { quoted_args << quote(arg) } return quoted_args.join(' ') } @[params] pub struct SplitParams { pub: posix bool = true comments bool } // split splits the string `s` into array using shell-like syntax. // Example: // ``` // assert shell.split("sh -c 'hostname -f'") == ['sh', '-c', 'hostname -f'] // ``` pub fn split(s string, params SplitParams) []string { mut parts := []string{} mut lexer := new(s, posix: params.posix comments: params.comments ) for token in lexer { parts << token } return parts } @[params] pub struct LexerParams { pub: posix bool comments bool = true punctuation bool } // new creates new Lexer instance. See LexerParams docstrings for info. // Instantiate Lexer directly if you need more custom lexer setup. pub fn new(input string, params LexerParams) Lexer { mut words := word_chars if params.posix { words += unicode_word_chars } if params.punctuation { words += extra_word_chars } return Lexer{ scanner: textscanner.new(input) word_chars: words punct_chars: if params.punctuation { punct_chars } else { '' } comment_chars: if params.comments { '#' } else { '' } // `whitespace_split` must be true if `punctuation` is false, otherwise // lexer stucks into infinite loop! whitespace_split: if params.punctuation { false } else { true } posix_mode: params.posix } } pub struct Lexer { pub mut: scanner textscanner.TextScanner pub: word_chars string = word_chars punct_chars string = punct_chars comment_chars string = '#' quotes string = '\'"' escape string = '\\' escaped_quotes string = '"' posix_mode bool // whitespace_split must be true if puct_chars is empty to prevent // lexer stucking into infinite loop! The parser sucks, I know. // Use `new` function to cretae Lexer instance to ensure the // correct whitespace_split value. whitespace_split bool mut: // This fields are used internally to store the current parser state. lineno int = 1 state string = ' ' token string push_back []string push_back_chars []string } // next returns parsed tokens until end of input string. pub fn (mut x Lexer) next() ?string { if x.push_back.len != 0 { token := x.push_back.first() x.push_back.drop(1) return token } if x.scanner.pos != x.scanner.ilen { return x.token() } return none } // token parses and returns one token from input string regarding to current scanner state. fn (mut x Lexer) token() ?string { // TODO: this function must be fixed and completely rewritten mut quoted := false mut escaped_state := ' ' for { mut nextchar := x.scanner.peek_u8() if x.punct_chars != '' && x.push_back_chars.len != 0 { nextchar = x.push_back_chars[x.push_back_chars.len - 1..][0].u8() } print_dbg('state=<${x.state}> I see character <${nextchar.ascii_str()}>') if nextchar == `\n` { x.lineno++ } match true { x.state == '' { x.token = '' break } x.state == ' ' { match true { nextchar == 0 { // There is `if not nextcahr` check, that means 'EOF reached' x.state = '' break } nextchar.is_space() { print_dbg('I see whitespace in whitespace state') if x.token != '' || (x.posix_mode && quoted) { break } x.scanner.skip() continue } x.comment_chars.contains_u8(nextchar) { // We need to skip all commented characters until \n found. x.scanner.goto_end() x.lineno++ } x.posix_mode && x.escape.contains_u8(nextchar) { escaped_state = 'a' x.state = nextchar.ascii_str() } x.word_chars.contains_u8(nextchar) { x.token = nextchar.ascii_str() x.state = 'a' } x.punct_chars.contains_u8(nextchar) { x.token = nextchar.ascii_str() x.state = 'c' } x.quotes.contains_u8(nextchar) { if !x.posix_mode { x.token = nextchar.ascii_str() } x.state = nextchar.ascii_str() } x.whitespace_split == true { x.token = nextchar.ascii_str() x.state = 'a' } else { x.token = nextchar.ascii_str() if x.token != '' || (x.posix_mode && quoted) { break } else { continue } } } } x.quotes.contains(x.state) { quoted = true if nextchar == 0 { // There is `if not nextcahr` check, that means 'EOF reached' panic('found non-terminated quote') } match true { nextchar.ascii_str() == x.state { if !x.posix_mode { x.token += nextchar.ascii_str() x.state = ' ' } else { x.state = 'a' } } x.posix_mode && x.escape.contains_u8(nextchar) && x.escaped_quotes.contains(x.state) { escaped_state = x.state x.state = nextchar.ascii_str() } else { x.token += nextchar.ascii_str() } } } x.escape.contains(x.state) { if nextchar == 0 { // There is `if not nextcahr` check, that means 'EOF reached' panic('no escaped character found') } if x.quotes.contains(escaped_state) && nextchar.ascii_str() != x.state && nextchar.ascii_str() != escaped_state { x.token += x.state } x.token += nextchar.ascii_str() x.state = escaped_state } x.state in ['a', 'c'] { match true { nextchar == 0 { x.state = '' // self.state = None break } nextchar.is_space() { print_dbg('I see whitespace in word state') x.state = ' ' if x.token != '' || (x.posix_mode && quoted) { break } else { continue } } x.comment_chars.contains_u8(nextchar) { // We need to skip all commented characters until \n found. // for { // if x.scanner.peek_u8() == `\n` { // x.scanner.skip() // break // } // x.scanner.skip() // } // x.lineno++ if x.posix_mode { x.state = ' ' if x.token != '' || (x.posix_mode && quoted) { break } else { continue } } } x.state == 'c' { if x.punct_chars.contains_u8(nextchar) { x.token += nextchar.ascii_str() } else { if !nextchar.is_space() { x.push_back_chars << nextchar.ascii_str() } x.state = ' ' break } } x.posix_mode && x.quotes.contains_u8(nextchar) { x.state = nextchar.ascii_str() } x.posix_mode && x.escape.contains_u8(nextchar) { escaped_state = 'a' x.state = nextchar.ascii_str() } (x.word_chars.contains_u8(nextchar) || x.quotes.contains_u8(nextchar)) || (x.whitespace_split && x.punct_chars.contains_u8(nextchar)) { x.token += nextchar.ascii_str() } else { if x.punct_chars != '' { x.push_back_chars << nextchar.ascii_str() } else { x.push_back.prepend(nextchar.ascii_str()) } print_dbg('I see punctuation char in word state') x.state = ' ' if x.token != '' || (x.posix_mode && quoted) { break } else { continue } } } } else {} } x.scanner.next() } result := x.token x.token = '' if x.posix_mode && !quoted && result == '' { return none } print_dbg('I got token <${result}>') return result } fn print_dbg(s string) { $if trace_shell_lexer ? { eprintln('shell lexer: ' + s) } }