commit a5f7006065d2dfda92dc99c62209422604a97284 Author: ge Date: Sat Jul 12 03:07:50 2025 +0300 init diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..01072ca --- /dev/null +++ b/.editorconfig @@ -0,0 +1,8 @@ +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true + +[*.v] +indent_style = tab diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9a98968 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,8 @@ +* text=auto eol=lf +*.bat eol=crlf + +*.v linguist-language=V +*.vv linguist-language=V +*.vsh linguist-language=V +v.mod linguist-language=V +.vdocignore linguist-language=ignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..88c0fdd --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# Binaries for programs and plugins +main +code +*.exe +*.exe~ +*.so +*.dylib +*.dll + +# Ignore binary output folders +bin/ + +# Ignore common editor/system specific metadata +.DS_Store +.idea/ +.vscode/ +*.iml + +# ENV +.env + +# vweb and database +*.db +*.js diff --git a/shell.v b/shell.v new file mode 100644 index 0000000..6f9c881 --- /dev/null +++ b/shell.v @@ -0,0 +1,329 @@ +module shell + +import strings.textscanner + +pub const safe_chars = '%+,-./0123456789:=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz' +pub const word_chars = 'abcdfeghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' +pub const unicode_word_chars = 'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ' +pub const extra_word_chars = '~-./*?=' +pub const punct_chars = '();<>|&' + +// quote returns a shell-escaped version of string `s`. +// Example: +// ``` +// assert shell.quote("d'arc") == 'd\'"\'"\'arc\'' +// ``` +pub fn quote(s string) string { + if s == '' { + return "''" + } + if s.contains_only(safe_chars) { + return s + } + return "'" + s.replace("'", '\'"\'"\'') + "'" +} + +// join joins `s` array members into a shell-escaped string. +// Example: +// ``` +// assert shell.join(['sh', '-c', 'hostname -f']) == "sh -c 'hostname -f'" +// ``` +pub fn join(s []string) string { + mut quoted_args := []string{} + for arg in s { + quoted_args << quote(arg) + } + return quoted_args.join(' ') +} + +@[params] +pub struct SplitParams { +pub: + posix bool = true + comments bool +} + +// split splits the string `s` into array using shell-like syntax. +// Example: +// ``` +// assert shell.split("sh -c 'hostname -f'") == ['sh', '-c', 'hostname -f'] +// ``` +pub fn split(s string, params SplitParams) []string { + mut parts := []string{} + mut lexer := new(s, + posix: params.posix + comments: params.comments + ) + for token in lexer { + parts << token + } + return parts +} + +@[params] +pub struct LexerParams { +pub: + posix bool + comments bool = true + punctuation bool +} + +// new creates new Lexer instance. See LexerParams docstrings for info. +// Instantiate Lexer directly if you need more custom lexer setup. +pub fn new(input string, params LexerParams) Lexer { + mut words := word_chars + if params.posix { + words += unicode_word_chars + } + if params.punctuation { + words += extra_word_chars + } + return Lexer{ + scanner: textscanner.new(input) + word_chars: words + punct_chars: if params.punctuation { punct_chars } else { '' } + comment_chars: if params.comments { '#' } else { '' } + // `whitespace_split` must be true if `punctuation` is false, otherwise + // lexer stucks into infinite loop! + whitespace_split: if params.punctuation { false } else { true } + posix_mode: params.posix + } +} + +pub struct Lexer { +pub mut: + scanner textscanner.TextScanner +pub: + word_chars string = word_chars + punct_chars string = punct_chars + comment_chars string = '#' + quotes string = '\'"' + escape string = '\\' + escaped_quotes string = '"' + posix_mode bool + // whitespace_split must be true if puct_chars is empty to prevent + // lexer stucking into infinite loop! The parser sucks, I know. + // Use `new` function to cretae Lexer instance to ensure the + // correct whitespace_split value. + whitespace_split bool +mut: + // This fields are used internally to store the current parser state. + lineno int = 1 + state string = ' ' + token string + push_back []string + push_back_chars []string +} + +// next returns parsed tokens until end of input string. +pub fn (mut x Lexer) next() ?string { + if x.push_back.len != 0 { + token := x.push_back.first() + x.push_back.drop(1) + return token + } + if x.scanner.pos != x.scanner.ilen { + return x.token() + } + return none +} + +// token parses and returns one token from input string regarding to current scanner state. +fn (mut x Lexer) token() ?string { + // TODO: this function must be fixed and completely rewritten + mut quoted := false + mut escaped_state := ' ' + for { + mut nextchar := x.scanner.peek_u8() + if x.punct_chars != '' && x.push_back_chars.len != 0 { + nextchar = x.push_back_chars[x.push_back_chars.len - 1..][0].u8() + } + print_dbg('state=<${x.state}> I see character <${nextchar.ascii_str()}>') + if nextchar == `\n` { + x.lineno++ + } + match true { + x.state == '' { + x.token = '' + break + } + x.state == ' ' { + match true { + nextchar == 0 { + // There is `if not nextcahr` check, that means 'EOF reached' + x.state = '' + break + } + nextchar.is_space() { + print_dbg('I see whitespace in whitespace state') + if x.token != '' || (x.posix_mode && quoted) { + break + } + x.scanner.skip() + continue + } + x.comment_chars.contains_u8(nextchar) { + // We need to skip all commented characters until \n found. + x.scanner.goto_end() + x.lineno++ + } + x.posix_mode && x.escape.contains_u8(nextchar) { + escaped_state = 'a' + x.state = nextchar.ascii_str() + } + x.word_chars.contains_u8(nextchar) { + x.token = nextchar.ascii_str() + x.state = 'a' + } + x.punct_chars.contains_u8(nextchar) { + x.token = nextchar.ascii_str() + x.state = 'c' + } + x.quotes.contains_u8(nextchar) { + if !x.posix_mode { + x.token = nextchar.ascii_str() + } + x.state = nextchar.ascii_str() + } + x.whitespace_split == true { + x.token = nextchar.ascii_str() + x.state = 'a' + } + else { + x.token = nextchar.ascii_str() + if x.token != '' || (x.posix_mode && quoted) { + break + } else { + continue + } + } + } + } + x.quotes.contains(x.state) { + quoted = true + if nextchar == 0 { + // There is `if not nextcahr` check, that means 'EOF reached' + panic('found non-terminated quote') + } + match true { + nextchar.ascii_str() == x.state { + if !x.posix_mode { + x.token += nextchar.ascii_str() + x.state = ' ' + } else { + x.state = 'a' + } + } + x.posix_mode && x.escape.contains_u8(nextchar) + && x.escaped_quotes.contains(x.state) { + escaped_state = x.state + x.state = nextchar.ascii_str() + } + else { + x.token += nextchar.ascii_str() + } + } + } + x.escape.contains(x.state) { + if nextchar == 0 { + // There is `if not nextcahr` check, that means 'EOF reached' + panic('no escaped character found') + } + if x.quotes.contains(escaped_state) && nextchar.ascii_str() != x.state + && nextchar.ascii_str() != escaped_state { + x.token += x.state + } + x.token += nextchar.ascii_str() + x.state = escaped_state + } + x.state in ['a', 'c'] { + match true { + nextchar == 0 { + x.state = '' // self.state = None + break + } + nextchar.is_space() { + print_dbg('I see whitespace in word state') + x.state = ' ' + if x.token != '' || (x.posix_mode && quoted) { + break + } else { + continue + } + } + x.comment_chars.contains_u8(nextchar) { + // We need to skip all commented characters until \n found. + // for { + // if x.scanner.peek_u8() == `\n` { + // x.scanner.skip() + // break + // } + // x.scanner.skip() + // } + // x.lineno++ + if x.posix_mode { + x.state = ' ' + if x.token != '' || (x.posix_mode && quoted) { + break + } else { + continue + } + } + } + x.state == 'c' { + if x.punct_chars.contains_u8(nextchar) { + x.token += nextchar.ascii_str() + } else { + if !nextchar.is_space() { + x.push_back_chars << nextchar.ascii_str() + } + x.state = ' ' + break + } + } + x.posix_mode && x.quotes.contains_u8(nextchar) { + x.state = nextchar.ascii_str() + } + x.posix_mode && x.escape.contains_u8(nextchar) { + escaped_state = 'a' + x.state = nextchar.ascii_str() + } + (x.word_chars.contains_u8(nextchar) + || x.quotes.contains_u8(nextchar)) + || (x.whitespace_split && x.punct_chars.contains_u8(nextchar)) { + x.token += nextchar.ascii_str() + } + else { + if x.punct_chars != '' { + x.push_back_chars << nextchar.ascii_str() + } else { + x.push_back.prepend(nextchar.ascii_str()) + } + print_dbg('I see punctuation char in word state') + x.state = ' ' + if x.token != '' || (x.posix_mode && quoted) { + break + } else { + continue + } + } + } + } + else {} + } + x.scanner.next() + } + result := x.token + x.token = '' + if x.posix_mode && !quoted && result == '' { + return none + } + print_dbg('I got token <${result}>') + return result +} + +fn print_dbg(s string) { + $if trace_shell_lexer ? { + eprintln('shell lexer: ' + s) + } +} diff --git a/shell_test.v b/shell_test.v new file mode 100644 index 0000000..f84b9c8 --- /dev/null +++ b/shell_test.v @@ -0,0 +1,27 @@ +import shell + +// TODO: pass all tests from https://github.com/python/cpython/blob/main/Lib/test/test_shlex.py + +fn test_quote() { + assert shell.quote("janna d'arc") == '\'janna d\'"\'"\'arc\'' +} + +fn test_join() { + assert shell.join(['sh', '-c', 'hostname -f']) == "sh -c 'hostname -f'" +} + +fn test_split() { + assert shell.split("sh -c 'hostname -f'") == ['sh', '-c', 'hostname -f'] + assert shell.split('sh -c hostname') == ['sh', '-c', 'hostname'] + assert shell.split('hostname -f # some comment') == ['hostname', '-f', '#', 'some', 'comment'] + assert shell.split('hostname -f # some comment', comments: true) == ['hostname', '-f'] + assert shell.split('grep -rn "#"') == ['grep', '-rn', '#'] + assert shell.split('grep -rn "#"', comments: true) == ['grep', '-rn', '#'] + + // FIXME: both assertions fails + // s := 'grep -rn hello # search hello + // awk --help + // '.trim_indent() + // assert shell.split(s) == ['grep', '-rn', 'hello', '#', 'search', 'hello', 'awk', '--help'] + // assert shell.split(s, comments: true) == ['grep', '-rn', 'hello', 'awk', '--help'] +} diff --git a/v.mod b/v.mod new file mode 100644 index 0000000..eb6fc2e --- /dev/null +++ b/v.mod @@ -0,0 +1,7 @@ +Module { + name: 'shell' + description: 'Shell lexer' + version: '0.0.0' + license: 'MIT' + dependencies: [] +}