init

2025-07-12 03:07:50 +03:00
commit a5f7006065
6 changed files with 403 additions and 0 deletions
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,8 @@
 [*]
 charset = utf-8
 end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
 [*.v]
 indent_style = tab
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,8 @@
 * text=auto eol=lf
 *.bat eol=crlf
 *.v linguist-language=V
 *.vv linguist-language=V
 *.vsh linguist-language=V
 v.mod linguist-language=V
 .vdocignore linguist-language=ignore
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,24 @@
 # Binaries for programs and plugins
 main
 code
 *.exe
 *.exe~
 *.so
 *.dylib
 *.dll
 # Ignore binary output folders
 bin/
 # Ignore common editor/system specific metadata
 .DS_Store
 .idea/
 .vscode/
 *.iml
 # ENV
 .env
 # vweb and database
 *.db
 *.js
--- a/shell.v
+++ b/shell.v
@@ -0,0 +1,329 @@
 module shell
 import strings.textscanner
 pub const safe_chars = '%+,-./0123456789:=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz'
 pub const word_chars = 'abcdfeghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
 pub const unicode_word_chars = 'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'
 pub const extra_word_chars = '~-./*?='
 pub const punct_chars = '();<>|&'
 // quote returns a shell-escaped version of string `s`.
 // Example:
 // ```
 // assert shell.quote("d'arc") == 'd\'"\'"\'arc\''
 // ```
 pub fn quote(s string) string {
 	if s == '' {
 		return "''"
 	}
 	if s.contains_only(safe_chars) {
 		return s
 	}
 	return "'" + s.replace("'", '\'"\'"\'') + "'"
 }
 // join joins `s` array members into a shell-escaped string.
 // Example:
 // ```
 // assert shell.join(['sh', '-c', 'hostname -f']) == "sh -c 'hostname -f'"
 // ```
 pub fn join(s []string) string {
 	mut quoted_args := []string{}
 	for arg in s {
 		quoted_args << quote(arg)
 	}
 	return quoted_args.join(' ')
 }
@[params]
 pub struct SplitParams {
 pub:
 	posix    bool = true
 	comments bool
 }
 // split splits the string `s` into array using shell-like syntax.
 // Example:
 // ```
 // assert shell.split("sh -c 'hostname -f'") == ['sh', '-c', 'hostname -f']
 // ```
 pub fn split(s string, params SplitParams) []string {
 	mut parts := []string{}
 	mut lexer := new(s,
 		posix:    params.posix
 		comments: params.comments
 	)
 	for token in lexer {
 		parts << token
 	}
 	return parts
 }
@[params]
 pub struct LexerParams {
 pub:
 	posix       bool
 	comments    bool = true
 	punctuation bool
 }
 // new creates new Lexer instance. See LexerParams docstrings for info.
 // Instantiate Lexer directly if you need more custom lexer setup.
 pub fn new(input string, params LexerParams) Lexer {
 	mut words := word_chars
 	if params.posix {
 		words += unicode_word_chars
 	}
 	if params.punctuation {
 		words += extra_word_chars
 	}
 	return Lexer{
 		scanner:       textscanner.new(input)
 		word_chars:    words
 		punct_chars:   if params.punctuation { punct_chars } else { '' }
 		comment_chars: if params.comments { '#' } else { '' }
 		// `whitespace_split` must be true if `punctuation` is false, otherwise
 		// lexer stucks into infinite loop!
 		whitespace_split: if params.punctuation { false } else { true }
 		posix_mode:       params.posix
 	}
 }
 pub struct Lexer {
 pub mut:
 	scanner textscanner.TextScanner
 pub:
 	word_chars     string = word_chars
 	punct_chars    string = punct_chars
 	comment_chars  string = '#'
 	quotes         string = '\'"'
 	escape         string = '\\'
 	escaped_quotes string = '"'
 	posix_mode     bool
 	// whitespace_split must be true if puct_chars is empty to prevent
 	// lexer stucking into infinite loop! The parser sucks, I know.
 	// Use `new` function to cretae Lexer instance to ensure the
 	// correct whitespace_split value.
 	whitespace_split bool
 mut:
 	// This fields are used internally to store the current parser state.
 	lineno          int    = 1
 	state           string = ' '
 	token           string
 	push_back       []string
 	push_back_chars []string
 }
 // next returns parsed tokens until end of input string.
 pub fn (mut x Lexer) next() ?string {
 	if x.push_back.len != 0 {
 		token := x.push_back.first()
 		x.push_back.drop(1)
 		return token
 	}
 	if x.scanner.pos != x.scanner.ilen {
 		return x.token()
 	}
 	return none
 }
 // token parses and returns one token from input string regarding to current scanner state.
 fn (mut x Lexer) token() ?string {
 	// TODO: this function must be fixed and completely rewritten
 	mut quoted := false
 	mut escaped_state := ' '
 	for {
 		mut nextchar := x.scanner.peek_u8()
 		if x.punct_chars != '' && x.push_back_chars.len != 0 {
 			nextchar = x.push_back_chars[x.push_back_chars.len - 1..][0].u8()
 		}
 		print_dbg('state=<${x.state}> I see character <${nextchar.ascii_str()}>')
 		if nextchar == `\n` {
 			x.lineno++
 		}
 		match true {
 			x.state == '' {
 				x.token = ''
 				break
 			}
 			x.state == ' ' {
 				match true {
 					nextchar == 0 {
 						// There is `if not nextcahr` check, that means 'EOF reached'
 						x.state = ''
 						break
 					}
 					nextchar.is_space() {
 						print_dbg('I see whitespace in whitespace state')
 						if x.token != '' || (x.posix_mode && quoted) {
 							break
 						}
 						x.scanner.skip()
 						continue
 					}
 					x.comment_chars.contains_u8(nextchar) {
 						// We need to skip all commented characters until \n found.
 						x.scanner.goto_end()
 						x.lineno++
 					}
 					x.posix_mode && x.escape.contains_u8(nextchar) {
 						escaped_state = 'a'
 						x.state = nextchar.ascii_str()
 					}
 					x.word_chars.contains_u8(nextchar) {
 						x.token = nextchar.ascii_str()
 						x.state = 'a'
 					}
 					x.punct_chars.contains_u8(nextchar) {
 						x.token = nextchar.ascii_str()
 						x.state = 'c'
 					}
 					x.quotes.contains_u8(nextchar) {
 						if !x.posix_mode {
 							x.token = nextchar.ascii_str()
 						}
 						x.state = nextchar.ascii_str()
 					}
 					x.whitespace_split == true {
 						x.token = nextchar.ascii_str()
 						x.state = 'a'
 					}
 					else {
 						x.token = nextchar.ascii_str()
 						if x.token != '' || (x.posix_mode && quoted) {
 							break
 						} else {
 							continue
 						}
 					}
 				}
 			}
 			x.quotes.contains(x.state) {
 				quoted = true
 				if nextchar == 0 {
 					// There is `if not nextcahr` check, that means 'EOF reached'
 					panic('found non-terminated quote')
 				}
 				match true {
 					nextchar.ascii_str() == x.state {
 						if !x.posix_mode {
 							x.token += nextchar.ascii_str()
 							x.state = ' '
 						} else {
 							x.state = 'a'
 						}
 					}
 					x.posix_mode && x.escape.contains_u8(nextchar)
 						&& x.escaped_quotes.contains(x.state) {
 						escaped_state = x.state
 						x.state = nextchar.ascii_str()
 					}
 					else {
 						x.token += nextchar.ascii_str()
 					}
 				}
 			}
 			x.escape.contains(x.state) {
 				if nextchar == 0 {
 					// There is `if not nextcahr` check, that means 'EOF reached'
 					panic('no escaped character found')
 				}
 				if x.quotes.contains(escaped_state) && nextchar.ascii_str() != x.state
 					&& nextchar.ascii_str() != escaped_state {
 					x.token += x.state
 				}
 				x.token += nextchar.ascii_str()
 				x.state = escaped_state
 			}
 			x.state in ['a', 'c'] {
 				match true {
 					nextchar == 0 {
 						x.state = '' // self.state = None
 						break
 					}
 					nextchar.is_space() {
 						print_dbg('I see whitespace in word state')
 						x.state = ' '
 						if x.token != '' || (x.posix_mode && quoted) {
 							break
 						} else {
 							continue
 						}
 					}
 					x.comment_chars.contains_u8(nextchar) {
 						// We need to skip all commented characters until \n found.
 						// for {
 						// 	if x.scanner.peek_u8() == `\n` {
 						// 		x.scanner.skip()
 						// 		break
 						// 	}
 						// 	x.scanner.skip()
 						// }
 						// x.lineno++
 						if x.posix_mode {
 							x.state = ' '
 							if x.token != '' || (x.posix_mode && quoted) {
 								break
 							} else {
 								continue
 							}
 						}
 					}
 					x.state == 'c' {
 						if x.punct_chars.contains_u8(nextchar) {
 							x.token += nextchar.ascii_str()
 						} else {
 							if !nextchar.is_space() {
 								x.push_back_chars << nextchar.ascii_str()
 							}
 							x.state = ' '
 							break
 						}
 					}
 					x.posix_mode && x.quotes.contains_u8(nextchar) {
 						x.state = nextchar.ascii_str()
 					}
 					x.posix_mode && x.escape.contains_u8(nextchar) {
 						escaped_state = 'a'
 						x.state = nextchar.ascii_str()
 					}
 					(x.word_chars.contains_u8(nextchar)
 						|| x.quotes.contains_u8(nextchar))
 						|| (x.whitespace_split && x.punct_chars.contains_u8(nextchar)) {
 						x.token += nextchar.ascii_str()
 					}
 					else {
 						if x.punct_chars != '' {
 							x.push_back_chars << nextchar.ascii_str()
 						} else {
 							x.push_back.prepend(nextchar.ascii_str())
 						}
 						print_dbg('I see punctuation char in word state')
 						x.state = ' '
 						if x.token != '' || (x.posix_mode && quoted) {
 							break
 						} else {
 							continue
 						}
 					}
 				}
 			}
 			else {}
 		}
 		x.scanner.next()
 	}
 	result := x.token
 	x.token = ''
 	if x.posix_mode && !quoted && result == '' {
 		return none
 	}
 	print_dbg('I got token <${result}>')
 	return result
 }
 fn print_dbg(s string) {
 	$if trace_shell_lexer ? {
 		eprintln('shell lexer: ' + s)
 	}
 }
--- a/shell_test.v
+++ b/shell_test.v
@@ -0,0 +1,27 @@
 import shell
 // TODO: pass all tests from https://github.com/python/cpython/blob/main/Lib/test/test_shlex.py
 fn test_quote() {
 	assert shell.quote("janna d'arc") == '\'janna d\'"\'"\'arc\''
 }
 fn test_join() {
 	assert shell.join(['sh', '-c', 'hostname -f']) == "sh -c 'hostname -f'"
 }
 fn test_split() {
 	assert shell.split("sh -c 'hostname -f'") == ['sh', '-c', 'hostname -f']
 	assert shell.split('sh -c hostname') == ['sh', '-c', 'hostname']
 	assert shell.split('hostname -f # some comment') == ['hostname', '-f', '#', 'some', 'comment']
 	assert shell.split('hostname -f # some comment', comments: true) == ['hostname', '-f']
 	assert shell.split('grep -rn "#"') == ['grep', '-rn', '#']
 	assert shell.split('grep -rn "#"', comments: true) == ['grep', '-rn', '#']
 	// FIXME: both assertions fails
 	// s := 'grep -rn hello # search hello
 	// awk --help
 	// '.trim_indent()
 	// assert shell.split(s) == ['grep', '-rn', 'hello', '#', 'search', 'hello', 'awk', '--help']
 	// assert shell.split(s, comments: true) == ['grep', '-rn', 'hello', 'awk', '--help']
 }
--- a/v.mod
+++ b/v.mod
@@ -0,0 +1,7 @@
 Module {
 	name: 'shell'
 	description: 'Shell lexer'
 	version: '0.0.0'
 	license: 'MIT'
 	dependencies: []
 }