module shell

import strings.textscanner

pub const safe_chars = '%+,-./0123456789:=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz'
pub const word_chars = 'abcdfeghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
pub const unicode_word_chars = 'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'
pub const extra_word_chars = '~-./*?='
pub const punct_chars = '();<>|&'

// quote returns a shell-escaped version of string `s`.
// Example:
// ```
// assert shell.quote("d'arc") == 'd\'"\'"\'arc\''
// ```
pub fn quote(s string) string {
	if s == '' {
		return "''"
	}
	if s.contains_only(safe_chars) {
		return s
	}
	return "'" + s.replace("'", '\'"\'"\'') + "'"
}

// join joins `s` array members into a shell-escaped string.
// Example:
// ```
// assert shell.join(['sh', '-c', 'hostname -f']) == "sh -c 'hostname -f'"
// ```
pub fn join(s []string) string {
	mut quoted_args := []string{}
	for arg in s {
		quoted_args << quote(arg)
	}
	return quoted_args.join(' ')
}

@[params]
pub struct SplitParams {
pub:
	posix    bool = true
	comments bool
}

// split splits the string `s` into array using shell-like syntax.
// Example:
// ```
// assert shell.split("sh -c 'hostname -f'") == ['sh', '-c', 'hostname -f']
// ```
pub fn split(s string, params SplitParams) []string {
	mut parts := []string{}
	mut lexer := new(s,
		posix:    params.posix
		comments: params.comments
	)
	for token in lexer {
		parts << token
	}
	return parts
}

@[params]
pub struct LexerParams {
pub:
	posix       bool
	comments    bool = true
	punctuation bool
}

// new creates new Lexer instance. See LexerParams docstrings for info.
// Instantiate Lexer directly if you need more custom lexer setup.
pub fn new(input string, params LexerParams) Lexer {
	mut words := word_chars
	if params.posix {
		words += unicode_word_chars
	}
	if params.punctuation {
		words += extra_word_chars
	}
	return Lexer{
		scanner:       textscanner.new(input)
		word_chars:    words
		punct_chars:   if params.punctuation { punct_chars } else { '' }
		comment_chars: if params.comments { '#' } else { '' }
		// `whitespace_split` must be true if `punctuation` is false, otherwise
		// lexer stucks into infinite loop!
		whitespace_split: if params.punctuation { false } else { true }
		posix_mode:       params.posix
	}
}

pub struct Lexer {
pub mut:
	scanner textscanner.TextScanner
pub:
	word_chars     string = word_chars
	punct_chars    string = punct_chars
	comment_chars  string = '#'
	quotes         string = '\'"'
	escape         string = '\\'
	escaped_quotes string = '"'
	posix_mode     bool
	// whitespace_split must be true if puct_chars is empty to prevent
	// lexer stucking into infinite loop! The parser sucks, I know.
	// Use `new` function to cretae Lexer instance to ensure the
	// correct whitespace_split value.
	whitespace_split bool
mut:
	// This fields are used internally to store the current parser state.
	lineno          int    = 1
	state           string = ' '
	token           string
	push_back       []string
	push_back_chars []string
}

// next returns parsed tokens until end of input string.
pub fn (mut x Lexer) next() ?string {
	if x.push_back.len != 0 {
		token := x.push_back.first()
		x.push_back.drop(1)
		return token
	}
	if x.scanner.pos != x.scanner.ilen {
		return x.token()
	}
	return none
}

// token parses and returns one token from input string regarding to current scanner state.
fn (mut x Lexer) token() ?string {
	// TODO: this function must be fixed and completely rewritten
	mut quoted := false
	mut escaped_state := ' '
	for {
		mut nextchar := x.scanner.peek_u8()
		if x.punct_chars != '' && x.push_back_chars.len != 0 {
			nextchar = x.push_back_chars[x.push_back_chars.len - 1..][0].u8()
		}
		print_dbg('state=<${x.state}> I see character <${nextchar.ascii_str()}>')
		if nextchar == `\n` {
			x.lineno++
		}
		match true {
			x.state == '' {
				x.token = ''
				break
			}
			x.state == ' ' {
				match true {
					nextchar == 0 {
						// There is `if not nextcahr` check, that means 'EOF reached'
						x.state = ''
						break
					}
					nextchar.is_space() {
						print_dbg('I see whitespace in whitespace state')
						if x.token != '' || (x.posix_mode && quoted) {
							break
						}
						x.scanner.skip()
						continue
					}
					x.comment_chars.contains_u8(nextchar) {
						// We need to skip all commented characters until \n found.
						x.scanner.goto_end()
						x.lineno++
					}
					x.posix_mode && x.escape.contains_u8(nextchar) {
						escaped_state = 'a'
						x.state = nextchar.ascii_str()
					}
					x.word_chars.contains_u8(nextchar) {
						x.token = nextchar.ascii_str()
						x.state = 'a'
					}
					x.punct_chars.contains_u8(nextchar) {
						x.token = nextchar.ascii_str()
						x.state = 'c'
					}
					x.quotes.contains_u8(nextchar) {
						if !x.posix_mode {
							x.token = nextchar.ascii_str()
						}
						x.state = nextchar.ascii_str()
					}
					x.whitespace_split == true {
						x.token = nextchar.ascii_str()
						x.state = 'a'
					}
					else {
						x.token = nextchar.ascii_str()
						if x.token != '' || (x.posix_mode && quoted) {
							break
						} else {
							continue
						}
					}
				}
			}
			x.quotes.contains(x.state) {
				quoted = true
				if nextchar == 0 {
					// There is `if not nextcahr` check, that means 'EOF reached'
					panic('found non-terminated quote')
				}
				match true {
					nextchar.ascii_str() == x.state {
						if !x.posix_mode {
							x.token += nextchar.ascii_str()
							x.state = ' '
						} else {
							x.state = 'a'
						}
					}
					x.posix_mode && x.escape.contains_u8(nextchar)
						&& x.escaped_quotes.contains(x.state) {
						escaped_state = x.state
						x.state = nextchar.ascii_str()
					}
					else {
						x.token += nextchar.ascii_str()
					}
				}
			}
			x.escape.contains(x.state) {
				if nextchar == 0 {
					// There is `if not nextcahr` check, that means 'EOF reached'
					panic('no escaped character found')
				}
				if x.quotes.contains(escaped_state) && nextchar.ascii_str() != x.state
					&& nextchar.ascii_str() != escaped_state {
					x.token += x.state
				}
				x.token += nextchar.ascii_str()
				x.state = escaped_state
			}
			x.state in ['a', 'c'] {
				match true {
					nextchar == 0 {
						x.state = '' // self.state = None
						break
					}
					nextchar.is_space() {
						print_dbg('I see whitespace in word state')
						x.state = ' '
						if x.token != '' || (x.posix_mode && quoted) {
							break
						} else {
							continue
						}
					}
					x.comment_chars.contains_u8(nextchar) {
						// We need to skip all commented characters until \n found.
						// for {
						// 	if x.scanner.peek_u8() == `\n` {
						// 		x.scanner.skip()
						// 		break
						// 	}
						// 	x.scanner.skip()
						// }
						// x.lineno++
						if x.posix_mode {
							x.state = ' '
							if x.token != '' || (x.posix_mode && quoted) {
								break
							} else {
								continue
							}
						}
					}
					x.state == 'c' {
						if x.punct_chars.contains_u8(nextchar) {
							x.token += nextchar.ascii_str()
						} else {
							if !nextchar.is_space() {
								x.push_back_chars << nextchar.ascii_str()
							}
							x.state = ' '
							break
						}
					}
					x.posix_mode && x.quotes.contains_u8(nextchar) {
						x.state = nextchar.ascii_str()
					}
					x.posix_mode && x.escape.contains_u8(nextchar) {
						escaped_state = 'a'
						x.state = nextchar.ascii_str()
					}
					(x.word_chars.contains_u8(nextchar)
						|| x.quotes.contains_u8(nextchar))
						|| (x.whitespace_split && x.punct_chars.contains_u8(nextchar)) {
						x.token += nextchar.ascii_str()
					}
					else {
						if x.punct_chars != '' {
							x.push_back_chars << nextchar.ascii_str()
						} else {
							x.push_back.prepend(nextchar.ascii_str())
						}
						print_dbg('I see punctuation char in word state')
						x.state = ' '
						if x.token != '' || (x.posix_mode && quoted) {
							break
						} else {
							continue
						}
					}
				}
			}
			else {}
		}
		x.scanner.next()
	}
	result := x.token
	x.token = ''
	if x.posix_mode && !quoted && result == '' {
		return none
	}
	print_dbg('I got token <${result}>')
	return result
}

fn print_dbg(s string) {
	$if trace_shell_lexer ? {
		eprintln('shell lexer: ' + s)
	}
}