From e607b5c6bcc4f8cee4a5531dc2310278fdc84a17 Mon Sep 17 00:00:00 2001
From: ge <gechandev@gmail.com>
Date: Sat, 15 Feb 2025 00:22:34 +0300
Subject: [PATCH] all: various improvements

---
 README.md |  14 +++--
 fdup.v    | 182 +++++++++++++++++++++++++++++++++++++++++++++++-------
 v.mod     |   2 +-
 3 files changed, 172 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 3959f83..133bdf9 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # fdup
 
-The dumb tool to find duplicate files by it's hash sums.
+The dumb tool for finding duplicate files by their hash sums.
 
 Compile it with `-prod` for better performance:
 
@@ -13,14 +13,20 @@ Look at releases page for prebuilt executables.
 # Synonsis
 
 ```
-Usage: fdup [flags] [commands] [path...]
+Usage: fdup [flags] [commands] [DIR...]
 
 File duplicates finder
 
 Flags:
-  -hash               Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]
+  -hash               Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]
   -threads            Number of threads used for calculating hash sums [default: number of CPU cores]
-  -brief              Brief output, print plain easy to parse hashes and filenames only
+  -brief              Brief output, print plain easy to parse hashes and filenames only.
+  -json               Print output in JSON format.
+  -exclude            Glob pattern to exclude files and directories [can be passed multiple times]
+  -skip-empty         Skip empty files.
+  -max-size           Maximum file size in bytes. Files larger than this will be skipped.
+  -remove             Remove duplicates.
+  -prompt             Prompt before every removal.
   -help               Prints help information.
   -version            Prints version information.
 
diff --git a/fdup.v b/fdup.v
index 6195baa..0d1ec28 100644
--- a/fdup.v
+++ b/fdup.v
@@ -20,6 +20,8 @@ import os
 import cli
 import arrays
 import maps
+import hash.crc32
+import hash.fnv1a
 import crypto.blake3
 import crypto.sha1
 import crypto.sha256
@@ -27,13 +29,14 @@ import crypto.md5
 import runtime
 import term
 import time
+import x.json2 as json
 
 fn main() {
 	mut app := cli.Command{
 		name:        'fdup'
 		description: 'File duplicates finder'
-		version:     '0.1.0'
-		usage:       '[path...]'
+		version:     '0.2.0'
+		usage:       '[DIR...]'
 		execute:     find
 		defaults:    struct {
 			man: false
@@ -42,8 +45,8 @@ fn main() {
 			cli.Flag{
 				flag:          .string
 				name:          'hash'
-				description:   'Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]'
-				default_value: ['md5']
+				description:   'Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]'
+				default_value: ['fnv1a']
 			},
 			cli.Flag{
 				flag:          .int
@@ -54,7 +57,37 @@ fn main() {
 			cli.Flag{
 				flag:        .bool
 				name:        'brief'
-				description: 'Brief output, print plain easy to parse hashes and filenames only'
+				description: 'Brief output, print plain easy to parse hashes and filenames only.'
+			},
+			cli.Flag{
+				flag:        .bool
+				name:        'json'
+				description: 'Print output in JSON format.'
+			},
+			cli.Flag{
+				flag:        .string_array
+				name:        'exclude'
+				description: 'Glob pattern to exclude files and directories [can be passed multiple times]'
+			},
+			cli.Flag{
+				flag:        .bool
+				name:        'skip-empty'
+				description: 'Skip empty files.'
+			},
+			cli.Flag{
+				flag:        .string
+				name:        'max-size'
+				description: 'Maximum file size in bytes. Files larger than this will be skipped.'
+			},
+			cli.Flag{
+				flag:        .bool
+				name:        'remove'
+				description: 'Remove duplicates.'
+			},
+			cli.Flag{
+				flag:        .bool
+				name:        'prompt'
+				description: 'Prompt before every removal.'
 			},
 		]
 	}
@@ -63,26 +96,70 @@ fn main() {
 }
 
 fn find(cmd cli.Command) ! {
-	hash_algo := HashAlgo.from_string(cmd.flags.get_string('hash')!) or { HashAlgo.md5 }
+	hash_fn := HashFn.from_string(cmd.flags.get_string('hash')!) or { HashFn.fnv1a }
 	nr_threads := cmd.flags.get_int('threads')!
-	brief := cmd.flags.get_bool('brief')!
+	brief_output := cmd.flags.get_bool('brief')!
+	json_output := cmd.flags.get_bool('json')!
+	exclude_globs := cmd.flags.get_strings('exclude')!
+	skip_empty := cmd.flags.get_bool('skip-empty')!
+	max_size := cmd.flags.get_string('max-size')!.u64()
+	remove := cmd.flags.get_bool('remove')!
+	prompt := cmd.flags.get_bool('prompt')!
+	if nr_threads <= 0 {
+		eprintln('threads number cannot be zero or negative')
+		exit(1)
+	}
 	mut search_paths := ['.']
 	if cmd.args.len > 0 {
 		search_paths = cmd.args.clone()
 	}
 	// collect full list of files absolute paths
 	mut file_paths := &[]string{}
-	for search_path in search_paths {
+	outer: for search_path in search_paths {
+		if search_path != '.' {
+			for glob in exclude_globs {
+				if search_path.match_glob(glob) {
+					continue outer
+				}
+			}
+		}
+		if !os.is_dir(search_path) {
+			eprintln('${search_path} is not a directory, skip')
+			continue
+		}
 		norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path)))
-		os.walk(norm_path, fn [mut file_paths] (file string) {
+		os.walk(norm_path, fn [mut file_paths, exclude_globs, skip_empty, max_size] (file string) {
+			for glob in exclude_globs {
+				if file.match_glob(glob) || os.file_name(file).match_glob(glob) {
+					return
+				}
+			}
+			mut file_size := u64(0)
+			if skip_empty || max_size > 0 {
+				file_size = os.file_size(file)
+			}
+			if skip_empty && file_size == 0 {
+				return
+			}
+			if max_size > 0 && file_size > max_size {
+				return
+			}
 			file_paths << file
 		})
 	}
+	if file_paths.len == 0 {
+		eprintln('nothing to do, exiting')
+		exit(1)
+	}
 	eprintln('found ${file_paths.len} files, processing...')
 	// split the files list into approximately equal parts by the number of threads
 	mut parts := [][]string{}
 	if nr_threads == 1 {
 		parts = [*file_paths]
+	} else if nr_threads >= file_paths.len {
+		for path in file_paths {
+			parts << [path]
+		}
 	} else {
 		parts = arrays.chunk(*file_paths, file_paths.len / nr_threads)
 		mut idx := 0
@@ -97,8 +174,8 @@ fn find(cmd cli.Command) ! {
 	}
 	// calculate hashsums in parallel
 	mut threads := []thread map[string]string{}
-	for i := 0; i < nr_threads; i++ {
-		threads << spawn calculate_hashsums(parts[i], hash_algo)
+	for i := 0; i < parts.len; i++ {
+		threads << spawn calculate_hashsums(i, parts[i], hash_fn)
 	}
 	calculated := threads.wait()
 	mut sums := map[string]string{}
@@ -111,20 +188,74 @@ fn find(cmd cli.Command) ! {
 		eprintln(term.bold('no duplicates found'))
 		exit(0)
 	}
-	for hash, files in dups {
-		if brief {
+	if brief_output {
+		for hash, files in dups {
 			for file in files {
 				println(hash + ':' + file)
 			}
-		} else {
+		}
+	} else if json_output {
+		mut output := OutputSchema{
+			hash_fn: hash_fn.str()
+		}
+		for hash, files in dups {
+			mut entries := []FileEntry{}
+			for file in files {
+				stat := os.stat(file)!
+				entries << FileEntry{
+					path:  file
+					size:  stat.size
+					mtime: time.unix(stat.mtime)
+				}
+			}
+			output.data << Duplicate{
+				hash:  hash
+				total: entries.len
+				files: entries
+			}
+		}
+		println(json.encode[OutputSchema](output))
+	} else {
+		for hash, files in dups {
 			println(term.bold(hash))
 			for file in files {
 				stat := os.stat(file)!
-				println('\t${time.unix(stat.mtime)}\t${file}')
+				println('\t${time.unix(stat.mtime)} ${stat.size:-10} ${file}')
 			}
 		}
 	}
-	exit(2)
+	if remove {
+		for _, files in dups {
+			for file in files[1..] {
+				if prompt {
+					answer := os.input("delete file '${file}'? (y/n): ")
+					if answer != 'y' {
+						eprintln('skipped ${file}')
+						continue
+					}
+				}
+				os.rm(file)!
+			}
+		}
+	}
+}
+
+struct OutputSchema {
+	hash_fn string
+mut:
+	data []Duplicate
+}
+
+struct Duplicate {
+	hash  string
+	total int
+	files []FileEntry
+}
+
+struct FileEntry {
+	path  string
+	size  u64
+	mtime time.Time
 }
 
 fn find_duplicates(files map[string]string) map[string][]string {
@@ -146,22 +277,30 @@ fn find_duplicates(files map[string]string) map[string][]string {
 	return dups
 }
 
-enum HashAlgo {
+enum HashFn {
 	blake3
+	crc32
+	fnv1a
 	sha1
 	sha256
 	md5
 }
 
-fn hashsum(file string, algo HashAlgo) string {
+fn hashsum(file string, hash_fn HashFn) string {
 	file_bytes := os.read_bytes(file) or { []u8{len: 1} }
 	defer {
 		unsafe { file_bytes.free() }
 	}
-	match algo {
+	match hash_fn {
 		.blake3 {
 			return blake3.sum256(file_bytes).hex()
 		}
+		.crc32 {
+			return crc32.sum(file_bytes).hex()
+		}
+		.fnv1a {
+			return fnv1a.sum64(file_bytes).hex()
+		}
 		.sha1 {
 			return sha1.sum(file_bytes).hex()
 		}
@@ -174,10 +313,11 @@ fn hashsum(file string, algo HashAlgo) string {
 	}
 }
 
-fn calculate_hashsums(files []string, hash HashAlgo) map[string]string {
+fn calculate_hashsums(tid int, files []string, hash_fn HashFn) map[string]string {
+	eprintln('thread ${tid} started with queue of ${files.len} files')
 	mut sums := map[string]string{}
 	for file in files {
-		sums[file] = hashsum(file, hash)
+		sums[file] = hashsum(file, hash_fn)
 	}
 	return sums
 }
diff --git a/v.mod b/v.mod
index f2452a0..01f7f7d 100644
--- a/v.mod
+++ b/v.mod
@@ -1,7 +1,7 @@
 Module {
 	name: 'fdup'
 	description: 'File duplicates finder'
-	version: '0.1.0'
+	version: '0.2.0'
 	license: 'GPL-3.0-or-later'
 	dependencies: []
 }