all: various improvements
This commit is contained in:
		
							
								
								
									
										14
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								README.md
									
									
									
									
									
								
							@@ -1,6 +1,6 @@
 | 
				
			|||||||
# fdup
 | 
					# fdup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The dumb tool to find duplicate files by it's hash sums.
 | 
					The dumb tool for finding duplicate files by their hash sums.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Compile it with `-prod` for better performance:
 | 
					Compile it with `-prod` for better performance:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -13,14 +13,20 @@ Look at releases page for prebuilt executables.
 | 
				
			|||||||
# Synonsis
 | 
					# Synonsis
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
Usage: fdup [flags] [commands] [path...]
 | 
					Usage: fdup [flags] [commands] [DIR...]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
File duplicates finder
 | 
					File duplicates finder
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Flags:
 | 
					Flags:
 | 
				
			||||||
  -hash               Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]
 | 
					  -hash               Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]
 | 
				
			||||||
  -threads            Number of threads used for calculating hash sums [default: number of CPU cores]
 | 
					  -threads            Number of threads used for calculating hash sums [default: number of CPU cores]
 | 
				
			||||||
  -brief              Brief output, print plain easy to parse hashes and filenames only
 | 
					  -brief              Brief output, print plain easy to parse hashes and filenames only.
 | 
				
			||||||
 | 
					  -json               Print output in JSON format.
 | 
				
			||||||
 | 
					  -exclude            Glob pattern to exclude files and directories [can be passed multiple times]
 | 
				
			||||||
 | 
					  -skip-empty         Skip empty files.
 | 
				
			||||||
 | 
					  -max-size           Maximum file size in bytes. Files larger than this will be skipped.
 | 
				
			||||||
 | 
					  -remove             Remove duplicates.
 | 
				
			||||||
 | 
					  -prompt             Prompt before every removal.
 | 
				
			||||||
  -help               Prints help information.
 | 
					  -help               Prints help information.
 | 
				
			||||||
  -version            Prints version information.
 | 
					  -version            Prints version information.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										182
									
								
								fdup.v
									
									
									
									
									
								
							
							
						
						
									
										182
									
								
								fdup.v
									
									
									
									
									
								
							@@ -20,6 +20,8 @@ import os
 | 
				
			|||||||
import cli
 | 
					import cli
 | 
				
			||||||
import arrays
 | 
					import arrays
 | 
				
			||||||
import maps
 | 
					import maps
 | 
				
			||||||
 | 
					import hash.crc32
 | 
				
			||||||
 | 
					import hash.fnv1a
 | 
				
			||||||
import crypto.blake3
 | 
					import crypto.blake3
 | 
				
			||||||
import crypto.sha1
 | 
					import crypto.sha1
 | 
				
			||||||
import crypto.sha256
 | 
					import crypto.sha256
 | 
				
			||||||
@@ -27,13 +29,14 @@ import crypto.md5
 | 
				
			|||||||
import runtime
 | 
					import runtime
 | 
				
			||||||
import term
 | 
					import term
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
 | 
					import x.json2 as json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fn main() {
 | 
					fn main() {
 | 
				
			||||||
	mut app := cli.Command{
 | 
						mut app := cli.Command{
 | 
				
			||||||
		name:        'fdup'
 | 
							name:        'fdup'
 | 
				
			||||||
		description: 'File duplicates finder'
 | 
							description: 'File duplicates finder'
 | 
				
			||||||
		version:     '0.1.0'
 | 
							version:     '0.2.0'
 | 
				
			||||||
		usage:       '[path...]'
 | 
							usage:       '[DIR...]'
 | 
				
			||||||
		execute:     find
 | 
							execute:     find
 | 
				
			||||||
		defaults:    struct {
 | 
							defaults:    struct {
 | 
				
			||||||
			man: false
 | 
								man: false
 | 
				
			||||||
@@ -42,8 +45,8 @@ fn main() {
 | 
				
			|||||||
			cli.Flag{
 | 
								cli.Flag{
 | 
				
			||||||
				flag:          .string
 | 
									flag:          .string
 | 
				
			||||||
				name:          'hash'
 | 
									name:          'hash'
 | 
				
			||||||
				description:   'Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]'
 | 
									description:   'Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]'
 | 
				
			||||||
				default_value: ['md5']
 | 
									default_value: ['fnv1a']
 | 
				
			||||||
			},
 | 
								},
 | 
				
			||||||
			cli.Flag{
 | 
								cli.Flag{
 | 
				
			||||||
				flag:          .int
 | 
									flag:          .int
 | 
				
			||||||
@@ -54,7 +57,37 @@ fn main() {
 | 
				
			|||||||
			cli.Flag{
 | 
								cli.Flag{
 | 
				
			||||||
				flag:        .bool
 | 
									flag:        .bool
 | 
				
			||||||
				name:        'brief'
 | 
									name:        'brief'
 | 
				
			||||||
				description: 'Brief output, print plain easy to parse hashes and filenames only'
 | 
									description: 'Brief output, print plain easy to parse hashes and filenames only.'
 | 
				
			||||||
 | 
								},
 | 
				
			||||||
 | 
								cli.Flag{
 | 
				
			||||||
 | 
									flag:        .bool
 | 
				
			||||||
 | 
									name:        'json'
 | 
				
			||||||
 | 
									description: 'Print output in JSON format.'
 | 
				
			||||||
 | 
								},
 | 
				
			||||||
 | 
								cli.Flag{
 | 
				
			||||||
 | 
									flag:        .string_array
 | 
				
			||||||
 | 
									name:        'exclude'
 | 
				
			||||||
 | 
									description: 'Glob pattern to exclude files and directories [can be passed multiple times]'
 | 
				
			||||||
 | 
								},
 | 
				
			||||||
 | 
								cli.Flag{
 | 
				
			||||||
 | 
									flag:        .bool
 | 
				
			||||||
 | 
									name:        'skip-empty'
 | 
				
			||||||
 | 
									description: 'Skip empty files.'
 | 
				
			||||||
 | 
								},
 | 
				
			||||||
 | 
								cli.Flag{
 | 
				
			||||||
 | 
									flag:        .string
 | 
				
			||||||
 | 
									name:        'max-size'
 | 
				
			||||||
 | 
									description: 'Maximum file size in bytes. Files larger than this will be skipped.'
 | 
				
			||||||
 | 
								},
 | 
				
			||||||
 | 
								cli.Flag{
 | 
				
			||||||
 | 
									flag:        .bool
 | 
				
			||||||
 | 
									name:        'remove'
 | 
				
			||||||
 | 
									description: 'Remove duplicates.'
 | 
				
			||||||
 | 
								},
 | 
				
			||||||
 | 
								cli.Flag{
 | 
				
			||||||
 | 
									flag:        .bool
 | 
				
			||||||
 | 
									name:        'prompt'
 | 
				
			||||||
 | 
									description: 'Prompt before every removal.'
 | 
				
			||||||
			},
 | 
								},
 | 
				
			||||||
		]
 | 
							]
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
@@ -63,26 +96,70 @@ fn main() {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fn find(cmd cli.Command) ! {
 | 
					fn find(cmd cli.Command) ! {
 | 
				
			||||||
	hash_algo := HashAlgo.from_string(cmd.flags.get_string('hash')!) or { HashAlgo.md5 }
 | 
						hash_fn := HashFn.from_string(cmd.flags.get_string('hash')!) or { HashFn.fnv1a }
 | 
				
			||||||
	nr_threads := cmd.flags.get_int('threads')!
 | 
						nr_threads := cmd.flags.get_int('threads')!
 | 
				
			||||||
	brief := cmd.flags.get_bool('brief')!
 | 
						brief_output := cmd.flags.get_bool('brief')!
 | 
				
			||||||
 | 
						json_output := cmd.flags.get_bool('json')!
 | 
				
			||||||
 | 
						exclude_globs := cmd.flags.get_strings('exclude')!
 | 
				
			||||||
 | 
						skip_empty := cmd.flags.get_bool('skip-empty')!
 | 
				
			||||||
 | 
						max_size := cmd.flags.get_string('max-size')!.u64()
 | 
				
			||||||
 | 
						remove := cmd.flags.get_bool('remove')!
 | 
				
			||||||
 | 
						prompt := cmd.flags.get_bool('prompt')!
 | 
				
			||||||
 | 
						if nr_threads <= 0 {
 | 
				
			||||||
 | 
							eprintln('threads number cannot be zero or negative')
 | 
				
			||||||
 | 
							exit(1)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	mut search_paths := ['.']
 | 
						mut search_paths := ['.']
 | 
				
			||||||
	if cmd.args.len > 0 {
 | 
						if cmd.args.len > 0 {
 | 
				
			||||||
		search_paths = cmd.args.clone()
 | 
							search_paths = cmd.args.clone()
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	// collect full list of files absolute paths
 | 
						// collect full list of files absolute paths
 | 
				
			||||||
	mut file_paths := &[]string{}
 | 
						mut file_paths := &[]string{}
 | 
				
			||||||
	for search_path in search_paths {
 | 
						outer: for search_path in search_paths {
 | 
				
			||||||
 | 
							if search_path != '.' {
 | 
				
			||||||
 | 
								for glob in exclude_globs {
 | 
				
			||||||
 | 
									if search_path.match_glob(glob) {
 | 
				
			||||||
 | 
										continue outer
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							if !os.is_dir(search_path) {
 | 
				
			||||||
 | 
								eprintln('${search_path} is not a directory, skip')
 | 
				
			||||||
 | 
								continue
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path)))
 | 
							norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path)))
 | 
				
			||||||
		os.walk(norm_path, fn [mut file_paths] (file string) {
 | 
							os.walk(norm_path, fn [mut file_paths, exclude_globs, skip_empty, max_size] (file string) {
 | 
				
			||||||
 | 
								for glob in exclude_globs {
 | 
				
			||||||
 | 
									if file.match_glob(glob) || os.file_name(file).match_glob(glob) {
 | 
				
			||||||
 | 
										return
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								mut file_size := u64(0)
 | 
				
			||||||
 | 
								if skip_empty || max_size > 0 {
 | 
				
			||||||
 | 
									file_size = os.file_size(file)
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								if skip_empty && file_size == 0 {
 | 
				
			||||||
 | 
									return
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								if max_size > 0 && file_size > max_size {
 | 
				
			||||||
 | 
									return
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
			file_paths << file
 | 
								file_paths << file
 | 
				
			||||||
		})
 | 
							})
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						if file_paths.len == 0 {
 | 
				
			||||||
 | 
							eprintln('nothing to do, exiting')
 | 
				
			||||||
 | 
							exit(1)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	eprintln('found ${file_paths.len} files, processing...')
 | 
						eprintln('found ${file_paths.len} files, processing...')
 | 
				
			||||||
	// split the files list into approximately equal parts by the number of threads
 | 
						// split the files list into approximately equal parts by the number of threads
 | 
				
			||||||
	mut parts := [][]string{}
 | 
						mut parts := [][]string{}
 | 
				
			||||||
	if nr_threads == 1 {
 | 
						if nr_threads == 1 {
 | 
				
			||||||
		parts = [*file_paths]
 | 
							parts = [*file_paths]
 | 
				
			||||||
 | 
						} else if nr_threads >= file_paths.len {
 | 
				
			||||||
 | 
							for path in file_paths {
 | 
				
			||||||
 | 
								parts << [path]
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
		parts = arrays.chunk(*file_paths, file_paths.len / nr_threads)
 | 
							parts = arrays.chunk(*file_paths, file_paths.len / nr_threads)
 | 
				
			||||||
		mut idx := 0
 | 
							mut idx := 0
 | 
				
			||||||
@@ -97,8 +174,8 @@ fn find(cmd cli.Command) ! {
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
	// calculate hashsums in parallel
 | 
						// calculate hashsums in parallel
 | 
				
			||||||
	mut threads := []thread map[string]string{}
 | 
						mut threads := []thread map[string]string{}
 | 
				
			||||||
	for i := 0; i < nr_threads; i++ {
 | 
						for i := 0; i < parts.len; i++ {
 | 
				
			||||||
		threads << spawn calculate_hashsums(parts[i], hash_algo)
 | 
							threads << spawn calculate_hashsums(i, parts[i], hash_fn)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	calculated := threads.wait()
 | 
						calculated := threads.wait()
 | 
				
			||||||
	mut sums := map[string]string{}
 | 
						mut sums := map[string]string{}
 | 
				
			||||||
@@ -111,20 +188,74 @@ fn find(cmd cli.Command) ! {
 | 
				
			|||||||
		eprintln(term.bold('no duplicates found'))
 | 
							eprintln(term.bold('no duplicates found'))
 | 
				
			||||||
		exit(0)
 | 
							exit(0)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	for hash, files in dups {
 | 
						if brief_output {
 | 
				
			||||||
		if brief {
 | 
							for hash, files in dups {
 | 
				
			||||||
			for file in files {
 | 
								for file in files {
 | 
				
			||||||
				println(hash + ':' + file)
 | 
									println(hash + ':' + file)
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
		} else {
 | 
							}
 | 
				
			||||||
 | 
						} else if json_output {
 | 
				
			||||||
 | 
							mut output := OutputSchema{
 | 
				
			||||||
 | 
								hash_fn: hash_fn.str()
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							for hash, files in dups {
 | 
				
			||||||
 | 
								mut entries := []FileEntry{}
 | 
				
			||||||
 | 
								for file in files {
 | 
				
			||||||
 | 
									stat := os.stat(file)!
 | 
				
			||||||
 | 
									entries << FileEntry{
 | 
				
			||||||
 | 
										path:  file
 | 
				
			||||||
 | 
										size:  stat.size
 | 
				
			||||||
 | 
										mtime: time.unix(stat.mtime)
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								output.data << Duplicate{
 | 
				
			||||||
 | 
									hash:  hash
 | 
				
			||||||
 | 
									total: entries.len
 | 
				
			||||||
 | 
									files: entries
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							println(json.encode[OutputSchema](output))
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							for hash, files in dups {
 | 
				
			||||||
			println(term.bold(hash))
 | 
								println(term.bold(hash))
 | 
				
			||||||
			for file in files {
 | 
								for file in files {
 | 
				
			||||||
				stat := os.stat(file)!
 | 
									stat := os.stat(file)!
 | 
				
			||||||
				println('\t${time.unix(stat.mtime)}\t${file}')
 | 
									println('\t${time.unix(stat.mtime)} ${stat.size:-10} ${file}')
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	exit(2)
 | 
						if remove {
 | 
				
			||||||
 | 
							for _, files in dups {
 | 
				
			||||||
 | 
								for file in files[1..] {
 | 
				
			||||||
 | 
									if prompt {
 | 
				
			||||||
 | 
										answer := os.input("delete file '${file}'? (y/n): ")
 | 
				
			||||||
 | 
										if answer != 'y' {
 | 
				
			||||||
 | 
											eprintln('skipped ${file}')
 | 
				
			||||||
 | 
											continue
 | 
				
			||||||
 | 
										}
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
									os.rm(file)!
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct OutputSchema {
 | 
				
			||||||
 | 
						hash_fn string
 | 
				
			||||||
 | 
					mut:
 | 
				
			||||||
 | 
						data []Duplicate
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct Duplicate {
 | 
				
			||||||
 | 
						hash  string
 | 
				
			||||||
 | 
						total int
 | 
				
			||||||
 | 
						files []FileEntry
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct FileEntry {
 | 
				
			||||||
 | 
						path  string
 | 
				
			||||||
 | 
						size  u64
 | 
				
			||||||
 | 
						mtime time.Time
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fn find_duplicates(files map[string]string) map[string][]string {
 | 
					fn find_duplicates(files map[string]string) map[string][]string {
 | 
				
			||||||
@@ -146,22 +277,30 @@ fn find_duplicates(files map[string]string) map[string][]string {
 | 
				
			|||||||
	return dups
 | 
						return dups
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
enum HashAlgo {
 | 
					enum HashFn {
 | 
				
			||||||
	blake3
 | 
						blake3
 | 
				
			||||||
 | 
						crc32
 | 
				
			||||||
 | 
						fnv1a
 | 
				
			||||||
	sha1
 | 
						sha1
 | 
				
			||||||
	sha256
 | 
						sha256
 | 
				
			||||||
	md5
 | 
						md5
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fn hashsum(file string, algo HashAlgo) string {
 | 
					fn hashsum(file string, hash_fn HashFn) string {
 | 
				
			||||||
	file_bytes := os.read_bytes(file) or { []u8{len: 1} }
 | 
						file_bytes := os.read_bytes(file) or { []u8{len: 1} }
 | 
				
			||||||
	defer {
 | 
						defer {
 | 
				
			||||||
		unsafe { file_bytes.free() }
 | 
							unsafe { file_bytes.free() }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	match algo {
 | 
						match hash_fn {
 | 
				
			||||||
		.blake3 {
 | 
							.blake3 {
 | 
				
			||||||
			return blake3.sum256(file_bytes).hex()
 | 
								return blake3.sum256(file_bytes).hex()
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							.crc32 {
 | 
				
			||||||
 | 
								return crc32.sum(file_bytes).hex()
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							.fnv1a {
 | 
				
			||||||
 | 
								return fnv1a.sum64(file_bytes).hex()
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		.sha1 {
 | 
							.sha1 {
 | 
				
			||||||
			return sha1.sum(file_bytes).hex()
 | 
								return sha1.sum(file_bytes).hex()
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
@@ -174,10 +313,11 @@ fn hashsum(file string, algo HashAlgo) string {
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fn calculate_hashsums(files []string, hash HashAlgo) map[string]string {
 | 
					fn calculate_hashsums(tid int, files []string, hash_fn HashFn) map[string]string {
 | 
				
			||||||
 | 
						eprintln('thread ${tid} started with queue of ${files.len} files')
 | 
				
			||||||
	mut sums := map[string]string{}
 | 
						mut sums := map[string]string{}
 | 
				
			||||||
	for file in files {
 | 
						for file in files {
 | 
				
			||||||
		sums[file] = hashsum(file, hash)
 | 
							sums[file] = hashsum(file, hash_fn)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return sums
 | 
						return sums
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user