From e607b5c6bcc4f8cee4a5531dc2310278fdc84a17 Mon Sep 17 00:00:00 2001 From: ge Date: Sat, 15 Feb 2025 00:22:34 +0300 Subject: [PATCH] all: various improvements --- README.md | 14 +++-- fdup.v | 182 +++++++++++++++++++++++++++++++++++++++++++++++------- v.mod | 2 +- 3 files changed, 172 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 3959f83..133bdf9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # fdup -The dumb tool to find duplicate files by it's hash sums. +The dumb tool for finding duplicate files by their hash sums. Compile it with `-prod` for better performance: @@ -13,14 +13,20 @@ Look at releases page for prebuilt executables. # Synonsis ``` -Usage: fdup [flags] [commands] [path...] +Usage: fdup [flags] [commands] [DIR...] File duplicates finder Flags: - -hash Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5] + -hash Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a] -threads Number of threads used for calculating hash sums [default: number of CPU cores] - -brief Brief output, print plain easy to parse hashes and filenames only + -brief Brief output, print plain easy to parse hashes and filenames only. + -json Print output in JSON format. + -exclude Glob pattern to exclude files and directories [can be passed multiple times] + -skip-empty Skip empty files. + -max-size Maximum file size in bytes. Files larger than this will be skipped. + -remove Remove duplicates. + -prompt Prompt before every removal. -help Prints help information. -version Prints version information. diff --git a/fdup.v b/fdup.v index 6195baa..0d1ec28 100644 --- a/fdup.v +++ b/fdup.v @@ -20,6 +20,8 @@ import os import cli import arrays import maps +import hash.crc32 +import hash.fnv1a import crypto.blake3 import crypto.sha1 import crypto.sha256 @@ -27,13 +29,14 @@ import crypto.md5 import runtime import term import time +import x.json2 as json fn main() { mut app := cli.Command{ name: 'fdup' description: 'File duplicates finder' - version: '0.1.0' - usage: '[path...]' + version: '0.2.0' + usage: '[DIR...]' execute: find defaults: struct { man: false @@ -42,8 +45,8 @@ fn main() { cli.Flag{ flag: .string name: 'hash' - description: 'Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]' - default_value: ['md5'] + description: 'Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]' + default_value: ['fnv1a'] }, cli.Flag{ flag: .int @@ -54,7 +57,37 @@ fn main() { cli.Flag{ flag: .bool name: 'brief' - description: 'Brief output, print plain easy to parse hashes and filenames only' + description: 'Brief output, print plain easy to parse hashes and filenames only.' + }, + cli.Flag{ + flag: .bool + name: 'json' + description: 'Print output in JSON format.' + }, + cli.Flag{ + flag: .string_array + name: 'exclude' + description: 'Glob pattern to exclude files and directories [can be passed multiple times]' + }, + cli.Flag{ + flag: .bool + name: 'skip-empty' + description: 'Skip empty files.' + }, + cli.Flag{ + flag: .string + name: 'max-size' + description: 'Maximum file size in bytes. Files larger than this will be skipped.' + }, + cli.Flag{ + flag: .bool + name: 'remove' + description: 'Remove duplicates.' + }, + cli.Flag{ + flag: .bool + name: 'prompt' + description: 'Prompt before every removal.' }, ] } @@ -63,26 +96,70 @@ fn main() { } fn find(cmd cli.Command) ! { - hash_algo := HashAlgo.from_string(cmd.flags.get_string('hash')!) or { HashAlgo.md5 } + hash_fn := HashFn.from_string(cmd.flags.get_string('hash')!) or { HashFn.fnv1a } nr_threads := cmd.flags.get_int('threads')! - brief := cmd.flags.get_bool('brief')! + brief_output := cmd.flags.get_bool('brief')! + json_output := cmd.flags.get_bool('json')! + exclude_globs := cmd.flags.get_strings('exclude')! + skip_empty := cmd.flags.get_bool('skip-empty')! + max_size := cmd.flags.get_string('max-size')!.u64() + remove := cmd.flags.get_bool('remove')! + prompt := cmd.flags.get_bool('prompt')! + if nr_threads <= 0 { + eprintln('threads number cannot be zero or negative') + exit(1) + } mut search_paths := ['.'] if cmd.args.len > 0 { search_paths = cmd.args.clone() } // collect full list of files absolute paths mut file_paths := &[]string{} - for search_path in search_paths { + outer: for search_path in search_paths { + if search_path != '.' { + for glob in exclude_globs { + if search_path.match_glob(glob) { + continue outer + } + } + } + if !os.is_dir(search_path) { + eprintln('${search_path} is not a directory, skip') + continue + } norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path))) - os.walk(norm_path, fn [mut file_paths] (file string) { + os.walk(norm_path, fn [mut file_paths, exclude_globs, skip_empty, max_size] (file string) { + for glob in exclude_globs { + if file.match_glob(glob) || os.file_name(file).match_glob(glob) { + return + } + } + mut file_size := u64(0) + if skip_empty || max_size > 0 { + file_size = os.file_size(file) + } + if skip_empty && file_size == 0 { + return + } + if max_size > 0 && file_size > max_size { + return + } file_paths << file }) } + if file_paths.len == 0 { + eprintln('nothing to do, exiting') + exit(1) + } eprintln('found ${file_paths.len} files, processing...') // split the files list into approximately equal parts by the number of threads mut parts := [][]string{} if nr_threads == 1 { parts = [*file_paths] + } else if nr_threads >= file_paths.len { + for path in file_paths { + parts << [path] + } } else { parts = arrays.chunk(*file_paths, file_paths.len / nr_threads) mut idx := 0 @@ -97,8 +174,8 @@ fn find(cmd cli.Command) ! { } // calculate hashsums in parallel mut threads := []thread map[string]string{} - for i := 0; i < nr_threads; i++ { - threads << spawn calculate_hashsums(parts[i], hash_algo) + for i := 0; i < parts.len; i++ { + threads << spawn calculate_hashsums(i, parts[i], hash_fn) } calculated := threads.wait() mut sums := map[string]string{} @@ -111,20 +188,74 @@ fn find(cmd cli.Command) ! { eprintln(term.bold('no duplicates found')) exit(0) } - for hash, files in dups { - if brief { + if brief_output { + for hash, files in dups { for file in files { println(hash + ':' + file) } - } else { + } + } else if json_output { + mut output := OutputSchema{ + hash_fn: hash_fn.str() + } + for hash, files in dups { + mut entries := []FileEntry{} + for file in files { + stat := os.stat(file)! + entries << FileEntry{ + path: file + size: stat.size + mtime: time.unix(stat.mtime) + } + } + output.data << Duplicate{ + hash: hash + total: entries.len + files: entries + } + } + println(json.encode[OutputSchema](output)) + } else { + for hash, files in dups { println(term.bold(hash)) for file in files { stat := os.stat(file)! - println('\t${time.unix(stat.mtime)}\t${file}') + println('\t${time.unix(stat.mtime)} ${stat.size:-10} ${file}') } } } - exit(2) + if remove { + for _, files in dups { + for file in files[1..] { + if prompt { + answer := os.input("delete file '${file}'? (y/n): ") + if answer != 'y' { + eprintln('skipped ${file}') + continue + } + } + os.rm(file)! + } + } + } +} + +struct OutputSchema { + hash_fn string +mut: + data []Duplicate +} + +struct Duplicate { + hash string + total int + files []FileEntry +} + +struct FileEntry { + path string + size u64 + mtime time.Time } fn find_duplicates(files map[string]string) map[string][]string { @@ -146,22 +277,30 @@ fn find_duplicates(files map[string]string) map[string][]string { return dups } -enum HashAlgo { +enum HashFn { blake3 + crc32 + fnv1a sha1 sha256 md5 } -fn hashsum(file string, algo HashAlgo) string { +fn hashsum(file string, hash_fn HashFn) string { file_bytes := os.read_bytes(file) or { []u8{len: 1} } defer { unsafe { file_bytes.free() } } - match algo { + match hash_fn { .blake3 { return blake3.sum256(file_bytes).hex() } + .crc32 { + return crc32.sum(file_bytes).hex() + } + .fnv1a { + return fnv1a.sum64(file_bytes).hex() + } .sha1 { return sha1.sum(file_bytes).hex() } @@ -174,10 +313,11 @@ fn hashsum(file string, algo HashAlgo) string { } } -fn calculate_hashsums(files []string, hash HashAlgo) map[string]string { +fn calculate_hashsums(tid int, files []string, hash_fn HashFn) map[string]string { + eprintln('thread ${tid} started with queue of ${files.len} files') mut sums := map[string]string{} for file in files { - sums[file] = hashsum(file, hash) + sums[file] = hashsum(file, hash_fn) } return sums } diff --git a/v.mod b/v.mod index f2452a0..01f7f7d 100644 --- a/v.mod +++ b/v.mod @@ -1,7 +1,7 @@ Module { name: 'fdup' description: 'File duplicates finder' - version: '0.1.0' + version: '0.2.0' license: 'GPL-3.0-or-later' dependencies: [] }