all: various improvements

This commit is contained in:
ge 2025-02-15 00:22:34 +03:00
parent d1ede3f528
commit e607b5c6bc
3 changed files with 172 additions and 26 deletions

View File

@ -1,6 +1,6 @@
# fdup # fdup
The dumb tool to find duplicate files by it's hash sums. The dumb tool for finding duplicate files by their hash sums.
Compile it with `-prod` for better performance: Compile it with `-prod` for better performance:
@ -13,14 +13,20 @@ Look at releases page for prebuilt executables.
# Synonsis # Synonsis
``` ```
Usage: fdup [flags] [commands] [path...] Usage: fdup [flags] [commands] [DIR...]
File duplicates finder File duplicates finder
Flags: Flags:
-hash Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5] -hash Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]
-threads Number of threads used for calculating hash sums [default: number of CPU cores] -threads Number of threads used for calculating hash sums [default: number of CPU cores]
-brief Brief output, print plain easy to parse hashes and filenames only -brief Brief output, print plain easy to parse hashes and filenames only.
-json Print output in JSON format.
-exclude Glob pattern to exclude files and directories [can be passed multiple times]
-skip-empty Skip empty files.
-max-size Maximum file size in bytes. Files larger than this will be skipped.
-remove Remove duplicates.
-prompt Prompt before every removal.
-help Prints help information. -help Prints help information.
-version Prints version information. -version Prints version information.

178
fdup.v
View File

@ -20,6 +20,8 @@ import os
import cli import cli
import arrays import arrays
import maps import maps
import hash.crc32
import hash.fnv1a
import crypto.blake3 import crypto.blake3
import crypto.sha1 import crypto.sha1
import crypto.sha256 import crypto.sha256
@ -27,13 +29,14 @@ import crypto.md5
import runtime import runtime
import term import term
import time import time
import x.json2 as json
fn main() { fn main() {
mut app := cli.Command{ mut app := cli.Command{
name: 'fdup' name: 'fdup'
description: 'File duplicates finder' description: 'File duplicates finder'
version: '0.1.0' version: '0.2.0'
usage: '[path...]' usage: '[DIR...]'
execute: find execute: find
defaults: struct { defaults: struct {
man: false man: false
@ -42,8 +45,8 @@ fn main() {
cli.Flag{ cli.Flag{
flag: .string flag: .string
name: 'hash' name: 'hash'
description: 'Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]' description: 'Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]'
default_value: ['md5'] default_value: ['fnv1a']
}, },
cli.Flag{ cli.Flag{
flag: .int flag: .int
@ -54,7 +57,37 @@ fn main() {
cli.Flag{ cli.Flag{
flag: .bool flag: .bool
name: 'brief' name: 'brief'
description: 'Brief output, print plain easy to parse hashes and filenames only' description: 'Brief output, print plain easy to parse hashes and filenames only.'
},
cli.Flag{
flag: .bool
name: 'json'
description: 'Print output in JSON format.'
},
cli.Flag{
flag: .string_array
name: 'exclude'
description: 'Glob pattern to exclude files and directories [can be passed multiple times]'
},
cli.Flag{
flag: .bool
name: 'skip-empty'
description: 'Skip empty files.'
},
cli.Flag{
flag: .string
name: 'max-size'
description: 'Maximum file size in bytes. Files larger than this will be skipped.'
},
cli.Flag{
flag: .bool
name: 'remove'
description: 'Remove duplicates.'
},
cli.Flag{
flag: .bool
name: 'prompt'
description: 'Prompt before every removal.'
}, },
] ]
} }
@ -63,26 +96,70 @@ fn main() {
} }
fn find(cmd cli.Command) ! { fn find(cmd cli.Command) ! {
hash_algo := HashAlgo.from_string(cmd.flags.get_string('hash')!) or { HashAlgo.md5 } hash_fn := HashFn.from_string(cmd.flags.get_string('hash')!) or { HashFn.fnv1a }
nr_threads := cmd.flags.get_int('threads')! nr_threads := cmd.flags.get_int('threads')!
brief := cmd.flags.get_bool('brief')! brief_output := cmd.flags.get_bool('brief')!
json_output := cmd.flags.get_bool('json')!
exclude_globs := cmd.flags.get_strings('exclude')!
skip_empty := cmd.flags.get_bool('skip-empty')!
max_size := cmd.flags.get_string('max-size')!.u64()
remove := cmd.flags.get_bool('remove')!
prompt := cmd.flags.get_bool('prompt')!
if nr_threads <= 0 {
eprintln('threads number cannot be zero or negative')
exit(1)
}
mut search_paths := ['.'] mut search_paths := ['.']
if cmd.args.len > 0 { if cmd.args.len > 0 {
search_paths = cmd.args.clone() search_paths = cmd.args.clone()
} }
// collect full list of files absolute paths // collect full list of files absolute paths
mut file_paths := &[]string{} mut file_paths := &[]string{}
for search_path in search_paths { outer: for search_path in search_paths {
if search_path != '.' {
for glob in exclude_globs {
if search_path.match_glob(glob) {
continue outer
}
}
}
if !os.is_dir(search_path) {
eprintln('${search_path} is not a directory, skip')
continue
}
norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path))) norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path)))
os.walk(norm_path, fn [mut file_paths] (file string) { os.walk(norm_path, fn [mut file_paths, exclude_globs, skip_empty, max_size] (file string) {
for glob in exclude_globs {
if file.match_glob(glob) || os.file_name(file).match_glob(glob) {
return
}
}
mut file_size := u64(0)
if skip_empty || max_size > 0 {
file_size = os.file_size(file)
}
if skip_empty && file_size == 0 {
return
}
if max_size > 0 && file_size > max_size {
return
}
file_paths << file file_paths << file
}) })
} }
if file_paths.len == 0 {
eprintln('nothing to do, exiting')
exit(1)
}
eprintln('found ${file_paths.len} files, processing...') eprintln('found ${file_paths.len} files, processing...')
// split the files list into approximately equal parts by the number of threads // split the files list into approximately equal parts by the number of threads
mut parts := [][]string{} mut parts := [][]string{}
if nr_threads == 1 { if nr_threads == 1 {
parts = [*file_paths] parts = [*file_paths]
} else if nr_threads >= file_paths.len {
for path in file_paths {
parts << [path]
}
} else { } else {
parts = arrays.chunk(*file_paths, file_paths.len / nr_threads) parts = arrays.chunk(*file_paths, file_paths.len / nr_threads)
mut idx := 0 mut idx := 0
@ -97,8 +174,8 @@ fn find(cmd cli.Command) ! {
} }
// calculate hashsums in parallel // calculate hashsums in parallel
mut threads := []thread map[string]string{} mut threads := []thread map[string]string{}
for i := 0; i < nr_threads; i++ { for i := 0; i < parts.len; i++ {
threads << spawn calculate_hashsums(parts[i], hash_algo) threads << spawn calculate_hashsums(i, parts[i], hash_fn)
} }
calculated := threads.wait() calculated := threads.wait()
mut sums := map[string]string{} mut sums := map[string]string{}
@ -111,20 +188,74 @@ fn find(cmd cli.Command) ! {
eprintln(term.bold('no duplicates found')) eprintln(term.bold('no duplicates found'))
exit(0) exit(0)
} }
if brief_output {
for hash, files in dups { for hash, files in dups {
if brief {
for file in files { for file in files {
println(hash + ':' + file) println(hash + ':' + file)
} }
}
} else if json_output {
mut output := OutputSchema{
hash_fn: hash_fn.str()
}
for hash, files in dups {
mut entries := []FileEntry{}
for file in files {
stat := os.stat(file)!
entries << FileEntry{
path: file
size: stat.size
mtime: time.unix(stat.mtime)
}
}
output.data << Duplicate{
hash: hash
total: entries.len
files: entries
}
}
println(json.encode[OutputSchema](output))
} else { } else {
for hash, files in dups {
println(term.bold(hash)) println(term.bold(hash))
for file in files { for file in files {
stat := os.stat(file)! stat := os.stat(file)!
println('\t${time.unix(stat.mtime)}\t${file}') println('\t${time.unix(stat.mtime)} ${stat.size:-10} ${file}')
} }
} }
} }
exit(2) if remove {
for _, files in dups {
for file in files[1..] {
if prompt {
answer := os.input("delete file '${file}'? (y/n): ")
if answer != 'y' {
eprintln('skipped ${file}')
continue
}
}
os.rm(file)!
}
}
}
}
struct OutputSchema {
hash_fn string
mut:
data []Duplicate
}
struct Duplicate {
hash string
total int
files []FileEntry
}
struct FileEntry {
path string
size u64
mtime time.Time
} }
fn find_duplicates(files map[string]string) map[string][]string { fn find_duplicates(files map[string]string) map[string][]string {
@ -146,22 +277,30 @@ fn find_duplicates(files map[string]string) map[string][]string {
return dups return dups
} }
enum HashAlgo { enum HashFn {
blake3 blake3
crc32
fnv1a
sha1 sha1
sha256 sha256
md5 md5
} }
fn hashsum(file string, algo HashAlgo) string { fn hashsum(file string, hash_fn HashFn) string {
file_bytes := os.read_bytes(file) or { []u8{len: 1} } file_bytes := os.read_bytes(file) or { []u8{len: 1} }
defer { defer {
unsafe { file_bytes.free() } unsafe { file_bytes.free() }
} }
match algo { match hash_fn {
.blake3 { .blake3 {
return blake3.sum256(file_bytes).hex() return blake3.sum256(file_bytes).hex()
} }
.crc32 {
return crc32.sum(file_bytes).hex()
}
.fnv1a {
return fnv1a.sum64(file_bytes).hex()
}
.sha1 { .sha1 {
return sha1.sum(file_bytes).hex() return sha1.sum(file_bytes).hex()
} }
@ -174,10 +313,11 @@ fn hashsum(file string, algo HashAlgo) string {
} }
} }
fn calculate_hashsums(files []string, hash HashAlgo) map[string]string { fn calculate_hashsums(tid int, files []string, hash_fn HashFn) map[string]string {
eprintln('thread ${tid} started with queue of ${files.len} files')
mut sums := map[string]string{} mut sums := map[string]string{}
for file in files { for file in files {
sums[file] = hashsum(file, hash) sums[file] = hashsum(file, hash_fn)
} }
return sums return sums
} }

2
v.mod
View File

@ -1,7 +1,7 @@
Module { Module {
name: 'fdup' name: 'fdup'
description: 'File duplicates finder' description: 'File duplicates finder'
version: '0.1.0' version: '0.2.0'
license: 'GPL-3.0-or-later' license: 'GPL-3.0-or-later'
dependencies: [] dependencies: []
} }