diff --git a/.gitignore b/.gitignore index 231a22e..9959f8f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Binaries for programs and plugins main -fdup* +fdup +fdup-* *.exe *.exe~ *.so diff --git a/fdup.v b/fdup.v new file mode 100644 index 0000000..6195baa --- /dev/null +++ b/fdup.v @@ -0,0 +1,183 @@ +// fdup - file duplicates finder +// Copyright (C) 2025 Ge +// +// This program is free software: you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the +// Free Software Foundation, either version 3 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program. If not, see . + +module main + +import os +import cli +import arrays +import maps +import crypto.blake3 +import crypto.sha1 +import crypto.sha256 +import crypto.md5 +import runtime +import term +import time + +fn main() { + mut app := cli.Command{ + name: 'fdup' + description: 'File duplicates finder' + version: '0.1.0' + usage: '[path...]' + execute: find + defaults: struct { + man: false + } + flags: [ + cli.Flag{ + flag: .string + name: 'hash' + description: 'Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]' + default_value: ['md5'] + }, + cli.Flag{ + flag: .int + name: 'threads' + description: 'Number of threads used for calculating hash sums [default: number of CPU cores]' + default_value: [runtime.nr_cpus().str()] + }, + cli.Flag{ + flag: .bool + name: 'brief' + description: 'Brief output, print plain easy to parse hashes and filenames only' + }, + ] + } + app.setup() + app.parse(os.args) +} + +fn find(cmd cli.Command) ! { + hash_algo := HashAlgo.from_string(cmd.flags.get_string('hash')!) or { HashAlgo.md5 } + nr_threads := cmd.flags.get_int('threads')! + brief := cmd.flags.get_bool('brief')! + mut search_paths := ['.'] + if cmd.args.len > 0 { + search_paths = cmd.args.clone() + } + // collect full list of files absolute paths + mut file_paths := &[]string{} + for search_path in search_paths { + norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path))) + os.walk(norm_path, fn [mut file_paths] (file string) { + file_paths << file + }) + } + eprintln('found ${file_paths.len} files, processing...') + // split the files list into approximately equal parts by the number of threads + mut parts := [][]string{} + if nr_threads == 1 { + parts = [*file_paths] + } else { + parts = arrays.chunk(*file_paths, file_paths.len / nr_threads) + mut idx := 0 + for parts.len != nr_threads { + parts[idx] = arrays.append(parts[0], parts.last()) + parts.delete_last() + idx++ + if idx >= parts.len { + idx = 0 + } + } + } + // calculate hashsums in parallel + mut threads := []thread map[string]string{} + for i := 0; i < nr_threads; i++ { + threads << spawn calculate_hashsums(parts[i], hash_algo) + } + calculated := threads.wait() + mut sums := map[string]string{} + for s in calculated { + maps.merge_in_place(mut sums, s) + } + // find and pretty-print duplicates + dups := find_duplicates(sums) + if dups.len == 0 { + eprintln(term.bold('no duplicates found')) + exit(0) + } + for hash, files in dups { + if brief { + for file in files { + println(hash + ':' + file) + } + } else { + println(term.bold(hash)) + for file in files { + stat := os.stat(file)! + println('\t${time.unix(stat.mtime)}\t${file}') + } + } + } + exit(2) +} + +fn find_duplicates(files map[string]string) map[string][]string { + mut dups := map[string][]string{} + for _, hash in files { + if hash !in dups { + for f, h in files { + if h == hash { + dups[hash] << f + } + } + } + } + for h, f in dups { + if f.len == 1 { + dups.delete(h) + } + } + return dups +} + +enum HashAlgo { + blake3 + sha1 + sha256 + md5 +} + +fn hashsum(file string, algo HashAlgo) string { + file_bytes := os.read_bytes(file) or { []u8{len: 1} } + defer { + unsafe { file_bytes.free() } + } + match algo { + .blake3 { + return blake3.sum256(file_bytes).hex() + } + .sha1 { + return sha1.sum(file_bytes).hex() + } + .sha256 { + return sha256.sum(file_bytes).hex() + } + .md5 { + return md5.sum(file_bytes).hex() + } + } +} + +fn calculate_hashsums(files []string, hash HashAlgo) map[string]string { + mut sums := map[string]string{} + for file in files { + sums[file] = hashsum(file, hash) + } + return sums +}