all: various improvements
This commit is contained in:
parent
d1ede3f528
commit
e607b5c6bc
14
README.md
14
README.md
@ -1,6 +1,6 @@
|
|||||||
# fdup
|
# fdup
|
||||||
|
|
||||||
The dumb tool to find duplicate files by it's hash sums.
|
The dumb tool for finding duplicate files by their hash sums.
|
||||||
|
|
||||||
Compile it with `-prod` for better performance:
|
Compile it with `-prod` for better performance:
|
||||||
|
|
||||||
@ -13,14 +13,20 @@ Look at releases page for prebuilt executables.
|
|||||||
# Synonsis
|
# Synonsis
|
||||||
|
|
||||||
```
|
```
|
||||||
Usage: fdup [flags] [commands] [path...]
|
Usage: fdup [flags] [commands] [DIR...]
|
||||||
|
|
||||||
File duplicates finder
|
File duplicates finder
|
||||||
|
|
||||||
Flags:
|
Flags:
|
||||||
-hash Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]
|
-hash Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]
|
||||||
-threads Number of threads used for calculating hash sums [default: number of CPU cores]
|
-threads Number of threads used for calculating hash sums [default: number of CPU cores]
|
||||||
-brief Brief output, print plain easy to parse hashes and filenames only
|
-brief Brief output, print plain easy to parse hashes and filenames only.
|
||||||
|
-json Print output in JSON format.
|
||||||
|
-exclude Glob pattern to exclude files and directories [can be passed multiple times]
|
||||||
|
-skip-empty Skip empty files.
|
||||||
|
-max-size Maximum file size in bytes. Files larger than this will be skipped.
|
||||||
|
-remove Remove duplicates.
|
||||||
|
-prompt Prompt before every removal.
|
||||||
-help Prints help information.
|
-help Prints help information.
|
||||||
-version Prints version information.
|
-version Prints version information.
|
||||||
|
|
||||||
|
178
fdup.v
178
fdup.v
@ -20,6 +20,8 @@ import os
|
|||||||
import cli
|
import cli
|
||||||
import arrays
|
import arrays
|
||||||
import maps
|
import maps
|
||||||
|
import hash.crc32
|
||||||
|
import hash.fnv1a
|
||||||
import crypto.blake3
|
import crypto.blake3
|
||||||
import crypto.sha1
|
import crypto.sha1
|
||||||
import crypto.sha256
|
import crypto.sha256
|
||||||
@ -27,13 +29,14 @@ import crypto.md5
|
|||||||
import runtime
|
import runtime
|
||||||
import term
|
import term
|
||||||
import time
|
import time
|
||||||
|
import x.json2 as json
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
mut app := cli.Command{
|
mut app := cli.Command{
|
||||||
name: 'fdup'
|
name: 'fdup'
|
||||||
description: 'File duplicates finder'
|
description: 'File duplicates finder'
|
||||||
version: '0.1.0'
|
version: '0.2.0'
|
||||||
usage: '[path...]'
|
usage: '[DIR...]'
|
||||||
execute: find
|
execute: find
|
||||||
defaults: struct {
|
defaults: struct {
|
||||||
man: false
|
man: false
|
||||||
@ -42,8 +45,8 @@ fn main() {
|
|||||||
cli.Flag{
|
cli.Flag{
|
||||||
flag: .string
|
flag: .string
|
||||||
name: 'hash'
|
name: 'hash'
|
||||||
description: 'Set hashing algorythm: blake3, sha1, sha256, md5 [default: md5]'
|
description: 'Hashing algorythm: blake3, crc32, fnv1a, sha1, sha256, md5 [default: fnv1a]'
|
||||||
default_value: ['md5']
|
default_value: ['fnv1a']
|
||||||
},
|
},
|
||||||
cli.Flag{
|
cli.Flag{
|
||||||
flag: .int
|
flag: .int
|
||||||
@ -54,7 +57,37 @@ fn main() {
|
|||||||
cli.Flag{
|
cli.Flag{
|
||||||
flag: .bool
|
flag: .bool
|
||||||
name: 'brief'
|
name: 'brief'
|
||||||
description: 'Brief output, print plain easy to parse hashes and filenames only'
|
description: 'Brief output, print plain easy to parse hashes and filenames only.'
|
||||||
|
},
|
||||||
|
cli.Flag{
|
||||||
|
flag: .bool
|
||||||
|
name: 'json'
|
||||||
|
description: 'Print output in JSON format.'
|
||||||
|
},
|
||||||
|
cli.Flag{
|
||||||
|
flag: .string_array
|
||||||
|
name: 'exclude'
|
||||||
|
description: 'Glob pattern to exclude files and directories [can be passed multiple times]'
|
||||||
|
},
|
||||||
|
cli.Flag{
|
||||||
|
flag: .bool
|
||||||
|
name: 'skip-empty'
|
||||||
|
description: 'Skip empty files.'
|
||||||
|
},
|
||||||
|
cli.Flag{
|
||||||
|
flag: .string
|
||||||
|
name: 'max-size'
|
||||||
|
description: 'Maximum file size in bytes. Files larger than this will be skipped.'
|
||||||
|
},
|
||||||
|
cli.Flag{
|
||||||
|
flag: .bool
|
||||||
|
name: 'remove'
|
||||||
|
description: 'Remove duplicates.'
|
||||||
|
},
|
||||||
|
cli.Flag{
|
||||||
|
flag: .bool
|
||||||
|
name: 'prompt'
|
||||||
|
description: 'Prompt before every removal.'
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -63,26 +96,70 @@ fn main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn find(cmd cli.Command) ! {
|
fn find(cmd cli.Command) ! {
|
||||||
hash_algo := HashAlgo.from_string(cmd.flags.get_string('hash')!) or { HashAlgo.md5 }
|
hash_fn := HashFn.from_string(cmd.flags.get_string('hash')!) or { HashFn.fnv1a }
|
||||||
nr_threads := cmd.flags.get_int('threads')!
|
nr_threads := cmd.flags.get_int('threads')!
|
||||||
brief := cmd.flags.get_bool('brief')!
|
brief_output := cmd.flags.get_bool('brief')!
|
||||||
|
json_output := cmd.flags.get_bool('json')!
|
||||||
|
exclude_globs := cmd.flags.get_strings('exclude')!
|
||||||
|
skip_empty := cmd.flags.get_bool('skip-empty')!
|
||||||
|
max_size := cmd.flags.get_string('max-size')!.u64()
|
||||||
|
remove := cmd.flags.get_bool('remove')!
|
||||||
|
prompt := cmd.flags.get_bool('prompt')!
|
||||||
|
if nr_threads <= 0 {
|
||||||
|
eprintln('threads number cannot be zero or negative')
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
mut search_paths := ['.']
|
mut search_paths := ['.']
|
||||||
if cmd.args.len > 0 {
|
if cmd.args.len > 0 {
|
||||||
search_paths = cmd.args.clone()
|
search_paths = cmd.args.clone()
|
||||||
}
|
}
|
||||||
// collect full list of files absolute paths
|
// collect full list of files absolute paths
|
||||||
mut file_paths := &[]string{}
|
mut file_paths := &[]string{}
|
||||||
for search_path in search_paths {
|
outer: for search_path in search_paths {
|
||||||
|
if search_path != '.' {
|
||||||
|
for glob in exclude_globs {
|
||||||
|
if search_path.match_glob(glob) {
|
||||||
|
continue outer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !os.is_dir(search_path) {
|
||||||
|
eprintln('${search_path} is not a directory, skip')
|
||||||
|
continue
|
||||||
|
}
|
||||||
norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path)))
|
norm_path := os.norm_path(os.abs_path(os.expand_tilde_to_home(search_path)))
|
||||||
os.walk(norm_path, fn [mut file_paths] (file string) {
|
os.walk(norm_path, fn [mut file_paths, exclude_globs, skip_empty, max_size] (file string) {
|
||||||
|
for glob in exclude_globs {
|
||||||
|
if file.match_glob(glob) || os.file_name(file).match_glob(glob) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mut file_size := u64(0)
|
||||||
|
if skip_empty || max_size > 0 {
|
||||||
|
file_size = os.file_size(file)
|
||||||
|
}
|
||||||
|
if skip_empty && file_size == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if max_size > 0 && file_size > max_size {
|
||||||
|
return
|
||||||
|
}
|
||||||
file_paths << file
|
file_paths << file
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
if file_paths.len == 0 {
|
||||||
|
eprintln('nothing to do, exiting')
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
eprintln('found ${file_paths.len} files, processing...')
|
eprintln('found ${file_paths.len} files, processing...')
|
||||||
// split the files list into approximately equal parts by the number of threads
|
// split the files list into approximately equal parts by the number of threads
|
||||||
mut parts := [][]string{}
|
mut parts := [][]string{}
|
||||||
if nr_threads == 1 {
|
if nr_threads == 1 {
|
||||||
parts = [*file_paths]
|
parts = [*file_paths]
|
||||||
|
} else if nr_threads >= file_paths.len {
|
||||||
|
for path in file_paths {
|
||||||
|
parts << [path]
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
parts = arrays.chunk(*file_paths, file_paths.len / nr_threads)
|
parts = arrays.chunk(*file_paths, file_paths.len / nr_threads)
|
||||||
mut idx := 0
|
mut idx := 0
|
||||||
@ -97,8 +174,8 @@ fn find(cmd cli.Command) ! {
|
|||||||
}
|
}
|
||||||
// calculate hashsums in parallel
|
// calculate hashsums in parallel
|
||||||
mut threads := []thread map[string]string{}
|
mut threads := []thread map[string]string{}
|
||||||
for i := 0; i < nr_threads; i++ {
|
for i := 0; i < parts.len; i++ {
|
||||||
threads << spawn calculate_hashsums(parts[i], hash_algo)
|
threads << spawn calculate_hashsums(i, parts[i], hash_fn)
|
||||||
}
|
}
|
||||||
calculated := threads.wait()
|
calculated := threads.wait()
|
||||||
mut sums := map[string]string{}
|
mut sums := map[string]string{}
|
||||||
@ -111,20 +188,74 @@ fn find(cmd cli.Command) ! {
|
|||||||
eprintln(term.bold('no duplicates found'))
|
eprintln(term.bold('no duplicates found'))
|
||||||
exit(0)
|
exit(0)
|
||||||
}
|
}
|
||||||
|
if brief_output {
|
||||||
for hash, files in dups {
|
for hash, files in dups {
|
||||||
if brief {
|
|
||||||
for file in files {
|
for file in files {
|
||||||
println(hash + ':' + file)
|
println(hash + ':' + file)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
} else if json_output {
|
||||||
|
mut output := OutputSchema{
|
||||||
|
hash_fn: hash_fn.str()
|
||||||
|
}
|
||||||
|
for hash, files in dups {
|
||||||
|
mut entries := []FileEntry{}
|
||||||
|
for file in files {
|
||||||
|
stat := os.stat(file)!
|
||||||
|
entries << FileEntry{
|
||||||
|
path: file
|
||||||
|
size: stat.size
|
||||||
|
mtime: time.unix(stat.mtime)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output.data << Duplicate{
|
||||||
|
hash: hash
|
||||||
|
total: entries.len
|
||||||
|
files: entries
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println(json.encode[OutputSchema](output))
|
||||||
} else {
|
} else {
|
||||||
|
for hash, files in dups {
|
||||||
println(term.bold(hash))
|
println(term.bold(hash))
|
||||||
for file in files {
|
for file in files {
|
||||||
stat := os.stat(file)!
|
stat := os.stat(file)!
|
||||||
println('\t${time.unix(stat.mtime)}\t${file}')
|
println('\t${time.unix(stat.mtime)} ${stat.size:-10} ${file}')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
exit(2)
|
if remove {
|
||||||
|
for _, files in dups {
|
||||||
|
for file in files[1..] {
|
||||||
|
if prompt {
|
||||||
|
answer := os.input("delete file '${file}'? (y/n): ")
|
||||||
|
if answer != 'y' {
|
||||||
|
eprintln('skipped ${file}')
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
os.rm(file)!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct OutputSchema {
|
||||||
|
hash_fn string
|
||||||
|
mut:
|
||||||
|
data []Duplicate
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Duplicate {
|
||||||
|
hash string
|
||||||
|
total int
|
||||||
|
files []FileEntry
|
||||||
|
}
|
||||||
|
|
||||||
|
struct FileEntry {
|
||||||
|
path string
|
||||||
|
size u64
|
||||||
|
mtime time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
fn find_duplicates(files map[string]string) map[string][]string {
|
fn find_duplicates(files map[string]string) map[string][]string {
|
||||||
@ -146,22 +277,30 @@ fn find_duplicates(files map[string]string) map[string][]string {
|
|||||||
return dups
|
return dups
|
||||||
}
|
}
|
||||||
|
|
||||||
enum HashAlgo {
|
enum HashFn {
|
||||||
blake3
|
blake3
|
||||||
|
crc32
|
||||||
|
fnv1a
|
||||||
sha1
|
sha1
|
||||||
sha256
|
sha256
|
||||||
md5
|
md5
|
||||||
}
|
}
|
||||||
|
|
||||||
fn hashsum(file string, algo HashAlgo) string {
|
fn hashsum(file string, hash_fn HashFn) string {
|
||||||
file_bytes := os.read_bytes(file) or { []u8{len: 1} }
|
file_bytes := os.read_bytes(file) or { []u8{len: 1} }
|
||||||
defer {
|
defer {
|
||||||
unsafe { file_bytes.free() }
|
unsafe { file_bytes.free() }
|
||||||
}
|
}
|
||||||
match algo {
|
match hash_fn {
|
||||||
.blake3 {
|
.blake3 {
|
||||||
return blake3.sum256(file_bytes).hex()
|
return blake3.sum256(file_bytes).hex()
|
||||||
}
|
}
|
||||||
|
.crc32 {
|
||||||
|
return crc32.sum(file_bytes).hex()
|
||||||
|
}
|
||||||
|
.fnv1a {
|
||||||
|
return fnv1a.sum64(file_bytes).hex()
|
||||||
|
}
|
||||||
.sha1 {
|
.sha1 {
|
||||||
return sha1.sum(file_bytes).hex()
|
return sha1.sum(file_bytes).hex()
|
||||||
}
|
}
|
||||||
@ -174,10 +313,11 @@ fn hashsum(file string, algo HashAlgo) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn calculate_hashsums(files []string, hash HashAlgo) map[string]string {
|
fn calculate_hashsums(tid int, files []string, hash_fn HashFn) map[string]string {
|
||||||
|
eprintln('thread ${tid} started with queue of ${files.len} files')
|
||||||
mut sums := map[string]string{}
|
mut sums := map[string]string{}
|
||||||
for file in files {
|
for file in files {
|
||||||
sums[file] = hashsum(file, hash)
|
sums[file] = hashsum(file, hash_fn)
|
||||||
}
|
}
|
||||||
return sums
|
return sums
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user