From e31ee231fa9666645b42c5bae204a81d2ea5da4f Mon Sep 17 00:00:00 2001 From: Kevin Veen-Birkenbach Date: Tue, 14 Nov 2023 15:26:47 +0100 Subject: [PATCH] Improved velocity with parallel processing --- main.py | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/main.py b/main.py index 7d19a07..cbd44fb 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,8 @@ import os import argparse import hashlib -from collections import defaultdict +from collections import defaultdict, Counter +from concurrent.futures import ProcessPoolExecutor from tqdm import tqdm def md5sum(filename): @@ -11,33 +12,33 @@ def md5sum(filename): hash_md5.update(chunk) return hash_md5.hexdigest() +def file_hashing_job(path): + file_hash = md5sum(path) + if file_hash: + return file_hash, path + def find_duplicates(directories, file_type): - hashes = defaultdict(list) - for directory in directories: - for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="Directories"): - for filename in files: - if file_type and not filename.endswith(file_type): - continue - path = os.path.join(root, filename) - if not os.path.islink(path): - file_hash = md5sum(path) - hashes[file_hash].append(path) + with ProcessPoolExecutor() as executor: + futures = [] + for directory in directories: + for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="directory"): + for filename in files: + if file_type and not filename.endswith(file_type): + continue + path = os.path.join(root, filename) + if not os.path.islink(path): + futures.append(executor.submit(file_hashing_job, path)) + + hashes = defaultdict(list) + for future in tqdm(futures, desc="Processing files", unit="file"): + result = future.result() + if result: + file_hash, path = result + hashes[file_hash].append(path) + return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1} -def handle_file_modification(original_file, duplicate_file, modification): - if modification == 'delete': - print(f"Deleting {duplicate_file}") - os.remove(duplicate_file) - elif modification == 'hardlink': - os.remove(duplicate_file) - os.link(original_file, duplicate_file) - print(f"Replaced {duplicate_file} with a hardlink to {original_file}") - elif modification == 'symlink': - os.remove(duplicate_file) - os.symlink(original_file, duplicate_file) - print(f"Replaced {duplicate_file} with a symlink to {original_file}") - def handle_modification(files, modification, mode, apply_to): original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0]) for duplicate_file in files: