Improved velocity with parallel processing

This commit is contained in:
Kevin Veen-Birkenbach 2023-11-14 15:26:47 +01:00
parent 5c4afe2655
commit e31ee231fa

49
main.py
View File

@ -1,7 +1,8 @@
import os
import argparse
import hashlib
from collections import defaultdict
from collections import defaultdict, Counter
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
def md5sum(filename):
@ -11,33 +12,33 @@ def md5sum(filename):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def file_hashing_job(path):
file_hash = md5sum(path)
if file_hash:
return file_hash, path
def find_duplicates(directories, file_type):
hashes = defaultdict(list)
for directory in directories:
for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="Directories"):
for filename in files:
if file_type and not filename.endswith(file_type):
continue
path = os.path.join(root, filename)
if not os.path.islink(path):
file_hash = md5sum(path)
hashes[file_hash].append(path)
with ProcessPoolExecutor() as executor:
futures = []
for directory in directories:
for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="directory"):
for filename in files:
if file_type and not filename.endswith(file_type):
continue
path = os.path.join(root, filename)
if not os.path.islink(path):
futures.append(executor.submit(file_hashing_job, path))
hashes = defaultdict(list)
for future in tqdm(futures, desc="Processing files", unit="file"):
result = future.result()
if result:
file_hash, path = result
hashes[file_hash].append(path)
return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}
def handle_file_modification(original_file, duplicate_file, modification):
if modification == 'delete':
print(f"Deleting {duplicate_file}")
os.remove(duplicate_file)
elif modification == 'hardlink':
os.remove(duplicate_file)
os.link(original_file, duplicate_file)
print(f"Replaced {duplicate_file} with a hardlink to {original_file}")
elif modification == 'symlink':
os.remove(duplicate_file)
os.symlink(original_file, duplicate_file)
print(f"Replaced {duplicate_file} with a symlink to {original_file}")
def handle_modification(files, modification, mode, apply_to):
original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0])
for duplicate_file in files: