Improved velocity with parallel processing

This commit is contained in:
Kevin Veen-Birkenbach 2023-11-14 15:26:47 +01:00
parent 5c4afe2655
commit e31ee231fa

35
main.py
View File

@ -1,7 +1,8 @@
import os import os
import argparse import argparse
import hashlib import hashlib
from collections import defaultdict from collections import defaultdict, Counter
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm from tqdm import tqdm
def md5sum(filename): def md5sum(filename):
@ -11,33 +12,33 @@ def md5sum(filename):
hash_md5.update(chunk) hash_md5.update(chunk)
return hash_md5.hexdigest() return hash_md5.hexdigest()
def file_hashing_job(path):
file_hash = md5sum(path)
if file_hash:
return file_hash, path
def find_duplicates(directories, file_type): def find_duplicates(directories, file_type):
hashes = defaultdict(list) with ProcessPoolExecutor() as executor:
futures = []
for directory in directories: for directory in directories:
for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="Directories"): for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="directory"):
for filename in files: for filename in files:
if file_type and not filename.endswith(file_type): if file_type and not filename.endswith(file_type):
continue continue
path = os.path.join(root, filename) path = os.path.join(root, filename)
if not os.path.islink(path): if not os.path.islink(path):
file_hash = md5sum(path) futures.append(executor.submit(file_hashing_job, path))
hashes = defaultdict(list)
for future in tqdm(futures, desc="Processing files", unit="file"):
result = future.result()
if result:
file_hash, path = result
hashes[file_hash].append(path) hashes[file_hash].append(path)
return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1} return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}
def handle_file_modification(original_file, duplicate_file, modification):
if modification == 'delete':
print(f"Deleting {duplicate_file}")
os.remove(duplicate_file)
elif modification == 'hardlink':
os.remove(duplicate_file)
os.link(original_file, duplicate_file)
print(f"Replaced {duplicate_file} with a hardlink to {original_file}")
elif modification == 'symlink':
os.remove(duplicate_file)
os.symlink(original_file, duplicate_file)
print(f"Replaced {duplicate_file} with a symlink to {original_file}")
def handle_modification(files, modification, mode, apply_to): def handle_modification(files, modification, mode, apply_to):
original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0]) original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0])
for duplicate_file in files: for duplicate_file in files: