Improved velocity with parallel processing

2026-01-02 13:29:09 +00:00 · 2023-11-14 15:26:47 +01:00
parent 5c4afe2655
commit e31ee231fa
1 changed files with 25 additions and 24 deletions
--- a/main.py
+++ b/main.py
@@ -1,7 +1,8 @@
 import os
 import argparse
 import hashlib
-from collections import defaultdict
+from collections import defaultdict, Counter
 from concurrent.futures import ProcessPoolExecutor
 from tqdm import tqdm
 def md5sum(filename):
@@ -11,33 +12,33 @@ def md5sum(filename):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()
 def file_hashing_job(path):
    file_hash = md5sum(path)
    if file_hash:
        return file_hash, path
 def find_duplicates(directories, file_type):
-    hashes = defaultdict(list)
+    with ProcessPoolExecutor() as executor:
-    for directory in directories:
+        futures = []
-        for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="Directories"):
+        for directory in directories:
-            for filename in files:
+            for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="directory"):
-                if file_type and not filename.endswith(file_type):
+                for filename in files:
-                    continue
+                    if file_type and not filename.endswith(file_type):
-                path = os.path.join(root, filename)
+                        continue
-                if not os.path.islink(path):
+                    path = os.path.join(root, filename)
-                    file_hash = md5sum(path)
+                    if not os.path.islink(path):
-                    hashes[file_hash].append(path)
+                        futures.append(executor.submit(file_hashing_job, path))
        hashes = defaultdict(list)
        for future in tqdm(futures, desc="Processing files", unit="file"):
            result = future.result()
            if result:
                file_hash, path = result
                hashes[file_hash].append(path)
    return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}
 def handle_file_modification(original_file, duplicate_file, modification):
    if modification == 'delete':
        print(f"Deleting {duplicate_file}")
        os.remove(duplicate_file)
    elif modification == 'hardlink':
        os.remove(duplicate_file)
        os.link(original_file, duplicate_file)
        print(f"Replaced {duplicate_file} with a hardlink to {original_file}")
    elif modification == 'symlink':
        os.remove(duplicate_file)
        os.symlink(original_file, duplicate_file)
        print(f"Replaced {duplicate_file} with a symlink to {original_file}")
 def handle_modification(files, modification, mode, apply_to):
    original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0])
    for duplicate_file in files: