Improved velocity with parallel processing

2025-07-17 06:14:25 +02:00 · 2023-11-14 15:26:47 +01:00 · 2023-11-14 15:26:47 +01:00 · e31ee231fa
commit e31ee231fa
parent 5c4afe2655
1 changed files with 25 additions and 24 deletions
--- a/main.py
+++ b/main.py
@ -1,7 +1,8 @@
 import os
 import argparse
 import hashlib
-from collections import defaultdict
+from collections import defaultdict, Counter
+from concurrent.futures import ProcessPoolExecutor
 from tqdm import tqdm

 def md5sum(filename):
@ -11,33 +12,33 @@ def md5sum(filename):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

+def file_hashing_job(path):
+    file_hash = md5sum(path)
+    if file_hash:
+        return file_hash, path
+
 def find_duplicates(directories, file_type):
-    hashes = defaultdict(list)
-    for directory in directories:
-        for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="Directories"):
-            for filename in files:
-                if file_type and not filename.endswith(file_type):
-                    continue
-                path = os.path.join(root, filename)
-                if not os.path.islink(path):
-                    file_hash = md5sum(path)
-                    hashes[file_hash].append(path)
+    with ProcessPoolExecutor() as executor:
+        futures = []
+        for directory in directories:
+            for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="directory"):
+                for filename in files:
+                    if file_type and not filename.endswith(file_type):
+                        continue
+                    path = os.path.join(root, filename)
+                    if not os.path.islink(path):
+                        futures.append(executor.submit(file_hashing_job, path))
+
+        hashes = defaultdict(list)
+        for future in tqdm(futures, desc="Processing files", unit="file"):
+            result = future.result()
+            if result:
+                file_hash, path = result
+                hashes[file_hash].append(path)
+
    return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}


-def handle_file_modification(original_file, duplicate_file, modification):
-    if modification == 'delete':
-        print(f"Deleting {duplicate_file}")
-        os.remove(duplicate_file)
-    elif modification == 'hardlink':
-        os.remove(duplicate_file)
-        os.link(original_file, duplicate_file)
-        print(f"Replaced {duplicate_file} with a hardlink to {original_file}")
-    elif modification == 'symlink':
-        os.remove(duplicate_file)
-        os.symlink(original_file, duplicate_file)
-        print(f"Replaced {duplicate_file} with a symlink to {original_file}")
-
 def handle_modification(files, modification, mode, apply_to):
    original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0])
    for duplicate_file in files: