mirror of
https://github.com/kevinveenbirkenbach/duplicate-file-handler.git
synced 2024-11-14 18:11:03 +01:00
Improved velocity with parallel processing
This commit is contained in:
parent
5c4afe2655
commit
e31ee231fa
49
main.py
49
main.py
@ -1,7 +1,8 @@
|
||||
import os
|
||||
import argparse
|
||||
import hashlib
|
||||
from collections import defaultdict
|
||||
from collections import defaultdict, Counter
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from tqdm import tqdm
|
||||
|
||||
def md5sum(filename):
|
||||
@ -11,33 +12,33 @@ def md5sum(filename):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
def file_hashing_job(path):
|
||||
file_hash = md5sum(path)
|
||||
if file_hash:
|
||||
return file_hash, path
|
||||
|
||||
def find_duplicates(directories, file_type):
|
||||
hashes = defaultdict(list)
|
||||
for directory in directories:
|
||||
for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="Directories"):
|
||||
for filename in files:
|
||||
if file_type and not filename.endswith(file_type):
|
||||
continue
|
||||
path = os.path.join(root, filename)
|
||||
if not os.path.islink(path):
|
||||
file_hash = md5sum(path)
|
||||
hashes[file_hash].append(path)
|
||||
with ProcessPoolExecutor() as executor:
|
||||
futures = []
|
||||
for directory in directories:
|
||||
for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="directory"):
|
||||
for filename in files:
|
||||
if file_type and not filename.endswith(file_type):
|
||||
continue
|
||||
path = os.path.join(root, filename)
|
||||
if not os.path.islink(path):
|
||||
futures.append(executor.submit(file_hashing_job, path))
|
||||
|
||||
hashes = defaultdict(list)
|
||||
for future in tqdm(futures, desc="Processing files", unit="file"):
|
||||
result = future.result()
|
||||
if result:
|
||||
file_hash, path = result
|
||||
hashes[file_hash].append(path)
|
||||
|
||||
return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}
|
||||
|
||||
|
||||
def handle_file_modification(original_file, duplicate_file, modification):
|
||||
if modification == 'delete':
|
||||
print(f"Deleting {duplicate_file}")
|
||||
os.remove(duplicate_file)
|
||||
elif modification == 'hardlink':
|
||||
os.remove(duplicate_file)
|
||||
os.link(original_file, duplicate_file)
|
||||
print(f"Replaced {duplicate_file} with a hardlink to {original_file}")
|
||||
elif modification == 'symlink':
|
||||
os.remove(duplicate_file)
|
||||
os.symlink(original_file, duplicate_file)
|
||||
print(f"Replaced {duplicate_file} with a symlink to {original_file}")
|
||||
|
||||
def handle_modification(files, modification, mode, apply_to):
|
||||
original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0])
|
||||
for duplicate_file in files:
|
||||
|
Loading…
Reference in New Issue
Block a user