mirror of
https://github.com/kevinveenbirkenbach/duplicate-file-handler.git
synced 2024-11-15 02:21:03 +01:00
Improved velocity with parallel processing
This commit is contained in:
parent
5c4afe2655
commit
e31ee231fa
49
main.py
49
main.py
@ -1,7 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import hashlib
|
import hashlib
|
||||||
from collections import defaultdict
|
from collections import defaultdict, Counter
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
def md5sum(filename):
|
def md5sum(filename):
|
||||||
@ -11,33 +12,33 @@ def md5sum(filename):
|
|||||||
hash_md5.update(chunk)
|
hash_md5.update(chunk)
|
||||||
return hash_md5.hexdigest()
|
return hash_md5.hexdigest()
|
||||||
|
|
||||||
|
def file_hashing_job(path):
|
||||||
|
file_hash = md5sum(path)
|
||||||
|
if file_hash:
|
||||||
|
return file_hash, path
|
||||||
|
|
||||||
def find_duplicates(directories, file_type):
|
def find_duplicates(directories, file_type):
|
||||||
hashes = defaultdict(list)
|
with ProcessPoolExecutor() as executor:
|
||||||
for directory in directories:
|
futures = []
|
||||||
for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="Directories"):
|
for directory in directories:
|
||||||
for filename in files:
|
for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="directory"):
|
||||||
if file_type and not filename.endswith(file_type):
|
for filename in files:
|
||||||
continue
|
if file_type and not filename.endswith(file_type):
|
||||||
path = os.path.join(root, filename)
|
continue
|
||||||
if not os.path.islink(path):
|
path = os.path.join(root, filename)
|
||||||
file_hash = md5sum(path)
|
if not os.path.islink(path):
|
||||||
hashes[file_hash].append(path)
|
futures.append(executor.submit(file_hashing_job, path))
|
||||||
|
|
||||||
|
hashes = defaultdict(list)
|
||||||
|
for future in tqdm(futures, desc="Processing files", unit="file"):
|
||||||
|
result = future.result()
|
||||||
|
if result:
|
||||||
|
file_hash, path = result
|
||||||
|
hashes[file_hash].append(path)
|
||||||
|
|
||||||
return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}
|
return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}
|
||||||
|
|
||||||
|
|
||||||
def handle_file_modification(original_file, duplicate_file, modification):
|
|
||||||
if modification == 'delete':
|
|
||||||
print(f"Deleting {duplicate_file}")
|
|
||||||
os.remove(duplicate_file)
|
|
||||||
elif modification == 'hardlink':
|
|
||||||
os.remove(duplicate_file)
|
|
||||||
os.link(original_file, duplicate_file)
|
|
||||||
print(f"Replaced {duplicate_file} with a hardlink to {original_file}")
|
|
||||||
elif modification == 'symlink':
|
|
||||||
os.remove(duplicate_file)
|
|
||||||
os.symlink(original_file, duplicate_file)
|
|
||||||
print(f"Replaced {duplicate_file} with a symlink to {original_file}")
|
|
||||||
|
|
||||||
def handle_modification(files, modification, mode, apply_to):
|
def handle_modification(files, modification, mode, apply_to):
|
||||||
original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0])
|
original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0])
|
||||||
for duplicate_file in files:
|
for duplicate_file in files:
|
||||||
|
Loading…
Reference in New Issue
Block a user