2023-11-14 10:28:18 +01:00
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import hashlib
|
|
|
|
from collections import defaultdict
|
|
|
|
from filecmp import dircmp
|
|
|
|
|
|
|
|
def md5sum(filename):
|
|
|
|
hash_md5 = hashlib.md5()
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
|
|
hash_md5.update(chunk)
|
|
|
|
return hash_md5.hexdigest()
|
|
|
|
|
|
|
|
def find_duplicates(directory):
|
|
|
|
hashes = defaultdict(list)
|
|
|
|
for root, dirs, files in os.walk(directory):
|
|
|
|
for filename in files:
|
|
|
|
path = os.path.join(root, filename)
|
|
|
|
file_hash = md5sum(path)
|
|
|
|
hashes[file_hash].append(path)
|
|
|
|
return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}
|
|
|
|
|
|
|
|
def print_diff(files):
|
|
|
|
print("Text file differences:")
|
|
|
|
for i in range(len(files) - 1):
|
|
|
|
os.system(f"diff {files[i]} {files[i+1]}")
|
|
|
|
|
|
|
|
def delete_file(file_path):
|
|
|
|
confirm = input(f"Do you want to delete this file? {file_path} [y/N] ")
|
|
|
|
if confirm.lower() in ["y", "yes"]:
|
|
|
|
os.remove(file_path)
|
|
|
|
print(f"Deleted {file_path}")
|
|
|
|
|
|
|
|
def handle_duplicates(duplicates):
|
|
|
|
for file_hash, files in duplicates.items():
|
|
|
|
print(f"Duplicate files for hash {file_hash}:")
|
|
|
|
for file in files:
|
|
|
|
print(file)
|
|
|
|
if "text" in os.popen(f"file -b --mime-type '{files[0]}'").read():
|
|
|
|
print_diff(files)
|
|
|
|
else:
|
|
|
|
for file in files:
|
|
|
|
print(f"File: {file}")
|
|
|
|
print("Duplicate(s) of this file:")
|
|
|
|
[print(duplicate) for duplicate in files if duplicate != file]
|
|
|
|
delete_file(file)
|
|
|
|
|
2023-11-14 10:33:49 +01:00
|
|
|
def main(directories):
|
|
|
|
all_duplicates = defaultdict(list)
|
|
|
|
for directory in directories:
|
|
|
|
if not os.path.isdir(directory):
|
|
|
|
print(f"Directory not found: {directory}")
|
|
|
|
continue
|
|
|
|
duplicates = find_duplicates(directory)
|
|
|
|
for hash, files in duplicates.items():
|
|
|
|
all_duplicates[hash].extend(files)
|
|
|
|
|
|
|
|
if not all_duplicates:
|
2023-11-14 10:28:18 +01:00
|
|
|
print("No duplicates found.")
|
|
|
|
return
|
|
|
|
|
2023-11-14 10:33:49 +01:00
|
|
|
handle_duplicates(all_duplicates)
|
2023-11-14 10:28:18 +01:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if len(sys.argv) > 1:
|
2023-11-14 10:33:49 +01:00
|
|
|
main(sys.argv[1:])
|
2023-11-14 10:28:18 +01:00
|
|
|
else:
|
2023-11-14 10:33:49 +01:00
|
|
|
print("Usage: python3 script.py <directory1> <directory2> ...")
|