From a54274b052b234402bc99213050da4829bed08ad Mon Sep 17 00:00:00 2001 From: Kevin Veen-Birkenbach Date: Tue, 14 Nov 2023 11:06:43 +0100 Subject: [PATCH] optimized structure and created test script --- .gitignore | 2 + create_test_structure.py | 46 +++++++++++++++++ main.py | 106 ++++++++++++++++++++++----------------- 3 files changed, 107 insertions(+), 47 deletions(-) create mode 100644 .gitignore create mode 100644 create_test_structure.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..84c0a09 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +test_dir1 +test_dir2 \ No newline at end of file diff --git a/create_test_structure.py b/create_test_structure.py new file mode 100644 index 0000000..ec7da40 --- /dev/null +++ b/create_test_structure.py @@ -0,0 +1,46 @@ +import os +import shutil +import hashlib +import random +import string + +def create_test_directory(base_dir, num_files=5, duplicate_files=2): + if not os.path.exists(base_dir): + os.makedirs(base_dir) + + # Erstelle eine Liste von eindeutigen Dateinamen + file_names = [f"file_{i}.txt" for i in range(num_files)] + + # Erstelle einige Dateien mit zufälligem Inhalt + for file_name in file_names: + with open(os.path.join(base_dir, file_name), 'w') as f: + content = ''.join(random.choices(string.ascii_lowercase, k=20)) + f.write(content) + + # Erstelle Duplikate + for i in range(duplicate_files): + original = os.path.join(base_dir, file_names[i]) + duplicate = os.path.join(base_dir, f"dup_{file_names[i]}") + shutil.copyfile(original, duplicate) + +def create_file_structure(): + # Basisverzeichnisse erstellen + base_dirs = ['test_dir1', 'test_dir2'] + for base_dir in base_dirs: + create_test_directory(base_dir) + + # Erstelle eine Datei im ersten Verzeichnis und dupliziere sie im zweiten + with open(os.path.join('test_dir1', 'unique_file.txt'), 'w') as f: + f.write("This is a unique file.") + + shutil.copyfile(os.path.join('test_dir1', 'unique_file.txt'), + os.path.join('test_dir2', 'unique_file.txt')) + + # Erstelle eine zusätzliche einzigartige Datei im zweiten Verzeichnis + with open(os.path.join('test_dir2', 'another_unique_file.txt'), 'w') as f: + f.write("This is another unique file.") + + print("Test file structure created.") + +if __name__ == "__main__": + create_file_structure() diff --git a/main.py b/main.py index 1ba8b27..fad7c90 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,7 @@ import os -import sys +import argparse import hashlib from collections import defaultdict -from filecmp import dircmp def md5sum(filename): hash_md5 = hashlib.md5() @@ -11,58 +10,71 @@ def md5sum(filename): hash_md5.update(chunk) return hash_md5.hexdigest() -def find_duplicates(directory): +def find_duplicates(directories): hashes = defaultdict(list) - for root, dirs, files in os.walk(directory): - for filename in files: - path = os.path.join(root, filename) - file_hash = md5sum(path) - hashes[file_hash].append(path) + for directory in directories: + for root, dirs, files in os.walk(directory): + for filename in files: + path = os.path.join(root, filename) + file_hash = md5sum(path) + hashes[file_hash].append(path) return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1} -def print_diff(files): - print("Text file differences:") - for i in range(len(files) - 1): - os.system(f"diff {files[i]} {files[i+1]}") - -def delete_file(file_path): - confirm = input(f"Do you want to delete this file? {file_path} [y/N] ") - if confirm.lower() in ["y", "yes"]: - os.remove(file_path) - print(f"Deleted {file_path}") - -def handle_duplicates(duplicates): - for file_hash, files in duplicates.items(): - print(f"Duplicate files for hash {file_hash}:") - for file in files: - print(file) - if "text" in os.popen(f"file -b --mime-type '{files[0]}'").read(): - print_diff(files) - else: +def handle_modification(files, modification, mode, apply_to): + if mode == 'preview': + if modification == 'show': + print("Would show the following duplicate files:") for file in files: - print(f"File: {file}") - print("Duplicate(s) of this file:") - [print(duplicate) for duplicate in files if duplicate != file] - delete_file(file) + if file.startswith(tuple(apply_to)): + print(file) + elif mode == 'act': + if modification == 'delete': + for file in files: + if file.startswith(tuple(apply_to)): + print(f"Deleting {file}") + os.remove(file) + elif modification == 'hardlink': + # Implement hardlink logic here + pass + elif modification == 'symlink': + # Implement symlink logic here + pass + elif mode == 'interactive': + for file in files: + if file.startswith(tuple(apply_to)): + answer = input(f"Do you want to {modification} this file? {file} [y/N] ") + if answer.lower() in ['y', 'yes']: + # Implement deletion, hardlink or symlink logic here + pass -def main(directories): - all_duplicates = defaultdict(list) - for directory in directories: - if not os.path.isdir(directory): - print(f"Directory not found: {directory}") - continue - duplicates = find_duplicates(directory) - for hash, files in duplicates.items(): - all_duplicates[hash].extend(files) +def main(args): + directories = args.directories + apply_to = args.apply_to or directories + duplicates = find_duplicates(directories) - if not all_duplicates: + if not duplicates: print("No duplicates found.") return - - handle_duplicates(all_duplicates) + + for file_hash, files in duplicates.items(): + if args.mode == 'preview' or (args.mode == 'interactive' and args.modification == 'show'): + print(f"Duplicate files for hash {file_hash}:") + [print(file) for file in files if file.startswith(tuple(apply_to))] + else: + handle_modification(files, args.modification, args.mode, apply_to) if __name__ == "__main__": - if len(sys.argv) > 1: - main(sys.argv[1:]) - else: - print("Usage: python3 script.py ...") + parser = argparse.ArgumentParser(description="Find and handle duplicate files.") + parser.add_argument('directories', nargs='*', default=['./'], help="Directories to scan for duplicates.") + parser.add_argument('--apply-to', nargs='*', help="Directories to apply modifications to.") + parser.add_argument('--modification', choices=['delete', 'hardlink', 'symlink', 'show'], default='show', help="Modification to perform on duplicates.") + parser.add_argument('--mode', choices=['act', 'preview', 'interactive'], default='preview', help="How to apply the modifications.") + + args = parser.parse_args() + + if args.apply_to and args.modification not in ['delete', 'hardlink', 'symlink']: + parser.error("--apply-to requires --modification to be 'delete', 'hardlink', or 'symlink'.") + if not args.apply_to and args.modification != 'show': + parser.error("Without --apply-to only 'show' modification is allowed.") + + main(args)