mirror of
https://github.com/kevinveenbirkenbach/duplicate-file-handler.git
synced 2024-11-25 23:31:04 +01:00
Compare commits
No commits in common. "89e15dd023aee82190bacaadc337c282b91f5357" and "c2566a355d75faa4d57e3d850a352504a9f0be77" have entirely different histories.
89e15dd023
...
c2566a355d
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
||||
test_dir*
|
||||
test_dir1
|
||||
test_dir2
|
45
README.md
45
README.md
@ -1,55 +1,38 @@
|
||||
# Duplicate File Handler
|
||||
|
||||
This repository contains a Python script for identifying and handling duplicate files in a directory and its subdirectories based on their MD5 hash. It allows for filtering by file type and provides options for handling duplicates such as deletion, hard linking, or sym linking.
|
||||
This repository contains two bash scripts for handling duplicate files in a directory and its subdirectories.
|
||||
|
||||
The scripts may need to be modified depending on the specific requirements of your system or the specific use case. They currently operate by comparing the MD5 hash of files to find duplicates, which is a common but not foolproof method.
|
||||
|
||||
## Author
|
||||
- Kevin Veen-Birkenbach
|
||||
|
||||
**Kevin Veen-Birkenbach**
|
||||
- Email: kevin@veen.world
|
||||
- Website: [https://www.veen.world](https://www.veen.world)
|
||||
|
||||
This repository was enhanced with the help of [OpenAI's ChatGPT](https://chat.openai.com/share/825931d6-1e33-40b0-8dfc-914b3f852eeb).
|
||||
This repository was created with the help of [OpenAI's ChatGPT](https://chat.openai.com/share/013e4367-8eca-4066-8b18-55457202ba57).
|
||||
|
||||
## Setup
|
||||
To use the script, ensure you have Python installed on your system. No additional libraries are required as the script uses standard Python libraries.
|
||||
These scripts will help you manage duplicate files in your directories. Please make sure to adjust permissions on the scripts to be executable with `chmod +x list_duplicates.sh delete_duplicates.sh` before running.
|
||||
|
||||
## Usage
|
||||
|
||||
### Identifying and Handling Duplicates
|
||||
### 1. List Duplicate Files
|
||||
|
||||
`main.py` is a Python script to identify all duplicate files in the specified directories. It can also filter by file type and handle duplicates by deleting them or replacing them with hard or symbolic links.
|
||||
`list_duplicates.sh` is a script to list all duplicate files in a specified directory and its subdirectories. For text files, it will also display the diffs.
|
||||
|
||||
```bash
|
||||
python main.py [options] directories
|
||||
./list_duplicates.sh /path/to/directory
|
||||
```
|
||||
|
||||
#### Options
|
||||
- `--apply-to`: Directories to apply modifications to.
|
||||
- `--modification`: Action to perform on duplicates - `delete`, `hardlink`, `symlink`, or `show` (default).
|
||||
- `--mode`: How to apply the modifications - `act`, `preview`, `interactive` (default: `preview`).
|
||||
- `-f`, `--file-type`: Filter by file type (e.g., `.txt` for text files).
|
||||
### 2. Delete Duplicate Files
|
||||
|
||||
### Creating Test File Structure
|
||||
|
||||
`create_file_structure.py` is a utility script to create a test file structure with duplicate files for testing purposes.
|
||||
`delete_duplicates.sh` is a script to find and delete duplicate files in a specified directory and its subdirectories. It will ask for confirmation before deleting each file and display the paths of its duplicates.
|
||||
|
||||
```bash
|
||||
python create_file_structure.py
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
To preview duplicate `.txt` files in `test_dir1` and `test_dir2`:
|
||||
|
||||
```bash
|
||||
python main.py --file-type .txt --mode preview test_dir1 test_dir2
|
||||
```
|
||||
|
||||
To interactively delete duplicates in `test_dir2`:
|
||||
|
||||
```bash
|
||||
python main.py --apply-to test_dir2 --modification delete --mode interactive test_dir1 test_dir2
|
||||
./delete_duplicates.sh /path/to/directory
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the terms of the [MIT License](LICENSE).
|
||||
This project is licensed under the terms of the [GNU Affero General Public License v3.0](https://www.gnu.org/licenses/agpl-3.0.de.html).
|
||||
|
@ -1,41 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
import hashlib
|
||||
import random
|
||||
import string
|
||||
|
||||
def create_test_directory(base_dir, num_files=5, duplicate_files=2, depth=1):
|
||||
os.makedirs(base_dir, exist_ok=True)
|
||||
subdirs = [os.path.join(base_dir, f"subdir_{i}") for i in range(depth)]
|
||||
for subdir in subdirs:
|
||||
os.makedirs(subdir, exist_ok=True)
|
||||
|
||||
for dir in [base_dir] + subdirs:
|
||||
file_names = [f"file_{i}.txt" for i in range(num_files)]
|
||||
for file_name in file_names:
|
||||
with open(os.path.join(dir, file_name), 'w') as f:
|
||||
content = ''.join(random.choices(string.ascii_lowercase, k=20))
|
||||
f.write(content)
|
||||
|
||||
for i in range(min(duplicate_files, num_files)):
|
||||
original = os.path.join(dir, file_names[i])
|
||||
for dup_num in range(1, duplicate_files+1):
|
||||
duplicate = os.path.join(dir, f"dup_{dup_num}_{file_names[i]}")
|
||||
shutil.copyfile(original, duplicate)
|
||||
|
||||
def copy_directory_contents(src, dst):
|
||||
if os.path.exists(dst):
|
||||
shutil.rmtree(dst)
|
||||
shutil.copytree(src, dst)
|
||||
|
||||
def create_file_structure(depth, num_files, duplicate_files):
|
||||
base_dirs = ['test_dir1', 'test_dir2']
|
||||
for base_dir in base_dirs:
|
||||
create_test_directory(base_dir, num_files, duplicate_files, depth)
|
||||
|
||||
copy_directory_contents('test_dir1', 'test_dir3')
|
||||
|
||||
print("Test file structure created.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_file_structure(depth=2, num_files=5, duplicate_files=3)
|
46
create_test_structure.py
Normal file
46
create_test_structure.py
Normal file
@ -0,0 +1,46 @@
|
||||
import os
|
||||
import shutil
|
||||
import hashlib
|
||||
import random
|
||||
import string
|
||||
|
||||
def create_test_directory(base_dir, num_files=5, duplicate_files=2):
|
||||
if not os.path.exists(base_dir):
|
||||
os.makedirs(base_dir)
|
||||
|
||||
# Erstelle eine Liste von eindeutigen Dateinamen
|
||||
file_names = [f"file_{i}.txt" for i in range(num_files)]
|
||||
|
||||
# Erstelle einige Dateien mit zufälligem Inhalt
|
||||
for file_name in file_names:
|
||||
with open(os.path.join(base_dir, file_name), 'w') as f:
|
||||
content = ''.join(random.choices(string.ascii_lowercase, k=20))
|
||||
f.write(content)
|
||||
|
||||
# Erstelle Duplikate
|
||||
for i in range(duplicate_files):
|
||||
original = os.path.join(base_dir, file_names[i])
|
||||
duplicate = os.path.join(base_dir, f"dup_{file_names[i]}")
|
||||
shutil.copyfile(original, duplicate)
|
||||
|
||||
def create_file_structure():
|
||||
# Basisverzeichnisse erstellen
|
||||
base_dirs = ['test_dir1', 'test_dir2']
|
||||
for base_dir in base_dirs:
|
||||
create_test_directory(base_dir)
|
||||
|
||||
# Erstelle eine Datei im ersten Verzeichnis und dupliziere sie im zweiten
|
||||
with open(os.path.join('test_dir1', 'unique_file.txt'), 'w') as f:
|
||||
f.write("This is a unique file.")
|
||||
|
||||
shutil.copyfile(os.path.join('test_dir1', 'unique_file.txt'),
|
||||
os.path.join('test_dir2', 'unique_file.txt'))
|
||||
|
||||
# Erstelle eine zusätzliche einzigartige Datei im zweiten Verzeichnis
|
||||
with open(os.path.join('test_dir2', 'another_unique_file.txt'), 'w') as f:
|
||||
f.write("This is another unique file.")
|
||||
|
||||
print("Test file structure created.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_file_structure()
|
35
delete_duplicates.sh
Executable file
35
delete_duplicates.sh
Executable file
@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -z "$1" ]
|
||||
then
|
||||
echo "Directory path not provided"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
dir="$1"
|
||||
duplicates=$(find "$dir" -type f -exec md5sum {} + | sort | uniq -d -w32)
|
||||
|
||||
echo "Duplicates found:"
|
||||
|
||||
echo "$duplicates" | while read line
|
||||
do
|
||||
files=$(grep "$line" <<< "$duplicates" | awk '{print $2}')
|
||||
for file in ${files[@]}
|
||||
do
|
||||
echo "File: $file"
|
||||
echo "Duplicate(s) of this file:"
|
||||
for duplicate in ${files[@]}
|
||||
do
|
||||
if [ $duplicate != $file ]
|
||||
then
|
||||
echo $duplicate
|
||||
fi
|
||||
done
|
||||
echo "Do you want to delete this file? [y/N]"
|
||||
read answer
|
||||
if [[ $answer == [yY] || $answer == [yY][eE][sS] ]]
|
||||
then
|
||||
rm -i "$file"
|
||||
fi
|
||||
done
|
||||
done
|
30
list_duplicates.sh
Executable file
30
list_duplicates.sh
Executable file
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -z "$1" ]
|
||||
then
|
||||
echo "Directory path not provided"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
dir="$1"
|
||||
duplicates=$(find "$dir" -type f -exec md5sum {} + | sort | uniq -d -w32)
|
||||
|
||||
if [ -z "$duplicates" ]
|
||||
then
|
||||
echo "No duplicates found."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Duplicates found:"
|
||||
|
||||
echo "$duplicates" | while read line
|
||||
do
|
||||
files=$(grep "$line" <<< "$duplicates" | awk '{print $2}')
|
||||
file_type=$(file -b --mime-type "${files[0]}")
|
||||
if [[ $file_type == text/* ]]
|
||||
then
|
||||
diff "${files[@]}"
|
||||
else
|
||||
echo "$files"
|
||||
fi
|
||||
done
|
91
main.py
91
main.py
@ -1,9 +1,7 @@
|
||||
import os
|
||||
import argparse
|
||||
import hashlib
|
||||
from collections import defaultdict, Counter
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
def md5sum(filename):
|
||||
hash_md5 = hashlib.md5()
|
||||
@ -12,86 +10,65 @@ def md5sum(filename):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
def file_hashing_job(path):
|
||||
file_hash = md5sum(path)
|
||||
if file_hash:
|
||||
return file_hash, path
|
||||
|
||||
def find_duplicates(directories, file_type):
|
||||
with ProcessPoolExecutor() as executor:
|
||||
futures = []
|
||||
for directory in directories:
|
||||
for root, dirs, files in tqdm(os.walk(directory, followlinks=False), desc=f"Indexing files of {directory}", unit="directory"):
|
||||
for filename in files:
|
||||
if file_type and not filename.endswith(file_type):
|
||||
continue
|
||||
path = os.path.join(root, filename)
|
||||
if not os.path.islink(path):
|
||||
futures.append(executor.submit(file_hashing_job, path))
|
||||
|
||||
def find_duplicates(directories):
|
||||
hashes = defaultdict(list)
|
||||
for future in tqdm(futures, desc="Processing files", unit="file"):
|
||||
result = future.result()
|
||||
if result:
|
||||
file_hash, path = result
|
||||
for directory in directories:
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for filename in files:
|
||||
path = os.path.join(root, filename)
|
||||
file_hash = md5sum(path)
|
||||
hashes[file_hash].append(path)
|
||||
|
||||
return {file_hash: paths for file_hash, paths in hashes.items() if len(paths) > 1}
|
||||
|
||||
def handle_file_modification(original_file, duplicate_file, modification):
|
||||
if modification == 'delete':
|
||||
print(f"Deleting {duplicate_file}")
|
||||
os.remove(duplicate_file)
|
||||
elif modification == 'hardlink':
|
||||
os.remove(duplicate_file)
|
||||
os.link(original_file, duplicate_file)
|
||||
print(f"Replaced {duplicate_file} with a hardlink to {original_file}")
|
||||
elif modification == 'symlink':
|
||||
os.remove(duplicate_file)
|
||||
os.symlink(original_file, duplicate_file)
|
||||
print(f"Replaced {duplicate_file} with a symlink to {original_file}")
|
||||
|
||||
def handle_modification(files, modification, mode, apply_to):
|
||||
original_file = next((f for f in files if not f.startswith(tuple(apply_to))), files[0])
|
||||
for duplicate_file in files:
|
||||
if duplicate_file != original_file:
|
||||
if duplicate_file.startswith(tuple(apply_to)):
|
||||
if mode == 'preview' and modification != 'show':
|
||||
print(f"Would perform {modification} on {duplicate_file}")
|
||||
if mode == 'preview':
|
||||
if modification == 'show':
|
||||
print("Would show the following duplicate files:")
|
||||
for file in files:
|
||||
if file.startswith(tuple(apply_to)):
|
||||
print(file)
|
||||
elif mode == 'act':
|
||||
handle_file_modification(original_file, duplicate_file, modification)
|
||||
if modification == 'delete':
|
||||
for file in files:
|
||||
if file.startswith(tuple(apply_to)):
|
||||
print(f"Deleting {file}")
|
||||
os.remove(file)
|
||||
elif modification == 'hardlink':
|
||||
# Implement hardlink logic here
|
||||
pass
|
||||
elif modification == 'symlink':
|
||||
# Implement symlink logic here
|
||||
pass
|
||||
elif mode == 'interactive':
|
||||
answer = input(f"Do you want to {modification} this file? {duplicate_file} [y/N] ")
|
||||
for file in files:
|
||||
if file.startswith(tuple(apply_to)):
|
||||
answer = input(f"Do you want to {modification} this file? {file} [y/N] ")
|
||||
if answer.lower() in ['y', 'yes']:
|
||||
handle_file_modification(original_file, duplicate_file, modification)
|
||||
else:
|
||||
print(f"Duplicate file (unmodified): {duplicate_file}")
|
||||
elif modification != 'show':
|
||||
print(f"Original file kept: {original_file}")
|
||||
print()
|
||||
|
||||
# Implement deletion, hardlink or symlink logic here
|
||||
pass
|
||||
|
||||
def main(args):
|
||||
directories = args.directories
|
||||
apply_to = args.apply_to or directories
|
||||
duplicates = find_duplicates(directories,args.file_type)
|
||||
duplicates = find_duplicates(directories)
|
||||
|
||||
if not duplicates:
|
||||
print("No duplicates found.")
|
||||
return
|
||||
|
||||
for file_hash, files in duplicates.items():
|
||||
if args.mode == 'preview' or (args.mode == 'interactive' and args.modification == 'show'):
|
||||
print(f"Duplicate files for hash {file_hash}:")
|
||||
[print(file) for file in files if file.startswith(tuple(directories))]
|
||||
[print(file) for file in files if file.startswith(tuple(apply_to))]
|
||||
else:
|
||||
handle_modification(files, args.modification, args.mode, apply_to)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Find and handle duplicate files.")
|
||||
parser.add_argument('directories', nargs='*', help="Directories to scan for duplicates.")
|
||||
parser.add_argument('--apply-to', nargs='*', help="Filter directories to apply modifications to.")
|
||||
parser.add_argument('--apply-to', nargs='*', help="Directories to apply modifications to.")
|
||||
parser.add_argument('--modification', choices=['delete', 'hardlink', 'symlink', 'show'], default='show', help="Modification to perform on duplicates.")
|
||||
parser.add_argument('--mode', choices=['act', 'preview', 'interactive'], default='preview', help="How to apply the modifications.")
|
||||
parser.add_argument('-f', '--file-type', help="Filter by file type (e.g., '.txt' for text files).", default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user