mirror of
https://github.com/kevinveenbirkenbach/directory-content-scanner.git
synced 2025-04-22 08:02:24 +02:00
Added binary ignoration
This commit is contained in:
parent
69477fa29e
commit
847b40e9e6
7
cli.py
7
cli.py
@ -5,14 +5,15 @@ def parse_arguments():
|
|||||||
description="Scan directories and print/compile file contents."
|
description="Scan directories and print/compile file contents."
|
||||||
)
|
)
|
||||||
parser.add_argument("paths", nargs='+', help="List of files or directories to scan.")
|
parser.add_argument("paths", nargs='+', help="List of files or directories to scan.")
|
||||||
parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt .log).")
|
parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt, .log).")
|
||||||
parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.")
|
parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.")
|
||||||
parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.")
|
parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.")
|
||||||
parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.")
|
parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.")
|
||||||
parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.")
|
parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.")
|
||||||
parser.add_argument("--compress", action='store_true', help="Compress code (for Python files).")
|
parser.add_argument("--compress", action='store_true', help="Compress code (for supported file types).")
|
||||||
parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.")
|
parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.")
|
||||||
parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.")
|
parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.")
|
||||||
parser.add_argument("--no-gitignore", action='store_true', help="Do not respect .gitignore files during scan.")
|
parser.add_argument("--no-gitignore", action='store_true', help="Do not respect .gitignore files during scan.")
|
||||||
|
parser.add_argument("--scan-binary-files", action='store_true', help="Scan binary files as well (by default these are ignored).")
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
@ -20,11 +20,36 @@ class DirectoryHandler:
|
|||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
# Filter out empty lines and comments.
|
# Filter out empty lines and comments.
|
||||||
patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]
|
patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]
|
||||||
# Save the base directory and its patterns
|
# Save the base directory and its patterns.
|
||||||
gitignore_data.append((dirpath, patterns))
|
gitignore_data.append((dirpath, patterns))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error reading {gitignore_path}: {e}")
|
print(f"Error reading {gitignore_path}: {e}")
|
||||||
return gitignore_data
|
return gitignore_data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_binary_file(file_path):
|
||||||
|
"""
|
||||||
|
Reads the first 1024 bytes of file_path and heuristically determines
|
||||||
|
if the file appears to be binary. This method returns True if a null byte
|
||||||
|
is found or if more than 30% of the bytes in the sample are non-text.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
chunk = f.read(1024)
|
||||||
|
# If there's a null byte, it's almost certainly binary.
|
||||||
|
if b'\x00' in chunk:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Define a set of text characters (ASCII printable + common control characters)
|
||||||
|
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x7F)))
|
||||||
|
# Count non-text characters in the chunk.
|
||||||
|
non_text = sum(byte not in text_chars for byte in chunk)
|
||||||
|
if len(chunk) > 0 and (non_text / len(chunk)) > 0.30:
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
# If the file cannot be read in binary mode, assume it's not binary.
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_gitignored(file_path, gitignore_data):
|
def is_gitignored(file_path, gitignore_data):
|
||||||
@ -37,12 +62,12 @@ class DirectoryHandler:
|
|||||||
try:
|
try:
|
||||||
rel_path = os.path.relpath(file_path, base_dir)
|
rel_path = os.path.relpath(file_path, base_dir)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# file_path and base_dir are on different drives
|
# file_path and base_dir are on different drives.
|
||||||
continue
|
continue
|
||||||
# If the file is not under the current .gitignore base_dir, skip it.
|
# If the file is not under the current .gitignore base_dir, skip it.
|
||||||
if rel_path.startswith('..'):
|
if rel_path.startswith('..'):
|
||||||
continue
|
continue
|
||||||
# Check all patterns
|
# Check all patterns.
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
if pattern.endswith('/'):
|
if pattern.endswith('/'):
|
||||||
# Directory pattern: check if any folder in the relative path matches.
|
# Directory pattern: check if any folder in the relative path matches.
|
||||||
@ -84,10 +109,15 @@ class DirectoryHandler:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains):
|
def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains, scan_binary_files=False):
|
||||||
"""
|
"""
|
||||||
Determines if a file should be printed based on various criteria.
|
Determines if a file should be printed based on various criteria.
|
||||||
|
By default, binary files are skipped unless scan_binary_files is True.
|
||||||
"""
|
"""
|
||||||
|
# Check binary file status using our heuristic.
|
||||||
|
if not scan_binary_files and DirectoryHandler.is_binary_file(file_path):
|
||||||
|
return False
|
||||||
|
|
||||||
if ignore_hidden and os.path.basename(file_path).startswith('.'):
|
if ignore_hidden and os.path.basename(file_path).startswith('.'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -148,7 +178,8 @@ class DirectoryHandler:
|
|||||||
kwargs['ignore_file_strings'],
|
kwargs['ignore_file_strings'],
|
||||||
kwargs['ignore_hidden'],
|
kwargs['ignore_hidden'],
|
||||||
kwargs['path_contains'],
|
kwargs['path_contains'],
|
||||||
kwargs['content_contains']
|
kwargs['content_contains'],
|
||||||
|
scan_binary_files=kwargs.get('scan_binary_files', False)
|
||||||
):
|
):
|
||||||
DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
|
DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
|
||||||
elif kwargs.get('verbose'):
|
elif kwargs.get('verbose'):
|
||||||
|
11
main.py
11
main.py
@ -19,7 +19,8 @@ def main():
|
|||||||
compress=args.compress,
|
compress=args.compress,
|
||||||
path_contains=args.path_contains,
|
path_contains=args.path_contains,
|
||||||
content_contains=args.content_contains,
|
content_contains=args.content_contains,
|
||||||
no_gitignore=args.no_gitignore
|
no_gitignore=args.no_gitignore,
|
||||||
|
scan_binary_files=args.scan_binary_files
|
||||||
)
|
)
|
||||||
elif os.path.isfile(path):
|
elif os.path.isfile(path):
|
||||||
if DirectoryHandler.should_print_file(
|
if DirectoryHandler.should_print_file(
|
||||||
@ -28,7 +29,8 @@ def main():
|
|||||||
ignore_file_strings=args.ignore_file_strings,
|
ignore_file_strings=args.ignore_file_strings,
|
||||||
ignore_hidden=args.ignore_hidden,
|
ignore_hidden=args.ignore_hidden,
|
||||||
path_contains=args.path_contains,
|
path_contains=args.path_contains,
|
||||||
content_contains=args.content_contains
|
content_contains=args.content_contains,
|
||||||
|
scan_binary_files=args.scan_binary_files
|
||||||
):
|
):
|
||||||
DirectoryHandler.handle_file(
|
DirectoryHandler.handle_file(
|
||||||
path,
|
path,
|
||||||
@ -36,11 +38,12 @@ def main():
|
|||||||
ignore_file_strings=args.ignore_file_strings,
|
ignore_file_strings=args.ignore_file_strings,
|
||||||
ignore_hidden=args.ignore_hidden,
|
ignore_hidden=args.ignore_hidden,
|
||||||
no_comments=args.no_comments,
|
no_comments=args.no_comments,
|
||||||
compress=args.compress
|
compress=args.compress,
|
||||||
|
scan_binary_files=args.scan_binary_files
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(f"Error: {path} is neither a valid file nor a directory.")
|
print(f"Error: {path} is neither a valid file nor a directory.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user