diff --git a/cli.py b/cli.py index 219bf3c..3d6df39 100644 --- a/cli.py +++ b/cli.py @@ -5,14 +5,15 @@ def parse_arguments(): description="Scan directories and print/compile file contents." ) parser.add_argument("paths", nargs='+', help="List of files or directories to scan.") - parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt .log).") + parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt, .log).") parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.") parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.") parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.") parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.") - parser.add_argument("--compress", action='store_true', help="Compress code (for Python files).") + parser.add_argument("--compress", action='store_true', help="Compress code (for supported file types).") parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.") parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.") parser.add_argument("--no-gitignore", action='store_true', help="Do not respect .gitignore files during scan.") + parser.add_argument("--scan-binary-files", action='store_true', help="Scan binary files as well (by default these are ignored).") - return parser.parse_args() \ No newline at end of file + return parser.parse_args() diff --git a/directory_handler.py b/directory_handler.py index d070a53..a661622 100644 --- a/directory_handler.py +++ b/directory_handler.py @@ -20,11 +20,36 @@ class DirectoryHandler: lines = f.readlines() # Filter out empty lines and comments. patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')] - # Save the base directory and its patterns + # Save the base directory and its patterns. gitignore_data.append((dirpath, patterns)) except Exception as e: print(f"Error reading {gitignore_path}: {e}") return gitignore_data + + @staticmethod + def is_binary_file(file_path): + """ + Reads the first 1024 bytes of file_path and heuristically determines + if the file appears to be binary. This method returns True if a null byte + is found or if more than 30% of the bytes in the sample are non-text. + """ + try: + with open(file_path, 'rb') as f: + chunk = f.read(1024) + # If there's a null byte, it's almost certainly binary. + if b'\x00' in chunk: + return True + + # Define a set of text characters (ASCII printable + common control characters) + text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x7F))) + # Count non-text characters in the chunk. + non_text = sum(byte not in text_chars for byte in chunk) + if len(chunk) > 0 and (non_text / len(chunk)) > 0.30: + return True + except Exception: + # If the file cannot be read in binary mode, assume it's not binary. + return False + return False @staticmethod def is_gitignored(file_path, gitignore_data): @@ -37,12 +62,12 @@ class DirectoryHandler: try: rel_path = os.path.relpath(file_path, base_dir) except ValueError: - # file_path and base_dir are on different drives + # file_path and base_dir are on different drives. continue # If the file is not under the current .gitignore base_dir, skip it. if rel_path.startswith('..'): continue - # Check all patterns + # Check all patterns. for pattern in patterns: if pattern.endswith('/'): # Directory pattern: check if any folder in the relative path matches. @@ -84,10 +109,15 @@ class DirectoryHandler: return False @staticmethod - def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains): + def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains, scan_binary_files=False): """ Determines if a file should be printed based on various criteria. + By default, binary files are skipped unless scan_binary_files is True. """ + # Check binary file status using our heuristic. + if not scan_binary_files and DirectoryHandler.is_binary_file(file_path): + return False + if ignore_hidden and os.path.basename(file_path).startswith('.'): return False @@ -148,7 +178,8 @@ class DirectoryHandler: kwargs['ignore_file_strings'], kwargs['ignore_hidden'], kwargs['path_contains'], - kwargs['content_contains'] + kwargs['content_contains'], + scan_binary_files=kwargs.get('scan_binary_files', False) ): DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress']) elif kwargs.get('verbose'): diff --git a/main.py b/main.py index 641338b..5852dd6 100755 --- a/main.py +++ b/main.py @@ -19,7 +19,8 @@ def main(): compress=args.compress, path_contains=args.path_contains, content_contains=args.content_contains, - no_gitignore=args.no_gitignore + no_gitignore=args.no_gitignore, + scan_binary_files=args.scan_binary_files ) elif os.path.isfile(path): if DirectoryHandler.should_print_file( @@ -28,7 +29,8 @@ def main(): ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, path_contains=args.path_contains, - content_contains=args.content_contains + content_contains=args.content_contains, + scan_binary_files=args.scan_binary_files ): DirectoryHandler.handle_file( path, @@ -36,11 +38,12 @@ def main(): ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, no_comments=args.no_comments, - compress=args.compress + compress=args.compress, + scan_binary_files=args.scan_binary_files ) else: print(f"Error: {path} is neither a valid file nor a directory.") sys.exit(1) if __name__ == "__main__": - main() \ No newline at end of file + main()