From 847b40e9e638c4f950a418c3c5caa42e38a3a65b Mon Sep 17 00:00:00 2001
From: Kevin Veen-Birkenbach <kevin@veen.world>
Date: Tue, 15 Apr 2025 22:11:01 +0200
Subject: [PATCH] Added binary ignoration

---
 cli.py               |  7 ++++---
 directory_handler.py | 41 ++++++++++++++++++++++++++++++++++++-----
 main.py              | 11 +++++++----
 3 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/cli.py b/cli.py
index 219bf3c..3d6df39 100644
--- a/cli.py
+++ b/cli.py
@@ -5,14 +5,15 @@ def parse_arguments():
         description="Scan directories and print/compile file contents."
     )
     parser.add_argument("paths", nargs='+', help="List of files or directories to scan.")
-    parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt .log).")
+    parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt, .log).")
     parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.")
     parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.")
     parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.")
     parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.")
-    parser.add_argument("--compress", action='store_true', help="Compress code (for Python files).")
+    parser.add_argument("--compress", action='store_true', help="Compress code (for supported file types).")
     parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.")
     parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.")
     parser.add_argument("--no-gitignore", action='store_true', help="Do not respect .gitignore files during scan.")
+    parser.add_argument("--scan-binary-files", action='store_true', help="Scan binary files as well (by default these are ignored).")
     
-    return parser.parse_args()
\ No newline at end of file
+    return parser.parse_args()
diff --git a/directory_handler.py b/directory_handler.py
index d070a53..a661622 100644
--- a/directory_handler.py
+++ b/directory_handler.py
@@ -20,11 +20,36 @@ class DirectoryHandler:
                         lines = f.readlines()
                     # Filter out empty lines and comments.
                     patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]
-                    # Save the base directory and its patterns
+                    # Save the base directory and its patterns.
                     gitignore_data.append((dirpath, patterns))
                 except Exception as e:
                     print(f"Error reading {gitignore_path}: {e}")
         return gitignore_data
+    
+    @staticmethod
+    def is_binary_file(file_path):
+        """
+        Reads the first 1024 bytes of file_path and heuristically determines
+        if the file appears to be binary. This method returns True if a null byte
+        is found or if more than 30% of the bytes in the sample are non-text.
+        """
+        try:
+            with open(file_path, 'rb') as f:
+                chunk = f.read(1024)
+            # If there's a null byte, it's almost certainly binary.
+            if b'\x00' in chunk:
+                return True
+            
+            # Define a set of text characters (ASCII printable + common control characters)
+            text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x7F)))
+            # Count non-text characters in the chunk.
+            non_text = sum(byte not in text_chars for byte in chunk)
+            if len(chunk) > 0 and (non_text / len(chunk)) > 0.30:
+                return True
+        except Exception:
+            # If the file cannot be read in binary mode, assume it's not binary.
+            return False
+        return False
 
     @staticmethod
     def is_gitignored(file_path, gitignore_data):
@@ -37,12 +62,12 @@ class DirectoryHandler:
             try:
                 rel_path = os.path.relpath(file_path, base_dir)
             except ValueError:
-                # file_path and base_dir are on different drives
+                # file_path and base_dir are on different drives.
                 continue
             # If the file is not under the current .gitignore base_dir, skip it.
             if rel_path.startswith('..'):
                 continue
-            # Check all patterns
+            # Check all patterns.
             for pattern in patterns:
                 if pattern.endswith('/'):
                     # Directory pattern: check if any folder in the relative path matches.
@@ -84,10 +109,15 @@ class DirectoryHandler:
         return False
 
     @staticmethod
-    def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains):
+    def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains, scan_binary_files=False):
         """
         Determines if a file should be printed based on various criteria.
+        By default, binary files are skipped unless scan_binary_files is True.
         """
+        # Check binary file status using our heuristic.
+        if not scan_binary_files and DirectoryHandler.is_binary_file(file_path):
+            return False
+
         if ignore_hidden and os.path.basename(file_path).startswith('.'):
             return False
 
@@ -148,7 +178,8 @@ class DirectoryHandler:
                     kwargs['ignore_file_strings'],
                     kwargs['ignore_hidden'],
                     kwargs['path_contains'],
-                    kwargs['content_contains']
+                    kwargs['content_contains'],
+                    scan_binary_files=kwargs.get('scan_binary_files', False)
                 ):
                     DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
                 elif kwargs.get('verbose'):
diff --git a/main.py b/main.py
index 641338b..5852dd6 100755
--- a/main.py
+++ b/main.py
@@ -19,7 +19,8 @@ def main():
                 compress=args.compress,
                 path_contains=args.path_contains,
                 content_contains=args.content_contains,
-                no_gitignore=args.no_gitignore
+                no_gitignore=args.no_gitignore,
+                scan_binary_files=args.scan_binary_files
             )
         elif os.path.isfile(path):
             if DirectoryHandler.should_print_file(
@@ -28,7 +29,8 @@ def main():
                 ignore_file_strings=args.ignore_file_strings,
                 ignore_hidden=args.ignore_hidden,
                 path_contains=args.path_contains,
-                content_contains=args.content_contains
+                content_contains=args.content_contains,
+                scan_binary_files=args.scan_binary_files
             ):
                 DirectoryHandler.handle_file(
                     path,
@@ -36,11 +38,12 @@ def main():
                     ignore_file_strings=args.ignore_file_strings,
                     ignore_hidden=args.ignore_hidden,
                     no_comments=args.no_comments,
-                    compress=args.compress
+                    compress=args.compress,
+                    scan_binary_files=args.scan_binary_files
                 )
         else:
             print(f"Error: {path} is neither a valid file nor a directory.")
             sys.exit(1)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()