Optimized parameter

Added binary ignoration
Optimized gitignore function
2025-09-10 20:37:11 +02:00 · 2025-07-09 16:59:18 +02:00 · 2025-04-15 22:11:01 +02:00 · 2025-04-15 22:02:59 +02:00 · 2025-04-15 21:54:22 +02:00 · 2025-04-15 21:53:48 +02:00
8 changed files with 378 additions and 186 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,7 @@
+github: kevinveenbirkenbach
+
+patreon: kevinveenbirkenbach
+
+buy_me_a_coffee: kevinveenbirkenbach
+
+custom: https://s.veen.world/paypaldonate
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+*__pycache__*
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-# Analysis-Ready Code (ARC)
+# 🤖👩‍🔬 Analysis-Ready Code (ARC)
+[![GitHub Sponsors](https://img.shields.io/badge/Sponsor-GitHub%20Sponsors-blue?logo=github)](https://github.com/sponsors/kevinveenbirkenbach) [![Patreon](https://img.shields.io/badge/Support-Patreon-orange?logo=patreon)](https://www.patreon.com/c/kevinveenbirkenbach) [![Buy Me a Coffee](https://img.shields.io/badge/Buy%20me%20a%20Coffee-Funding-yellow?logo=buymeacoffee)](https://buymeacoffee.com/kevinveenbirkenbach) [![PayPal](https://img.shields.io/badge/Donate-PayPal-blue?logo=paypal)](https://s.veen.world/paypaldonate)
+

 Analysis-Ready Code (ARC) is a Python-based utility designed to recursively scan directories and transform source code into a format optimized for AI and computer analysis. By stripping comments, filtering specific file types, and optionally compressing content, ARC ensures that your code is clean and ready for automated processing.

--- a/cli.py
+++ b/cli.py
@@ -0,0 +1,71 @@
+import argparse
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Scan directories and print/compile file contents."
+    )
+    parser.add_argument(
+        "paths",
+        nargs='+',
+        help="List of files or directories to scan."
+    )
+    parser.add_argument(
+        "-t", "--file-types",
+        nargs='+',
+        default=[],
+        help="Filter by file types (e.g., .txt, .log)."
+    )
+    parser.add_argument(
+        "-x", "--ignore-file-strings",
+        nargs='+',
+        default=[],
+        help="Ignore files and folders containing these strings."
+    )
+    parser.add_argument(
+        "-S", "--show-hidden",
+        action='store_true',
+        dest='show_hidden',
+        default=False,
+        help="Include hidden directories and files in the scan."
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action='store_true',
+        help="Enable verbose mode."
+    )
+    parser.add_argument(
+        "-N", "--no-comments",
+        action='store_true',
+        help="Remove comments from the displayed content based on file type."
+    )
+    parser.add_argument(
+        "-z", "--compress",
+        action='store_true',
+        help="Compress code (for supported file types)."
+    )
+    parser.add_argument(
+        "-p", "--path-contains",
+        nargs='+',
+        default=[],
+        help="Display files whose paths contain one of these strings."
+    )
+    parser.add_argument(
+        "-C", "--content-contains",
+        nargs='+',
+        default=[],
+        help="Display files containing one of these strings in their content."
+    )
+    parser.add_argument(
+        "-G", "--no-gitignore",
+        action='store_true',
+        help="Do not respect .gitignore files during scan."
+    )
+    parser.add_argument(
+        "-b", "--scan-binary-files",
+        action='store_true',
+        help="Scan binary files as well (by default these are ignored)."
+    )
+    # Convert show_hidden to ignore_hidden for downstream use
+    args = parser.parse_args()
+    args.ignore_hidden = not args.show_hidden
+    return args
--- a/code_processor.py
+++ b/code_processor.py
@@ -0,0 +1,54 @@
+import re
+import zlib
+
+class CodeProcessor:
+    PYTHON = ".py"
+    JS = ".js"
+    C = ".c"
+    CPP = ".cpp"
+    H = ".h"
+    BASH = ".sh"
+    SHELL = ".bash"
+
+    @staticmethod
+    def remove_comments(content, file_type):
+        """Remove comments based on file type."""
+        comment_patterns = {
+            CodeProcessor.PYTHON: [
+                (r'\s*#.*', '', 0),
+                (r'\"\"\"(.*?)\"\"\"', '', re.DOTALL),
+                (r"\'\'\'(.*?)\'\'\'", '', re.DOTALL)
+            ],
+            CodeProcessor.JS: [
+                (r'\s*//.*', '', 0),
+                (r'/\*.*?\*/', '', 0)
+            ],
+            CodeProcessor.C: [
+                (r'\s*//.*', '', 0),
+                (r'/\*.*?\*/', '', 0)
+            ],
+            CodeProcessor.CPP: [
+                (r'\s*//.*', '', 0),
+                (r'/\*.*?\*/', '', 0)
+            ],
+            CodeProcessor.H: [
+                (r'\s*//.*', '', 0),
+                (r'/\*.*?\*/', '', 0)
+            ],
+            CodeProcessor.BASH: [
+                (r'\s*#.*', '', 0)
+            ],
+            CodeProcessor.SHELL: [
+                (r'\s*#.*', '', 0)
+            ]
+        }
+
+        patterns = comment_patterns.get(file_type, [])
+        for pattern, repl, flags in patterns:
+            content = re.sub(pattern, repl, content, flags=flags)
+        return content.strip()
+
+    @staticmethod
+    def compress(content):
+        """Compress code using zlib."""
+        return zlib.compress(content.encode())
--- a/directory_handler.py
+++ b/directory_handler.py
@@ -0,0 +1,193 @@
+import os
+import fnmatch
+from code_processor import CodeProcessor
+
+class DirectoryHandler:
+    @staticmethod
+    def load_gitignore_patterns(root_path):
+        """
+        Recursively scans for .gitignore files in the given root_path.
+        Returns a list of tuples (base_dir, patterns) where:
+          - base_dir: the directory in which the .gitignore was found.
+          - patterns: a list of pattern strings from that .gitignore.
+        """
+        gitignore_data = []
+        for dirpath, _, filenames in os.walk(root_path):
+            if '.gitignore' in filenames:
+                gitignore_path = os.path.join(dirpath, '.gitignore')
+                try:
+                    with open(gitignore_path, 'r') as f:
+                        lines = f.readlines()
+                    # Filter out empty lines and comments.
+                    patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]
+                    # Save the base directory and its patterns.
+                    gitignore_data.append((dirpath, patterns))
+                except Exception as e:
+                    print(f"Error reading {gitignore_path}: {e}")
+        return gitignore_data
+    
+    @staticmethod
+    def is_binary_file(file_path):
+        """
+        Reads the first 1024 bytes of file_path and heuristically determines
+        if the file appears to be binary. This method returns True if a null byte
+        is found or if more than 30% of the bytes in the sample are non-text.
+        """
+        try:
+            with open(file_path, 'rb') as f:
+                chunk = f.read(1024)
+            # If there's a null byte, it's almost certainly binary.
+            if b'\x00' in chunk:
+                return True
+            
+            # Define a set of text characters (ASCII printable + common control characters)
+            text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x7F)))
+            # Count non-text characters in the chunk.
+            non_text = sum(byte not in text_chars for byte in chunk)
+            if len(chunk) > 0 and (non_text / len(chunk)) > 0.30:
+                return True
+        except Exception:
+            # If the file cannot be read in binary mode, assume it's not binary.
+            return False
+        return False
+
+    @staticmethod
+    def is_gitignored(file_path, gitignore_data):
+        """
+        Checks if file_path should be ignored according to the .gitignore entries.
+        For each tuple (base_dir, patterns), if file_path is under base_dir,
+        computes the relative path and matches it against the patterns.
+        """
+        for base_dir, patterns in gitignore_data:
+            try:
+                rel_path = os.path.relpath(file_path, base_dir)
+            except ValueError:
+                # file_path and base_dir are on different drives.
+                continue
+            # If the file is not under the current .gitignore base_dir, skip it.
+            if rel_path.startswith('..'):
+                continue
+            # Check all patterns.
+            for pattern in patterns:
+                if pattern.endswith('/'):
+                    # Directory pattern: check if any folder in the relative path matches.
+                    parts = rel_path.split(os.sep)
+                    for part in parts[:-1]:
+                        if fnmatch.fnmatch(part + '/', pattern):
+                            return True
+                else:
+                    # Check if the relative path matches the pattern.
+                    if fnmatch.fnmatch(rel_path, pattern):
+                        return True
+        return False
+
+    @staticmethod
+    def filter_directories(dirs, ignore_file_strings, ignore_hidden):
+        """
+        Filter out directories based on ignore_file_strings and hidden status.
+        """
+        if ignore_hidden:
+            dirs[:] = [d for d in dirs if not d.startswith('.')]
+        dirs[:] = [d for d in dirs if not any(ig in d for ig in ignore_file_strings)]
+
+    @staticmethod
+    def path_or_content_contains(file_path, path_contains, content_contains):
+        """
+        Check if the file path contains specific strings or if the file content does.
+        """
+        if path_contains and any(whitelist_str in file_path for whitelist_str in path_contains):
+            return True
+
+        if content_contains:
+            try:
+                with open(file_path, 'r') as f:
+                    content = f.read()
+                if any(whitelist_str in content for whitelist_str in content_contains):
+                    return True
+            except UnicodeDecodeError:
+                return False
+        return False
+
+    @staticmethod
+    def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains, scan_binary_files=False):
+        """
+        Determines if a file should be printed based on various criteria.
+        By default, binary files are skipped unless scan_binary_files is True.
+        """
+        # Check binary file status using our heuristic.
+        if not scan_binary_files and DirectoryHandler.is_binary_file(file_path):
+            return False
+
+        if ignore_hidden and os.path.basename(file_path).startswith('.'):
+            return False
+
+        if file_types and not any(file_path.endswith(ft) for ft in file_types):
+            return False
+
+        if any(ignore_str in file_path for ignore_str in ignore_file_strings):
+            return False
+
+        if path_contains or content_contains:
+            return DirectoryHandler.path_or_content_contains(file_path, path_contains, content_contains)
+        return True
+
+    @staticmethod
+    def print_file_content(file_path, no_comments, compress):
+        """
+        Prints the content of a file, optionally removing comments or compressing the output.
+        """
+        try:
+            with open(file_path, 'r') as f:
+                content = f.read()
+            if no_comments:
+                file_type = os.path.splitext(file_path)[1]
+                content = CodeProcessor.remove_comments(content, file_type)
+            print(f"<< START: {file_path} >>")
+            if compress:
+                compressed_content = CodeProcessor.compress(content)
+                print("COMPRESSED CODE:")
+                print(compressed_content)
+            else:
+                print(content)
+            print("<< END >>\n")
+        except UnicodeDecodeError:
+            print(f"Warning: Could not read file due to encoding issues: {file_path}")
+            exit(1)
+
+    @staticmethod
+    def handle_directory(directory, **kwargs):
+        """
+        Scans the directory and processes each file while respecting .gitignore rules.
+        """
+        gitignore_data = []
+        if not kwargs.get('no_gitignore'):
+            gitignore_data = DirectoryHandler.load_gitignore_patterns(directory)
+
+        for root, dirs, files in os.walk(directory):
+            DirectoryHandler.filter_directories(dirs, kwargs['ignore_file_strings'], kwargs['ignore_hidden'])
+            for file in files:
+                file_path = os.path.join(root, file)
+                if gitignore_data and DirectoryHandler.is_gitignored(file_path, gitignore_data):
+                    if kwargs.get('verbose'):
+                        print(f"Skipped (gitignored): {file_path}")
+                    continue
+
+                if DirectoryHandler.should_print_file(
+                    file_path,
+                    kwargs['file_types'],
+                    kwargs['ignore_file_strings'],
+                    kwargs['ignore_hidden'],
+                    kwargs['path_contains'],
+                    kwargs['content_contains'],
+                    scan_binary_files=kwargs.get('scan_binary_files', False)
+                ):
+                    DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
+                elif kwargs.get('verbose'):
+                    print(f"Skipped file: {file_path}")
+
+    @staticmethod
+    def handle_file(file_path, **kwargs):
+        """
+        Processes an individual file.
+        """
+        DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
--- a/main.py
+++ b/main.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import os
+import sys
+from cli import parse_arguments
+from directory_handler import DirectoryHandler
+
+def main():
+    args = parse_arguments()
+    
+    for path in args.paths:
+        if os.path.isdir(path):
+            DirectoryHandler.handle_directory(
+                path,
+                file_types=args.file_types,
+                ignore_file_strings=args.ignore_file_strings,
+                ignore_hidden=args.ignore_hidden,
+                verbose=args.verbose,
+                no_comments=args.no_comments,
+                compress=args.compress,
+                path_contains=args.path_contains,
+                content_contains=args.content_contains,
+                no_gitignore=args.no_gitignore,
+                scan_binary_files=args.scan_binary_files
+            )
+        elif os.path.isfile(path):
+            if DirectoryHandler.should_print_file(
+                path,
+                file_types=args.file_types,
+                ignore_file_strings=args.ignore_file_strings,
+                ignore_hidden=args.ignore_hidden,
+                path_contains=args.path_contains,
+                content_contains=args.content_contains,
+                scan_binary_files=args.scan_binary_files
+            ):
+                DirectoryHandler.handle_file(
+                    path,
+                    file_types=args.file_types,
+                    ignore_file_strings=args.ignore_file_strings,
+                    ignore_hidden=args.ignore_hidden,
+                    no_comments=args.no_comments,
+                    compress=args.compress,
+                    scan_binary_files=args.scan_binary_files
+                )
+        else:
+            print(f"Error: {path} is neither a valid file nor a directory.")
+            sys.exit(1)
+
+if __name__ == "__main__":
+    main()
--- a/scan.py
+++ b/scan.py
@@ -1,185 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import argparse
-import re
-import zlib
-
-class CodeProcessor:
-    PYTHON = ".py"
-    JS = ".js"
-    C = ".c"
-    CPP = ".cpp"
-    H = ".h"
-    BASH = ".sh"
-    SHELL = ".bash"
-
-    @staticmethod
-    def remove_comments(content, file_type):
-        """Remove comments based on file type."""
-        comment_patterns = {
-            CodeProcessor.PYTHON: [
-                (r'\s*#.*', '',0),
-                (r'\"\"\"(.*?)\"\"\"', '', re.DOTALL),
-                (r"\'\'\'(.*?)\'\'\'", '', re.DOTALL)
-            ],
-            CodeProcessor.JS: [
-                (r'\s*//.*', '',0),
-                (r'/\*.*?\*/', '',0)
-            ],
-            CodeProcessor.C: [
-                (r'\s*//.*', '',0),
-                (r'/\*.*?\*/', '',0)
-            ],
-            CodeProcessor.CPP: [
-                (r'\s*//.*', '',0),
-                (r'/\*.*?\*/', '',0)
-            ],
-            CodeProcessor.H: [
-                (r'\s*//.*', '',0),
-                (r'/\*.*?\*/', '',0)
-            ],
-            CodeProcessor.BASH: [
-                (r'\s*#.*', '', 0)
-            ],
-            CodeProcessor.SHELL: [
-                (r'\s*#.*', '', 0)
-            ]
-        }
-
-        patterns = comment_patterns.get(file_type, [])
-        for pattern, repl, flags in patterns:
-            content = re.sub(pattern, repl, content, flags=flags)
-        return content.strip()
-
-    @staticmethod
-    def compress(content):
-        """Compress code using zlib."""
-        return zlib.compress(content.encode())
-
-
-class DirectoryHandler:
-    
-    @staticmethod
-    def filter_directories(dirs, ignore_file_strings, ignore_hidden):
-        """Filter out directories based on ignore criteria."""
-        if ignore_hidden:
-            dirs[:] = [d for d in dirs if not d.startswith('.')]
-        dirs[:] = [d for d in dirs if not any(ig in d for ig in ignore_file_strings)]
-
-    @staticmethod
-    def path_or_content_contains(file_path, path_contains, content_contains):
-        # Check if the file name contains specific strings (whitelist)
-        if path_contains and any(whitelist_str in file_path for whitelist_str in path_contains):
-            return True
-
-        # Check file content for specific strings (if specified)
-        if content_contains:
-            try:
-                with open(file_path, 'r') as f:
-                    content = f.read()
-                # Return True if any of the content_contains strings are found in the content
-                if any(whitelist_str in content for whitelist_str in content_contains):
-                    return True
-            except UnicodeDecodeError:
-                # Return False if there's a Unicode decode error (file can't be read)
-                return False
-        return False
-
-    @staticmethod
-    def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains):
-        """
-        Determine if a file should be printed based on various criteria.
-
-        Args:
-        file_path (str): The path of the file to be checked.
-        file_types (list): List of allowed file extensions.
-        ignore_file_strings (list): List of strings; if any are found in the file path, the file is ignored.
-        ignore_hidden (bool): If True, hidden files (starting with '.') are ignored.
-        path_contains (list): List of strings; the file is processed only if its path contains one of these strings.
-        content_contains (list): List of strings; the file is processed only if its content contains one of these strings.
-
-        Returns:
-        bool: True if the file should be printed, False otherwise.
-        """
-
-        # Check for hidden files if ignore_hidden is enabled
-        if ignore_hidden and os.path.basename(file_path).startswith('.'):
-            return False
-
-        # Check if the file type is in the allowed list (if specified)
-        if file_types and not any(file_path.endswith(file_type) for file_type in file_types):
-            return False
-
-        # Check if the file should be ignored based on the presence of specific strings in its path
-        if any(ignore_str in file_path for ignore_str in ignore_file_strings):
-            return False
-
-        if path_contains or content_contains:
-            return DirectoryHandler.path_or_content_contains(file_path, path_contains, content_contains)
-        return True
-    
-    @staticmethod
-    def print_file_content(file_path, no_comments, compress):
-        """Print the content of a file."""
-        try:
-            with open(file_path, 'r') as f:
-                content = f.read()
-            if no_comments:
-                file_type = os.path.splitext(file_path)[1]
-                content = CodeProcessor.remove_comments(content, file_type)
-            print(f"<< START: {file_path} >>")
-            if compress:
-                compressed_content = CodeProcessor.compress(content)
-                print(f"COMPRESSED CODE: ")
-                print(compressed_content)
-            else:
-                print(content)
-            print("<< END >>\n")
-        except UnicodeDecodeError:
-            print(f"Warning: Could not read file due to encoding issues: {file_path}")
-            exit(1)
-
-    @staticmethod
-    def handle_directory(directory, **kwargs):
-        """Handle scanning and printing for directories."""
-        for root, dirs, files in os.walk(directory):
-            DirectoryHandler.filter_directories(dirs, kwargs['ignore_file_strings'], kwargs['ignore_hidden'])
-            for file in files:
-                if DirectoryHandler.should_print_file(os.path.join(root, file), kwargs['file_types'], kwargs['ignore_file_strings'], kwargs['ignore_hidden'], kwargs['path_contains'], kwargs['content_contains']):
-                    DirectoryHandler.print_file_content(os.path.join(root, file), kwargs['no_comments'], kwargs['compress'])
-                elif kwargs['verbose']:
-                    print(f"Skipped file: {file}")
-
-    @staticmethod
-    def handle_file(file_path, **kwargs):
-        """Handle scanning and printing for individual files."""
-        DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Scan directories and print/compile file contents.")
-    parser.add_argument("paths", nargs='+', help="List of files or directories to scan.")
-    parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt .log).")
-    parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.")
-    parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.")
-    parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.")
-    parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.")
-    parser.add_argument("--compress", action='store_true', help="Compress code (for Python files).")
-    parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.")
-    parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.")
-    
-    args = parser.parse_args()
-    
-    for path in args.paths:
-        if os.path.isdir(path):
-            DirectoryHandler.handle_directory(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, verbose=args.verbose, no_comments=args.no_comments, compress=args.compress, path_contains=args.path_contains, content_contains=args.content_contains)
-        elif os.path.isfile(path):
-            if DirectoryHandler.should_print_file(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, path_contains=args.path_contains, content_contains=args.content_contains):
-                DirectoryHandler.handle_file(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, no_comments=args.no_comments, compress=args.compress)
-        else:
-            print(f"Error: {path} is neither a valid file nor a directory.")
-            exit(1)
-
-if __name__ == "__main__":
-    main()
Author	SHA1	Message	Date
Kevin Veen-Birkenbach	c5938cf482	Optimized parameter	2025-07-09 16:59:18 +02:00
Kevin Veen-Birkenbach	847b40e9e6	Added binary ignoration	2025-04-15 22:11:01 +02:00
Kevin Veen-Birkenbach	69477fa29e	Optimized gitignore function	2025-04-15 22:02:59 +02:00
Kevin Veen-Birkenbach	ab62b4d1b9	Renamed .gitignore	2025-04-15 21:54:22 +02:00
Kevin Veen-Birkenbach	485f068fa5	Ignored __pycache__	2025-04-15 21:53:48 +02:00
Kevin Veen-Birkenbach	bf2f548a1f	Refactored	2025-04-15 21:52:42 +02:00
Kevin Veen-Birkenbach	11b325ee25	Added automatic ignore option for .gitignore	2025-04-15 21:47:41 +02:00
Kevin Veen-Birkenbach	4953993321	Added Funding	2025-03-12 20:47:42 +01:00
Kevin Veen-Birkenbach	49601176e0	Merge branch 'main' of github.com:kevinveenbirkenbach/analysis-ready-code	2025-03-12 11:14:39 +01:00
Kevin Veen-Birkenbach	b822435762	renamed main.py	2025-03-12 10:49:36 +01:00
Kevin Veen-Birkenbach	843b16563e	renamed main.py	2025-03-12 10:38:59 +01:00
Kevin Veen-Birkenbach	9de33b67db	renamed to main.py to make it installable	2025-03-06 10:10:09 +01:00