diff --git a/README.md b/README.md index 9140a47..3db3f4a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Analysis-Ready Code (ARC) +# 🤖👩‍🔬 Analysis-Ready Code (ARC) Analysis-Ready Code (ARC) is a Python-based utility designed to recursively scan directories and transform source code into a format optimized for AI and computer analysis. By stripping comments, filtering specific file types, and optionally compressing content, ARC ensures that your code is clean and ready for automated processing. diff --git a/scan.py b/scan.py deleted file mode 100755 index 381443c..0000000 --- a/scan.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python3 - -import os -import argparse -import re -import zlib - -class CodeProcessor: - PYTHON = ".py" - JS = ".js" - C = ".c" - CPP = ".cpp" - H = ".h" - BASH = ".sh" - SHELL = ".bash" - - @staticmethod - def remove_comments(content, file_type): - """Remove comments based on file type.""" - comment_patterns = { - CodeProcessor.PYTHON: [ - (r'\s*#.*', '',0), - (r'\"\"\"(.*?)\"\"\"', '', re.DOTALL), - (r"\'\'\'(.*?)\'\'\'", '', re.DOTALL) - ], - CodeProcessor.JS: [ - (r'\s*//.*', '',0), - (r'/\*.*?\*/', '',0) - ], - CodeProcessor.C: [ - (r'\s*//.*', '',0), - (r'/\*.*?\*/', '',0) - ], - CodeProcessor.CPP: [ - (r'\s*//.*', '',0), - (r'/\*.*?\*/', '',0) - ], - CodeProcessor.H: [ - (r'\s*//.*', '',0), - (r'/\*.*?\*/', '',0) - ], - CodeProcessor.BASH: [ - (r'\s*#.*', '', 0) - ], - CodeProcessor.SHELL: [ - (r'\s*#.*', '', 0) - ] - } - - patterns = comment_patterns.get(file_type, []) - for pattern, repl, flags in patterns: - content = re.sub(pattern, repl, content, flags=flags) - return content.strip() - - @staticmethod - def compress(content): - """Compress code using zlib.""" - return zlib.compress(content.encode()) - - -class DirectoryHandler: - - @staticmethod - def filter_directories(dirs, ignore_file_strings, ignore_hidden): - """Filter out directories based on ignore criteria.""" - if ignore_hidden: - dirs[:] = [d for d in dirs if not d.startswith('.')] - dirs[:] = [d for d in dirs if not any(ig in d for ig in ignore_file_strings)] - - @staticmethod - def path_or_content_contains(file_path, path_contains, content_contains): - # Check if the file name contains specific strings (whitelist) - if path_contains and any(whitelist_str in file_path for whitelist_str in path_contains): - return True - - # Check file content for specific strings (if specified) - if content_contains: - try: - with open(file_path, 'r') as f: - content = f.read() - # Return True if any of the content_contains strings are found in the content - if any(whitelist_str in content for whitelist_str in content_contains): - return True - except UnicodeDecodeError: - # Return False if there's a Unicode decode error (file can't be read) - return False - return False - - @staticmethod - def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains): - """ - Determine if a file should be printed based on various criteria. - - Args: - file_path (str): The path of the file to be checked. - file_types (list): List of allowed file extensions. - ignore_file_strings (list): List of strings; if any are found in the file path, the file is ignored. - ignore_hidden (bool): If True, hidden files (starting with '.') are ignored. - path_contains (list): List of strings; the file is processed only if its path contains one of these strings. - content_contains (list): List of strings; the file is processed only if its content contains one of these strings. - - Returns: - bool: True if the file should be printed, False otherwise. - """ - - # Check for hidden files if ignore_hidden is enabled - if ignore_hidden and os.path.basename(file_path).startswith('.'): - return False - - # Check if the file type is in the allowed list (if specified) - if file_types and not any(file_path.endswith(file_type) for file_type in file_types): - return False - - # Check if the file should be ignored based on the presence of specific strings in its path - if any(ignore_str in file_path for ignore_str in ignore_file_strings): - return False - - if path_contains or content_contains: - return DirectoryHandler.path_or_content_contains(file_path, path_contains, content_contains) - return True - - @staticmethod - def print_file_content(file_path, no_comments, compress): - """Print the content of a file.""" - try: - with open(file_path, 'r') as f: - content = f.read() - if no_comments: - file_type = os.path.splitext(file_path)[1] - content = CodeProcessor.remove_comments(content, file_type) - print(f"<< START: {file_path} >>") - if compress: - compressed_content = CodeProcessor.compress(content) - print(f"COMPRESSED CODE: ") - print(compressed_content) - else: - print(content) - print("<< END >>\n") - except UnicodeDecodeError: - print(f"Warning: Could not read file due to encoding issues: {file_path}") - exit(1) - - @staticmethod - def handle_directory(directory, **kwargs): - """Handle scanning and printing for directories.""" - for root, dirs, files in os.walk(directory): - DirectoryHandler.filter_directories(dirs, kwargs['ignore_file_strings'], kwargs['ignore_hidden']) - for file in files: - if DirectoryHandler.should_print_file(os.path.join(root, file), kwargs['file_types'], kwargs['ignore_file_strings'], kwargs['ignore_hidden'], kwargs['path_contains'], kwargs['content_contains']): - DirectoryHandler.print_file_content(os.path.join(root, file), kwargs['no_comments'], kwargs['compress']) - elif kwargs['verbose']: - print(f"Skipped file: {file}") - - @staticmethod - def handle_file(file_path, **kwargs): - """Handle scanning and printing for individual files.""" - DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress']) - - -def main(): - parser = argparse.ArgumentParser(description="Scan directories and print/compile file contents.") - parser.add_argument("paths", nargs='+', help="List of files or directories to scan.") - parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt .log).") - parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.") - parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.") - parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.") - parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.") - parser.add_argument("--compress", action='store_true', help="Compress code (for Python files).") - parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.") - parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.") - - args = parser.parse_args() - - for path in args.paths: - if os.path.isdir(path): - DirectoryHandler.handle_directory(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, verbose=args.verbose, no_comments=args.no_comments, compress=args.compress, path_contains=args.path_contains, content_contains=args.content_contains) - elif os.path.isfile(path): - if DirectoryHandler.should_print_file(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, path_contains=args.path_contains, content_contains=args.content_contains): - DirectoryHandler.handle_file(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, no_comments=args.no_comments, compress=args.compress) - else: - print(f"Error: {path} is neither a valid file nor a directory.") - exit(1) - -if __name__ == "__main__": - main() \ No newline at end of file