Compare commits

..

12 Commits

8 changed files with 378 additions and 186 deletions

7
.github/FUNDING.yml vendored Normal file
View File

@@ -0,0 +1,7 @@
github: kevinveenbirkenbach
patreon: kevinveenbirkenbach
buy_me_a_coffee: kevinveenbirkenbach
custom: https://s.veen.world/paypaldonate

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*__pycache__*

View File

@@ -1,4 +1,6 @@
# Analysis-Ready Code (ARC)
# 🤖👩‍🔬 Analysis-Ready Code (ARC)
[![GitHub Sponsors](https://img.shields.io/badge/Sponsor-GitHub%20Sponsors-blue?logo=github)](https://github.com/sponsors/kevinveenbirkenbach) [![Patreon](https://img.shields.io/badge/Support-Patreon-orange?logo=patreon)](https://www.patreon.com/c/kevinveenbirkenbach) [![Buy Me a Coffee](https://img.shields.io/badge/Buy%20me%20a%20Coffee-Funding-yellow?logo=buymeacoffee)](https://buymeacoffee.com/kevinveenbirkenbach) [![PayPal](https://img.shields.io/badge/Donate-PayPal-blue?logo=paypal)](https://s.veen.world/paypaldonate)
Analysis-Ready Code (ARC) is a Python-based utility designed to recursively scan directories and transform source code into a format optimized for AI and computer analysis. By stripping comments, filtering specific file types, and optionally compressing content, ARC ensures that your code is clean and ready for automated processing.

71
cli.py Normal file
View File

@@ -0,0 +1,71 @@
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(
description="Scan directories and print/compile file contents."
)
parser.add_argument(
"paths",
nargs='+',
help="List of files or directories to scan."
)
parser.add_argument(
"-t", "--file-types",
nargs='+',
default=[],
help="Filter by file types (e.g., .txt, .log)."
)
parser.add_argument(
"-x", "--ignore-file-strings",
nargs='+',
default=[],
help="Ignore files and folders containing these strings."
)
parser.add_argument(
"-S", "--show-hidden",
action='store_true',
dest='show_hidden',
default=False,
help="Include hidden directories and files in the scan."
)
parser.add_argument(
"-v", "--verbose",
action='store_true',
help="Enable verbose mode."
)
parser.add_argument(
"-N", "--no-comments",
action='store_true',
help="Remove comments from the displayed content based on file type."
)
parser.add_argument(
"-z", "--compress",
action='store_true',
help="Compress code (for supported file types)."
)
parser.add_argument(
"-p", "--path-contains",
nargs='+',
default=[],
help="Display files whose paths contain one of these strings."
)
parser.add_argument(
"-C", "--content-contains",
nargs='+',
default=[],
help="Display files containing one of these strings in their content."
)
parser.add_argument(
"-G", "--no-gitignore",
action='store_true',
help="Do not respect .gitignore files during scan."
)
parser.add_argument(
"-b", "--scan-binary-files",
action='store_true',
help="Scan binary files as well (by default these are ignored)."
)
# Convert show_hidden to ignore_hidden for downstream use
args = parser.parse_args()
args.ignore_hidden = not args.show_hidden
return args

54
code_processor.py Normal file
View File

@@ -0,0 +1,54 @@
import re
import zlib
class CodeProcessor:
PYTHON = ".py"
JS = ".js"
C = ".c"
CPP = ".cpp"
H = ".h"
BASH = ".sh"
SHELL = ".bash"
@staticmethod
def remove_comments(content, file_type):
"""Remove comments based on file type."""
comment_patterns = {
CodeProcessor.PYTHON: [
(r'\s*#.*', '', 0),
(r'\"\"\"(.*?)\"\"\"', '', re.DOTALL),
(r"\'\'\'(.*?)\'\'\'", '', re.DOTALL)
],
CodeProcessor.JS: [
(r'\s*//.*', '', 0),
(r'/\*.*?\*/', '', 0)
],
CodeProcessor.C: [
(r'\s*//.*', '', 0),
(r'/\*.*?\*/', '', 0)
],
CodeProcessor.CPP: [
(r'\s*//.*', '', 0),
(r'/\*.*?\*/', '', 0)
],
CodeProcessor.H: [
(r'\s*//.*', '', 0),
(r'/\*.*?\*/', '', 0)
],
CodeProcessor.BASH: [
(r'\s*#.*', '', 0)
],
CodeProcessor.SHELL: [
(r'\s*#.*', '', 0)
]
}
patterns = comment_patterns.get(file_type, [])
for pattern, repl, flags in patterns:
content = re.sub(pattern, repl, content, flags=flags)
return content.strip()
@staticmethod
def compress(content):
"""Compress code using zlib."""
return zlib.compress(content.encode())

193
directory_handler.py Normal file
View File

@@ -0,0 +1,193 @@
import os
import fnmatch
from code_processor import CodeProcessor
class DirectoryHandler:
@staticmethod
def load_gitignore_patterns(root_path):
"""
Recursively scans for .gitignore files in the given root_path.
Returns a list of tuples (base_dir, patterns) where:
- base_dir: the directory in which the .gitignore was found.
- patterns: a list of pattern strings from that .gitignore.
"""
gitignore_data = []
for dirpath, _, filenames in os.walk(root_path):
if '.gitignore' in filenames:
gitignore_path = os.path.join(dirpath, '.gitignore')
try:
with open(gitignore_path, 'r') as f:
lines = f.readlines()
# Filter out empty lines and comments.
patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]
# Save the base directory and its patterns.
gitignore_data.append((dirpath, patterns))
except Exception as e:
print(f"Error reading {gitignore_path}: {e}")
return gitignore_data
@staticmethod
def is_binary_file(file_path):
"""
Reads the first 1024 bytes of file_path and heuristically determines
if the file appears to be binary. This method returns True if a null byte
is found or if more than 30% of the bytes in the sample are non-text.
"""
try:
with open(file_path, 'rb') as f:
chunk = f.read(1024)
# If there's a null byte, it's almost certainly binary.
if b'\x00' in chunk:
return True
# Define a set of text characters (ASCII printable + common control characters)
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x7F)))
# Count non-text characters in the chunk.
non_text = sum(byte not in text_chars for byte in chunk)
if len(chunk) > 0 and (non_text / len(chunk)) > 0.30:
return True
except Exception:
# If the file cannot be read in binary mode, assume it's not binary.
return False
return False
@staticmethod
def is_gitignored(file_path, gitignore_data):
"""
Checks if file_path should be ignored according to the .gitignore entries.
For each tuple (base_dir, patterns), if file_path is under base_dir,
computes the relative path and matches it against the patterns.
"""
for base_dir, patterns in gitignore_data:
try:
rel_path = os.path.relpath(file_path, base_dir)
except ValueError:
# file_path and base_dir are on different drives.
continue
# If the file is not under the current .gitignore base_dir, skip it.
if rel_path.startswith('..'):
continue
# Check all patterns.
for pattern in patterns:
if pattern.endswith('/'):
# Directory pattern: check if any folder in the relative path matches.
parts = rel_path.split(os.sep)
for part in parts[:-1]:
if fnmatch.fnmatch(part + '/', pattern):
return True
else:
# Check if the relative path matches the pattern.
if fnmatch.fnmatch(rel_path, pattern):
return True
return False
@staticmethod
def filter_directories(dirs, ignore_file_strings, ignore_hidden):
"""
Filter out directories based on ignore_file_strings and hidden status.
"""
if ignore_hidden:
dirs[:] = [d for d in dirs if not d.startswith('.')]
dirs[:] = [d for d in dirs if not any(ig in d for ig in ignore_file_strings)]
@staticmethod
def path_or_content_contains(file_path, path_contains, content_contains):
"""
Check if the file path contains specific strings or if the file content does.
"""
if path_contains and any(whitelist_str in file_path for whitelist_str in path_contains):
return True
if content_contains:
try:
with open(file_path, 'r') as f:
content = f.read()
if any(whitelist_str in content for whitelist_str in content_contains):
return True
except UnicodeDecodeError:
return False
return False
@staticmethod
def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains, scan_binary_files=False):
"""
Determines if a file should be printed based on various criteria.
By default, binary files are skipped unless scan_binary_files is True.
"""
# Check binary file status using our heuristic.
if not scan_binary_files and DirectoryHandler.is_binary_file(file_path):
return False
if ignore_hidden and os.path.basename(file_path).startswith('.'):
return False
if file_types and not any(file_path.endswith(ft) for ft in file_types):
return False
if any(ignore_str in file_path for ignore_str in ignore_file_strings):
return False
if path_contains or content_contains:
return DirectoryHandler.path_or_content_contains(file_path, path_contains, content_contains)
return True
@staticmethod
def print_file_content(file_path, no_comments, compress):
"""
Prints the content of a file, optionally removing comments or compressing the output.
"""
try:
with open(file_path, 'r') as f:
content = f.read()
if no_comments:
file_type = os.path.splitext(file_path)[1]
content = CodeProcessor.remove_comments(content, file_type)
print(f"<< START: {file_path} >>")
if compress:
compressed_content = CodeProcessor.compress(content)
print("COMPRESSED CODE:")
print(compressed_content)
else:
print(content)
print("<< END >>\n")
except UnicodeDecodeError:
print(f"Warning: Could not read file due to encoding issues: {file_path}")
exit(1)
@staticmethod
def handle_directory(directory, **kwargs):
"""
Scans the directory and processes each file while respecting .gitignore rules.
"""
gitignore_data = []
if not kwargs.get('no_gitignore'):
gitignore_data = DirectoryHandler.load_gitignore_patterns(directory)
for root, dirs, files in os.walk(directory):
DirectoryHandler.filter_directories(dirs, kwargs['ignore_file_strings'], kwargs['ignore_hidden'])
for file in files:
file_path = os.path.join(root, file)
if gitignore_data and DirectoryHandler.is_gitignored(file_path, gitignore_data):
if kwargs.get('verbose'):
print(f"Skipped (gitignored): {file_path}")
continue
if DirectoryHandler.should_print_file(
file_path,
kwargs['file_types'],
kwargs['ignore_file_strings'],
kwargs['ignore_hidden'],
kwargs['path_contains'],
kwargs['content_contains'],
scan_binary_files=kwargs.get('scan_binary_files', False)
):
DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
elif kwargs.get('verbose'):
print(f"Skipped file: {file_path}")
@staticmethod
def handle_file(file_path, **kwargs):
"""
Processes an individual file.
"""
DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])

49
main.py Executable file
View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python3
import os
import sys
from cli import parse_arguments
from directory_handler import DirectoryHandler
def main():
args = parse_arguments()
for path in args.paths:
if os.path.isdir(path):
DirectoryHandler.handle_directory(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
verbose=args.verbose,
no_comments=args.no_comments,
compress=args.compress,
path_contains=args.path_contains,
content_contains=args.content_contains,
no_gitignore=args.no_gitignore,
scan_binary_files=args.scan_binary_files
)
elif os.path.isfile(path):
if DirectoryHandler.should_print_file(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
path_contains=args.path_contains,
content_contains=args.content_contains,
scan_binary_files=args.scan_binary_files
):
DirectoryHandler.handle_file(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
no_comments=args.no_comments,
compress=args.compress,
scan_binary_files=args.scan_binary_files
)
else:
print(f"Error: {path} is neither a valid file nor a directory.")
sys.exit(1)
if __name__ == "__main__":
main()

185
scan.py
View File

@@ -1,185 +0,0 @@
#!/usr/bin/env python3
import os
import argparse
import re
import zlib
class CodeProcessor:
PYTHON = ".py"
JS = ".js"
C = ".c"
CPP = ".cpp"
H = ".h"
BASH = ".sh"
SHELL = ".bash"
@staticmethod
def remove_comments(content, file_type):
"""Remove comments based on file type."""
comment_patterns = {
CodeProcessor.PYTHON: [
(r'\s*#.*', '',0),
(r'\"\"\"(.*?)\"\"\"', '', re.DOTALL),
(r"\'\'\'(.*?)\'\'\'", '', re.DOTALL)
],
CodeProcessor.JS: [
(r'\s*//.*', '',0),
(r'/\*.*?\*/', '',0)
],
CodeProcessor.C: [
(r'\s*//.*', '',0),
(r'/\*.*?\*/', '',0)
],
CodeProcessor.CPP: [
(r'\s*//.*', '',0),
(r'/\*.*?\*/', '',0)
],
CodeProcessor.H: [
(r'\s*//.*', '',0),
(r'/\*.*?\*/', '',0)
],
CodeProcessor.BASH: [
(r'\s*#.*', '', 0)
],
CodeProcessor.SHELL: [
(r'\s*#.*', '', 0)
]
}
patterns = comment_patterns.get(file_type, [])
for pattern, repl, flags in patterns:
content = re.sub(pattern, repl, content, flags=flags)
return content.strip()
@staticmethod
def compress(content):
"""Compress code using zlib."""
return zlib.compress(content.encode())
class DirectoryHandler:
@staticmethod
def filter_directories(dirs, ignore_file_strings, ignore_hidden):
"""Filter out directories based on ignore criteria."""
if ignore_hidden:
dirs[:] = [d for d in dirs if not d.startswith('.')]
dirs[:] = [d for d in dirs if not any(ig in d for ig in ignore_file_strings)]
@staticmethod
def path_or_content_contains(file_path, path_contains, content_contains):
# Check if the file name contains specific strings (whitelist)
if path_contains and any(whitelist_str in file_path for whitelist_str in path_contains):
return True
# Check file content for specific strings (if specified)
if content_contains:
try:
with open(file_path, 'r') as f:
content = f.read()
# Return True if any of the content_contains strings are found in the content
if any(whitelist_str in content for whitelist_str in content_contains):
return True
except UnicodeDecodeError:
# Return False if there's a Unicode decode error (file can't be read)
return False
return False
@staticmethod
def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains):
"""
Determine if a file should be printed based on various criteria.
Args:
file_path (str): The path of the file to be checked.
file_types (list): List of allowed file extensions.
ignore_file_strings (list): List of strings; if any are found in the file path, the file is ignored.
ignore_hidden (bool): If True, hidden files (starting with '.') are ignored.
path_contains (list): List of strings; the file is processed only if its path contains one of these strings.
content_contains (list): List of strings; the file is processed only if its content contains one of these strings.
Returns:
bool: True if the file should be printed, False otherwise.
"""
# Check for hidden files if ignore_hidden is enabled
if ignore_hidden and os.path.basename(file_path).startswith('.'):
return False
# Check if the file type is in the allowed list (if specified)
if file_types and not any(file_path.endswith(file_type) for file_type in file_types):
return False
# Check if the file should be ignored based on the presence of specific strings in its path
if any(ignore_str in file_path for ignore_str in ignore_file_strings):
return False
if path_contains or content_contains:
return DirectoryHandler.path_or_content_contains(file_path, path_contains, content_contains)
return True
@staticmethod
def print_file_content(file_path, no_comments, compress):
"""Print the content of a file."""
try:
with open(file_path, 'r') as f:
content = f.read()
if no_comments:
file_type = os.path.splitext(file_path)[1]
content = CodeProcessor.remove_comments(content, file_type)
print(f"<< START: {file_path} >>")
if compress:
compressed_content = CodeProcessor.compress(content)
print(f"COMPRESSED CODE: ")
print(compressed_content)
else:
print(content)
print("<< END >>\n")
except UnicodeDecodeError:
print(f"Warning: Could not read file due to encoding issues: {file_path}")
exit(1)
@staticmethod
def handle_directory(directory, **kwargs):
"""Handle scanning and printing for directories."""
for root, dirs, files in os.walk(directory):
DirectoryHandler.filter_directories(dirs, kwargs['ignore_file_strings'], kwargs['ignore_hidden'])
for file in files:
if DirectoryHandler.should_print_file(os.path.join(root, file), kwargs['file_types'], kwargs['ignore_file_strings'], kwargs['ignore_hidden'], kwargs['path_contains'], kwargs['content_contains']):
DirectoryHandler.print_file_content(os.path.join(root, file), kwargs['no_comments'], kwargs['compress'])
elif kwargs['verbose']:
print(f"Skipped file: {file}")
@staticmethod
def handle_file(file_path, **kwargs):
"""Handle scanning and printing for individual files."""
DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
def main():
parser = argparse.ArgumentParser(description="Scan directories and print/compile file contents.")
parser.add_argument("paths", nargs='+', help="List of files or directories to scan.")
parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt .log).")
parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.")
parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.")
parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.")
parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.")
parser.add_argument("--compress", action='store_true', help="Compress code (for Python files).")
parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.")
parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.")
args = parser.parse_args()
for path in args.paths:
if os.path.isdir(path):
DirectoryHandler.handle_directory(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, verbose=args.verbose, no_comments=args.no_comments, compress=args.compress, path_contains=args.path_contains, content_contains=args.content_contains)
elif os.path.isfile(path):
if DirectoryHandler.should_print_file(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, path_contains=args.path_contains, content_contains=args.content_contains):
DirectoryHandler.handle_file(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, no_comments=args.no_comments, compress=args.compress)
else:
print(f"Error: {path} is neither a valid file nor a directory.")
exit(1)
if __name__ == "__main__":
main()