Compare commits

..

1 Commits

Author SHA1 Message Date
74651bb880 renamed main.py 2025-03-06 10:13:19 +01:00
12 changed files with 173 additions and 803 deletions

7
.github/FUNDING.yml vendored
View File

@@ -1,7 +0,0 @@
github: kevinveenbirkenbach
patreon: kevinveenbirkenbach
buy_me_a_coffee: kevinveenbirkenbach
custom: https://s.veen.world/paypaldonate

1
.gitignore vendored
View File

@@ -1 +0,0 @@
*__pycache__*

View File

@@ -1,17 +0,0 @@
# Makefile for ARC
.PHONY: test install help
help:
@echo "Targets:"
@echo " make test - Run unit tests"
@echo " make install - Show how to install via Kevin's Package Manager"
test:
@python -m unittest discover -s tests -p "test_*.py" -t .
install:
@echo "ARC is distributed via Kevin's Package Manager."
@echo "Install it with:"
@echo " package-manager install arc"
@echo ""
@echo "(This 'make install' does not perform any other actions.)"

View File

@@ -1,6 +1,4 @@
# 🤖👩‍🔬 Analysis-Ready Code (ARC)
[![GitHub Sponsors](https://img.shields.io/badge/Sponsor-GitHub%20Sponsors-blue?logo=github)](https://github.com/sponsors/kevinveenbirkenbach) [![Patreon](https://img.shields.io/badge/Support-Patreon-orange?logo=patreon)](https://www.patreon.com/c/kevinveenbirkenbach) [![Buy Me a Coffee](https://img.shields.io/badge/Buy%20me%20a%20Coffee-Funding-yellow?logo=buymeacoffee)](https://buymeacoffee.com/kevinveenbirkenbach) [![PayPal](https://img.shields.io/badge/Donate-PayPal-blue?logo=paypal)](https://s.veen.world/paypaldonate)
Analysis-Ready Code (ARC) is a Python-based utility designed to recursively scan directories and transform source code into a format optimized for AI and computer analysis. By stripping comments, filtering specific file types, and optionally compressing content, ARC ensures that your code is clean and ready for automated processing.

View File

71
cli.py
View File

@@ -1,71 +0,0 @@
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(
description="Scan directories and print/compile file contents."
)
parser.add_argument(
"paths",
nargs='+',
help="List of files or directories to scan."
)
parser.add_argument(
"-t", "--file-types",
nargs='+',
default=[],
help="Filter by file types (e.g., .txt, .log)."
)
parser.add_argument(
"-x", "--ignore-file-strings",
nargs='+',
default=[],
help="Ignore files and folders containing these strings."
)
parser.add_argument(
"-S", "--show-hidden",
action='store_true',
dest='show_hidden',
default=False,
help="Include hidden directories and files in the scan."
)
parser.add_argument(
"-v", "--verbose",
action='store_true',
help="Enable verbose mode."
)
parser.add_argument(
"-N", "--no-comments",
action='store_true',
help="Remove comments from the displayed content based on file type."
)
parser.add_argument(
"-z", "--compress",
action='store_true',
help="Compress code (for supported file types)."
)
parser.add_argument(
"-p", "--path-contains",
nargs='+',
default=[],
help="Display files whose paths contain one of these strings."
)
parser.add_argument(
"-C", "--content-contains",
nargs='+',
default=[],
help="Display files containing one of these strings in their content."
)
parser.add_argument(
"-G", "--no-gitignore",
action='store_true',
help="Do not respect .gitignore files during scan."
)
parser.add_argument(
"-b", "--scan-binary-files",
action='store_true',
help="Scan binary files as well (by default these are ignored)."
)
# Convert show_hidden to ignore_hidden for downstream use
args = parser.parse_args()
args.ignore_hidden = not args.show_hidden
return args

View File

@@ -1,285 +0,0 @@
import re
import zlib
from dataclasses import dataclass
from typing import Dict, Tuple, Pattern, Optional
import io
import tokenize
@dataclass(frozen=True)
class LanguageSpec:
"""Holds compiled comment patterns for a language."""
patterns: Tuple[Pattern, ...]
class CodeProcessor:
"""
Utilities to strip comments and (de)compress code.
- Python: tokenize-based (safe) with precise docstring removal.
- C/CPP/JS: state-machine comment stripper that respects string/char literals.
- Shell/YAML: remove full-line hash comments only.
- Jinja: remove {# ... #} blocks.
"""
# File extensions (normalized to lowercase)
EXT_TO_LANG: Dict[str, str] = {
".py": "python",
".js": "cstyle",
".c": "cstyle",
".cpp": "cstyle",
".h": "cstyle",
".sh": "hash",
".bash": "hash",
".yml": "hash",
".yaml": "hash",
".j2": "jinja",
".jinja": "jinja",
".jinja2": "jinja",
".tpl": "jinja",
}
# Regex-based specs for hash and jinja
_HASH = LanguageSpec(patterns=(
re.compile(r"^\s*#.*$", flags=re.MULTILINE), # only full-line comments
))
_JINJA = LanguageSpec(patterns=(
re.compile(r"\{#.*?#\}", flags=re.DOTALL), # {# ... #} across lines
))
LANG_SPECS: Dict[str, LanguageSpec] = {
"hash": _HASH,
"jinja": _JINJA,
# "cstyle" handled by a state machine, not regex
# "python" handled by tokenize, not regex
}
@classmethod
def _lang_from_ext(cls, file_type: str) -> Optional[str]:
"""Map an extension like '.py' to an internal language key."""
ext = file_type.lower().strip()
return cls.EXT_TO_LANG.get(ext)
# -----------------------------
# Python stripping via tokenize
# -----------------------------
@staticmethod
def _strip_python_comments_tokenize(content: str) -> str:
"""
Remove comments and docstrings safely using tokenize.
Rules:
- Drop COMMENT tokens.
- Drop module docstring only if it's the very first statement at col 0.
- Drop the first STRING statement in a suite immediately after 'def'/'class'
header (':' NEWLINE INDENT).
"""
tokens = tokenize.generate_tokens(io.StringIO(content).readline)
out_tokens = []
indent_level = 0
module_docstring_candidate = True # until we see first real stmt at module level
expect_suite_docstring = False # just entered a suite after def/class
last_was_colon = False
seen_nontrivial_in_line = False # guards module docstring (start of logical line)
for tok_type, tok_str, start, end, line in tokens:
# Track indentation
if tok_type == tokenize.INDENT:
indent_level += 1
elif tok_type == tokenize.DEDENT:
indent_level = max(0, indent_level - 1)
# New logical line: reset guard
if tok_type in (tokenize.NEWLINE, tokenize.NL):
seen_nontrivial_in_line = False
out_tokens.append((tok_type, tok_str))
continue
# Comments are dropped
if tok_type == tokenize.COMMENT:
continue
# Detect ':' ending a def/class header
if tok_type == tokenize.OP and tok_str == ":":
last_was_colon = True
out_tokens.append((tok_type, tok_str))
continue
# After ':' + NEWLINE + INDENT comes a suite start -> allow docstring removal
if tok_type == tokenize.INDENT and last_was_colon:
expect_suite_docstring = True
last_was_colon = False
out_tokens.append((tok_type, tok_str))
continue
# Any non-INDENT token clears the last_was_colon flag
if tok_type != tokenize.NL:
last_was_colon = False
# STRING handling
if tok_type == tokenize.STRING:
at_line_start = (start[1] == 0) and not seen_nontrivial_in_line
if indent_level == 0:
# Potential module docstring only if first statement at col 0
if module_docstring_candidate and at_line_start:
module_docstring_candidate = False
# drop it
continue
# Any other top-level string is normal
module_docstring_candidate = False
out_tokens.append((tok_type, tok_str))
seen_nontrivial_in_line = True
continue
else:
# In a suite: if it's the first statement after def/class, drop regardless of column
if expect_suite_docstring:
expect_suite_docstring = False
# drop it
continue
expect_suite_docstring = False
out_tokens.append((tok_type, tok_str))
seen_nontrivial_in_line = True
continue
# Any other significant token disables module-docstring candidacy
if tok_type not in (tokenize.INDENT, tokenize.DEDENT):
if indent_level == 0:
module_docstring_candidate = False
# Mark we've seen something on this line
if tok_type not in (tokenize.NL, tokenize.NEWLINE):
seen_nontrivial_in_line = True
out_tokens.append((tok_type, tok_str))
return tokenize.untokenize(out_tokens)
# ---------------------------------
# C-style stripping via state machine
# ---------------------------------
@staticmethod
def _strip_cstyle_comments(content: str) -> str:
"""
Remove // line comments and /* ... */ block comments while preserving
string ("...") and char ('...') literals and their escape sequences.
"""
i = 0
n = len(content)
out = []
in_line_comment = False
in_block_comment = False
in_string = False
in_char = False
escape = False
while i < n:
c = content[i]
nxt = content[i + 1] if i + 1 < n else ""
# If inside line comment: consume until newline
if in_line_comment:
if c == "\n":
in_line_comment = False
out.append(c)
i += 1
continue
# If inside block comment: consume until '*/'
if in_block_comment:
if c == "*" and nxt == "/":
in_block_comment = False
i += 2
else:
i += 1
continue
# If inside string literal
if in_string:
out.append(c)
if escape:
escape = False
else:
if c == "\\":
escape = True
elif c == '"':
in_string = False
i += 1
continue
# If inside char literal
if in_char:
out.append(c)
if escape:
escape = False
else:
if c == "\\":
escape = True
elif c == "'":
in_char = False
i += 1
continue
# Not in any special state:
# Check for start of comments
if c == "/" and nxt == "/":
in_line_comment = True
i += 2
continue
if c == "/" and nxt == "*":
in_block_comment = True
i += 2
continue
# Check for start of string/char literals
if c == '"':
in_string = True
out.append(c)
i += 1
continue
if c == "'":
in_char = True
out.append(c)
i += 1
continue
# Normal character
out.append(c)
i += 1
return "".join(out)
# -------------------
# Public API
# -------------------
@classmethod
def remove_comments(cls, content: str, file_type: str) -> str:
"""
Remove comments based on file type/extension.
- Python: tokenize-based
- C/CPP/JS: state-machine
- Hash (sh/yaml): regex full-line
- Jinja: regex {# ... #}
"""
lang = cls._lang_from_ext(file_type)
if lang is None:
return content.strip()
if lang == "python":
return cls._strip_python_comments_tokenize(content).strip()
if lang == "cstyle":
return cls._strip_cstyle_comments(content).strip()
spec = cls.LANG_SPECS.get(lang)
if not spec:
return content.strip()
cleaned = content
for pat in spec.patterns:
cleaned = pat.sub("", cleaned)
return cleaned.strip()
@staticmethod
def compress(content: str, level: int = 9) -> bytes:
"""Compress code using zlib. Returns bytes."""
return zlib.compress(content.encode("utf-8"), level)
@staticmethod
def decompress(blob: bytes) -> str:
"""Decompress zlib-compressed code back to text."""
return zlib.decompress(blob).decode("utf-8")

View File

@@ -1,193 +0,0 @@
import os
import fnmatch
from code_processor import CodeProcessor
class DirectoryHandler:
@staticmethod
def load_gitignore_patterns(root_path):
"""
Recursively scans for .gitignore files in the given root_path.
Returns a list of tuples (base_dir, patterns) where:
- base_dir: the directory in which the .gitignore was found.
- patterns: a list of pattern strings from that .gitignore.
"""
gitignore_data = []
for dirpath, _, filenames in os.walk(root_path):
if '.gitignore' in filenames:
gitignore_path = os.path.join(dirpath, '.gitignore')
try:
with open(gitignore_path, 'r') as f:
lines = f.readlines()
# Filter out empty lines and comments.
patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]
# Save the base directory and its patterns.
gitignore_data.append((dirpath, patterns))
except Exception as e:
print(f"Error reading {gitignore_path}: {e}")
return gitignore_data
@staticmethod
def is_binary_file(file_path):
"""
Reads the first 1024 bytes of file_path and heuristically determines
if the file appears to be binary. This method returns True if a null byte
is found or if more than 30% of the bytes in the sample are non-text.
"""
try:
with open(file_path, 'rb') as f:
chunk = f.read(1024)
# If there's a null byte, it's almost certainly binary.
if b'\x00' in chunk:
return True
# Define a set of text characters (ASCII printable + common control characters)
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x7F)))
# Count non-text characters in the chunk.
non_text = sum(byte not in text_chars for byte in chunk)
if len(chunk) > 0 and (non_text / len(chunk)) > 0.30:
return True
except Exception:
# If the file cannot be read in binary mode, assume it's not binary.
return False
return False
@staticmethod
def is_gitignored(file_path, gitignore_data):
"""
Checks if file_path should be ignored according to the .gitignore entries.
For each tuple (base_dir, patterns), if file_path is under base_dir,
computes the relative path and matches it against the patterns.
"""
for base_dir, patterns in gitignore_data:
try:
rel_path = os.path.relpath(file_path, base_dir)
except ValueError:
# file_path and base_dir are on different drives.
continue
# If the file is not under the current .gitignore base_dir, skip it.
if rel_path.startswith('..'):
continue
# Check all patterns.
for pattern in patterns:
if pattern.endswith('/'):
# Directory pattern: check if any folder in the relative path matches.
parts = rel_path.split(os.sep)
for part in parts[:-1]:
if fnmatch.fnmatch(part + '/', pattern):
return True
else:
# Check if the relative path matches the pattern.
if fnmatch.fnmatch(rel_path, pattern):
return True
return False
@staticmethod
def filter_directories(dirs, ignore_file_strings, ignore_hidden):
"""
Filter out directories based on ignore_file_strings and hidden status.
"""
if ignore_hidden:
dirs[:] = [d for d in dirs if not d.startswith('.')]
dirs[:] = [d for d in dirs if not any(ig in d for ig in ignore_file_strings)]
@staticmethod
def path_or_content_contains(file_path, path_contains, content_contains):
"""
Check if the file path contains specific strings or if the file content does.
"""
if path_contains and any(whitelist_str in file_path for whitelist_str in path_contains):
return True
if content_contains:
try:
with open(file_path, 'r') as f:
content = f.read()
if any(whitelist_str in content for whitelist_str in content_contains):
return True
except UnicodeDecodeError:
return False
return False
@staticmethod
def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains, scan_binary_files=False):
"""
Determines if a file should be printed based on various criteria.
By default, binary files are skipped unless scan_binary_files is True.
"""
# Check binary file status using our heuristic.
if not scan_binary_files and DirectoryHandler.is_binary_file(file_path):
return False
if ignore_hidden and os.path.basename(file_path).startswith('.'):
return False
if file_types and not any(file_path.endswith(ft) for ft in file_types):
return False
if any(ignore_str in file_path for ignore_str in ignore_file_strings):
return False
if path_contains or content_contains:
return DirectoryHandler.path_or_content_contains(file_path, path_contains, content_contains)
return True
@staticmethod
def print_file_content(file_path, no_comments, compress):
"""
Prints the content of a file, optionally removing comments or compressing the output.
"""
try:
with open(file_path, 'r') as f:
content = f.read()
if no_comments:
file_type = os.path.splitext(file_path)[1]
content = CodeProcessor.remove_comments(content, file_type)
print(f"<< START: {file_path} >>")
if compress:
compressed_content = CodeProcessor.compress(content)
print("COMPRESSED CODE:")
print(compressed_content)
else:
print(content)
print("<< END >>\n")
except UnicodeDecodeError:
print(f"Warning: Could not read file due to encoding issues: {file_path}")
exit(1)
@staticmethod
def handle_directory(directory, **kwargs):
"""
Scans the directory and processes each file while respecting .gitignore rules.
"""
gitignore_data = []
if not kwargs.get('no_gitignore'):
gitignore_data = DirectoryHandler.load_gitignore_patterns(directory)
for root, dirs, files in os.walk(directory):
DirectoryHandler.filter_directories(dirs, kwargs['ignore_file_strings'], kwargs['ignore_hidden'])
for file in files:
file_path = os.path.join(root, file)
if gitignore_data and DirectoryHandler.is_gitignored(file_path, gitignore_data):
if kwargs.get('verbose'):
print(f"Skipped (gitignored): {file_path}")
continue
if DirectoryHandler.should_print_file(
file_path,
kwargs['file_types'],
kwargs['ignore_file_strings'],
kwargs['ignore_hidden'],
kwargs['path_contains'],
kwargs['content_contains'],
scan_binary_files=kwargs.get('scan_binary_files', False)
):
DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
elif kwargs.get('verbose'):
print(f"Skipped file: {file_path}")
@staticmethod
def handle_file(file_path, **kwargs):
"""
Processes an individual file.
"""
DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])

210
main.py
View File

@@ -1,49 +1,185 @@
#!/usr/bin/env python3
import os
import sys
from cli import parse_arguments
from directory_handler import DirectoryHandler
import argparse
import re
import zlib
class CodeProcessor:
PYTHON = ".py"
JS = ".js"
C = ".c"
CPP = ".cpp"
H = ".h"
BASH = ".sh"
SHELL = ".bash"
@staticmethod
def remove_comments(content, file_type):
"""Remove comments based on file type."""
comment_patterns = {
CodeProcessor.PYTHON: [
(r'\s*#.*', '',0),
(r'\"\"\"(.*?)\"\"\"', '', re.DOTALL),
(r"\'\'\'(.*?)\'\'\'", '', re.DOTALL)
],
CodeProcessor.JS: [
(r'\s*//.*', '',0),
(r'/\*.*?\*/', '',0)
],
CodeProcessor.C: [
(r'\s*//.*', '',0),
(r'/\*.*?\*/', '',0)
],
CodeProcessor.CPP: [
(r'\s*//.*', '',0),
(r'/\*.*?\*/', '',0)
],
CodeProcessor.H: [
(r'\s*//.*', '',0),
(r'/\*.*?\*/', '',0)
],
CodeProcessor.BASH: [
(r'\s*#.*', '', 0)
],
CodeProcessor.SHELL: [
(r'\s*#.*', '', 0)
]
}
patterns = comment_patterns.get(file_type, [])
for pattern, repl, flags in patterns:
content = re.sub(pattern, repl, content, flags=flags)
return content.strip()
@staticmethod
def compress(content):
"""Compress code using zlib."""
return zlib.compress(content.encode())
class DirectoryHandler:
@staticmethod
def filter_directories(dirs, ignore_file_strings, ignore_hidden):
"""Filter out directories based on ignore criteria."""
if ignore_hidden:
dirs[:] = [d for d in dirs if not d.startswith('.')]
dirs[:] = [d for d in dirs if not any(ig in d for ig in ignore_file_strings)]
@staticmethod
def path_or_content_contains(file_path, path_contains, content_contains):
# Check if the file name contains specific strings (whitelist)
if path_contains and any(whitelist_str in file_path for whitelist_str in path_contains):
return True
# Check file content for specific strings (if specified)
if content_contains:
try:
with open(file_path, 'r') as f:
content = f.read()
# Return True if any of the content_contains strings are found in the content
if any(whitelist_str in content for whitelist_str in content_contains):
return True
except UnicodeDecodeError:
# Return False if there's a Unicode decode error (file can't be read)
return False
return False
@staticmethod
def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains):
"""
Determine if a file should be printed based on various criteria.
Args:
file_path (str): The path of the file to be checked.
file_types (list): List of allowed file extensions.
ignore_file_strings (list): List of strings; if any are found in the file path, the file is ignored.
ignore_hidden (bool): If True, hidden files (starting with '.') are ignored.
path_contains (list): List of strings; the file is processed only if its path contains one of these strings.
content_contains (list): List of strings; the file is processed only if its content contains one of these strings.
Returns:
bool: True if the file should be printed, False otherwise.
"""
# Check for hidden files if ignore_hidden is enabled
if ignore_hidden and os.path.basename(file_path).startswith('.'):
return False
# Check if the file type is in the allowed list (if specified)
if file_types and not any(file_path.endswith(file_type) for file_type in file_types):
return False
# Check if the file should be ignored based on the presence of specific strings in its path
if any(ignore_str in file_path for ignore_str in ignore_file_strings):
return False
if path_contains or content_contains:
return DirectoryHandler.path_or_content_contains(file_path, path_contains, content_contains)
return True
@staticmethod
def print_file_content(file_path, no_comments, compress):
"""Print the content of a file."""
try:
with open(file_path, 'r') as f:
content = f.read()
if no_comments:
file_type = os.path.splitext(file_path)[1]
content = CodeProcessor.remove_comments(content, file_type)
print(f"<< START: {file_path} >>")
if compress:
compressed_content = CodeProcessor.compress(content)
print(f"COMPRESSED CODE: ")
print(compressed_content)
else:
print(content)
print("<< END >>\n")
except UnicodeDecodeError:
print(f"Warning: Could not read file due to encoding issues: {file_path}")
exit(1)
@staticmethod
def handle_directory(directory, **kwargs):
"""Handle scanning and printing for directories."""
for root, dirs, files in os.walk(directory):
DirectoryHandler.filter_directories(dirs, kwargs['ignore_file_strings'], kwargs['ignore_hidden'])
for file in files:
if DirectoryHandler.should_print_file(os.path.join(root, file), kwargs['file_types'], kwargs['ignore_file_strings'], kwargs['ignore_hidden'], kwargs['path_contains'], kwargs['content_contains']):
DirectoryHandler.print_file_content(os.path.join(root, file), kwargs['no_comments'], kwargs['compress'])
elif kwargs['verbose']:
print(f"Skipped file: {file}")
@staticmethod
def handle_file(file_path, **kwargs):
"""Handle scanning and printing for individual files."""
DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
def main():
args = parse_arguments()
parser = argparse.ArgumentParser(description="Scan directories and print/compile file contents.")
parser.add_argument("paths", nargs='+', help="List of files or directories to scan.")
parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt .log).")
parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.")
parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.")
parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.")
parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.")
parser.add_argument("--compress", action='store_true', help="Compress code (for Python files).")
parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.")
parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.")
args = parser.parse_args()
for path in args.paths:
if os.path.isdir(path):
DirectoryHandler.handle_directory(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
verbose=args.verbose,
no_comments=args.no_comments,
compress=args.compress,
path_contains=args.path_contains,
content_contains=args.content_contains,
no_gitignore=args.no_gitignore,
scan_binary_files=args.scan_binary_files
)
DirectoryHandler.handle_directory(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, verbose=args.verbose, no_comments=args.no_comments, compress=args.compress, path_contains=args.path_contains, content_contains=args.content_contains)
elif os.path.isfile(path):
if DirectoryHandler.should_print_file(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
path_contains=args.path_contains,
content_contains=args.content_contains,
scan_binary_files=args.scan_binary_files
):
DirectoryHandler.handle_file(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
no_comments=args.no_comments,
compress=args.compress,
scan_binary_files=args.scan_binary_files
)
if DirectoryHandler.should_print_file(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, path_contains=args.path_contains, content_contains=args.content_contains):
DirectoryHandler.handle_file(path, file_types=args.file_types, ignore_file_strings=args.ignore_file_strings, ignore_hidden=args.ignore_hidden, no_comments=args.no_comments, compress=args.compress)
else:
print(f"Error: {path} is neither a valid file nor a directory.")
sys.exit(1)
exit(1)
if __name__ == "__main__":
main()
main()

View File

View File

View File

@@ -1,190 +0,0 @@
# tests/unit/test_arc.py
import io
import os
import sys
import tempfile
import unittest
from contextlib import redirect_stdout
# Ensure project root is on sys.path when running via discover
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
from code_processor import CodeProcessor
from directory_handler import DirectoryHandler
class TestCodeProcessor(unittest.TestCase):
def test_python_comment_and_docstring_stripping(self):
src = '''\
"""module docstring should go away"""
# a comment
x = 1 # inline comment
y = "string with # not a comment"
def f():
"""function docstring should go away"""
s = """triple quoted but not a docstring"""
return x
'''
out = CodeProcessor.remove_comments(src, ".py")
self.assertNotIn("module docstring", out)
self.assertNotIn("function docstring", out)
self.assertNotIn("# a comment", out)
# tolerate whitespace normalization from tokenize.untokenize
self.assertRegex(out, r'y\s*=\s*"string with # not a comment"')
self.assertIn('triple quoted but not a docstring', out)
def test_cstyle_comment_stripping(self):
src = '''\
// line comment
int main() {
/* block
comment */
int x = 42; // end comment
const char* s = "/* not a comment here */";
return x;
}
'''
out = CodeProcessor.remove_comments(src, ".c")
# line comment and block comment gone
self.assertNotIn("// line comment", out)
self.assertNotIn("block\n comment", out)
# string content with /* */ inside should remain
self.assertIn('const char* s = "/* not a comment here */";', out)
def test_hash_comment_stripping(self):
src = """\
# top comment
KEY=value # trailing comment should be kept by default
plain: value
"""
out = CodeProcessor.remove_comments(src, ".yml")
# Our regex removes full lines starting with optional spaces then '#'
self.assertNotIn("top comment", out)
# It does not remove trailing fragments after content for hash style
self.assertIn("KEY=value", out)
self.assertIn("plain: value", out)
def test_jinja_comment_stripping(self):
src = """\
{# top jinja comment #}
Hello {{ name }}!
{#
multi-line
jinja comment
#}
Body text and {{ value }}.
"""
out = CodeProcessor.remove_comments(src, ".j2")
self.assertNotIn("top jinja comment", out)
self.assertNotIn("multi-line", out)
# Regular content and expressions remain
self.assertIn("Hello {{ name }}!", out)
self.assertIn("Body text and {{ value }}.", out)
def test_unknown_extension_returns_stripped(self):
src = " x = 1 # not removed for unknown "
out = CodeProcessor.remove_comments(src, ".unknown")
self.assertEqual(out, "x = 1 # not removed for unknown")
def test_compress_decompress_roundtrip(self):
src = "def x():\n return 42\n"
blob = CodeProcessor.compress(src)
self.assertIsInstance(blob, (bytes, bytearray))
back = CodeProcessor.decompress(blob)
self.assertEqual(src, back)
class TestDirectoryHandler(unittest.TestCase):
def test_is_binary_file(self):
with tempfile.NamedTemporaryFile(delete=False) as tf:
tf.write(b"\x00\x01\x02BINARY")
path = tf.name
try:
self.assertTrue(DirectoryHandler.is_binary_file(path))
finally:
os.remove(path)
def test_gitignore_matching(self):
with tempfile.TemporaryDirectory() as root:
# Create .gitignore ignoring build/ and *.log
gi_dir = os.path.join(root, "a")
os.makedirs(gi_dir, exist_ok=True)
with open(os.path.join(gi_dir, ".gitignore"), "w") as f:
f.write("build/\n*.log\n")
# Files
os.makedirs(os.path.join(gi_dir, "build"), exist_ok=True)
ignored_dir_file = os.path.join(gi_dir, "build", "x.txt")
with open(ignored_dir_file, "w") as f:
f.write("ignored")
ignored_log = os.path.join(gi_dir, "debug.log")
with open(ignored_log, "w") as f:
f.write("ignored log")
kept_file = os.path.join(gi_dir, "src.txt")
with open(kept_file, "w") as f:
f.write("keep me")
gi_data = DirectoryHandler.load_gitignore_patterns(root)
self.assertTrue(DirectoryHandler.is_gitignored(ignored_dir_file, gi_data))
self.assertTrue(DirectoryHandler.is_gitignored(ignored_log, gi_data))
self.assertFalse(DirectoryHandler.is_gitignored(kept_file, gi_data))
def test_should_print_file_filters_hidden_and_types(self):
with tempfile.TemporaryDirectory() as root:
hidden = os.path.join(root, ".hidden.txt")
plain = os.path.join(root, "keep.py")
with open(hidden, "w") as f:
f.write("data")
with open(plain, "w") as f:
f.write("print('hi')")
self.assertFalse(
DirectoryHandler.should_print_file(
hidden,
file_types=[".py"],
ignore_file_strings=[],
ignore_hidden=True,
path_contains=[],
content_contains=[],
)
)
self.assertTrue(
DirectoryHandler.should_print_file(
plain,
file_types=[".py"],
ignore_file_strings=[],
ignore_hidden=True,
path_contains=[],
content_contains=[],
)
)
def test_print_file_content_no_comments_and_compress(self):
with tempfile.TemporaryDirectory() as root:
p = os.path.join(root, "t.py")
with open(p, "w") as f:
f.write("# comment only\nx=1\n")
buf = io.StringIO()
with redirect_stdout(buf):
DirectoryHandler.print_file_content(p, no_comments=True, compress=False)
out = buf.getvalue()
self.assertIn("<< START:", out)
# be whitespace-tolerant (tokenize may insert spaces)
self.assertRegex(out, r"x\s*=\s*1")
self.assertNotIn("# comment only", out)
buf = io.StringIO()
with redirect_stdout(buf):
DirectoryHandler.print_file_content(p, no_comments=True, compress=True)
out = buf.getvalue()
self.assertIn("COMPRESSED CODE:", out)
self.assertIn("<< END >>", out)
if __name__ == "__main__":
unittest.main()