Refactor CodeProcessor to use safe state-machine and tokenize-based stripping, add Jinja {# #} support, and introduce unit tests with Makefile targets

- Added LanguageSpec dataclass and mapping for extensions
- Implemented state-machine for C/CPP/JS comment stripping (handles strings correctly)
- Improved Python comment/docstring removal using tokenize
- Added regex-based stripping for hash (#) and Jinja {# #} comments
- Added Makefile with test and install targets
- Added unit test suite under tests/unit covering Python, C-style, hash, and Jinja cases
- Added compress/decompress roundtrip test
- Added directory handler tests

See: https://chatgpt.com/share/68e0250f-40d4-800f-911d-2b4700246574
This commit is contained in:
2025-10-03 21:34:02 +02:00
parent c5938cf482
commit b55576beb2
6 changed files with 484 additions and 46 deletions

17
Makefile Normal file
View File

@@ -0,0 +1,17 @@
# Makefile for ARC
.PHONY: test install help
help:
@echo "Targets:"
@echo " make test - Run unit tests"
@echo " make install - Show how to install via Kevin's Package Manager"
test:
@python -m unittest discover -s tests -p "test_*.py" -t .
install:
@echo "ARC is distributed via Kevin's Package Manager."
@echo "Install it with:"
@echo " package-manager install arc"
@echo ""
@echo "(This 'make install' does not perform any other actions.)"

0
__init__.py Normal file
View File

View File

@@ -1,54 +1,285 @@
import re import re
import zlib import zlib
from dataclasses import dataclass
from typing import Dict, Tuple, Pattern, Optional
import io
import tokenize
@dataclass(frozen=True)
class LanguageSpec:
"""Holds compiled comment patterns for a language."""
patterns: Tuple[Pattern, ...]
class CodeProcessor: class CodeProcessor:
PYTHON = ".py" """
JS = ".js" Utilities to strip comments and (de)compress code.
C = ".c" - Python: tokenize-based (safe) with precise docstring removal.
CPP = ".cpp" - C/CPP/JS: state-machine comment stripper that respects string/char literals.
H = ".h" - Shell/YAML: remove full-line hash comments only.
BASH = ".sh" - Jinja: remove {# ... #} blocks.
SHELL = ".bash" """
# File extensions (normalized to lowercase)
EXT_TO_LANG: Dict[str, str] = {
".py": "python",
".js": "cstyle",
".c": "cstyle",
".cpp": "cstyle",
".h": "cstyle",
".sh": "hash",
".bash": "hash",
".yml": "hash",
".yaml": "hash",
".j2": "jinja",
".jinja": "jinja",
".jinja2": "jinja",
".tpl": "jinja",
}
# Regex-based specs for hash and jinja
_HASH = LanguageSpec(patterns=(
re.compile(r"^\s*#.*$", flags=re.MULTILINE), # only full-line comments
))
_JINJA = LanguageSpec(patterns=(
re.compile(r"\{#.*?#\}", flags=re.DOTALL), # {# ... #} across lines
))
LANG_SPECS: Dict[str, LanguageSpec] = {
"hash": _HASH,
"jinja": _JINJA,
# "cstyle" handled by a state machine, not regex
# "python" handled by tokenize, not regex
}
@classmethod
def _lang_from_ext(cls, file_type: str) -> Optional[str]:
"""Map an extension like '.py' to an internal language key."""
ext = file_type.lower().strip()
return cls.EXT_TO_LANG.get(ext)
# -----------------------------
# Python stripping via tokenize
# -----------------------------
@staticmethod
def _strip_python_comments_tokenize(content: str) -> str:
"""
Remove comments and docstrings safely using tokenize.
Rules:
- Drop COMMENT tokens.
- Drop module docstring only if it's the very first statement at col 0.
- Drop the first STRING statement in a suite immediately after 'def'/'class'
header (':' NEWLINE INDENT).
"""
tokens = tokenize.generate_tokens(io.StringIO(content).readline)
out_tokens = []
indent_level = 0
module_docstring_candidate = True # until we see first real stmt at module level
expect_suite_docstring = False # just entered a suite after def/class
last_was_colon = False
seen_nontrivial_in_line = False # guards module docstring (start of logical line)
for tok_type, tok_str, start, end, line in tokens:
# Track indentation
if tok_type == tokenize.INDENT:
indent_level += 1
elif tok_type == tokenize.DEDENT:
indent_level = max(0, indent_level - 1)
# New logical line: reset guard
if tok_type in (tokenize.NEWLINE, tokenize.NL):
seen_nontrivial_in_line = False
out_tokens.append((tok_type, tok_str))
continue
# Comments are dropped
if tok_type == tokenize.COMMENT:
continue
# Detect ':' ending a def/class header
if tok_type == tokenize.OP and tok_str == ":":
last_was_colon = True
out_tokens.append((tok_type, tok_str))
continue
# After ':' + NEWLINE + INDENT comes a suite start -> allow docstring removal
if tok_type == tokenize.INDENT and last_was_colon:
expect_suite_docstring = True
last_was_colon = False
out_tokens.append((tok_type, tok_str))
continue
# Any non-INDENT token clears the last_was_colon flag
if tok_type != tokenize.NL:
last_was_colon = False
# STRING handling
if tok_type == tokenize.STRING:
at_line_start = (start[1] == 0) and not seen_nontrivial_in_line
if indent_level == 0:
# Potential module docstring only if first statement at col 0
if module_docstring_candidate and at_line_start:
module_docstring_candidate = False
# drop it
continue
# Any other top-level string is normal
module_docstring_candidate = False
out_tokens.append((tok_type, tok_str))
seen_nontrivial_in_line = True
continue
else:
# In a suite: if it's the first statement after def/class, drop regardless of column
if expect_suite_docstring:
expect_suite_docstring = False
# drop it
continue
expect_suite_docstring = False
out_tokens.append((tok_type, tok_str))
seen_nontrivial_in_line = True
continue
# Any other significant token disables module-docstring candidacy
if tok_type not in (tokenize.INDENT, tokenize.DEDENT):
if indent_level == 0:
module_docstring_candidate = False
# Mark we've seen something on this line
if tok_type not in (tokenize.NL, tokenize.NEWLINE):
seen_nontrivial_in_line = True
out_tokens.append((tok_type, tok_str))
return tokenize.untokenize(out_tokens)
# ---------------------------------
# C-style stripping via state machine
# ---------------------------------
@staticmethod
def _strip_cstyle_comments(content: str) -> str:
"""
Remove // line comments and /* ... */ block comments while preserving
string ("...") and char ('...') literals and their escape sequences.
"""
i = 0
n = len(content)
out = []
in_line_comment = False
in_block_comment = False
in_string = False
in_char = False
escape = False
while i < n:
c = content[i]
nxt = content[i + 1] if i + 1 < n else ""
# If inside line comment: consume until newline
if in_line_comment:
if c == "\n":
in_line_comment = False
out.append(c)
i += 1
continue
# If inside block comment: consume until '*/'
if in_block_comment:
if c == "*" and nxt == "/":
in_block_comment = False
i += 2
else:
i += 1
continue
# If inside string literal
if in_string:
out.append(c)
if escape:
escape = False
else:
if c == "\\":
escape = True
elif c == '"':
in_string = False
i += 1
continue
# If inside char literal
if in_char:
out.append(c)
if escape:
escape = False
else:
if c == "\\":
escape = True
elif c == "'":
in_char = False
i += 1
continue
# Not in any special state:
# Check for start of comments
if c == "/" and nxt == "/":
in_line_comment = True
i += 2
continue
if c == "/" and nxt == "*":
in_block_comment = True
i += 2
continue
# Check for start of string/char literals
if c == '"':
in_string = True
out.append(c)
i += 1
continue
if c == "'":
in_char = True
out.append(c)
i += 1
continue
# Normal character
out.append(c)
i += 1
return "".join(out)
# -------------------
# Public API
# -------------------
@classmethod
def remove_comments(cls, content: str, file_type: str) -> str:
"""
Remove comments based on file type/extension.
- Python: tokenize-based
- C/CPP/JS: state-machine
- Hash (sh/yaml): regex full-line
- Jinja: regex {# ... #}
"""
lang = cls._lang_from_ext(file_type)
if lang is None:
return content.strip()
if lang == "python":
return cls._strip_python_comments_tokenize(content).strip()
if lang == "cstyle":
return cls._strip_cstyle_comments(content).strip()
spec = cls.LANG_SPECS.get(lang)
if not spec:
return content.strip()
cleaned = content
for pat in spec.patterns:
cleaned = pat.sub("", cleaned)
return cleaned.strip()
@staticmethod @staticmethod
def remove_comments(content, file_type): def compress(content: str, level: int = 9) -> bytes:
"""Remove comments based on file type.""" """Compress code using zlib. Returns bytes."""
comment_patterns = { return zlib.compress(content.encode("utf-8"), level)
CodeProcessor.PYTHON: [
(r'\s*#.*', '', 0),
(r'\"\"\"(.*?)\"\"\"', '', re.DOTALL),
(r"\'\'\'(.*?)\'\'\'", '', re.DOTALL)
],
CodeProcessor.JS: [
(r'\s*//.*', '', 0),
(r'/\*.*?\*/', '', 0)
],
CodeProcessor.C: [
(r'\s*//.*', '', 0),
(r'/\*.*?\*/', '', 0)
],
CodeProcessor.CPP: [
(r'\s*//.*', '', 0),
(r'/\*.*?\*/', '', 0)
],
CodeProcessor.H: [
(r'\s*//.*', '', 0),
(r'/\*.*?\*/', '', 0)
],
CodeProcessor.BASH: [
(r'\s*#.*', '', 0)
],
CodeProcessor.SHELL: [
(r'\s*#.*', '', 0)
]
}
patterns = comment_patterns.get(file_type, [])
for pattern, repl, flags in patterns:
content = re.sub(pattern, repl, content, flags=flags)
return content.strip()
@staticmethod @staticmethod
def compress(content): def decompress(blob: bytes) -> str:
"""Compress code using zlib.""" """Decompress zlib-compressed code back to text."""
return zlib.compress(content.encode()) return zlib.decompress(blob).decode("utf-8")

0
tests/__init__.py Normal file
View File

0
tests/unit/__init__.py Normal file
View File

190
tests/unit/test_arc.py Normal file
View File

@@ -0,0 +1,190 @@
# tests/unit/test_arc.py
import io
import os
import sys
import tempfile
import unittest
from contextlib import redirect_stdout
# Ensure project root is on sys.path when running via discover
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
from code_processor import CodeProcessor
from directory_handler import DirectoryHandler
class TestCodeProcessor(unittest.TestCase):
def test_python_comment_and_docstring_stripping(self):
src = '''\
"""module docstring should go away"""
# a comment
x = 1 # inline comment
y = "string with # not a comment"
def f():
"""function docstring should go away"""
s = """triple quoted but not a docstring"""
return x
'''
out = CodeProcessor.remove_comments(src, ".py")
self.assertNotIn("module docstring", out)
self.assertNotIn("function docstring", out)
self.assertNotIn("# a comment", out)
# tolerate whitespace normalization from tokenize.untokenize
self.assertRegex(out, r'y\s*=\s*"string with # not a comment"')
self.assertIn('triple quoted but not a docstring', out)
def test_cstyle_comment_stripping(self):
src = '''\
// line comment
int main() {
/* block
comment */
int x = 42; // end comment
const char* s = "/* not a comment here */";
return x;
}
'''
out = CodeProcessor.remove_comments(src, ".c")
# line comment and block comment gone
self.assertNotIn("// line comment", out)
self.assertNotIn("block\n comment", out)
# string content with /* */ inside should remain
self.assertIn('const char* s = "/* not a comment here */";', out)
def test_hash_comment_stripping(self):
src = """\
# top comment
KEY=value # trailing comment should be kept by default
plain: value
"""
out = CodeProcessor.remove_comments(src, ".yml")
# Our regex removes full lines starting with optional spaces then '#'
self.assertNotIn("top comment", out)
# It does not remove trailing fragments after content for hash style
self.assertIn("KEY=value", out)
self.assertIn("plain: value", out)
def test_jinja_comment_stripping(self):
src = """\
{# top jinja comment #}
Hello {{ name }}!
{#
multi-line
jinja comment
#}
Body text and {{ value }}.
"""
out = CodeProcessor.remove_comments(src, ".j2")
self.assertNotIn("top jinja comment", out)
self.assertNotIn("multi-line", out)
# Regular content and expressions remain
self.assertIn("Hello {{ name }}!", out)
self.assertIn("Body text and {{ value }}.", out)
def test_unknown_extension_returns_stripped(self):
src = " x = 1 # not removed for unknown "
out = CodeProcessor.remove_comments(src, ".unknown")
self.assertEqual(out, "x = 1 # not removed for unknown")
def test_compress_decompress_roundtrip(self):
src = "def x():\n return 42\n"
blob = CodeProcessor.compress(src)
self.assertIsInstance(blob, (bytes, bytearray))
back = CodeProcessor.decompress(blob)
self.assertEqual(src, back)
class TestDirectoryHandler(unittest.TestCase):
def test_is_binary_file(self):
with tempfile.NamedTemporaryFile(delete=False) as tf:
tf.write(b"\x00\x01\x02BINARY")
path = tf.name
try:
self.assertTrue(DirectoryHandler.is_binary_file(path))
finally:
os.remove(path)
def test_gitignore_matching(self):
with tempfile.TemporaryDirectory() as root:
# Create .gitignore ignoring build/ and *.log
gi_dir = os.path.join(root, "a")
os.makedirs(gi_dir, exist_ok=True)
with open(os.path.join(gi_dir, ".gitignore"), "w") as f:
f.write("build/\n*.log\n")
# Files
os.makedirs(os.path.join(gi_dir, "build"), exist_ok=True)
ignored_dir_file = os.path.join(gi_dir, "build", "x.txt")
with open(ignored_dir_file, "w") as f:
f.write("ignored")
ignored_log = os.path.join(gi_dir, "debug.log")
with open(ignored_log, "w") as f:
f.write("ignored log")
kept_file = os.path.join(gi_dir, "src.txt")
with open(kept_file, "w") as f:
f.write("keep me")
gi_data = DirectoryHandler.load_gitignore_patterns(root)
self.assertTrue(DirectoryHandler.is_gitignored(ignored_dir_file, gi_data))
self.assertTrue(DirectoryHandler.is_gitignored(ignored_log, gi_data))
self.assertFalse(DirectoryHandler.is_gitignored(kept_file, gi_data))
def test_should_print_file_filters_hidden_and_types(self):
with tempfile.TemporaryDirectory() as root:
hidden = os.path.join(root, ".hidden.txt")
plain = os.path.join(root, "keep.py")
with open(hidden, "w") as f:
f.write("data")
with open(plain, "w") as f:
f.write("print('hi')")
self.assertFalse(
DirectoryHandler.should_print_file(
hidden,
file_types=[".py"],
ignore_file_strings=[],
ignore_hidden=True,
path_contains=[],
content_contains=[],
)
)
self.assertTrue(
DirectoryHandler.should_print_file(
plain,
file_types=[".py"],
ignore_file_strings=[],
ignore_hidden=True,
path_contains=[],
content_contains=[],
)
)
def test_print_file_content_no_comments_and_compress(self):
with tempfile.TemporaryDirectory() as root:
p = os.path.join(root, "t.py")
with open(p, "w") as f:
f.write("# comment only\nx=1\n")
buf = io.StringIO()
with redirect_stdout(buf):
DirectoryHandler.print_file_content(p, no_comments=True, compress=False)
out = buf.getvalue()
self.assertIn("<< START:", out)
# be whitespace-tolerant (tokenize may insert spaces)
self.assertRegex(out, r"x\s*=\s*1")
self.assertNotIn("# comment only", out)
buf = io.StringIO()
with redirect_stdout(buf):
DirectoryHandler.print_file_content(p, no_comments=True, compress=True)
out = buf.getvalue()
self.assertIn("COMPRESSED CODE:", out)
self.assertIn("<< END >>", out)
if __name__ == "__main__":
unittest.main()