Refactor CodeProcessor to use safe state-machine and tokenize-based stripping, add Jinja {# #} support, and introduce unit tests with Makefile targets

- Added LanguageSpec dataclass and mapping for extensions - Implemented state-machine for C/CPP/JS comment stripping (handles strings correctly) - Improved Python comment/docstring removal using tokenize - Added regex-based stripping for hash (#) and Jinja {# #} comments - Added Makefile with test and install targets - Added unit test suite under tests/unit covering Python, C-style, hash, and Jinja cases - Added compress/decompress roundtrip test - Added directory handler tests See: https://chatgpt.com/share/68e0250f-40d4-800f-911d-2b4700246574
2025-11-24 13:44:59 +00:00 · 2025-10-03 21:34:02 +02:00
parent c5938cf482
commit b55576beb2
6 changed files with 484 additions and 46 deletions
--- a/17
+++ b/17
@@ -0,0 +1,17 @@
+# Makefile for ARC
+.PHONY: test install help
+
+help:
+	@echo "Targets:"
+	@echo "  make test     - Run unit tests"
+	@echo "  make install  - Show how to install via Kevin's Package Manager"
+
+test:
+	@python -m unittest discover -s tests -p "test_*.py" -t .
+
+install:
+	@echo "ARC is distributed via Kevin's Package Manager."
+	@echo "Install it with:"
+	@echo "    package-manager install arc"
+	@echo ""
+	@echo "(This 'make install' does not perform any other actions.)"
--- a/init.py
+++ b/init.py
--- a/code_processor.py
+++ b/code_processor.py
@@ -1,54 +1,285 @@
 import re
 import zlib
+from dataclasses import dataclass
+from typing import Dict, Tuple, Pattern, Optional
+import io
+import tokenize
+
+
+@dataclass(frozen=True)
+class LanguageSpec:
+    """Holds compiled comment patterns for a language."""
+    patterns: Tuple[Pattern, ...]
+

 class CodeProcessor:
-    PYTHON = ".py"
-    JS = ".js"
-    C = ".c"
-    CPP = ".cpp"
-    H = ".h"
-    BASH = ".sh"
-    SHELL = ".bash"
-
-    @staticmethod
-    def remove_comments(content, file_type):
-        """Remove comments based on file type."""
-        comment_patterns = {
-            CodeProcessor.PYTHON: [
-                (r'\s*#.*', '', 0),
-                (r'\"\"\"(.*?)\"\"\"', '', re.DOTALL),
-                (r"\'\'\'(.*?)\'\'\'", '', re.DOTALL)
-            ],
-            CodeProcessor.JS: [
-                (r'\s*//.*', '', 0),
-                (r'/\*.*?\*/', '', 0)
-            ],
-            CodeProcessor.C: [
-                (r'\s*//.*', '', 0),
-                (r'/\*.*?\*/', '', 0)
-            ],
-            CodeProcessor.CPP: [
-                (r'\s*//.*', '', 0),
-                (r'/\*.*?\*/', '', 0)
-            ],
-            CodeProcessor.H: [
-                (r'\s*//.*', '', 0),
-                (r'/\*.*?\*/', '', 0)
-            ],
-            CodeProcessor.BASH: [
-                (r'\s*#.*', '', 0)
-            ],
-            CodeProcessor.SHELL: [
-                (r'\s*#.*', '', 0)
-            ]
+    """
+    Utilities to strip comments and (de)compress code.
+    - Python: tokenize-based (safe) with precise docstring removal.
+    - C/CPP/JS: state-machine comment stripper that respects string/char literals.
+    - Shell/YAML: remove full-line hash comments only.
+    - Jinja: remove {# ... #} blocks.
+    """
+    # File extensions (normalized to lowercase)
+    EXT_TO_LANG: Dict[str, str] = {
+        ".py": "python",
+        ".js": "cstyle",
+        ".c": "cstyle",
+        ".cpp": "cstyle",
+        ".h": "cstyle",
+        ".sh": "hash",
+        ".bash": "hash",
+        ".yml": "hash",
+        ".yaml": "hash",
+        ".j2": "jinja",
+        ".jinja": "jinja",
+        ".jinja2": "jinja",
+        ".tpl": "jinja",
    }

-        patterns = comment_patterns.get(file_type, [])
-        for pattern, repl, flags in patterns:
-            content = re.sub(pattern, repl, content, flags=flags)
+    # Regex-based specs for hash and jinja
+    _HASH = LanguageSpec(patterns=(
+        re.compile(r"^\s*#.*$", flags=re.MULTILINE),   # only full-line comments
+    ))
+    _JINJA = LanguageSpec(patterns=(
+        re.compile(r"\{#.*?#\}", flags=re.DOTALL),     # {# ... #} across lines
+    ))
+
+    LANG_SPECS: Dict[str, LanguageSpec] = {
+        "hash": _HASH,
+        "jinja": _JINJA,
+        # "cstyle" handled by a state machine, not regex
+        # "python" handled by tokenize, not regex
+    }
+
+    @classmethod
+    def _lang_from_ext(cls, file_type: str) -> Optional[str]:
+        """Map an extension like '.py' to an internal language key."""
+        ext = file_type.lower().strip()
+        return cls.EXT_TO_LANG.get(ext)
+
+    # -----------------------------
+    # Python stripping via tokenize
+    # -----------------------------
+    @staticmethod
+    def _strip_python_comments_tokenize(content: str) -> str:
+        """
+        Remove comments and docstrings safely using tokenize.
+        Rules:
+          - Drop COMMENT tokens.
+          - Drop module docstring only if it's the very first statement at col 0.
+          - Drop the first STRING statement in a suite immediately after 'def'/'class'
+            header (':' NEWLINE INDENT).
+        """
+        tokens = tokenize.generate_tokens(io.StringIO(content).readline)
+        out_tokens = []
+
+        indent_level = 0
+        module_docstring_candidate = True  # until we see first real stmt at module level
+        expect_suite_docstring = False     # just entered a suite after def/class
+        last_was_colon = False
+        seen_nontrivial_in_line = False    # guards module docstring (start of logical line)
+
+        for tok_type, tok_str, start, end, line in tokens:
+            # Track indentation
+            if tok_type == tokenize.INDENT:
+                indent_level += 1
+            elif tok_type == tokenize.DEDENT:
+                indent_level = max(0, indent_level - 1)
+
+            # New logical line: reset guard
+            if tok_type in (tokenize.NEWLINE, tokenize.NL):
+                seen_nontrivial_in_line = False
+                out_tokens.append((tok_type, tok_str))
+                continue
+
+            # Comments are dropped
+            if tok_type == tokenize.COMMENT:
+                continue
+
+            # Detect ':' ending a def/class header
+            if tok_type == tokenize.OP and tok_str == ":":
+                last_was_colon = True
+                out_tokens.append((tok_type, tok_str))
+                continue
+
+            # After ':' + NEWLINE + INDENT comes a suite start -> allow docstring removal
+            if tok_type == tokenize.INDENT and last_was_colon:
+                expect_suite_docstring = True
+                last_was_colon = False
+                out_tokens.append((tok_type, tok_str))
+                continue
+            # Any non-INDENT token clears the last_was_colon flag
+            if tok_type != tokenize.NL:
+                last_was_colon = False
+
+            # STRING handling
+            if tok_type == tokenize.STRING:
+                at_line_start = (start[1] == 0) and not seen_nontrivial_in_line
+                if indent_level == 0:
+                    # Potential module docstring only if first statement at col 0
+                    if module_docstring_candidate and at_line_start:
+                        module_docstring_candidate = False
+                        # drop it
+                        continue
+                    # Any other top-level string is normal
+                    module_docstring_candidate = False
+                    out_tokens.append((tok_type, tok_str))
+                    seen_nontrivial_in_line = True
+                    continue
+                else:
+                    # In a suite: if it's the first statement after def/class, drop regardless of column
+                    if expect_suite_docstring:
+                        expect_suite_docstring = False
+                        # drop it
+                        continue
+                    expect_suite_docstring = False
+                    out_tokens.append((tok_type, tok_str))
+                    seen_nontrivial_in_line = True
+                    continue
+
+            # Any other significant token disables module-docstring candidacy
+            if tok_type not in (tokenize.INDENT, tokenize.DEDENT):
+                if indent_level == 0:
+                    module_docstring_candidate = False
+                # Mark we've seen something on this line
+                if tok_type not in (tokenize.NL, tokenize.NEWLINE):
+                    seen_nontrivial_in_line = True
+
+            out_tokens.append((tok_type, tok_str))
+
+        return tokenize.untokenize(out_tokens)
+
+    # ---------------------------------
+    # C-style stripping via state machine
+    # ---------------------------------
+    @staticmethod
+    def _strip_cstyle_comments(content: str) -> str:
+        """
+        Remove // line comments and /* ... */ block comments while preserving
+        string ("...") and char ('...') literals and their escape sequences.
+        """
+        i = 0
+        n = len(content)
+        out = []
+        in_line_comment = False
+        in_block_comment = False
+        in_string = False
+        in_char = False
+        escape = False
+
+        while i < n:
+            c = content[i]
+            nxt = content[i + 1] if i + 1 < n else ""
+
+            # If inside line comment: consume until newline
+            if in_line_comment:
+                if c == "\n":
+                    in_line_comment = False
+                    out.append(c)
+                i += 1
+                continue
+
+            # If inside block comment: consume until '*/'
+            if in_block_comment:
+                if c == "*" and nxt == "/":
+                    in_block_comment = False
+                    i += 2
+                else:
+                    i += 1
+                continue
+
+            # If inside string literal
+            if in_string:
+                out.append(c)
+                if escape:
+                    escape = False
+                else:
+                    if c == "\\":
+                        escape = True
+                    elif c == '"':
+                        in_string = False
+                i += 1
+                continue
+
+            # If inside char literal
+            if in_char:
+                out.append(c)
+                if escape:
+                    escape = False
+                else:
+                    if c == "\\":
+                        escape = True
+                    elif c == "'":
+                        in_char = False
+                i += 1
+                continue
+
+            # Not in any special state:
+            # Check for start of comments
+            if c == "/" and nxt == "/":
+                in_line_comment = True
+                i += 2
+                continue
+            if c == "/" and nxt == "*":
+                in_block_comment = True
+                i += 2
+                continue
+
+            # Check for start of string/char literals
+            if c == '"':
+                in_string = True
+                out.append(c)
+                i += 1
+                continue
+            if c == "'":
+                in_char = True
+                out.append(c)
+                i += 1
+                continue
+
+            # Normal character
+            out.append(c)
+            i += 1
+
+        return "".join(out)
+
+    # -------------------
+    # Public API
+    # -------------------
+    @classmethod
+    def remove_comments(cls, content: str, file_type: str) -> str:
+        """
+        Remove comments based on file type/extension.
+          - Python: tokenize-based
+          - C/CPP/JS: state-machine
+          - Hash (sh/yaml): regex full-line
+          - Jinja: regex {# ... #}
+        """
+        lang = cls._lang_from_ext(file_type)
+        if lang is None:
            return content.strip()

+        if lang == "python":
+            return cls._strip_python_comments_tokenize(content).strip()
+        if lang == "cstyle":
+            return cls._strip_cstyle_comments(content).strip()
+
+        spec = cls.LANG_SPECS.get(lang)
+        if not spec:
+            return content.strip()
+
+        cleaned = content
+        for pat in spec.patterns:
+            cleaned = pat.sub("", cleaned)
+        return cleaned.strip()
+
    @staticmethod
-    def compress(content):
-        """Compress code using zlib."""
-        return zlib.compress(content.encode())
+    def compress(content: str, level: int = 9) -> bytes:
+        """Compress code using zlib. Returns bytes."""
+        return zlib.compress(content.encode("utf-8"), level)
+
+    @staticmethod
+    def decompress(blob: bytes) -> str:
+        """Decompress zlib-compressed code back to text."""
+        return zlib.decompress(blob).decode("utf-8")
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/unit/init.py
+++ b/tests/unit/init.py
--- a/tests/unit/test_arc.py
+++ b/tests/unit/test_arc.py
@@ -0,0 +1,190 @@
+# tests/unit/test_arc.py
+import io
+import os
+import sys
+import tempfile
+import unittest
+from contextlib import redirect_stdout
+
+# Ensure project root is on sys.path when running via discover
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+
+from code_processor import CodeProcessor
+from directory_handler import DirectoryHandler
+
+
+class TestCodeProcessor(unittest.TestCase):
+    def test_python_comment_and_docstring_stripping(self):
+        src = '''\
+"""module docstring should go away"""
+
+# a comment
+x = 1  # inline comment
+y = "string with # not a comment"
+
+def f():
+    """function docstring should go away"""
+    s = """triple quoted but not a docstring"""
+    return x
+'''
+        out = CodeProcessor.remove_comments(src, ".py")
+        self.assertNotIn("module docstring", out)
+        self.assertNotIn("function docstring", out)
+        self.assertNotIn("# a comment", out)
+        # tolerate whitespace normalization from tokenize.untokenize
+        self.assertRegex(out, r'y\s*=\s*"string with # not a comment"')
+        self.assertIn('triple quoted but not a docstring', out)
+
+    def test_cstyle_comment_stripping(self):
+        src = '''\
+// line comment
+int main() {
+  /* block
+     comment */
+  int x = 42; // end comment
+  const char* s = "/* not a comment here */";
+  return x;
+}
+'''
+        out = CodeProcessor.remove_comments(src, ".c")
+        # line comment and block comment gone
+        self.assertNotIn("// line comment", out)
+        self.assertNotIn("block\n     comment", out)
+        # string content with /* */ inside should remain
+        self.assertIn('const char* s = "/* not a comment here */";', out)
+
+    def test_hash_comment_stripping(self):
+        src = """\
+# top comment
+KEY=value  # trailing comment should be kept by default
+plain: value
+"""
+        out = CodeProcessor.remove_comments(src, ".yml")
+        # Our regex removes full lines starting with optional spaces then '#'
+        self.assertNotIn("top comment", out)
+        # It does not remove trailing fragments after content for hash style
+        self.assertIn("KEY=value", out)
+        self.assertIn("plain: value", out)
+
+    def test_jinja_comment_stripping(self):
+        src = """\
+{# top jinja comment #}
+Hello {{ name }}!
+{#
+  multi-line
+  jinja comment
+#}
+Body text and {{ value }}.
+"""
+        out = CodeProcessor.remove_comments(src, ".j2")
+        self.assertNotIn("top jinja comment", out)
+        self.assertNotIn("multi-line", out)
+        # Regular content and expressions remain
+        self.assertIn("Hello {{ name }}!", out)
+        self.assertIn("Body text and {{ value }}.", out)
+
+    def test_unknown_extension_returns_stripped(self):
+        src = "  x = 1  # not removed for unknown  "
+        out = CodeProcessor.remove_comments(src, ".unknown")
+        self.assertEqual(out, "x = 1  # not removed for unknown")
+
+    def test_compress_decompress_roundtrip(self):
+        src = "def x():\n    return 42\n"
+        blob = CodeProcessor.compress(src)
+        self.assertIsInstance(blob, (bytes, bytearray))
+        back = CodeProcessor.decompress(blob)
+        self.assertEqual(src, back)
+
+
+class TestDirectoryHandler(unittest.TestCase):
+    def test_is_binary_file(self):
+        with tempfile.NamedTemporaryFile(delete=False) as tf:
+            tf.write(b"\x00\x01\x02BINARY")
+            path = tf.name
+        try:
+            self.assertTrue(DirectoryHandler.is_binary_file(path))
+        finally:
+            os.remove(path)
+
+    def test_gitignore_matching(self):
+        with tempfile.TemporaryDirectory() as root:
+            # Create .gitignore ignoring build/ and *.log
+            gi_dir = os.path.join(root, "a")
+            os.makedirs(gi_dir, exist_ok=True)
+            with open(os.path.join(gi_dir, ".gitignore"), "w") as f:
+                f.write("build/\n*.log\n")
+
+            # Files
+            os.makedirs(os.path.join(gi_dir, "build"), exist_ok=True)
+            ignored_dir_file = os.path.join(gi_dir, "build", "x.txt")
+            with open(ignored_dir_file, "w") as f:
+                f.write("ignored")
+            ignored_log = os.path.join(gi_dir, "debug.log")
+            with open(ignored_log, "w") as f:
+                f.write("ignored log")
+            kept_file = os.path.join(gi_dir, "src.txt")
+            with open(kept_file, "w") as f:
+                f.write("keep me")
+
+            gi_data = DirectoryHandler.load_gitignore_patterns(root)
+
+            self.assertTrue(DirectoryHandler.is_gitignored(ignored_dir_file, gi_data))
+            self.assertTrue(DirectoryHandler.is_gitignored(ignored_log, gi_data))
+            self.assertFalse(DirectoryHandler.is_gitignored(kept_file, gi_data))
+
+    def test_should_print_file_filters_hidden_and_types(self):
+        with tempfile.TemporaryDirectory() as root:
+            hidden = os.path.join(root, ".hidden.txt")
+            plain = os.path.join(root, "keep.py")
+            with open(hidden, "w") as f:
+                f.write("data")
+            with open(plain, "w") as f:
+                f.write("print('hi')")
+
+            self.assertFalse(
+                DirectoryHandler.should_print_file(
+                    hidden,
+                    file_types=[".py"],
+                    ignore_file_strings=[],
+                    ignore_hidden=True,
+                    path_contains=[],
+                    content_contains=[],
+                )
+            )
+            self.assertTrue(
+                DirectoryHandler.should_print_file(
+                    plain,
+                    file_types=[".py"],
+                    ignore_file_strings=[],
+                    ignore_hidden=True,
+                    path_contains=[],
+                    content_contains=[],
+                )
+            )
+
+    def test_print_file_content_no_comments_and_compress(self):
+        with tempfile.TemporaryDirectory() as root:
+            p = os.path.join(root, "t.py")
+            with open(p, "w") as f:
+                f.write("# comment only\nx=1\n")
+            buf = io.StringIO()
+            with redirect_stdout(buf):
+                DirectoryHandler.print_file_content(p, no_comments=True, compress=False)
+            out = buf.getvalue()
+            self.assertIn("<< START:", out)
+            # be whitespace-tolerant (tokenize may insert spaces)
+            self.assertRegex(out, r"x\s*=\s*1")
+            self.assertNotIn("# comment only", out)
+
+            buf = io.StringIO()
+            with redirect_stdout(buf):
+                DirectoryHandler.print_file_content(p, no_comments=True, compress=True)
+            out = buf.getvalue()
+            self.assertIn("COMPRESSED CODE:", out)
+            self.assertIn("<< END >>", out)
+
+
+if __name__ == "__main__":
+    unittest.main()