diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3e781dd --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +# Makefile for ARC +.PHONY: test install help + +help: + @echo "Targets:" + @echo " make test - Run unit tests" + @echo " make install - Show how to install via Kevin's Package Manager" + +test: + @python -m unittest discover -s tests -p "test_*.py" -t . + +install: + @echo "ARC is distributed via Kevin's Package Manager." + @echo "Install it with:" + @echo " package-manager install arc" + @echo "" + @echo "(This 'make install' does not perform any other actions.)" diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code_processor.py b/code_processor.py index ff81cd3..7a89e24 100644 --- a/code_processor.py +++ b/code_processor.py @@ -1,54 +1,285 @@ import re import zlib +from dataclasses import dataclass +from typing import Dict, Tuple, Pattern, Optional +import io +import tokenize + + +@dataclass(frozen=True) +class LanguageSpec: + """Holds compiled comment patterns for a language.""" + patterns: Tuple[Pattern, ...] + class CodeProcessor: - PYTHON = ".py" - JS = ".js" - C = ".c" - CPP = ".cpp" - H = ".h" - BASH = ".sh" - SHELL = ".bash" + """ + Utilities to strip comments and (de)compress code. + - Python: tokenize-based (safe) with precise docstring removal. + - C/CPP/JS: state-machine comment stripper that respects string/char literals. + - Shell/YAML: remove full-line hash comments only. + - Jinja: remove {# ... #} blocks. + """ + # File extensions (normalized to lowercase) + EXT_TO_LANG: Dict[str, str] = { + ".py": "python", + ".js": "cstyle", + ".c": "cstyle", + ".cpp": "cstyle", + ".h": "cstyle", + ".sh": "hash", + ".bash": "hash", + ".yml": "hash", + ".yaml": "hash", + ".j2": "jinja", + ".jinja": "jinja", + ".jinja2": "jinja", + ".tpl": "jinja", + } + + # Regex-based specs for hash and jinja + _HASH = LanguageSpec(patterns=( + re.compile(r"^\s*#.*$", flags=re.MULTILINE), # only full-line comments + )) + _JINJA = LanguageSpec(patterns=( + re.compile(r"\{#.*?#\}", flags=re.DOTALL), # {# ... #} across lines + )) + + LANG_SPECS: Dict[str, LanguageSpec] = { + "hash": _HASH, + "jinja": _JINJA, + # "cstyle" handled by a state machine, not regex + # "python" handled by tokenize, not regex + } + + @classmethod + def _lang_from_ext(cls, file_type: str) -> Optional[str]: + """Map an extension like '.py' to an internal language key.""" + ext = file_type.lower().strip() + return cls.EXT_TO_LANG.get(ext) + + # ----------------------------- + # Python stripping via tokenize + # ----------------------------- + @staticmethod + def _strip_python_comments_tokenize(content: str) -> str: + """ + Remove comments and docstrings safely using tokenize. + Rules: + - Drop COMMENT tokens. + - Drop module docstring only if it's the very first statement at col 0. + - Drop the first STRING statement in a suite immediately after 'def'/'class' + header (':' NEWLINE INDENT). + """ + tokens = tokenize.generate_tokens(io.StringIO(content).readline) + out_tokens = [] + + indent_level = 0 + module_docstring_candidate = True # until we see first real stmt at module level + expect_suite_docstring = False # just entered a suite after def/class + last_was_colon = False + seen_nontrivial_in_line = False # guards module docstring (start of logical line) + + for tok_type, tok_str, start, end, line in tokens: + # Track indentation + if tok_type == tokenize.INDENT: + indent_level += 1 + elif tok_type == tokenize.DEDENT: + indent_level = max(0, indent_level - 1) + + # New logical line: reset guard + if tok_type in (tokenize.NEWLINE, tokenize.NL): + seen_nontrivial_in_line = False + out_tokens.append((tok_type, tok_str)) + continue + + # Comments are dropped + if tok_type == tokenize.COMMENT: + continue + + # Detect ':' ending a def/class header + if tok_type == tokenize.OP and tok_str == ":": + last_was_colon = True + out_tokens.append((tok_type, tok_str)) + continue + + # After ':' + NEWLINE + INDENT comes a suite start -> allow docstring removal + if tok_type == tokenize.INDENT and last_was_colon: + expect_suite_docstring = True + last_was_colon = False + out_tokens.append((tok_type, tok_str)) + continue + # Any non-INDENT token clears the last_was_colon flag + if tok_type != tokenize.NL: + last_was_colon = False + + # STRING handling + if tok_type == tokenize.STRING: + at_line_start = (start[1] == 0) and not seen_nontrivial_in_line + if indent_level == 0: + # Potential module docstring only if first statement at col 0 + if module_docstring_candidate and at_line_start: + module_docstring_candidate = False + # drop it + continue + # Any other top-level string is normal + module_docstring_candidate = False + out_tokens.append((tok_type, tok_str)) + seen_nontrivial_in_line = True + continue + else: + # In a suite: if it's the first statement after def/class, drop regardless of column + if expect_suite_docstring: + expect_suite_docstring = False + # drop it + continue + expect_suite_docstring = False + out_tokens.append((tok_type, tok_str)) + seen_nontrivial_in_line = True + continue + + # Any other significant token disables module-docstring candidacy + if tok_type not in (tokenize.INDENT, tokenize.DEDENT): + if indent_level == 0: + module_docstring_candidate = False + # Mark we've seen something on this line + if tok_type not in (tokenize.NL, tokenize.NEWLINE): + seen_nontrivial_in_line = True + + out_tokens.append((tok_type, tok_str)) + + return tokenize.untokenize(out_tokens) + + # --------------------------------- + # C-style stripping via state machine + # --------------------------------- + @staticmethod + def _strip_cstyle_comments(content: str) -> str: + """ + Remove // line comments and /* ... */ block comments while preserving + string ("...") and char ('...') literals and their escape sequences. + """ + i = 0 + n = len(content) + out = [] + in_line_comment = False + in_block_comment = False + in_string = False + in_char = False + escape = False + + while i < n: + c = content[i] + nxt = content[i + 1] if i + 1 < n else "" + + # If inside line comment: consume until newline + if in_line_comment: + if c == "\n": + in_line_comment = False + out.append(c) + i += 1 + continue + + # If inside block comment: consume until '*/' + if in_block_comment: + if c == "*" and nxt == "/": + in_block_comment = False + i += 2 + else: + i += 1 + continue + + # If inside string literal + if in_string: + out.append(c) + if escape: + escape = False + else: + if c == "\\": + escape = True + elif c == '"': + in_string = False + i += 1 + continue + + # If inside char literal + if in_char: + out.append(c) + if escape: + escape = False + else: + if c == "\\": + escape = True + elif c == "'": + in_char = False + i += 1 + continue + + # Not in any special state: + # Check for start of comments + if c == "/" and nxt == "/": + in_line_comment = True + i += 2 + continue + if c == "/" and nxt == "*": + in_block_comment = True + i += 2 + continue + + # Check for start of string/char literals + if c == '"': + in_string = True + out.append(c) + i += 1 + continue + if c == "'": + in_char = True + out.append(c) + i += 1 + continue + + # Normal character + out.append(c) + i += 1 + + return "".join(out) + + # ------------------- + # Public API + # ------------------- + @classmethod + def remove_comments(cls, content: str, file_type: str) -> str: + """ + Remove comments based on file type/extension. + - Python: tokenize-based + - C/CPP/JS: state-machine + - Hash (sh/yaml): regex full-line + - Jinja: regex {# ... #} + """ + lang = cls._lang_from_ext(file_type) + if lang is None: + return content.strip() + + if lang == "python": + return cls._strip_python_comments_tokenize(content).strip() + if lang == "cstyle": + return cls._strip_cstyle_comments(content).strip() + + spec = cls.LANG_SPECS.get(lang) + if not spec: + return content.strip() + + cleaned = content + for pat in spec.patterns: + cleaned = pat.sub("", cleaned) + return cleaned.strip() @staticmethod - def remove_comments(content, file_type): - """Remove comments based on file type.""" - comment_patterns = { - CodeProcessor.PYTHON: [ - (r'\s*#.*', '', 0), - (r'\"\"\"(.*?)\"\"\"', '', re.DOTALL), - (r"\'\'\'(.*?)\'\'\'", '', re.DOTALL) - ], - CodeProcessor.JS: [ - (r'\s*//.*', '', 0), - (r'/\*.*?\*/', '', 0) - ], - CodeProcessor.C: [ - (r'\s*//.*', '', 0), - (r'/\*.*?\*/', '', 0) - ], - CodeProcessor.CPP: [ - (r'\s*//.*', '', 0), - (r'/\*.*?\*/', '', 0) - ], - CodeProcessor.H: [ - (r'\s*//.*', '', 0), - (r'/\*.*?\*/', '', 0) - ], - CodeProcessor.BASH: [ - (r'\s*#.*', '', 0) - ], - CodeProcessor.SHELL: [ - (r'\s*#.*', '', 0) - ] - } - - patterns = comment_patterns.get(file_type, []) - for pattern, repl, flags in patterns: - content = re.sub(pattern, repl, content, flags=flags) - return content.strip() + def compress(content: str, level: int = 9) -> bytes: + """Compress code using zlib. Returns bytes.""" + return zlib.compress(content.encode("utf-8"), level) @staticmethod - def compress(content): - """Compress code using zlib.""" - return zlib.compress(content.encode()) \ No newline at end of file + def decompress(blob: bytes) -> str: + """Decompress zlib-compressed code back to text.""" + return zlib.decompress(blob).decode("utf-8") diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_arc.py b/tests/unit/test_arc.py new file mode 100644 index 0000000..db25775 --- /dev/null +++ b/tests/unit/test_arc.py @@ -0,0 +1,190 @@ +# tests/unit/test_arc.py +import io +import os +import sys +import tempfile +import unittest +from contextlib import redirect_stdout + +# Ensure project root is on sys.path when running via discover +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +if PROJECT_ROOT not in sys.path: + sys.path.insert(0, PROJECT_ROOT) + +from code_processor import CodeProcessor +from directory_handler import DirectoryHandler + + +class TestCodeProcessor(unittest.TestCase): + def test_python_comment_and_docstring_stripping(self): + src = '''\ +"""module docstring should go away""" + +# a comment +x = 1 # inline comment +y = "string with # not a comment" + +def f(): + """function docstring should go away""" + s = """triple quoted but not a docstring""" + return x +''' + out = CodeProcessor.remove_comments(src, ".py") + self.assertNotIn("module docstring", out) + self.assertNotIn("function docstring", out) + self.assertNotIn("# a comment", out) + # tolerate whitespace normalization from tokenize.untokenize + self.assertRegex(out, r'y\s*=\s*"string with # not a comment"') + self.assertIn('triple quoted but not a docstring', out) + + def test_cstyle_comment_stripping(self): + src = '''\ +// line comment +int main() { + /* block + comment */ + int x = 42; // end comment + const char* s = "/* not a comment here */"; + return x; +} +''' + out = CodeProcessor.remove_comments(src, ".c") + # line comment and block comment gone + self.assertNotIn("// line comment", out) + self.assertNotIn("block\n comment", out) + # string content with /* */ inside should remain + self.assertIn('const char* s = "/* not a comment here */";', out) + + def test_hash_comment_stripping(self): + src = """\ +# top comment +KEY=value # trailing comment should be kept by default +plain: value +""" + out = CodeProcessor.remove_comments(src, ".yml") + # Our regex removes full lines starting with optional spaces then '#' + self.assertNotIn("top comment", out) + # It does not remove trailing fragments after content for hash style + self.assertIn("KEY=value", out) + self.assertIn("plain: value", out) + + def test_jinja_comment_stripping(self): + src = """\ +{# top jinja comment #} +Hello {{ name }}! +{# + multi-line + jinja comment +#} +Body text and {{ value }}. +""" + out = CodeProcessor.remove_comments(src, ".j2") + self.assertNotIn("top jinja comment", out) + self.assertNotIn("multi-line", out) + # Regular content and expressions remain + self.assertIn("Hello {{ name }}!", out) + self.assertIn("Body text and {{ value }}.", out) + + def test_unknown_extension_returns_stripped(self): + src = " x = 1 # not removed for unknown " + out = CodeProcessor.remove_comments(src, ".unknown") + self.assertEqual(out, "x = 1 # not removed for unknown") + + def test_compress_decompress_roundtrip(self): + src = "def x():\n return 42\n" + blob = CodeProcessor.compress(src) + self.assertIsInstance(blob, (bytes, bytearray)) + back = CodeProcessor.decompress(blob) + self.assertEqual(src, back) + + +class TestDirectoryHandler(unittest.TestCase): + def test_is_binary_file(self): + with tempfile.NamedTemporaryFile(delete=False) as tf: + tf.write(b"\x00\x01\x02BINARY") + path = tf.name + try: + self.assertTrue(DirectoryHandler.is_binary_file(path)) + finally: + os.remove(path) + + def test_gitignore_matching(self): + with tempfile.TemporaryDirectory() as root: + # Create .gitignore ignoring build/ and *.log + gi_dir = os.path.join(root, "a") + os.makedirs(gi_dir, exist_ok=True) + with open(os.path.join(gi_dir, ".gitignore"), "w") as f: + f.write("build/\n*.log\n") + + # Files + os.makedirs(os.path.join(gi_dir, "build"), exist_ok=True) + ignored_dir_file = os.path.join(gi_dir, "build", "x.txt") + with open(ignored_dir_file, "w") as f: + f.write("ignored") + ignored_log = os.path.join(gi_dir, "debug.log") + with open(ignored_log, "w") as f: + f.write("ignored log") + kept_file = os.path.join(gi_dir, "src.txt") + with open(kept_file, "w") as f: + f.write("keep me") + + gi_data = DirectoryHandler.load_gitignore_patterns(root) + + self.assertTrue(DirectoryHandler.is_gitignored(ignored_dir_file, gi_data)) + self.assertTrue(DirectoryHandler.is_gitignored(ignored_log, gi_data)) + self.assertFalse(DirectoryHandler.is_gitignored(kept_file, gi_data)) + + def test_should_print_file_filters_hidden_and_types(self): + with tempfile.TemporaryDirectory() as root: + hidden = os.path.join(root, ".hidden.txt") + plain = os.path.join(root, "keep.py") + with open(hidden, "w") as f: + f.write("data") + with open(plain, "w") as f: + f.write("print('hi')") + + self.assertFalse( + DirectoryHandler.should_print_file( + hidden, + file_types=[".py"], + ignore_file_strings=[], + ignore_hidden=True, + path_contains=[], + content_contains=[], + ) + ) + self.assertTrue( + DirectoryHandler.should_print_file( + plain, + file_types=[".py"], + ignore_file_strings=[], + ignore_hidden=True, + path_contains=[], + content_contains=[], + ) + ) + + def test_print_file_content_no_comments_and_compress(self): + with tempfile.TemporaryDirectory() as root: + p = os.path.join(root, "t.py") + with open(p, "w") as f: + f.write("# comment only\nx=1\n") + buf = io.StringIO() + with redirect_stdout(buf): + DirectoryHandler.print_file_content(p, no_comments=True, compress=False) + out = buf.getvalue() + self.assertIn("<< START:", out) + # be whitespace-tolerant (tokenize may insert spaces) + self.assertRegex(out, r"x\s*=\s*1") + self.assertNotIn("# comment only", out) + + buf = io.StringIO() + with redirect_stdout(buf): + DirectoryHandler.print_file_content(p, no_comments=True, compress=True) + out = buf.getvalue() + self.assertIn("COMPRESSED CODE:", out) + self.assertIn("<< END >>", out) + + +if __name__ == "__main__": + unittest.main()