This commit introduces a complete structural and architectural refactor of

Analysis-Ready Code (ARC). The project is now fully migrated to a modern
src/-based Python package layout, with proper packaging via pyproject.toml,
a clean Nix flake, and improved CLI entry points.

Major changes:

• Add `src/arc/` package with clean module structure:
  - arc/__init__.py now contains the main() dispatcher and clipboard helpers
  - arc/__main__.py provides a proper `python -m arc` entry point
  - arc/cli.py rewritten with full argparse-based interface
  - arc/code_processor.py modernized and relocated
  - arc/directory_handler.py rewritten with output_stream support
  - arc/tee.py added for multi-stream output (stdout + buffer)

• Remove legacy top-level modules:
  - cli.py
  - directory_handler.py
  - main.py

• Introduce fully PEP-517 compliant pyproject.toml with console script:
  - arc = arc.__main__:main

• Add Nix flake (`flake.nix`) providing:
  - buildPythonApplication package `arc`
  - `nix run .#arc` app
  - development shell with Python + xclip

• Add Makefile overhaul:
  - automatic detection of Nix vs Python installation
  - unified install/uninstall targets
  - Nix wrapper installation into ~/.local/bin
  - improved help text and shell safety

• Add GitHub CI pipelines:
  - ci-python.yml for Python builds + Makefile tests + arc --help
  - ci-nix.yml for Nix builds, flake checks, dev-shell tests, and `nix run .#arc`

• Refactor and extend unit tests:
  - test_arc.py updated for src/ imports
  - new tests: test_cli.py, test_main.py, test_tee.py
  - improved CodeProcessor and DirectoryHandler tests

• Add egg-info metadata for local builds

• Add build/lib/ tree for compatibility with setuptools (generated)

Overall, this commit modernizes ARC into a clean, robust, and fully packaged
Python/Nix hybrid tool, enabling reproducible builds, solid CLI behavior,
testable architecture, and CI automation.

https://chatgpt.com/share/693933a0-e280-800f-9cf0-26036d15be04
This commit is contained in:
2025-12-10 09:47:19 +01:00
parent b55576beb2
commit 039481d3a9
19 changed files with 965 additions and 186 deletions

111
src/arc/__init__.py Normal file
View File

@@ -0,0 +1,111 @@
import io
import os
import subprocess
import sys
from .cli import parse_arguments
from .directory_handler import DirectoryHandler
from .tee import Tee
import shutil
import subprocess
def copy_to_clipboard(text: str, quiet: bool = False):
if shutil.which("xclip"):
subprocess.run(["xclip", "-selection", "clipboard"], input=text, text=True)
return
if shutil.which("wl-copy"):
subprocess.run(["wl-copy"], input=text, text=True)
return
if shutil.which("pbcopy"):
subprocess.run(["pbcopy"], input=text, text=True)
return
if not quiet:
print("Warning: No clipboard tool found (xclip, wl-copy, pbcopy)", file=sys.stderr)
def main() -> None:
args = parse_arguments()
# QUIET MODE:
# - no terminal output
# - but clipboard buffer still active
#
# Normal:
# - output goes to stdout
# - optionally tee into buffer
buffer = None
if args.clipboard:
buffer = io.StringIO()
if args.quiet:
# quiet + clipboard → only buffer, no stdout
output_stream = buffer
else:
# normal + clipboard → stdout + buffer
output_stream = Tee(sys.stdout, buffer)
else:
# no clipboard
if args.quiet:
# quiet without clipboard → suppress ALL output
class NullWriter:
def write(self, *_): pass
def flush(self): pass
output_stream = NullWriter()
else:
output_stream = sys.stdout
# Process all paths
for path in args.paths:
if os.path.isdir(path):
DirectoryHandler.handle_directory(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
verbose=args.verbose and not args.quiet,
no_comments=args.no_comments,
compress=args.compress,
path_contains=args.path_contains,
content_contains=args.content_contains,
no_gitignore=args.no_gitignore,
scan_binary_files=args.scan_binary_files,
output_stream=output_stream,
)
elif os.path.isfile(path):
if DirectoryHandler.should_print_file(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
path_contains=args.path_contains,
content_contains=args.content_contains,
scan_binary_files=args.scan_binary_files,
):
DirectoryHandler.handle_file(
path,
file_types=args.file_types,
ignore_file_strings=args.ignore_file_strings,
ignore_hidden=args.ignore_hidden,
no_comments=args.no_comments,
compress=args.compress,
scan_binary_files=args.scan_binary_files,
output_stream=output_stream,
)
else:
if not args.quiet:
print(f"Error: {path} is neither file nor directory.", file=sys.stderr)
sys.exit(1)
# Copy to clipboard if enabled
if buffer is not None:
text = buffer.getvalue()
try:
subprocess.run(["xclip", "-selection", "clipboard"], input=text, text=True, check=False)
except FileNotFoundError:
if not args.quiet:
print("Warning: xclip not found.", file=sys.stderr)

18
src/arc/__main__.py Normal file
View File

@@ -0,0 +1,18 @@
# src/arc/__main__.py
from . import main as _arc_main
def main() -> None:
"""
Entry point for the `arc` console script and `python -m arc`.
This keeps all CLI logic in `arc.__init__.py` (main()) and simply
delegates to it, so both setuptools/entry_points and Nix wrappers
can reliably import `arc.__main__:main`.
"""
_arc_main()
if __name__ == "__main__":
main()

120
src/arc/cli.py Normal file
View File

@@ -0,0 +1,120 @@
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(
description="Scan directories and print/compile file contents."
)
# Positional: paths
parser.add_argument(
"paths",
nargs="+",
help="List of files or directories to scan.",
)
# File type filter
parser.add_argument(
"-t",
"--file-types",
nargs="+",
default=[],
help="Filter by file types (e.g., .py, .js, .c).",
)
# Ignore file/path strings (was previously -x, jetzt -I)
parser.add_argument(
"-I",
"--ignore-file-strings",
nargs="+",
default=[],
help="Ignore files and folders containing these strings.",
)
# Clipboard: alias -x
parser.add_argument(
"-x",
"--clipboard",
action="store_true",
help="Copy the output to the X clipboard via xclip (alias: -x).",
)
# Quiet mode
parser.add_argument(
"-q",
"--quiet",
action="store_true",
help="Suppress terminal output (useful with --clipboard).",
)
# Show hidden files
parser.add_argument(
"-S",
"--show-hidden",
action="store_true",
dest="show_hidden",
default=False,
help="Include hidden directories and files.",
)
# Verbose
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable verbose mode.",
)
# Strip comments
parser.add_argument(
"-N",
"--no-comments",
action="store_true",
help="Remove comments from files before printing.",
)
# Compress
parser.add_argument(
"-z",
"--compress",
action="store_true",
help="Compress content instead of printing plain text.",
)
# Path filter
parser.add_argument(
"-p",
"--path-contains",
nargs="+",
default=[],
help="Only include files whose *path* contains one of these strings.",
)
# Content filter
parser.add_argument(
"-C",
"--content-contains",
nargs="+",
default=[],
help="Only include files whose *content* contains one of these strings.",
)
# Ignore .gitignore
parser.add_argument(
"-G",
"--no-gitignore",
action="store_true",
help="Do not respect .gitignore files during scan.",
)
# Scan binary files
parser.add_argument(
"-b",
"--scan-binary-files",
action="store_true",
help="Also scan binary files (ignored by default).",
)
args = parser.parse_args()
args.ignore_hidden = not args.show_hidden
return args

285
src/arc/code_processor.py Normal file
View File

@@ -0,0 +1,285 @@
import io
import re
import tokenize
import zlib
from dataclasses import dataclass
from typing import Dict, Tuple, Pattern, Optional
@dataclass(frozen=True)
class LanguageSpec:
"""Holds compiled comment patterns for a language."""
patterns: Tuple[Pattern, ...]
class CodeProcessor:
"""
Utilities to strip comments and (de)compress code.
- Python: tokenize-based (safe) with precise docstring removal.
- C/CPP/JS: state-machine comment stripper that respects string/char literals.
- Shell/YAML: remove full-line hash comments only.
- Jinja: remove {# ... #} blocks.
"""
# File extensions (normalized to lowercase)
EXT_TO_LANG: Dict[str, str] = {
".py": "python",
".js": "cstyle",
".c": "cstyle",
".cpp": "cstyle",
".h": "cstyle",
".sh": "hash",
".bash": "hash",
".yml": "hash",
".yaml": "hash",
".j2": "jinja",
".jinja": "jinja",
".jinja2": "jinja",
".tpl": "jinja",
}
# Regex-based specs for hash and jinja
_HASH = LanguageSpec(patterns=(
re.compile(r"^\s*#.*$", flags=re.MULTILINE), # only full-line comments
))
_JINJA = LanguageSpec(patterns=(
re.compile(r"\{#.*?#\}", flags=re.DOTALL), # {# ... #} across lines
))
LANG_SPECS: Dict[str, LanguageSpec] = {
"hash": _HASH,
"jinja": _JINJA,
# "cstyle" handled by a state machine, not regex
# "python" handled by tokenize, not regex
}
@classmethod
def _lang_from_ext(cls, file_type: str) -> Optional[str]:
"""Map an extension like '.py' to an internal language key."""
ext = file_type.lower().strip()
return cls.EXT_TO_LANG.get(ext)
# -----------------------------
# Python stripping via tokenize
# -----------------------------
@staticmethod
def _strip_python_comments_tokenize(content: str) -> str:
"""
Remove comments and docstrings safely using tokenize.
Rules:
- Drop COMMENT tokens.
- Drop module docstring only if it's the very first statement at col 0.
- Drop the first STRING statement in a suite immediately after 'def'/'class'
header (':' NEWLINE INDENT).
"""
tokens = tokenize.generate_tokens(io.StringIO(content).readline)
out_tokens = []
indent_level = 0
module_docstring_candidate = True # until we see first real stmt at module level
expect_suite_docstring = False # just entered a suite after def/class
last_was_colon = False
seen_nontrivial_in_line = False # guards module docstring (start of logical line)
for tok_type, tok_str, start, end, line in tokens:
# Track indentation
if tok_type == tokenize.INDENT:
indent_level += 1
elif tok_type == tokenize.DEDENT:
indent_level = max(0, indent_level - 1)
# New logical line: reset guard
if tok_type in (tokenize.NEWLINE, tokenize.NL):
seen_nontrivial_in_line = False
out_tokens.append((tok_type, tok_str))
continue
# Comments are dropped
if tok_type == tokenize.COMMENT:
continue
# Detect ':' ending a def/class header
if tok_type == tokenize.OP and tok_str == ":":
last_was_colon = True
out_tokens.append((tok_type, tok_str))
continue
# After ':' + NEWLINE + INDENT comes a suite start -> allow docstring removal
if tok_type == tokenize.INDENT and last_was_colon:
expect_suite_docstring = True
last_was_colon = False
out_tokens.append((tok_type, tok_str))
continue
# Any non-INDENT token clears the last_was_colon flag
if tok_type != tokenize.NL:
last_was_colon = False
# STRING handling
if tok_type == tokenize.STRING:
at_line_start = (start[1] == 0) and not seen_nontrivial_in_line
if indent_level == 0:
# Potential module docstring only if first statement at col 0
if module_docstring_candidate and at_line_start:
module_docstring_candidate = False
# drop it
continue
# Any other top-level string is normal
module_docstring_candidate = False
out_tokens.append((tok_type, tok_str))
seen_nontrivial_in_line = True
continue
else:
# In a suite: if it's the first statement after def/class, drop regardless of column
if expect_suite_docstring:
expect_suite_docstring = False
# drop it
continue
expect_suite_docstring = False
out_tokens.append((tok_type, tok_str))
seen_nontrivial_in_line = True
continue
# Any other significant token disables module-docstring candidacy
if tok_type not in (tokenize.INDENT, tokenize.DEDENT):
if indent_level == 0:
module_docstring_candidate = False
# Mark we've seen something on this line
if tok_type not in (tokenize.NL, tokenize.NEWLINE):
seen_nontrivial_in_line = True
out_tokens.append((tok_type, tok_str))
return tokenize.untokenize(out_tokens)
# ---------------------------------
# C-style stripping via state machine
# ---------------------------------
@staticmethod
def _strip_cstyle_comments(content: str) -> str:
"""
Remove // line comments and /* ... */ block comments while preserving
string ("...") and char ('...') literals and their escape sequences.
"""
i = 0
n = len(content)
out = []
in_line_comment = False
in_block_comment = False
in_string = False
in_char = False
escape = False
while i < n:
c = content[i]
nxt = content[i + 1] if i + 1 < n else ""
# If inside line comment: consume until newline
if in_line_comment:
if c == "\n":
in_line_comment = False
out.append(c)
i += 1
continue
# If inside block comment: consume until '*/'
if in_block_comment:
if c == "*" and nxt == "/":
in_block_comment = False
i += 2
else:
i += 1
continue
# If inside string literal
if in_string:
out.append(c)
if escape:
escape = False
else:
if c == "\\":
escape = True
elif c == '"':
in_string = False
i += 1
continue
# If inside char literal
if in_char:
out.append(c)
if escape:
escape = False
else:
if c == "\\":
escape = True
elif c == "'":
in_char = False
i += 1
continue
# Not in any special state:
# Check for start of comments
if c == "/" and nxt == "/":
in_line_comment = True
i += 2
continue
if c == "/" and nxt == "*":
in_block_comment = True
i += 2
continue
# Check for start of string/char literals
if c == '"':
in_string = True
out.append(c)
i += 1
continue
if c == "'":
in_char = True
out.append(c)
i += 1
continue
# Normal character
out.append(c)
i += 1
return "".join(out)
# -------------------
# Public API
# -------------------
@classmethod
def remove_comments(cls, content: str, file_type: str) -> str:
"""
Remove comments based on file type/extension.
- Python: tokenize-based
- C/CPP/JS: state-machine
- Hash (sh/yaml): regex full-line
- Jinja: regex {# ... #}
"""
lang = cls._lang_from_ext(file_type)
if lang is None:
return content.strip()
if lang == "python":
return cls._strip_python_comments_tokenize(content).strip()
if lang == "cstyle":
return cls._strip_cstyle_comments(content).strip()
spec = cls.LANG_SPECS.get(lang)
if not spec:
return content.strip()
cleaned = content
for pat in spec.patterns:
cleaned = pat.sub("", cleaned)
return cleaned.strip()
@staticmethod
def compress(content: str, level: int = 9) -> bytes:
"""Compress code using zlib. Returns bytes."""
return zlib.compress(content.encode("utf-8"), level)
@staticmethod
def decompress(blob: bytes) -> str:
"""Decompress zlib-compressed code back to text."""
return zlib.decompress(blob).decode("utf-8")

View File

@@ -0,0 +1,228 @@
import fnmatch
import os
import sys
from .code_processor import CodeProcessor
class DirectoryHandler:
@staticmethod
def load_gitignore_patterns(root_path):
"""
Recursively scans for .gitignore files in the given root_path.
Returns a list of tuples (base_dir, patterns) where:
- base_dir: the directory in which the .gitignore was found.
- patterns: a list of pattern strings from that .gitignore.
"""
gitignore_data = []
for dirpath, _, filenames in os.walk(root_path):
if ".gitignore" in filenames:
gitignore_path = os.path.join(dirpath, ".gitignore")
try:
with open(gitignore_path, "r") as f:
lines = f.readlines()
# Filter out empty lines and comments.
patterns = [
line.strip()
for line in lines
if line.strip() and not line.strip().startswith("#")
]
# Save the base directory and its patterns.
gitignore_data.append((dirpath, patterns))
except Exception as e: # pragma: no cover - defensive
print(f"Error reading {gitignore_path}: {e}", file=sys.stderr)
return gitignore_data
@staticmethod
def is_binary_file(file_path):
"""
Reads the first 1024 bytes of file_path and heuristically determines
if the file appears to be binary. This method returns True if a null byte
is found or if more than 30% of the bytes in the sample are non-text.
"""
try:
with open(file_path, "rb") as f:
chunk = f.read(1024)
# If there's a null byte, it's almost certainly binary.
if b"\x00" in chunk:
return True
# Define a set of text characters (ASCII printable + common control characters)
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x7F)))
# Count non-text characters in the chunk.
non_text = sum(byte not in text_chars for byte in chunk)
if len(chunk) > 0 and (non_text / len(chunk)) > 0.30:
return True
except Exception: # pragma: no cover - defensive
# If the file cannot be read in binary mode, assume it's not binary.
return False
return False
@staticmethod
def is_gitignored(file_path, gitignore_data):
"""
Checks if file_path should be ignored according to the .gitignore entries.
For each tuple (base_dir, patterns), if file_path is under base_dir,
computes the relative path and matches it against the patterns.
"""
for base_dir, patterns in gitignore_data:
try:
rel_path = os.path.relpath(file_path, base_dir)
except ValueError:
# file_path and base_dir are on different drives.
continue
# If the file is not under the current .gitignore base_dir, skip it.
if rel_path.startswith(".."):
continue
# Check all patterns.
for pattern in patterns:
if pattern.endswith("/"):
# Directory pattern: check if any folder in the relative path matches.
parts = rel_path.split(os.sep)
for part in parts[:-1]:
if fnmatch.fnmatch(part + "/", pattern):
return True
else:
# Check if the relative path matches the pattern.
if fnmatch.fnmatch(rel_path, pattern):
return True
return False
@staticmethod
def filter_directories(dirs, ignore_file_strings, ignore_hidden):
"""
Filter out directories based on ignore_file_strings and hidden status.
"""
if ignore_hidden:
dirs[:] = [d for d in dirs if not d.startswith(".")]
dirs[:] = [d for d in dirs if not any(ig in d for ig in ignore_file_strings)]
@staticmethod
def path_or_content_contains(file_path, path_contains, content_contains):
"""
Check if the file path contains specific strings or if the file content does.
"""
if path_contains and any(whitelist_str in file_path for whitelist_str in path_contains):
return True
if content_contains:
try:
with open(file_path, "r") as f:
content = f.read()
if any(whitelist_str in content for whitelist_str in content_contains):
return True
except UnicodeDecodeError:
return False
return False
@staticmethod
def should_print_file(
file_path,
file_types,
ignore_file_strings,
ignore_hidden,
path_contains,
content_contains,
scan_binary_files=False,
):
"""
Determines if a file should be printed based on various criteria.
By default, binary files are skipped unless scan_binary_files is True.
"""
# Check binary file status using our heuristic.
if not scan_binary_files and DirectoryHandler.is_binary_file(file_path):
return False
if ignore_hidden and os.path.basename(file_path).startswith("."):
return False
if file_types and not any(file_path.endswith(ft) for ft in file_types):
return False
if any(ignore_str in file_path for ignore_str in ignore_file_strings):
return False
if path_contains or content_contains:
return DirectoryHandler.path_or_content_contains(
file_path, path_contains, content_contains
)
return True
@staticmethod
def print_file_content(file_path, no_comments, compress, output_stream):
"""
Prints the content of a file, optionally removing comments or compressing the output.
"""
try:
with open(file_path, "r") as f:
content = f.read()
if no_comments:
file_type = os.path.splitext(file_path)[1]
content = CodeProcessor.remove_comments(content, file_type)
print(f"<< START: {file_path} >>", file=output_stream)
if compress:
compressed_content = CodeProcessor.compress(content)
print("COMPRESSED CODE:", file=output_stream)
print(compressed_content, file=output_stream)
else:
print(content, file=output_stream)
print("<< END >>\n", file=output_stream)
except UnicodeDecodeError:
print(
f"Warning: Could not read file due to encoding issues: {file_path}",
file=sys.stderr,
)
sys.exit(1)
@staticmethod
def handle_directory(directory, **kwargs):
"""
Scans the directory and processes each file while respecting .gitignore rules.
"""
gitignore_data = []
if not kwargs.get("no_gitignore"):
gitignore_data = DirectoryHandler.load_gitignore_patterns(directory)
output_stream = kwargs.get("output_stream", sys.stdout)
for root, dirs, files in os.walk(directory):
DirectoryHandler.filter_directories(
dirs, kwargs["ignore_file_strings"], kwargs["ignore_hidden"]
)
for file in files:
file_path = os.path.join(root, file)
if gitignore_data and DirectoryHandler.is_gitignored(file_path, gitignore_data):
if kwargs.get("verbose"):
print(f"Skipped (gitignored): {file_path}", file=output_stream)
continue
if DirectoryHandler.should_print_file(
file_path,
kwargs["file_types"],
kwargs["ignore_file_strings"],
kwargs["ignore_hidden"],
kwargs["path_contains"],
kwargs["content_contains"],
scan_binary_files=kwargs.get("scan_binary_files", False),
):
DirectoryHandler.print_file_content(
file_path,
kwargs["no_comments"],
kwargs["compress"],
output_stream=output_stream,
)
elif kwargs.get("verbose"):
print(f"Skipped file: {file_path}", file=output_stream)
@staticmethod
def handle_file(file_path, **kwargs):
"""
Processes an individual file.
"""
output_stream = kwargs.get("output_stream", sys.stdout)
DirectoryHandler.print_file_content(
file_path,
kwargs["no_comments"],
kwargs["compress"],
output_stream=output_stream,
)

23
src/arc/tee.py Normal file
View File

@@ -0,0 +1,23 @@
from typing import TextIO
class Tee:
"""
Simple tee-like stream that writes everything to multiple underlying streams.
Typical usage:
tee = Tee(sys.stdout, buffer)
print("hello", file=tee)
"""
def __init__(self, *streams: TextIO) -> None:
self.streams = streams
def write(self, data: str) -> None:
for stream in self.streams:
stream.write(data)
def flush(self) -> None:
for stream in self.streams:
if hasattr(stream, "flush"):
stream.flush()