immunant
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎c2rust-postprocess/README.md‎
Lines changed: 49 additions & 0 deletions b/‎c2rust-postprocess/README.md‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎c2rust-postprocess/c2rust-postprocess‎
Lines changed: 2 additions & 0 deletions b/‎c2rust-postprocess/c2rust-postprocess‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎c2rust-postprocess/postprocess/__init__.py‎
Lines changed: 121 additions & 0 deletions b/‎c2rust-postprocess/postprocess/__init__.py‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎c2rust-postprocess/postprocess/__main__.py‎
Lines changed: 50 additions & 0 deletions b/‎c2rust-postprocess/postprocess/__main__.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎c2rust-postprocess/postprocess/cache.py‎
Lines changed: 102 additions & 0 deletions b/‎c2rust-postprocess/postprocess/cache.py‎
Lines changed: 102 additions & 0 deletions
@@ -10,6 +10,8 @@
 **/tests/
 /build
 *.pyc
+**/__pycache__
+*.egg-info/
 .vagrant
 **/compile_commands.json
 .python-version
 
@@ -0,0 +1,49 @@
+# LLM-based postprocessing of c2rust transpiler output
+
+This is currently a prototype effort to gauge the extent to which LLMs can 
+accelerate the types of translation and migration that help move C code to Rust.
+
+# Prerequisites
+
+- Python 3.12 or later
+- `uv` in path
+- A valid `GEMINI_API_KEY` set
+- A transpiled codebase with a correct `compile_commands.json`
+
+# Running
+
+- `c2rust-postprocess path/to/compile_commands.json`, or
+- `uv run postprocess path/to/compile_commands.json`
+
+# Testing
+
+## Test prerequisites
+
+- `bear` and `c2rust` in path 
+
+```
+   uv run pytest -v
+   uv run pytest -v tests/test_utils.py # filter tests to run
+```
+
+## Misc
+
+- `uv run ruff check --fix .` to format & lint
+
+# TODOs
+
+- testable prototype
+  - [x] gemini api support
+    + using synchronous API, tabled async API for now
+  - file-based caching of model responses
+    + storage format could be improved to make it easier to create
+      golden input/output pairs for testing
+  - pluggable support for getting definitions
+  - verifying correctness of responses 
+  - filtering by file and function name
+  - openai model support
+  - antropic model support
+  - openrouter API support?
+  - non-trivial: use async support to speed up postprocessing
+    + supported by gemini api, IDK about others
+
@@ -0,0 +1,2 @@
+#!/bin/sh
+uv run postproc
@@ -0,0 +1,121 @@
+"""
+c2rust-postprocess: Transfer comments from C functions to Rust functions using LLMs.
+"""
+
+import logging
+from pathlib import Path
+from textwrap import dedent
+from typing import Any
+
+from postprocess.cache import AbstractCache
+from postprocess.definitions import get_c_comments, get_function_span_pairs
+from postprocess.models import get_model_by_id
+from postprocess.utils import get_compile_commands, get_rust_files, read_chunk, remove_backticks
+
+from pygments import highlight
+from pygments.lexers import RustLexer
+from pygments.formatters import TerminalFormatter
+
+# TODO: could also include
+# - validation function to check result
+# - list of comments to check for
+class CommentTransferPrompt:
+    c_function: str
+    rust_function: str
+    prompt_text: str
+
+    __slots__ = ("c_function", "rust_function", "prompt_text")
+
+    def __init__(self, c_function: str, rust_function: str, prompt_text: str) -> None:
+        self.c_function = c_function
+        self.rust_function = rust_function
+        self.prompt_text = prompt_text
+
+    def __str__(self) -> str:
+        return self.prompt_text + "\n\n" + \
+               "C function:\n```c\n" + self.c_function + "```\n\n" + \
+               "Rust function:\n```rust\n" + self.rust_function + "```\n"
+
+
+def generate_prompts(
+    compile_commands: list[dict[str, Any]], rust_file: Path
+) -> list[CommentTransferPrompt]:
+    pairs = get_function_span_pairs(compile_commands, rust_file)
+
+    prompts = []
+
+    for rust_fn, c_fn in pairs:
+        c_def = read_chunk(c_fn["file"], c_fn["start_byte"], c_fn["end_byte"])
+        c_comments = get_c_comments(c_def)
+        if not c_comments:
+            logging.info(f"Skipping C function without comments: {c_fn['name']}")
+            continue
+
+        # TODO: log on verbose level
+        # print(f"C function {c_fn['name']} definition:\n{c_def}\n")
+
+        rust_def = read_chunk(
+            rust_fn["file"], rust_fn["start_byte"], rust_fn["end_byte"]
+        )
+        # TODO: log on verbose level
+        # print(f"Rust function {rust_fn['name']} definition:\n{rust_def}\n")
+
+        # TODO: make this function take a model and get prompt from model
+        prompt_text = """
+        Transfer the comments from the following C function to the corresponding Rust function.
+        Do not add any comments that are not present in the C function.
+        Respond with the Rust function definition with the transferred comments; say nothing else.
+        """ # noqa: E501
+        prompt_text = dedent(prompt_text).strip()
+
+        prompt = CommentTransferPrompt(
+            c_function=c_def, rust_function=rust_def, prompt_text=prompt_text
+        )
+
+        prompts.append(prompt)
+
+    return prompts
+
+
+# TODO: get from model
+SYSTEM_INSTRUCTION = (
+    "You are a helpful assistant that transfers comments from C code to Rust code."
+)
+
+def transfer_comments(compile_commands_path: Path, cache: AbstractCache) -> None:
+    # TODO: instantiate the model based on command line args
+    # TODO: avoid google-specific import here
+    from google.genai import types
+    model = get_model_by_id(
+            "gemini-3-pro-preview",
+            generation_config = {"system_instruction": types.Content(
+                role="system",
+                parts=[types.Part.from_text(text=SYSTEM_INSTRUCTION)]
+            )}
+    )
+
+    rust_sources = get_rust_files(compile_commands_path.parent)
+
+    compile_commands = get_compile_commands(compile_commands_path)
+
+    for rust_file in rust_sources:
+        prompts = generate_prompts(compile_commands, rust_file)
+
+        for prompt in prompts:
+            messages = [
+                {"role": "user", "content": str(prompt)},
+            ]
+
+            if not (response := cache.lookup(messages)):
+                response = model.generate_with_tools(messages)
+                if response is None:
+                    logging.error("Model returned no response")
+                    continue
+                cache.update(messages, response)
+
+            response = remove_backticks(response)
+
+            if True:  # TODO: detect when terminal supports colors
+                highlighted_response = highlight(response, RustLexer(), TerminalFormatter())
+
+                print("Response:\n", highlighted_response)
@@ -0,0 +1,50 @@
+import argparse
+import logging
+import sys
+from collections.abc import Sequence
+
+from postprocess import transfer_comments
+from postprocess.cache import DirectoryCache
+from postprocess.utils import existing_file
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Transfer C function comments to Rust using LLMs.",
+    )
+    parser.add_argument(
+        "compile_commands",
+        type=existing_file,
+        help="Path to compile_commands.json.",
+    )
+
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        required=False,
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        help="Logging level (default: INFO)",
+    )
+
+    return parser
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = build_arg_parser()
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level=logging.getLevelName(args.log_level.upper()))
+
+    cache = DirectoryCache()
+
+    transfer_comments(args.compile_commands, cache)
+
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except KeyboardInterrupt as e:
+        logging.warning("Interrupted by user, terminating...")
@@ -0,0 +1,102 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from tempfile import gettempdir
+from typing import Any
+
+
+class AbstractCache(ABC):
+    """
+    Abstract base class for caching of LLM interactions.
+    """
+
+    def __init__(self, path: Path, **kwargs: Any):
+        self._path = path
+        self._config = kwargs
+
+    @property
+    def path(self) -> Path:
+        return self._path
+
+    @abstractmethod
+    def lookup(
+        self,
+        messages: list[dict[str, Any]],
+
+    ) -> str | None:
+        """Lookup a cached response for the given messages.
+
+        Args:
+            messages: The list of messages representing the conversation history.
+        """
+        pass
+
+    @abstractmethod
+    def update(
+        self,
+        messages: list[dict[str, Any]],
+        response: str
+    ) -> None:
+        """Store a response in the cache for the given messages.
+
+        Args:
+            messages: The list of messages representing the conversation history.
+            response: The response text to cache.
+        """
+        pass
+
+    @abstractmethod
+    def clear(self) -> None:
+        """Clear the entire cache."""
+        pass
+
+    def flush(self) -> None:  # noqa: B027
+        """
+        Optional: Persist cache to disk.
+        Not abstract because not all implementations need it.
+        """
+        pass
+
+
+class DirectoryCache(AbstractCache):
+    """
+    Cache that stores cached responses in a directory.
+    If no path is specified, a temporary directory is used.
+    """
+
+    def __init__(self, path: Path | None = None, **kwargs: Any):
+        if path is None:
+            path = Path(gettempdir()) / "c2rust_postprocess"
+        super().__init__(path, **kwargs)
+        self._path.mkdir(parents=True, exist_ok=True)
+
+    def get_cache_file_name(self, messages: list[dict[str, Any]]) -> Path:
+        import hashlib
+        import json
+
+        messages_str = json.dumps(messages, sort_keys=True)
+        hash_digest = hashlib.sha256(messages_str.encode()).hexdigest()
+        return self._path / f"{hash_digest}.txt"
+
+    def lookup(
+        self,
+        messages: list[dict[str, Any]],
+    ) -> str | None:
+        cache_file = self.get_cache_file_name(messages)
+
+        if cache_file.exists():
+            with open(cache_file, encoding='utf-8') as f:
+                return f.read()
+        return None
+
+    def update(
+        self,
+        messages: list[dict[str, Any]],
+        response: str
+    ) -> None:
+        cache_file = self.get_cache_file_name(messages)
+
+        with open(cache_file, 'w', encoding='utf-8') as f:
+            f.write(response)
+
+    def clear(self) -> None:
+        self._path.unlink(missing_ok=True)