from __future__ import annotations import argparse import json import sys from pathlib import Path from typing import Callable, Optional, TextIO from PIL import Image from .pipeline import CoRGIPipeline from .qwen_client import Qwen3VLClient, QwenGenerationConfig from .types import GroundedEvidence, ReasoningStep DEFAULT_MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct" def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog="corgi-cli", description="Run the CoRGI reasoning pipeline over an image/question pair.", ) parser.add_argument("--image", type=Path, required=True, help="Path to the input image (jpg/png/etc.)") parser.add_argument("--question", type=str, required=True, help="Visual question for the image") parser.add_argument("--max-steps", type=int, default=3, help="Maximum number of reasoning steps to request") parser.add_argument( "--max-regions", type=int, default=3, help="Maximum number of grounded regions per visual step", ) parser.add_argument( "--model-id", type=str, default=None, help="Optional override for the Qwen3-VL model identifier", ) parser.add_argument( "--json-out", type=Path, default=None, help="Optional path to write the pipeline result as JSON", ) return parser def _format_step(step: ReasoningStep) -> str: needs = "yes" if step.needs_vision else "no" suffix = f"; reason: {step.reason}" if step.reason else "" return f"[{step.index}] {step.statement} (needs vision: {needs}{suffix})" def _format_evidence_item(evidence: GroundedEvidence) -> str: bbox = ", ".join(f"{coord:.2f}" for coord in evidence.bbox) parts = [f"Step {evidence.step_index} | bbox=({bbox})"] if evidence.description: parts.append(f"desc: {evidence.description}") if evidence.confidence is not None: parts.append(f"conf: {evidence.confidence:.2f}") return " | ".join(parts) def _default_pipeline_factory(model_id: Optional[str]) -> CoRGIPipeline: config = QwenGenerationConfig(model_id=model_id or DEFAULT_MODEL_ID) client = Qwen3VLClient(config=config) return CoRGIPipeline(vlm_client=client) def execute_cli( *, image_path: Path, question: str, max_steps: int, max_regions: int, model_id: Optional[str], json_out: Optional[Path], pipeline_factory: Callable[[Optional[str]], CoRGIPipeline] | None = None, output_stream: TextIO | None = None, ) -> None: if output_stream is None: output_stream = sys.stdout factory = pipeline_factory or _default_pipeline_factory with Image.open(image_path) as img: image = img.convert("RGB") pipeline = factory(model_id) result = pipeline.run( image=image, question=question, max_steps=max_steps, max_regions=max_regions, ) print(f"Question: {question}", file=output_stream) print("-- Steps --", file=output_stream) for step in result.steps: print(_format_step(step), file=output_stream) if not result.steps: print("(no reasoning steps returned)", file=output_stream) print("-- Evidence --", file=output_stream) if result.evidence: for evidence in result.evidence: print(_format_evidence_item(evidence), file=output_stream) else: print("(no visual evidence)", file=output_stream) print("-- Answer --", file=output_stream) print(f"Answer: {result.answer}", file=output_stream) if json_out is not None: json_out.parent.mkdir(parents=True, exist_ok=True) with json_out.open("w", encoding="utf-8") as handle: json.dump(result.to_json(), handle, ensure_ascii=False, indent=2) def main(argv: Optional[list[str]] = None) -> int: parser = build_parser() args = parser.parse_args(argv) execute_cli( image_path=args.image, question=args.question, max_steps=args.max_steps, max_regions=args.max_regions, model_id=args.model_id, json_out=args.json_out, ) return 0 __all__ = ["build_parser", "execute_cli", "main"]