`Guardrail Evals`

Guardrail evaluation runner and CLI.

This script provides a command-line interface and class for running guardrail evaluations on datasets.

GuardrailEval

Class for running guardrail evaluations.

Source code in src/guardrails/evals/guardrail_evals.py

class GuardrailEval:
    """Class for running guardrail evaluations."""

    def __init__(
        self,
        config_path: Path,
        dataset_path: Path,
        stages: Sequence[str] | None = None,
        batch_size: int = DEFAULT_BATCH_SIZE,
        output_dir: Path = Path("results"),
        api_key: str | None = None,
        base_url: str | None = None,
        azure_endpoint: str | None = None,
        azure_api_version: str | None = None,
        mode: str = "evaluate",
        models: Sequence[str] | None = None,
        latency_iterations: int = DEFAULT_LATENCY_ITERATIONS,
        multi_turn: bool = False,
        max_parallel_models: int | None = None,
        benchmark_chunk_size: int | None = None,
    ) -> None:
        """Initialize the evaluator.

        Args:
            config_path: Path to pipeline configuration file.
            dataset_path: Path to evaluation dataset (JSONL).
            stages: Specific stages to evaluate (pre_flight, input, output).
            batch_size: Number of samples to process in parallel.
            output_dir: Directory to save evaluation results.
            api_key: API key for OpenAI, Azure OpenAI, or OpenAI-compatible API.
            base_url: Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1).
            azure_endpoint: Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com).
            azure_api_version: Azure OpenAI API version (e.g., 2025-01-01-preview).
            mode: Evaluation mode ("evaluate" or "benchmark").
            models: Models to test in benchmark mode.
            multi_turn: Whether to evaluate guardrails on multi-turn conversations.
            latency_iterations: Number of iterations for latency testing.
            max_parallel_models: Maximum number of models to benchmark concurrently.
            benchmark_chunk_size: Optional sample chunk size for per-model benchmarking.
        """
        self._validate_inputs(
            config_path,
            dataset_path,
            batch_size,
            mode,
            latency_iterations,
            max_parallel_models,
            benchmark_chunk_size,
        )

        self.config_path = config_path
        self.dataset_path = dataset_path
        self.stages = stages
        self.batch_size = batch_size
        self.output_dir = output_dir
        self.api_key = api_key
        self.base_url = base_url
        self.azure_endpoint = azure_endpoint
        self.azure_api_version = azure_api_version or "2025-01-01-preview"
        self.mode = mode
        self.models = list(models) if models else list(DEFAULT_BENCHMARK_MODELS)
        self.max_parallel_models = self._determine_parallel_model_limit(len(self.models), max_parallel_models)
        self.benchmark_chunk_size = benchmark_chunk_size
        self.latency_iterations = latency_iterations
        self.multi_turn = multi_turn

        # Validate Azure configuration
        if azure_endpoint and not AsyncAzureOpenAI:
            raise ValueError("Azure OpenAI support requires openai>=1.0.0. Please upgrade: pip install --upgrade openai")

    def _validate_inputs(
        self,
        config_path: Path,
        dataset_path: Path,
        batch_size: int,
        mode: str,
        latency_iterations: int,
        max_parallel_models: int | None,
        benchmark_chunk_size: int | None,
    ) -> None:
        """Validate input parameters."""
        if not config_path.exists():
            raise ValueError(f"Config file not found: {config_path}")

        if not dataset_path.exists():
            raise ValueError(f"Dataset file not found: {dataset_path}")

        if batch_size <= 0:
            raise ValueError(f"Batch size must be positive, got: {batch_size}")

        if mode not in ("evaluate", "benchmark"):
            raise ValueError(f"Invalid mode: {mode}. Must be 'evaluate' or 'benchmark'")

        if latency_iterations <= 0:
            raise ValueError(f"Latency iterations must be positive, got: {latency_iterations}")

        if max_parallel_models is not None and max_parallel_models <= 0:
            raise ValueError(f"max_parallel_models must be positive, got: {max_parallel_models}")

        if benchmark_chunk_size is not None and benchmark_chunk_size <= 0:
            raise ValueError(f"benchmark_chunk_size must be positive, got: {benchmark_chunk_size}")

    @staticmethod
    def _determine_parallel_model_limit(model_count: int, requested_limit: int | None) -> int:
        """Resolve the number of benchmark tasks that can run concurrently.

        Args:
            model_count: Total number of models scheduled for benchmarking.
            requested_limit: Optional user-provided parallelism limit.

        Returns:
            Number of concurrent benchmark tasks to run.

        Raises:
            ValueError: If either model_count or requested_limit is invalid.
        """
        if model_count <= 0:
            raise ValueError("model_count must be positive")

        if requested_limit is not None:
            if requested_limit <= 0:
                raise ValueError("max_parallel_models must be positive")
            return min(requested_limit, model_count)

        cpu_count = os.cpu_count() or 1
        return max(1, min(cpu_count, model_count))

    @staticmethod
    def _chunk_samples(samples: list[Sample], chunk_size: int | None) -> Iterator[list[Sample]]:
        """Yield contiguous sample chunks respecting the configured chunk size.

        Args:
            samples: Samples to evaluate.
            chunk_size: Optional maximum chunk size to enforce.

        Returns:
            Iterator yielding chunks of the provided samples.

        Raises:
            ValueError: If chunk_size is non-positive when provided.
        """
        if chunk_size is not None and chunk_size <= 0:
            raise ValueError("chunk_size must be positive when provided")

        if not samples or chunk_size is None or chunk_size >= len(samples):
            yield samples
            return

        for start in range(0, len(samples), chunk_size):
            yield samples[start : start + chunk_size]

    async def run(self) -> None:
        """Run the evaluation pipeline for all specified stages."""
        try:
            if self.mode == "benchmark":
                await self._run_benchmark()
            else:
                await self._run_evaluation()
        except Exception as e:
            logger.error("Evaluation failed: %s", e)
            raise

    async def _run_evaluation(self) -> None:
        """Run standard evaluation mode."""
        pipeline_bundles = load_pipeline_bundles(self.config_path)
        stages_to_evaluate = self._get_valid_stages(pipeline_bundles)

        if not stages_to_evaluate:
            raise ValueError("No valid stages found in configuration")

        logger.info("Evaluating stages: %s", ", ".join(stages_to_evaluate))

        loader = JsonlDatasetLoader()
        samples = loader.load(self.dataset_path)
        logger.info("Loaded %d samples from dataset", len(samples))

        context = self._create_context()
        calculator = GuardrailMetricsCalculator()
        reporter = JsonResultsReporter()

        all_results = {}
        all_metrics = {}

        for stage in stages_to_evaluate:
            logger.info("Starting %s stage evaluation", stage)

            try:
                stage_results = await self._evaluate_single_stage(stage, pipeline_bundles, samples, context, calculator)

                if stage_results:
                    all_results[stage] = stage_results["results"]
                    all_metrics[stage] = stage_results["metrics"]
                    logger.info("Completed %s stage evaluation", stage)
                else:
                    logger.warning("Stage '%s' evaluation returned no results", stage)

            except Exception as e:
                logger.error("Failed to evaluate stage '%s': %s", stage, e)

        if not all_results:
            raise ValueError("No stages were successfully evaluated")

        reporter.save_multi_stage(all_results, all_metrics, self.output_dir)
        logger.info("Evaluation completed. Results saved to: %s", self.output_dir)

    async def _run_benchmark(self) -> None:
        """Run benchmark mode comparing multiple models."""
        logger.info('event="benchmark_start" duration_ms=0 models="%s"', ", ".join(self.models))
        logger.info(
            'event="benchmark_parallel_config" duration_ms=0 parallel_limit=%d chunk_size=%s batch_size=%d',
            self.max_parallel_models,
            self.benchmark_chunk_size if self.benchmark_chunk_size else "dataset",
            self.batch_size,
        )

        pipeline_bundles = load_pipeline_bundles(self.config_path)
        stage_to_test, guardrail_name = self._get_benchmark_target(pipeline_bundles)

        # Validate guardrail has model configuration
        stage_bundle = getattr(pipeline_bundles, stage_to_test)
        if not self._has_model_configuration(stage_bundle):
            raise ValueError(
                f"Guardrail '{guardrail_name}' does not have a model configuration. "
                "Benchmark mode requires LLM-based guardrails with configurable models."
            )

        logger.info('event="benchmark_target" duration_ms=0 guardrail="%s" stage="%s"', guardrail_name, stage_to_test)

        loader = JsonlDatasetLoader()
        samples = loader.load(self.dataset_path)
        logger.info('event="benchmark_samples_loaded" duration_ms=0 count=%d', len(samples))

        benchmark_calculator = BenchmarkMetricsCalculator()
        basic_calculator = GuardrailMetricsCalculator()
        benchmark_reporter = BenchmarkReporter(self.output_dir)

        # Run benchmark for all models
        results_by_model, metrics_by_model = await self._benchmark_all_models(
            stage_to_test, guardrail_name, samples, benchmark_calculator, basic_calculator
        )

        # Run latency testing
        logger.info('event="benchmark_latency_start" duration_ms=0 model_count=%d', len(self.models))
        latency_results = await self._run_latency_tests(stage_to_test, samples)

        # Save benchmark results
        benchmark_dir = benchmark_reporter.save_benchmark_results(
            results_by_model, metrics_by_model, latency_results, guardrail_name, len(samples), self.latency_iterations
        )

        # Create visualizations
        logger.info('event="benchmark_visualization_start" duration_ms=0 guardrail="%s"', guardrail_name)
        visualizer = BenchmarkVisualizer(benchmark_dir / "graphs")
        visualization_files = visualizer.create_all_visualizations(
            results_by_model, metrics_by_model, latency_results, guardrail_name, samples[0].expected_triggers if samples else {}
        )

        logger.info('event="benchmark_complete" duration_ms=0 output="%s"', benchmark_dir)
        logger.info('event="benchmark_visualization_complete" duration_ms=0 count=%d', len(visualization_files))

    def _has_model_configuration(self, stage_bundle) -> bool:
        """Check if the guardrail has a model configuration."""
        if not stage_bundle.guardrails:
            return False

        guardrail_config = stage_bundle.guardrails[0].config
        if not guardrail_config:
            return False

        if isinstance(guardrail_config, dict) and "model" in guardrail_config:
            return True
        elif hasattr(guardrail_config, "model"):
            return True

        return False

    async def _run_latency_tests(self, stage_to_test: str, samples: list[Sample]) -> dict[str, Any]:
        """Run latency tests for all models."""
        latency_results = {}
        latency_tester = LatencyTester(iterations=self.latency_iterations)

        for model in self.models:
            model_stage_bundle = self._create_model_specific_stage_bundle(getattr(load_pipeline_bundles(self.config_path), stage_to_test), model)
            model_context = self._create_context()
            latency_results[model] = await latency_tester.test_guardrail_latency_for_model(
                model_context,
                model_stage_bundle,
                samples,
                self.latency_iterations,
                desc=f"Testing latency: {model}",
            )

        return latency_results

    def _create_context(self) -> Context:
        """Create evaluation context with OpenAI client.

        Supports OpenAI, Azure OpenAI, and OpenAI-compatible APIs.
        Used for both evaluation and benchmark modes.

        Returns:
            Context with configured AsyncOpenAI or AsyncAzureOpenAI client.
        """
        # Azure OpenAI
        if self.azure_endpoint:
            if not AsyncAzureOpenAI:
                raise ValueError("Azure OpenAI support requires openai>=1.0.0. Please upgrade: pip install --upgrade openai")

            azure_kwargs = {
                "azure_endpoint": self.azure_endpoint,
                "api_version": self.azure_api_version,
            }
            if self.api_key:
                azure_kwargs["api_key"] = self.api_key

            guardrail_llm = AsyncAzureOpenAI(**azure_kwargs)
            logger.info("Created Azure OpenAI client for endpoint: %s", self.azure_endpoint)
        # OpenAI or OpenAI-compatible API
        else:
            openai_kwargs = {}
            if self.api_key:
                openai_kwargs["api_key"] = self.api_key
            if self.base_url:
                openai_kwargs["base_url"] = self.base_url
                logger.info("Created OpenAI-compatible client for base_url: %s", self.base_url)

            guardrail_llm = AsyncOpenAI(**openai_kwargs)

        return Context(guardrail_llm=guardrail_llm)

    def _is_valid_stage(self, pipeline_bundles, stage: str) -> bool:
        """Check if a stage has valid guardrails configured.

        Args:
            pipeline_bundles: Pipeline bundles object.
            stage: Stage name to check.

        Returns:
            True if stage exists and has guardrails configured.
        """
        if not hasattr(pipeline_bundles, stage):
            return False

        stage_bundle = getattr(pipeline_bundles, stage)
        return stage_bundle is not None and hasattr(stage_bundle, "guardrails") and bool(stage_bundle.guardrails)

    def _create_model_specific_stage_bundle(self, stage_bundle, model: str):
        """Create a deep copy of the stage bundle with model-specific configuration."""
        try:
            modified_bundle = copy.deepcopy(stage_bundle)
        except Exception as e:
            logger.error("Failed to create deep copy of stage bundle: %s", e)
            raise ValueError(f"Failed to create deep copy of stage bundle: {e}") from e

        logger.info("Creating model-specific configuration for model: %s", model)

        guardrails_updated = 0
        for guardrail in modified_bundle.guardrails:
            try:
                if hasattr(guardrail, "config") and guardrail.config:
                    if isinstance(guardrail.config, dict) and "model" in guardrail.config:
                        original_model = guardrail.config["model"]
                        guardrail.config["model"] = model
                        logger.info("Updated guardrail '%s' model from '%s' to '%s'", guardrail.name, original_model, model)
                        guardrails_updated += 1
                    elif hasattr(guardrail.config, "model"):
                        original_model = getattr(guardrail.config, "model", "unknown")
                        guardrail.config.model = model
                        logger.info("Updated guardrail '%s' model from '%s' to '%s'", guardrail.name, original_model, model)
                        guardrails_updated += 1
            except Exception as e:
                logger.error("Failed to update guardrail '%s' configuration: %s", guardrail.name, e)
                raise ValueError(f"Failed to update guardrail '{guardrail.name}' configuration: {e}") from e

        if guardrails_updated == 0:
            logger.warning("No guardrails with model configuration were found")
        else:
            logger.info("Successfully updated %d guardrail(s) for model: %s", guardrails_updated, model)

        return modified_bundle

    def _get_valid_stages(self, pipeline_bundles) -> list[str]:
        """Get list of valid stages to evaluate."""
        if self.stages is None:
            # Auto-detect all valid stages
            available_stages = [stage for stage in VALID_STAGES if self._is_valid_stage(pipeline_bundles, stage)]

            if not available_stages:
                raise ValueError("No valid stages found in configuration")

            logger.info("No stages specified, evaluating all available stages: %s", ", ".join(available_stages))
            return available_stages
        else:
            # Validate requested stages
            valid_requested_stages = []
            for stage in self.stages:
                if stage not in VALID_STAGES:
                    logger.warning("Invalid stage '%s', skipping", stage)
                    continue

                if not self._is_valid_stage(pipeline_bundles, stage):
                    logger.warning("Stage '%s' not found or has no guardrails configured, skipping", stage)
                    continue

                valid_requested_stages.append(stage)

            if not valid_requested_stages:
                raise ValueError("No valid stages found in configuration")

            return valid_requested_stages

    async def _evaluate_single_stage(
        self, stage: str, pipeline_bundles, samples: list[Sample], context: Context, calculator: GuardrailMetricsCalculator
    ) -> dict[str, Any] | None:
        """Evaluate a single pipeline stage."""
        try:
            stage_bundle = getattr(pipeline_bundles, stage)
            guardrails = instantiate_guardrails(stage_bundle)

            engine = AsyncRunEngine(guardrails, multi_turn=self.multi_turn)

            stage_results = await engine.run(context, samples, self.batch_size, desc=f"Evaluating {stage} stage")

            stage_metrics = calculator.calculate(stage_results)

            return {"results": stage_results, "metrics": stage_metrics}

        except Exception as e:
            logger.error("Failed to evaluate stage '%s': %s", stage, e)
            return None

    def _get_benchmark_target(self, pipeline_bundles) -> tuple[str, str]:
        """Get the stage and guardrail to benchmark."""
        if self.stages:
            stage_to_test = self.stages[0]
            if not self._is_valid_stage(pipeline_bundles, stage_to_test):
                raise ValueError(f"Stage '{stage_to_test}' has no guardrails configured")
        else:
            # Find first valid stage
            stage_to_test = next((stage for stage in VALID_STAGES if self._is_valid_stage(pipeline_bundles, stage)), None)
            if not stage_to_test:
                raise ValueError("No valid stage found for benchmarking")

        stage_bundle = getattr(pipeline_bundles, stage_to_test)
        guardrail_name = stage_bundle.guardrails[0].name

        return stage_to_test, guardrail_name

    async def _benchmark_all_models(
        self,
        stage_to_test: str,
        guardrail_name: str,
        samples: list[Sample],
        benchmark_calculator: BenchmarkMetricsCalculator,
        basic_calculator: GuardrailMetricsCalculator,
    ) -> tuple[dict[str, list], dict[str, dict]]:
        """Benchmark all models for the specified stage and guardrail."""
        pipeline_bundles = load_pipeline_bundles(self.config_path)
        stage_bundle = getattr(pipeline_bundles, stage_to_test)

        semaphore = asyncio.Semaphore(self.max_parallel_models)
        total_models = len(self.models)

        async def run_model_task(index: int, model: str) -> tuple[str, dict[str, Any]]:
            """Execute a benchmark task under concurrency control.

            Args:
                index: One-based position of the model being benchmarked.
                model: Identifier of the model to benchmark.

            Returns:
                Tuple of (model_name, results_dict) where results_dict contains "results" and "metrics" keys.
            """
            async with semaphore:
                start_time = time.perf_counter()
                logger.info(
                    'event="benchmark_model_start" duration_ms=0 model="%s" position=%d total=%d',
                    model,
                    index,
                    total_models,
                )

                try:
                    modified_stage_bundle = self._create_model_specific_stage_bundle(stage_bundle, model)

                    model_results = await self._benchmark_single_model(
                        model,
                        modified_stage_bundle,
                        samples,
                        guardrail_name,
                        benchmark_calculator,
                        basic_calculator,
                    )

                    elapsed_ms = (time.perf_counter() - start_time) * 1000

                    if model_results:
                        logger.info(
                            'event="benchmark_model_complete" duration_ms=%.2f model="%s" status="success"',
                            elapsed_ms,
                            model,
                        )
                        return (model, model_results)
                    else:
                        logger.warning(
                            'event="benchmark_model_empty" duration_ms=%.2f model="%s" status="no_results"',
                            elapsed_ms,
                            model,
                        )
                        return (model, {"results": [], "metrics": {}})

                except Exception as e:
                    elapsed_ms = (time.perf_counter() - start_time) * 1000
                    logger.error(
                        'event="benchmark_model_failure" duration_ms=%.2f model="%s" error="%s"',
                        elapsed_ms,
                        model,
                        e,
                    )
                    return (model, {"results": [], "metrics": {}})

        task_results = await asyncio.gather(*(run_model_task(index, model) for index, model in enumerate(self.models, start=1)))

        # Build dictionaries from collected results
        results_by_model: dict[str, list] = {}
        metrics_by_model: dict[str, dict] = {}
        for model, result_dict in task_results:
            results_by_model[model] = result_dict["results"]
            metrics_by_model[model] = result_dict["metrics"]

        # Log summary
        successful_models = [model for model in self.models if results_by_model.get(model)]
        failed_models = [model for model in self.models if not results_by_model.get(model)]

        logger.info('event="benchmark_summary" duration_ms=0 successful=%d failed=%d', len(successful_models), len(failed_models))
        logger.info(
            'event="benchmark_successful_models" duration_ms=0 models="%s"',
            ", ".join(successful_models) if successful_models else "None",
        )
        if failed_models:
            logger.warning('event="benchmark_failed_models" duration_ms=0 models="%s"', ", ".join(failed_models))
        logger.info('event="benchmark_total_models" duration_ms=0 total=%d', len(self.models))

        return results_by_model, metrics_by_model

    async def _benchmark_single_model(
        self,
        model: str,
        stage_bundle,
        samples: list[Sample],
        guardrail_name: str,
        benchmark_calculator: BenchmarkMetricsCalculator,
        basic_calculator: GuardrailMetricsCalculator,
    ) -> dict[str, Any] | None:
        """Benchmark a single model."""
        try:
            model_context = self._create_context()

            guardrails = instantiate_guardrails(stage_bundle)
            engine = AsyncRunEngine(guardrails, multi_turn=self.multi_turn)
            chunk_total = 1
            if self.benchmark_chunk_size and len(samples) > 0:
                chunk_total = max(1, math.ceil(len(samples) / self.benchmark_chunk_size))

            model_results: list[Any] = []
            for chunk_index, chunk in enumerate(self._chunk_samples(samples, self.benchmark_chunk_size), start=1):
                chunk_desc = f"Benchmarking {model}" if chunk_total == 1 else f"Benchmarking {model} ({chunk_index}/{chunk_total})"
                chunk_results = await engine.run(model_context, chunk, self.batch_size, desc=chunk_desc)
                model_results.extend(chunk_results)

            guardrail_config = stage_bundle.guardrails[0].config if stage_bundle.guardrails else None

            advanced_metrics = benchmark_calculator.calculate_advanced_metrics(model_results, guardrail_name, guardrail_config)

            basic_metrics = basic_calculator.calculate(model_results)

            if guardrail_name in basic_metrics:
                guardrail_metrics = basic_metrics[guardrail_name]
                basic_metrics_dict = {
                    "precision": guardrail_metrics.precision,
                    "recall": guardrail_metrics.recall,
                    "f1_score": guardrail_metrics.f1_score,
                    "true_positives": guardrail_metrics.true_positives,
                    "false_positives": guardrail_metrics.false_positives,
                    "false_negatives": guardrail_metrics.false_negatives,
                    "true_negatives": guardrail_metrics.true_negatives,
                    "total_samples": guardrail_metrics.total_samples,
                }
            else:
                basic_metrics_dict = {}

            combined_metrics = {**basic_metrics_dict, **advanced_metrics}

            return {"results": model_results, "metrics": combined_metrics}

        except Exception as e:
            logger.error("Failed to benchmark model %s: %s", model, e)
            return None

init

__init__(
    config_path: Path,
    dataset_path: Path,
    stages: Sequence[str] | None = None,
    batch_size: int = DEFAULT_BATCH_SIZE,
    output_dir: Path = Path("results"),
    api_key: str | None = None,
    base_url: str | None = None,
    azure_endpoint: str | None = None,
    azure_api_version: str | None = None,
    mode: str = "evaluate",
    models: Sequence[str] | None = None,
    latency_iterations: int = DEFAULT_LATENCY_ITERATIONS,
    multi_turn: bool = False,
    max_parallel_models: int | None = None,
    benchmark_chunk_size: int | None = None,
) -> None

Initialize the evaluator.

Parameters:

Name	Type	Description	Default
`config_path`	`Path`	Path to pipeline configuration file.	required
`dataset_path`	`Path`	Path to evaluation dataset (JSONL).	required
`stages`	`Sequence[str] \| None`	Specific stages to evaluate (pre_flight, input, output).	`None`
`batch_size`	`int`	Number of samples to process in parallel.	`DEFAULT_BATCH_SIZE`
`output_dir`	`Path`	Directory to save evaluation results.	`Path('results')`
`api_key`	`str \| None`	API key for OpenAI, Azure OpenAI, or OpenAI-compatible API.	`None`
`base_url`	`str \| None`	Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1).	`None`
`azure_endpoint`	`str \| None`	Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com).	`None`
`azure_api_version`	`str \| None`	Azure OpenAI API version (e.g., 2025-01-01-preview).	`None`
`mode`	`str`	Evaluation mode ("evaluate" or "benchmark").	`'evaluate'`
`models`	`Sequence[str] \| None`	Models to test in benchmark mode.	`None`
`multi_turn`	`bool`	Whether to evaluate guardrails on multi-turn conversations.	`False`
`latency_iterations`	`int`	Number of iterations for latency testing.	`DEFAULT_LATENCY_ITERATIONS`
`max_parallel_models`	`int \| None`	Maximum number of models to benchmark concurrently.	`None`
`benchmark_chunk_size`	`int \| None`	Optional sample chunk size for per-model benchmarking.	`None`

Source code in src/guardrails/evals/guardrail_evals.py

def __init__(
    self,
    config_path: Path,
    dataset_path: Path,
    stages: Sequence[str] | None = None,
    batch_size: int = DEFAULT_BATCH_SIZE,
    output_dir: Path = Path("results"),
    api_key: str | None = None,
    base_url: str | None = None,
    azure_endpoint: str | None = None,
    azure_api_version: str | None = None,
    mode: str = "evaluate",
    models: Sequence[str] | None = None,
    latency_iterations: int = DEFAULT_LATENCY_ITERATIONS,
    multi_turn: bool = False,
    max_parallel_models: int | None = None,
    benchmark_chunk_size: int | None = None,
) -> None:
    """Initialize the evaluator.

    Args:
        config_path: Path to pipeline configuration file.
        dataset_path: Path to evaluation dataset (JSONL).
        stages: Specific stages to evaluate (pre_flight, input, output).
        batch_size: Number of samples to process in parallel.
        output_dir: Directory to save evaluation results.
        api_key: API key for OpenAI, Azure OpenAI, or OpenAI-compatible API.
        base_url: Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1).
        azure_endpoint: Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com).
        azure_api_version: Azure OpenAI API version (e.g., 2025-01-01-preview).
        mode: Evaluation mode ("evaluate" or "benchmark").
        models: Models to test in benchmark mode.
        multi_turn: Whether to evaluate guardrails on multi-turn conversations.
        latency_iterations: Number of iterations for latency testing.
        max_parallel_models: Maximum number of models to benchmark concurrently.
        benchmark_chunk_size: Optional sample chunk size for per-model benchmarking.
    """
    self._validate_inputs(
        config_path,
        dataset_path,
        batch_size,
        mode,
        latency_iterations,
        max_parallel_models,
        benchmark_chunk_size,
    )

    self.config_path = config_path
    self.dataset_path = dataset_path
    self.stages = stages
    self.batch_size = batch_size
    self.output_dir = output_dir
    self.api_key = api_key
    self.base_url = base_url
    self.azure_endpoint = azure_endpoint
    self.azure_api_version = azure_api_version or "2025-01-01-preview"
    self.mode = mode
    self.models = list(models) if models else list(DEFAULT_BENCHMARK_MODELS)
    self.max_parallel_models = self._determine_parallel_model_limit(len(self.models), max_parallel_models)
    self.benchmark_chunk_size = benchmark_chunk_size
    self.latency_iterations = latency_iterations
    self.multi_turn = multi_turn

    # Validate Azure configuration
    if azure_endpoint and not AsyncAzureOpenAI:
        raise ValueError("Azure OpenAI support requires openai>=1.0.0. Please upgrade: pip install --upgrade openai")

run `async`

run() -> None

Run the evaluation pipeline for all specified stages.

Source code in src/guardrails/evals/guardrail_evals.py

async def run(self) -> None:
    """Run the evaluation pipeline for all specified stages."""
    try:
        if self.mode == "benchmark":
            await self._run_benchmark()
        else:
            await self._run_evaluation()
    except Exception as e:
        logger.error("Evaluation failed: %s", e)
        raise

main

main() -> None

Run the evaluation script.

Source code in src/guardrails/evals/guardrail_evals.py

def main() -> None:
    """Run the evaluation script."""
    parser = argparse.ArgumentParser(
        description="Run guardrail evaluations",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Standard evaluation of all stages
  guardrails-evals --config-path config.json --dataset-path data.jsonl

  # Multi-stage evaluation
  guardrails-evals --config-path config.json --dataset-path data.jsonl --stages pre_flight input

  # Benchmark mode with OpenAI models
  guardrails-evals --config-path config.json --dataset-path data.jsonl --mode benchmark --models gpt-5 gpt-5-mini

  # Azure OpenAI benchmark
  guardrails-evals --config-path config.json --dataset-path data.jsonl --mode benchmark \\
    --azure-endpoint https://your-resource.openai.azure.com --api-key your-key \\
    --models gpt-4o gpt-4o-mini

  # Ollama local models
  guardrails-evals --config-path config.json --dataset-path data.jsonl --mode benchmark \\
    --base-url http://localhost:11434/v1 --api-key fake-key --models llama3 mistral

  # vLLM or other OpenAI-compatible API
  guardrails-evals --config-path config.json --dataset-path data.jsonl --mode benchmark \\
    --base-url http://your-server:8000/v1 --api-key your-key --models your-model

  # Module execution during local development
  python -m guardrails.evals.guardrail_evals --config-path config.json --dataset-path data.jsonl
        """,
    )

    # Required arguments
    parser.add_argument(
        "--config-path",
        type=Path,
        required=True,
        help="Path to the guardrail config file",
    )
    parser.add_argument(
        "--dataset-path",
        type=Path,
        required=True,
        help="Path to the evaluation dataset",
    )

    # Evaluation mode
    parser.add_argument(
        "--mode",
        choices=["evaluate", "benchmark"],
        default="evaluate",
        help="Evaluation mode: 'evaluate' for standard evaluation, 'benchmark' for model comparison (default: evaluate)",
    )

    # Optional evaluation arguments
    parser.add_argument(
        "--stages",
        nargs="+",
        choices=VALID_STAGES,
        help="Pipeline stages to evaluate. If not specified, evaluates all stages found in config.",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=DEFAULT_BATCH_SIZE,
        help=f"Number of samples to process in parallel (default: {DEFAULT_BATCH_SIZE})",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("results"),
        help="Directory to save evaluation results (default: results)",
    )
    parser.add_argument(
        "--multi-turn",
        action="store_true",
        help="Process conversation-aware guardrails incrementally turn-by-turn instead of a single pass.",
    )

    # API configuration
    parser.add_argument(
        "--api-key",
        type=str,
        help="API key for OpenAI, Azure OpenAI, or OpenAI-compatible API",
    )
    parser.add_argument(
        "--base-url",
        type=str,
        help="Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1 for Ollama)",
    )
    parser.add_argument(
        "--azure-endpoint",
        type=str,
        help="Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)",
    )
    parser.add_argument(
        "--azure-api-version",
        type=str,
        default="2025-01-01-preview",
        help="Azure OpenAI API version (default: 2025-01-01-preview)",
    )

    # Benchmark-only arguments
    parser.add_argument(
        "--models",
        nargs="+",
        help="Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)",
    )
    parser.add_argument(
        "--latency-iterations",
        type=int,
        default=DEFAULT_LATENCY_ITERATIONS,
        help=f"Number of iterations for latency testing in benchmark mode (default: {DEFAULT_LATENCY_ITERATIONS})",
    )
    parser.add_argument(
        "--max-parallel-models",
        type=int,
        help="Maximum number of models to benchmark concurrently (default: max(1, min(model_count, cpu_count)))",
    )
    parser.add_argument(
        "--benchmark-chunk-size",
        type=int,
        help="Optional number of samples per chunk when benchmarking to limit long-running runs.",
    )

    args = parser.parse_args()

    # Validate arguments
    try:
        if not args.config_path.exists():
            print(f"❌ Error: Config file not found: {args.config_path}")
            sys.exit(1)

        if not args.dataset_path.exists():
            print(f"❌ Error: Dataset file not found: {args.dataset_path}")
            sys.exit(1)

        if args.batch_size <= 0:
            print(f"❌ Error: Batch size must be positive, got: {args.batch_size}")
            sys.exit(1)

        if args.latency_iterations <= 0:
            print(f"❌ Error: Latency iterations must be positive, got: {args.latency_iterations}")
            sys.exit(1)

        if args.max_parallel_models is not None and args.max_parallel_models <= 0:
            print(f"❌ Error: max-parallel-models must be positive, got: {args.max_parallel_models}")
            sys.exit(1)

        if args.benchmark_chunk_size is not None and args.benchmark_chunk_size <= 0:
            print(f"❌ Error: benchmark-chunk-size must be positive, got: {args.benchmark_chunk_size}")
            sys.exit(1)

        if args.stages:
            invalid_stages = [stage for stage in args.stages if stage not in VALID_STAGES]
            if invalid_stages:
                print(f"❌ Error: Invalid stages: {invalid_stages}. Valid stages are: {', '.join(VALID_STAGES)}")
                sys.exit(1)

        if args.mode == "benchmark" and args.stages and len(args.stages) > 1:
            print("⚠️  Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.")

        # Validate provider configuration
        azure_endpoint = getattr(args, "azure_endpoint", None)
        base_url = getattr(args, "base_url", None)

        if azure_endpoint and base_url:
            print("❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.")
            sys.exit(1)

        if azure_endpoint and not args.api_key:
            print("❌ Error: --api-key is required when using --azure-endpoint")
            sys.exit(1)

    except Exception as e:
        print(f"❌ Error validating arguments: {e}")
        sys.exit(1)

    # Run evaluation
    try:
        print(f"🚀 Starting guardrail evaluation in {args.mode} mode...")
        print(f"   Config: {args.config_path}")
        print(f"   Dataset: {args.dataset_path}")
        print(f"   Output: {args.output_dir}")

        # Show provider configuration
        if getattr(args, "azure_endpoint", None):
            print(f"   Provider: Azure OpenAI ({args.azure_endpoint})")
        elif getattr(args, "base_url", None):
            print(f"   Provider: OpenAI-compatible API ({args.base_url})")
        else:
            print("   Provider: OpenAI")

        if args.mode == "benchmark":
            print(f"   Models: {', '.join(args.models or DEFAULT_BENCHMARK_MODELS)}")
            print(f"   Latency iterations: {args.latency_iterations}")
            model_count = len(args.models or DEFAULT_BENCHMARK_MODELS)
            parallel_setting = GuardrailEval._determine_parallel_model_limit(model_count, args.max_parallel_models)
            print(f"   Parallel models: {parallel_setting}")
            if args.benchmark_chunk_size:
                print(f"   Benchmark chunk size: {args.benchmark_chunk_size}")
            else:
                print("   Benchmark chunk size: dataset")

        if args.multi_turn:
            print("   Conversation handling: multi-turn incremental")
        else:
            print("   Conversation handling: single-pass")

        eval = GuardrailEval(
            config_path=args.config_path,
            dataset_path=args.dataset_path,
            stages=args.stages,
            batch_size=args.batch_size,
            output_dir=args.output_dir,
            api_key=args.api_key,
            base_url=getattr(args, "base_url", None),
            azure_endpoint=getattr(args, "azure_endpoint", None),
            azure_api_version=getattr(args, "azure_api_version", None),
            mode=args.mode,
            models=args.models,
            latency_iterations=args.latency_iterations,
            multi_turn=args.multi_turn,
            max_parallel_models=args.max_parallel_models,
            benchmark_chunk_size=args.benchmark_chunk_size,
        )

        asyncio.run(eval.run())
        print("✅ Evaluation completed successfully!")

    except KeyboardInterrupt:
        print("\n⚠️  Evaluation interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Evaluation failed: {e}")
        if logger.isEnabledFor(logging.DEBUG):
            import traceback

            traceback.print_exc()
        sys.exit(1)

Guardrail Evals

GuardrailEval

__init__

run async

main

`Guardrail Evals`

init

run `async`