feat: add optimization for duration

andrewklatzke · andrewklatzke · commit 5d762764fa6d · 2026-04-08T14:34:41.000-08:00
diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py
@@ -33,6 +33,7 @@
     OptimizationResultPayload,
 )
 from ldai_optimization.prompts import (
+    _acceptance_criteria_implies_duration_optimization,
     build_message_history_text,
     build_new_variation_prompt,
     build_reasoning_history,
@@ -80,6 +81,12 @@ def _compute_validation_count(pool_size: int) -> int:
 # the variation step is treated as a failure.
 _MAX_VARIATION_RETRIES = 3
 
+# Duration gate: a candidate must be at least this much faster than the baseline
+# (history[0].duration_ms) to pass the duration check when acceptance criteria
+# imply a latency optimization goal. 0.80 means the candidate must clock in at
+# under 80% of the baseline — i.e. at least 20% improvement.
+_DURATION_TOLERANCE = 0.80
+
 # Maps SDK status strings to the API status/activity values expected by
 # agent_optimization_result records. Defined at module level to avoid
 # allocating the dict on every on_status_update invocation.
@@ -328,6 +335,7 @@ async def _call_judges(
         variables: Optional[Dict[str, Any]] = None,
         agent_tools: Optional[List[ToolDefinition]] = None,
         expected_response: Optional[str] = None,
+        agent_duration_ms: Optional[float] = None,
     ) -> Dict[str, JudgeResult]:
         """
         Call all judges in parallel (auto-path).
@@ -344,6 +352,9 @@ async def _call_judges(
         :param agent_tools: Normalised list of tool dicts that were available to the agent
         :param expected_response: Optional ground truth expected response. When provided,
             judges are instructed to factor it into their scoring alongside acceptance criteria.
+        :param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
+            Forwarded to acceptance judges whose statement implies a latency goal so they
+            can mention the duration change in their rationale.
         :return: Dictionary of judge results (score and rationale)
         """
         if not self._options.judges:
@@ -396,6 +407,7 @@ async def _call_judges(
                         variables=resolved_variables,
                         agent_tools=resolved_agent_tools,
                         expected_response=expected_response,
+                        agent_duration_ms=agent_duration_ms,
                     )
                     judge_results[judge_key] = result
 
@@ -613,6 +625,7 @@ async def _evaluate_acceptance_judge(
         variables: Optional[Dict[str, Any]] = None,
         agent_tools: Optional[List[ToolDefinition]] = None,
         expected_response: Optional[str] = None,
+        agent_duration_ms: Optional[float] = None,
     ) -> JudgeResult:
         """
         Evaluate using an acceptance statement judge.
@@ -627,6 +640,9 @@ async def _evaluate_acceptance_judge(
         :param agent_tools: Normalised list of tool dicts that were available to the agent
         :param expected_response: Optional ground truth expected response. When provided,
             injected into instructions and judge message so the judge can score actual vs. expected.
+        :param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
+            When the acceptance statement implies a latency goal, the judge is instructed
+            to mention the duration change in its rationale.
         :return: The judge result with score and rationale
         """
         if not optimization_judge.acceptance_statement:
@@ -662,6 +678,32 @@ async def _evaluate_acceptance_judge(
             'Example: {"score": 0.8, "rationale": "The response matches the acceptance statement well."}'
         )
 
+        if (
+            agent_duration_ms is not None
+            and _acceptance_criteria_implies_duration_optimization(
+                {judge_key: optimization_judge}
+            )
+        ):
+            baseline_ms = (
+                self._history[0].duration_ms
+                if self._history and self._history[0].duration_ms is not None
+                else None
+            )
+            instructions += (
+                f"\n\nThe acceptance criteria for this judge includes a latency/duration goal. "
+                f"The agent's response took {agent_duration_ms:.0f}ms to generate. "
+            )
+            if baseline_ms is not None:
+                delta_ms = agent_duration_ms - baseline_ms
+                direction = "faster" if delta_ms < 0 else "slower"
+                instructions += (
+                    f"The baseline duration (first iteration) was {baseline_ms:.0f}ms. "
+                    f"This response was {abs(delta_ms):.0f}ms {direction} than the baseline. "
+                )
+            instructions += (
+                "Please mention the duration and any change from baseline in your rationale."
+            )
+
         if resolved_variables:
             instructions += f"\n\nThe following variables were available to the agent: {json.dumps(resolved_variables)}"
 
@@ -911,6 +953,11 @@ async def _run_ground_truth_optimization(
                 else:
                     sample_passed = self._evaluate_response(optimize_context)
 
+                if sample_passed and _acceptance_criteria_implies_duration_optimization(
+                    self._options.judges
+                ):
+                    sample_passed = self._evaluate_duration(optimize_context)
+
                 if not sample_passed:
                     logger.info(
                         "[GT Attempt %d] -> Sample %d/%d FAILED",
@@ -1147,6 +1194,9 @@ async def _generate_new_variation(
         )
         self._safe_status_update("generating variation", status_ctx, iteration)
 
+        optimize_for_duration = _acceptance_criteria_implies_duration_optimization(
+            self._options.judges
+        )
         instructions = build_new_variation_prompt(
             self._history,
             self._options.judges,
@@ -1156,6 +1206,7 @@ async def _generate_new_variation(
             self._options.model_choices,
             self._options.variable_choices,
             self._initial_instructions,
+            optimize_for_duration=optimize_for_duration,
         )
 
         # Create a flat history list (without nested history) to avoid exponential growth
@@ -1486,6 +1537,7 @@ async def _execute_agent_turn(
                 variables=optimize_context.current_variables,
                 agent_tools=agent_tools,
                 expected_response=expected_response,
+                agent_duration_ms=agent_duration_ms,
             )
 
         return dataclasses.replace(
@@ -1523,6 +1575,38 @@ def _evaluate_response(self, optimize_context: OptimizationContext) -> bool:
 
         return True
 
+    def _evaluate_duration(self, optimize_context: OptimizationContext) -> bool:
+        """
+        Check whether the candidate's duration meets the improvement target vs. the baseline.
+
+        The baseline is history[0].duration_ms — the very first completed iteration,
+        representing the original unoptimized configuration's latency. The candidate
+        must be at least _DURATION_TOLERANCE faster (default: 20% improvement).
+
+        Returns True without blocking when no baseline is available (empty history or
+        history[0].duration_ms is None), or when the candidate's duration_ms was not
+        captured. This avoids penalising configurations when timing data is missing.
+
+        :param optimize_context: The completed turn context containing duration_ms
+        :return: True if the duration requirement is met or cannot be checked
+        """
+        if not self._history or self._history[0].duration_ms is None:
+            return True
+        if optimize_context.duration_ms is None:
+            return True
+        baseline = self._history[0].duration_ms
+        passed = optimize_context.duration_ms < baseline * _DURATION_TOLERANCE
+        if not passed:
+            logger.warning(
+                "[Iteration %d] -> Duration check failed: %.0fms >= baseline %.0fms * %.0f%% (%.0fms)",
+                optimize_context.iteration,
+                optimize_context.duration_ms,
+                baseline,
+                _DURATION_TOLERANCE * 100,
+                baseline * _DURATION_TOLERANCE,
+            )
+        return passed
+
     def _handle_success(
         self, optimize_context: OptimizationContext, iteration: int
     ) -> Any:
@@ -1691,6 +1775,11 @@ async def _run_validation_phase(
             else:
                 sample_passed = self._evaluate_response(val_ctx)
 
+            if sample_passed and _acceptance_criteria_implies_duration_optimization(
+                self._options.judges
+            ):
+                sample_passed = self._evaluate_duration(val_ctx)
+
             last_ctx = val_ctx
 
             if not sample_passed:
@@ -1798,6 +1887,11 @@ async def _run_optimization(
                         iteration,
                     )
 
+            if initial_passed and _acceptance_criteria_implies_duration_optimization(
+                self._options.judges
+            ):
+                initial_passed = self._evaluate_duration(optimize_context)
+
             if initial_passed:
                 all_valid, last_ctx = await self._run_validation_phase(
                     optimize_context, iteration
diff --git a/packages/optimization/src/ldai_optimization/prompts.py b/packages/optimization/src/ldai_optimization/prompts.py
@@ -1,12 +1,42 @@
 """Prompt-building functions for LaunchDarkly AI optimization."""
 
+import re
 from typing import Any, Dict, List, Optional
 
 from ldai_optimization.dataclasses import (
     OptimizationContext,
     OptimizationJudge,
 )
 
+_DURATION_KEYWORDS = re.compile(
+    r"\b(fast|faster|quickly|quick|latency|low-latency|duration|response\s+time|"
+    r"time\s+to\s+respond|milliseconds|performant|snappy|efficient|seconds)\b|"
+    r"(?<![a-zA-Z])ms\b",
+    re.IGNORECASE,
+)
+
+
+def _acceptance_criteria_implies_duration_optimization(
+    judges: Optional[Dict[str, OptimizationJudge]],
+) -> bool:
+    """Return True if any judge acceptance statement implies a latency optimization goal.
+
+    Scans each judge's acceptance_statement for latency-related keywords. The
+    check is case-insensitive. Returns False when judges is None or no judge
+    carries an acceptance statement.
+
+    :param judges: Judge configuration dict from OptimizationOptions, or None.
+    :return: True if duration optimization should be applied.
+    """
+    if not judges:
+        return False
+    for judge in judges.values():
+        if judge.acceptance_statement and _DURATION_KEYWORDS.search(
+            judge.acceptance_statement
+        ):
+            return True
+    return False
+
 
 def build_message_history_text(
     history: List[OptimizationContext],
@@ -82,6 +112,7 @@ def build_new_variation_prompt(
     model_choices: List[str],
     variable_choices: List[Dict[str, Any]],
     initial_instructions: str,
+    optimize_for_duration: bool = False,
 ) -> str:
     """
     Build the LLM prompt for generating an improved agent configuration.
@@ -99,6 +130,8 @@ def build_new_variation_prompt(
     :param model_choices: List of model IDs the LLM may select from
     :param variable_choices: List of variable dicts (used to derive placeholder names)
     :param initial_instructions: The original unmodified instructions template
+    :param optimize_for_duration: When True, appends a duration optimization section
+        instructing the LLM to prefer faster models and simpler instructions.
     :return: The assembled prompt string
     """
     sections = [
@@ -112,6 +145,7 @@ def build_new_variation_prompt(
         variation_prompt_improvement_instructions(
             history, model_choices, variable_choices, initial_instructions
         ),
+        variation_prompt_duration_optimization(model_choices) if optimize_for_duration else "",
     ]
 
     return "\n\n".join(s for s in sections if s)
@@ -211,6 +245,8 @@ def variation_prompt_configuration(
         if previous_ctx.user_input:
             lines.append(f"User question: {previous_ctx.user_input}")
         lines.append(f"Agent response: {previous_ctx.completion_response}")
+        if previous_ctx.duration_ms is not None:
+            lines.append(f"Agent duration: {previous_ctx.duration_ms:.0f}ms")
         return "\n".join(lines)
     else:
         return "\n".join(
@@ -262,6 +298,8 @@ def variation_prompt_feedback(
                 if result.rationale:
                     feedback_line += f"\n  Reasoning: {result.rationale}"
                 lines.append(feedback_line)
+        if ctx.duration_ms is not None:
+            lines.append(f"Agent duration: {ctx.duration_ms:.0f}ms")
     return "\n".join(lines)
 
 
@@ -487,3 +525,33 @@ def variation_prompt_improvement_instructions(
                 parameters_instructions,
             ]
         )
+
+
+def variation_prompt_duration_optimization(model_choices: List[str]) -> str:
+    """
+    Duration optimization section of the variation prompt.
+
+    Included when acceptance criteria imply a latency reduction goal. Instructs
+    the LLM to treat response speed as a secondary objective — quality criteria
+    must still be met first — and provides concrete guidance on how to reduce
+    latency through model selection and instruction simplification.
+
+    :param model_choices: List of model IDs the LLM may select from, so it can
+        apply its own knowledge of which models tend to be faster.
+    :return: The duration optimization prompt block.
+    """
+    return "\n".join(
+        [
+            "## Duration Optimization:",
+            "The acceptance criteria for this optimization implies that response latency should be reduced.",
+            "In addition to improving quality, generate a variation that aims to reduce the agent's response time.",
+            "You may:",
+            "- Select a faster model from the available choices if quality requirements can still be met.",
+            f"  Available models: {model_choices}",
+            "  Use your knowledge of these models to prefer those that are known to respond more quickly.",
+            "- Simplify or shorten the instructions where this does not compromise the acceptance criteria.",
+            "  Shorter prompts reduce input token counts and typically yield faster responses.",
+            "- Avoid increasing max_tokens or other parameters that extend generation time.",
+            "Quality criteria remain the primary objective — do not sacrifice passing scores to achieve lower latency.",
+        ]
+    )
diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py