3333 OptimizationResultPayload ,
3434)
3535from ldai_optimization .prompts import (
36+ _acceptance_criteria_implies_duration_optimization ,
3637 build_message_history_text ,
3738 build_new_variation_prompt ,
3839 build_reasoning_history ,
@@ -80,6 +81,12 @@ def _compute_validation_count(pool_size: int) -> int:
8081# the variation step is treated as a failure.
8182_MAX_VARIATION_RETRIES = 3
8283
84+ # Duration gate: a candidate must be at least this much faster than the baseline
85+ # (history[0].duration_ms) to pass the duration check when acceptance criteria
86+ # imply a latency optimization goal. 0.80 means the candidate must clock in at
87+ # under 80% of the baseline — i.e. at least 20% improvement.
88+ _DURATION_TOLERANCE = 0.80
89+
8390# Maps SDK status strings to the API status/activity values expected by
8491# agent_optimization_result records. Defined at module level to avoid
8592# allocating the dict on every on_status_update invocation.
@@ -328,6 +335,7 @@ async def _call_judges(
328335 variables : Optional [Dict [str , Any ]] = None ,
329336 agent_tools : Optional [List [ToolDefinition ]] = None ,
330337 expected_response : Optional [str ] = None ,
338+ agent_duration_ms : Optional [float ] = None ,
331339 ) -> Dict [str , JudgeResult ]:
332340 """
333341 Call all judges in parallel (auto-path).
@@ -344,6 +352,9 @@ async def _call_judges(
344352 :param agent_tools: Normalised list of tool dicts that were available to the agent
345353 :param expected_response: Optional ground truth expected response. When provided,
346354 judges are instructed to factor it into their scoring alongside acceptance criteria.
355+ :param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
356+ Forwarded to acceptance judges whose statement implies a latency goal so they
357+ can mention the duration change in their rationale.
347358 :return: Dictionary of judge results (score and rationale)
348359 """
349360 if not self ._options .judges :
@@ -396,6 +407,7 @@ async def _call_judges(
396407 variables = resolved_variables ,
397408 agent_tools = resolved_agent_tools ,
398409 expected_response = expected_response ,
410+ agent_duration_ms = agent_duration_ms ,
399411 )
400412 judge_results [judge_key ] = result
401413
@@ -613,6 +625,7 @@ async def _evaluate_acceptance_judge(
613625 variables : Optional [Dict [str , Any ]] = None ,
614626 agent_tools : Optional [List [ToolDefinition ]] = None ,
615627 expected_response : Optional [str ] = None ,
628+ agent_duration_ms : Optional [float ] = None ,
616629 ) -> JudgeResult :
617630 """
618631 Evaluate using an acceptance statement judge.
@@ -627,6 +640,9 @@ async def _evaluate_acceptance_judge(
627640 :param agent_tools: Normalised list of tool dicts that were available to the agent
628641 :param expected_response: Optional ground truth expected response. When provided,
629642 injected into instructions and judge message so the judge can score actual vs. expected.
643+ :param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
644+ When the acceptance statement implies a latency goal, the judge is instructed
645+ to mention the duration change in its rationale.
630646 :return: The judge result with score and rationale
631647 """
632648 if not optimization_judge .acceptance_statement :
@@ -662,6 +678,32 @@ async def _evaluate_acceptance_judge(
662678 'Example: {"score": 0.8, "rationale": "The response matches the acceptance statement well."}'
663679 )
664680
681+ if (
682+ agent_duration_ms is not None
683+ and _acceptance_criteria_implies_duration_optimization (
684+ {judge_key : optimization_judge }
685+ )
686+ ):
687+ baseline_ms = (
688+ self ._history [0 ].duration_ms
689+ if self ._history and self ._history [0 ].duration_ms is not None
690+ else None
691+ )
692+ instructions += (
693+ f"\n \n The acceptance criteria for this judge includes a latency/duration goal. "
694+ f"The agent's response took { agent_duration_ms :.0f} ms to generate. "
695+ )
696+ if baseline_ms is not None :
697+ delta_ms = agent_duration_ms - baseline_ms
698+ direction = "faster" if delta_ms < 0 else "slower"
699+ instructions += (
700+ f"The baseline duration (first iteration) was { baseline_ms :.0f} ms. "
701+ f"This response was { abs (delta_ms ):.0f} ms { direction } than the baseline. "
702+ )
703+ instructions += (
704+ "Please mention the duration and any change from baseline in your rationale."
705+ )
706+
665707 if resolved_variables :
666708 instructions += f"\n \n The following variables were available to the agent: { json .dumps (resolved_variables )} "
667709
@@ -911,6 +953,11 @@ async def _run_ground_truth_optimization(
911953 else :
912954 sample_passed = self ._evaluate_response (optimize_context )
913955
956+ if sample_passed and _acceptance_criteria_implies_duration_optimization (
957+ self ._options .judges
958+ ):
959+ sample_passed = self ._evaluate_duration (optimize_context )
960+
914961 if not sample_passed :
915962 logger .info (
916963 "[GT Attempt %d] -> Sample %d/%d FAILED" ,
@@ -1147,6 +1194,9 @@ async def _generate_new_variation(
11471194 )
11481195 self ._safe_status_update ("generating variation" , status_ctx , iteration )
11491196
1197+ optimize_for_duration = _acceptance_criteria_implies_duration_optimization (
1198+ self ._options .judges
1199+ )
11501200 instructions = build_new_variation_prompt (
11511201 self ._history ,
11521202 self ._options .judges ,
@@ -1156,6 +1206,7 @@ async def _generate_new_variation(
11561206 self ._options .model_choices ,
11571207 self ._options .variable_choices ,
11581208 self ._initial_instructions ,
1209+ optimize_for_duration = optimize_for_duration ,
11591210 )
11601211
11611212 # Create a flat history list (without nested history) to avoid exponential growth
@@ -1486,6 +1537,7 @@ async def _execute_agent_turn(
14861537 variables = optimize_context .current_variables ,
14871538 agent_tools = agent_tools ,
14881539 expected_response = expected_response ,
1540+ agent_duration_ms = agent_duration_ms ,
14891541 )
14901542
14911543 return dataclasses .replace (
@@ -1523,6 +1575,38 @@ def _evaluate_response(self, optimize_context: OptimizationContext) -> bool:
15231575
15241576 return True
15251577
1578+ def _evaluate_duration (self , optimize_context : OptimizationContext ) -> bool :
1579+ """
1580+ Check whether the candidate's duration meets the improvement target vs. the baseline.
1581+
1582+ The baseline is history[0].duration_ms — the very first completed iteration,
1583+ representing the original unoptimized configuration's latency. The candidate
1584+ must be at least _DURATION_TOLERANCE faster (default: 20% improvement).
1585+
1586+ Returns True without blocking when no baseline is available (empty history or
1587+ history[0].duration_ms is None), or when the candidate's duration_ms was not
1588+ captured. This avoids penalising configurations when timing data is missing.
1589+
1590+ :param optimize_context: The completed turn context containing duration_ms
1591+ :return: True if the duration requirement is met or cannot be checked
1592+ """
1593+ if not self ._history or self ._history [0 ].duration_ms is None :
1594+ return True
1595+ if optimize_context .duration_ms is None :
1596+ return True
1597+ baseline = self ._history [0 ].duration_ms
1598+ passed = optimize_context .duration_ms < baseline * _DURATION_TOLERANCE
1599+ if not passed :
1600+ logger .warning (
1601+ "[Iteration %d] -> Duration check failed: %.0fms >= baseline %.0fms * %.0f%% (%.0fms)" ,
1602+ optimize_context .iteration ,
1603+ optimize_context .duration_ms ,
1604+ baseline ,
1605+ _DURATION_TOLERANCE * 100 ,
1606+ baseline * _DURATION_TOLERANCE ,
1607+ )
1608+ return passed
1609+
15261610 def _handle_success (
15271611 self , optimize_context : OptimizationContext , iteration : int
15281612 ) -> Any :
@@ -1691,6 +1775,11 @@ async def _run_validation_phase(
16911775 else :
16921776 sample_passed = self ._evaluate_response (val_ctx )
16931777
1778+ if sample_passed and _acceptance_criteria_implies_duration_optimization (
1779+ self ._options .judges
1780+ ):
1781+ sample_passed = self ._evaluate_duration (val_ctx )
1782+
16941783 last_ctx = val_ctx
16951784
16961785 if not sample_passed :
@@ -1798,6 +1887,11 @@ async def _run_optimization(
17981887 iteration ,
17991888 )
18001889
1890+ if initial_passed and _acceptance_criteria_implies_duration_optimization (
1891+ self ._options .judges
1892+ ):
1893+ initial_passed = self ._evaluate_duration (optimize_context )
1894+
18011895 if initial_passed :
18021896 all_valid , last_ctx = await self ._run_validation_phase (
18031897 optimize_context , iteration
0 commit comments