data-fair · albanm · Jun 29, 2026 · Jun 29, 2026
diff --git a/api/doc/settings/put-req/.type/index.d.ts b/api/doc/settings/put-req/.type/index.d.ts
@@ -303,7 +303,7 @@ export type Model3 = {
  *
  * The "gatekeeper." Classifies each new user message for profanity, prompt-injection, persona override, and out-of-scope requests. Should be fast and cheap — it sits on the critical path to the first response token.
  *
- * Recommendations: a small/fast general-purpose model with structured (JSON) output support, e.g. Claude 4.5 Haiku, GPT-5.4 Mini, Mistral Small 4, Qwen3 (4B). Dedicated moderation classifiers (Llama Guard, moderation APIs) are not compatible: they use fixed taxonomies and output formats that cannot express this platform's custom policy. Avoid reasoning ("thinking") models: the verdict is capped at a small token budget and a short latency window, both of which a reasoning model spends on hidden reasoning instead of the JSON verdict — it then returns nothing usable in time and moderation silently fails open (every message is allowed).
+ * Recommendations: a small/fast general-purpose model with structured (JSON) output support, e.g. Claude 4.5 Haiku, GPT-5.4 Mini, Mistral Small 4, Qwen3 (4B). Dedicated moderation classifiers (Llama Guard, moderation APIs) are not compatible: they use fixed taxonomies and output formats that cannot express this platform's custom policy.
  */
 export type Moderator = {
   model?: Model4;

diff --git a/api/doc/settings/put-req/.type/validate.js b/api/doc/settings/put-req/.type/validate.js
diff --git a/api/src/models/mock-model.ts b/api/src/models/mock-model.ts
@@ -253,6 +253,18 @@ function processForModel (modelId: string, options: { prompt: string | Array<any
   // Reasoning seam: emit reasoning tokens before the answer (exercises the gateway's
   // reasoning_content forwarding and the client's reasoning capture).
   if (lastMessage.toLowerCase() === 'reason') return { type: 'text', text: 'world', reasoning: 'Let me think about it.' }
+  // Step-budget close-out seams (exercise the sub-agent loop → close-out path).
+  // A task of exactly "loop forever" makes the model emit a tool call on EVERY step
+  // (ignoring prior tool results), so a sub-agent's ToolLoopAgent runs to its
+  // stepCountIs cap and finishes on 'tool-calls'. The harness then issues a no-tools
+  // close-out turn; the second seam recognizes that prompt and returns a distinctive
+  // best-effort answer the test asserts was recovered (not a bare truncation notice).
+  if (lastMessage.trim().toLowerCase() === 'loop forever') {
+    return { type: 'tool-call', toolName: 'get_schema', toolArgs: '{"dataset":"test"}' }
+  }
+  if (/reached your step budget/i.test(lastMessage)) {
+    return { type: 'text', text: 'Closed-out best-effort summary from gathered data.' }
+  }
   switch (modelId) {
     case 'mock-tools':
       return processMockToolsPrompt(lastMessage, options.prompt)
@@ -291,10 +303,13 @@ export function createMockLanguageModel (modelId: string = 'mock-model'): Langua
       if (result.type === 'tool-call') {
         const calls = result.toolCalls ?? [{ toolName: result.toolName!, toolArgs: result.toolArgs || '{}' }]
         const usage = buildUsage(promptText, JSON.stringify(calls))
+        // Salt the id with the prompt length so a multi-step loop (same tool every step)
+        // gets a distinct id per step instead of reusing one across the accumulated history.
+        const callIdSalt = Array.isArray(options.prompt) ? options.prompt.length : 0
         const stream = new ReadableStream<LanguageModelV3StreamPart>({
           start (controller) {
             calls.forEach((call, idx) => {
-              const toolCallId = `mock-tool-call-id-${idx}`
+              const toolCallId = `mock-tool-call-id-${callIdSalt}-${idx}`
               controller.enqueue({ type: 'tool-input-start', id: toolCallId, toolName: call.toolName })
               controller.enqueue({ type: 'tool-input-delta', id: toolCallId, delta: call.toolArgs || '{}' })
               controller.enqueue({ type: 'tool-input-end', id: toolCallId })
@@ -354,10 +369,11 @@ export function createMockLanguageModel (modelId: string = 'mock-model'): Langua
 
       if (result.type === 'tool-call') {
         const calls = result.toolCalls ?? [{ toolName: result.toolName!, toolArgs: result.toolArgs || '{}' }]
+        const callIdSalt = Array.isArray(options.prompt) ? options.prompt.length : 0
         return {
           content: calls.map((call, idx) => ({
             type: 'tool-call' as const,
-            toolCallId: `mock-tool-call-id-${idx}`,
+            toolCallId: `mock-tool-call-id-${callIdSalt}-${idx}`,
             toolName: call.toolName,
             input: call.toolArgs ? JSON.parse(call.toolArgs) : {}
           })),

diff --git a/docs/architecture/sub-agents.md b/docs/architecture/sub-agents.md
@@ -123,10 +123,23 @@ sequenceDiagram
 
 Each `ToolLoopAgent` is configured with:
 - **model** — resolved from `provider.chat(config.model ?? 'tools')`
-- **instructions** — the sub-agent's system prompt
+- **instructions** — the sub-agent's system prompt (the host-provided prompt verbatim)
 - **tools** — the reserved tool set (only tools listed in `config.tools`)
 - **stopWhen** — `stepCountIs(10)` (max 10 autonomous steps)
 
+### Step-budget close-out
+
+A weak worker can keep calling tools after it already has the answer, exhausting the 10-step
+cap with `finishReason: 'tool-calls'`. Killing it there would discard a result it already
+produced and report a bare truncation. Instead, on that finish reason the orchestrator runs
+**one final close-out turn with no tools** (`SUBAGENT_CLOSEOUT_PROMPT` via `generateText`):
+the model cannot loop, so it must synthesize a best-effort answer from its own transcript.
+That recovered answer is carried as the trailing message content (flagged `stepLimitReached`)
+and surfaced to the lead as a *partial* result rather than a failure. Only if the close-out
+call itself fails does the worker fall back to the standalone step-limit notice. This mirrors
+AutoGen's `reflection_with_llm` summary mode (force a synthesis) rather than discarding the
+run; see [§6](#6-context-reduction) for how the partial result reaches the lead.
+
 Sub-agents run **concurrently** when the main agent requests several in one step — the AI SDK dispatches each tool call without awaiting the previous (`executeToolCall` is fired and tracked, not awaited). Each call streams into its own panel, keyed by the delegating `toolCallId`. Because workers are stateless (§5), there is nothing to serialize: even two concurrent calls to the same sub-agent run in parallel.
 
 ---
@@ -200,19 +213,24 @@ internal step loop (`stepCountIs(10)`), which is orthogonal to cross-call statel
 
 ## 6. Context Reduction
 
-The main agent never sees the full sub-agent trace. `toModelOutput()` extracts only the last assistant message's text:
+The main agent never sees the full sub-agent trace. `toModelOutput()` delegates to
+`subAgentModelOutput()`, which decides what single text the lead receives from the worker's
+message list. It is **not** a plain "last message content" read — the trailing message can
+carry flags that change the meaning:
 
 ```typescript
-toModelOutput: ({ output }) => {
-  const lastMsg = Array.isArray(output) ? output[output.length - 1] : null
-  return {
-    type: 'text',
-    value: (lastMsg as ChatMessage | null)?.content || 'Task completed.'
-  }
-}
+toModelOutput: ({ output }) => ({ type: 'text', value: subAgentModelOutput(output) })
+
+// subAgentModelOutput, by trailing-message flag:
+//   moderationBlocked → SUBAGENT_MODERATION_NOTICE  (a content-policy decision)
+//   stepLimitReached  → close-out answer prefixed with SUBAGENT_PARTIAL_PREFIX,
+//                       or SUBAGENT_STEP_LIMIT_NOTICE if nothing was recovered
+//   otherwise         → the trailing message text, or SUBAGENT_DONE_FALLBACK if empty
 ```
 
-This keeps the main agent's context window lean — a sub-agent that made 8 tool calls across 5 steps produces a single paragraph of text in the main conversation. The full trace is visible in the UI via `ChatMessage.subAgentPanels`, rendered per delegating tool-call as a bordered, state-colored panel that is **collapsed by default** and expanded on demand (its live activity shows in the header even while collapsed). A per-user **"Simplify sub-agent display"** flag (`simpleSubAgents`, on by default) instead renders each delegation as a plain status chip, like any other tool call; the full-panel view is the opt-in. Panels never auto-open.
+This mirrors OpenAI's `as_tool(custom_output_extractor=…)` hook: the deliverable is chosen
+from the run rather than blindly taken as the raw last token. It keeps the main agent's
+context window lean — a sub-agent that made 8 tool calls across 5 steps produces a single paragraph of text in the main conversation. The full trace is visible in the UI via `ChatMessage.subAgentPanels`, rendered per delegating tool-call as a bordered, state-colored panel that is **collapsed by default** and expanded on demand (its live activity shows in the header even while collapsed). A per-user **"Simplify sub-agent display"** flag (`simpleSubAgents`, on by default) instead renders each delegation as a plain status chip, like any other tool call; the full-panel view is the opt-in. Panels never auto-open.
 
 This also reduces pressure on the 24,000-character compaction threshold (see [Conversation history compaction](./compaction.md)).
 

diff --git a/tests/features/chat-subagent/chat-subagent.e2e.spec.ts b/tests/features/chat-subagent/chat-subagent.e2e.spec.ts
@@ -135,4 +135,30 @@ test.describe('Chat Sub-Agent UI', () => {
     await expect(body).toBeVisible({ timeout: 5000 })
     await expect(body.locator('.v-chip', { hasText: 'get_schema' }).first()).toBeVisible({ timeout: 5000 })
   })
+
+  test('Sub-agent that loops to its step cap recovers a close-out answer', async ({ page, goToWithAuth }) => {
+    await seedFullPanelCookie(page)
+    await goToWithAuth('/agents/_dev/chat-subagent', 'test-standalone1')
+    await waitForToolsReady(page, 'data_analyst (2 tools)', true)
+
+    // Task "loop forever" makes the sub-agent call a tool on every step until it hits
+    // the stepCountIs(10) cap (finishReason: tool-calls). The harness then runs ONE
+    // no-tools close-out turn that synthesizes a best-effort answer from what it
+    // gathered — instead of discarding the run and reporting a bare truncation.
+    await page.getByPlaceholder('Type your message...').fill('call tool subagent_data_analyst {"task":"loop forever"}')
+    await page.getByRole('button', { name: 'Send' }).click()
+
+    const panel = page.locator('.agent-chat').getByTestId('subagent-panel').first()
+    await expect(panel).toBeVisible({ timeout: 20000 })
+
+    // The looping + close-out must terminate the turn (input re-enabled), not hang.
+    await expect(page.getByPlaceholder('Type your message...')).toBeEnabled({ timeout: 20000 })
+
+    // Expand and confirm the recovered close-out answer is rendered in the panel —
+    // proving the answer survived the step cap rather than being lost to a notice.
+    await panel.getByTestId('subagent-panel-header').click()
+    const body = panel.getByTestId('subagent-panel-body')
+    await expect(body).toBeVisible({ timeout: 5000 })
+    await expect(body.getByText('Closed-out best-effort summary from gathered data.')).toBeVisible({ timeout: 5000 })
+  })
 })
diff --git a/tests/features/chat-subagent/subagent-output.unit.spec.ts b/tests/features/chat-subagent/subagent-output.unit.spec.ts
@@ -4,6 +4,7 @@ import {
   subAgentModelOutput,
   SUBAGENT_MODERATION_NOTICE,
   SUBAGENT_STEP_LIMIT_NOTICE,
+  SUBAGENT_PARTIAL_PREFIX,
   SUBAGENT_DONE_FALLBACK
 } from '../../../ui/src/composables/agent-subagent-output.ts'
 
@@ -16,12 +17,24 @@ test.describe('subAgentModelOutput (what the lead receives)', () => {
     assert.equal(out, 'Sales grew 15% YoY.')
   })
 
-  test('step-limit truncation reports a partial notice, not a success', () => {
-    // The regression: a worker cut off at its step cap ends on an empty/flagged
-    // message and must NOT be reported as 'Task completed.'.
+  test('step-limit with a recovered close-out answer hands the lead the data, flagged partial', () => {
+    // The forced close-out turn synthesized a real answer despite the step cap: the lead
+    // must receive that answer (not a bare notice), prefixed so it treats it as partial.
     const out = subAgentModelOutput([
       { role: 'assistant', content: 'querying…' },
-      { role: 'assistant', content: SUBAGENT_STEP_LIMIT_NOTICE, stepLimitReached: true }
+      { role: 'assistant', content: 'Found 68 school-calendar rows for the Rennes academy.', stepLimitReached: true }
+    ])
+    assert.ok(out.startsWith(SUBAGENT_PARTIAL_PREFIX))
+    assert.ok(out.includes('Found 68 school-calendar rows for the Rennes academy.'))
+    assert.notEqual(out, SUBAGENT_DONE_FALLBACK)
+  })
+
+  test('step-limit with no recovered content falls back to the standalone notice', () => {
+    // The close-out call itself failed (empty content): report the truncation rather
+    // than fabricate a result, and never disguise it as 'Task completed.'.
+    const out = subAgentModelOutput([
+      { role: 'assistant', content: 'querying…' },
+      { role: 'assistant', content: '', stepLimitReached: true }
     ])
     assert.equal(out, SUBAGENT_STEP_LIMIT_NOTICE)
     assert.notEqual(out, SUBAGENT_DONE_FALLBACK)

diff --git a/ui/dts/auto-imports.d.ts b/ui/dts/auto-imports.d.ts
@@ -21,6 +21,7 @@ declare global {
   const SELECT_TOOL_NAME: typeof import('../src/composables/tool-exploration')['SELECT_TOOL_NAME']
   const SUBAGENT_DONE_FALLBACK: typeof import('../src/composables/agent-subagent-output')['SUBAGENT_DONE_FALLBACK']
   const SUBAGENT_MODERATION_NOTICE: typeof import('../src/composables/agent-subagent-output')['SUBAGENT_MODERATION_NOTICE']
+  const SUBAGENT_PARTIAL_PREFIX: typeof import('../src/composables/agent-subagent-output')['SUBAGENT_PARTIAL_PREFIX']
   const SUBAGENT_STEP_LIMIT_NOTICE: typeof import('../src/composables/agent-subagent-output')['SUBAGENT_STEP_LIMIT_NOTICE']
   const activityLabelKey: typeof import('../src/composables/agent-activity')['activityLabelKey']
   const appendStreamingCaret: typeof import('../src/utils/markdown')['appendStreamingCaret']
@@ -184,6 +185,7 @@ declare module 'vue' {
     readonly SELECT_TOOL_NAME: UnwrapRef<typeof import('../src/composables/tool-exploration')['SELECT_TOOL_NAME']>
     readonly SUBAGENT_DONE_FALLBACK: UnwrapRef<typeof import('../src/composables/agent-subagent-output')['SUBAGENT_DONE_FALLBACK']>
     readonly SUBAGENT_MODERATION_NOTICE: UnwrapRef<typeof import('../src/composables/agent-subagent-output')['SUBAGENT_MODERATION_NOTICE']>
+    readonly SUBAGENT_PARTIAL_PREFIX: UnwrapRef<typeof import('../src/composables/agent-subagent-output')['SUBAGENT_PARTIAL_PREFIX']>
     readonly SUBAGENT_STEP_LIMIT_NOTICE: UnwrapRef<typeof import('../src/composables/agent-subagent-output')['SUBAGENT_STEP_LIMIT_NOTICE']>
     readonly activityLabelKey: UnwrapRef<typeof import('../src/composables/agent-activity')['activityLabelKey']>
     readonly appendStreamingCaret: UnwrapRef<typeof import('../src/utils/markdown')['appendStreamingCaret']>

diff --git a/ui/src/composables/agent-subagent-output.ts b/ui/src/composables/agent-subagent-output.ts
@@ -6,9 +6,11 @@
  * flags that change what the lead should be told, so this is not a plain
  * "last message content" read:
  *  - `moderationBlocked` → a content-policy decision, not a finished task.
- *  - `stepLimitReached`  → the worker was truncated at its step cap mid-tool-chain;
- *    the trailing message is empty, so without this it would fall through to the
- *    'Task completed.' fallback and report a truncation as a success.
+ *  - `stepLimitReached`  → the worker exhausted its step cap while still calling tools.
+ *    A forced close-out turn (see use-agent-chat) usually recovers a best-effort answer,
+ *    carried as the message content: the lead gets that answer, prefixed so it treats it
+ *    as possibly incomplete. Only when nothing was recovered (empty content) does it fall
+ *    back to the standalone notice — without which it would report a truncation as success.
  */
 
 // Handed to the main agent when the sub-agent's own gateway call was blocked by
@@ -23,6 +25,12 @@ export const SUBAGENT_STEP_LIMIT_NOTICE = 'The sub-agent reached its step limit
 // (e.g. ended on a tool result). A genuine, if uninformative, completion.
 export const SUBAGENT_DONE_FALLBACK = 'Task completed.'
 
+// Prepended to a recovered close-out answer. The worker exceeded its step budget, but a
+// final no-tools turn synthesized a best-effort answer from what it had already gathered.
+// The lead receives that answer flagged as possibly incomplete, so it can use the data
+// without treating it as authoritative or assuming the task fully succeeded.
+export const SUBAGENT_PARTIAL_PREFIX = '[Partial result — the sub-agent reached its step budget. The findings below are what it gathered; treat them as possibly incomplete and verify before relying on them.]'
+
 interface SubAgentOutputMessage {
   content?: string
   moderationBlocked?: boolean
@@ -34,6 +42,13 @@ export function subAgentModelOutput (output: unknown): string {
   const messages = Array.isArray(output) ? output as SubAgentOutputMessage[] : []
   const lastMsg = messages.length ? messages[messages.length - 1] : null
   if (lastMsg?.moderationBlocked) return SUBAGENT_MODERATION_NOTICE
-  if (lastMsg?.stepLimitReached) return SUBAGENT_STEP_LIMIT_NOTICE
+  if (lastMsg?.stepLimitReached) {
+    // The forced close-out turn (use-agent-chat) carries its synthesized answer as the
+    // message content. Hand that to the lead, prefixed as possibly incomplete. Only when
+    // the close-out produced nothing (content empty/missing) do we fall back to the
+    // standalone notice — i.e. report the truncation rather than fabricate a result.
+    const body = lastMsg.content?.trim()
+    return body ? `${SUBAGENT_PARTIAL_PREFIX}\n\n${body}` : SUBAGENT_STEP_LIMIT_NOTICE
+  }
   return lastMsg?.content || SUBAGENT_DONE_FALLBACK
 }