PrimeIntellect-ai · hallerite · Jun 4, 2026 · Jun 4, 2026
diff --git a/renderers/base.py b/renderers/base.py
@@ -1037,9 +1037,14 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No
     "moonshotai/Kimi-K2-Instruct": "kimi-k2",
     "moonshotai/Kimi-K2.5": "kimi-k2.5",
     "moonshotai/Kimi-K2.6": "kimi-k2.5",
-    # Nemotron 3.
+    # Nemotron 3. Nano / Super share one chat-template variant; the Ultra
+    # checkpoints use the Ultra variant — the renderer auto-selects it from
+    # the model name (see ``nemotron3._ULTRA_DEFAULTS``). BF16 and FP8 share the
+    # same tokenizer and template.
     "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nemotron-3",
     "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": "nemotron-3",
+    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3",
+    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3",
     # Poolside Laguna.
     "poolside/Laguna-XS.2": "laguna-xs.2",
     # GPT-OSS.

diff --git a/renderers/configs.py b/renderers/configs.py
@@ -337,13 +337,42 @@ class Nemotron3RendererConfig(BaseRendererConfig):
     """When ``True``, the generation prompt includes ``<think>``. Mirrors
     the chat template's ``enable_thinking`` kwarg."""
 
+    ultra: bool | None = None
+    """Select the Nemotron-3 **Ultra** chat-template variant.
+
+    ``None`` (default) auto-detects from the model name (see
+    ``renderers.nemotron3._ULTRA_DEFAULTS``): the Ultra checkpoints resolve
+    to ``True``; Nano / Super and unknown checkpoints to ``False``. Set
+    explicitly to force a variant — e.g. an Ultra fine-tune or a
+    locally-pathed checkpoint whose ``name_or_path`` isn't in the table.
+
+    Ultra's template differs from Nano/Super: the reasoning block is glued
+    as ``<think>\\n{reasoning}</think>{content}`` (no ``\\n`` around
+    ``</think>``), truncated historical turns collapse to
+    ``<think></think>{content}`` (no ``\\n``), and the thinking-truncation
+    boundary follows the template's ``loop.index0 < last_user_idx`` rule
+    (drop thinking on every assistant turn before the last user message).
+
+    Not a chat-template kwarg — it picks which template the renderer
+    mirrors, not a variable passed into one — so it's listed in
+    ``_internal_fields`` and excluded from ``template_field_names()``."""
+
     truncate_history_thinking: bool = True
     """When ``False``, keep ``<think>{reasoning}</think>`` on past-cycle
     assistant turns instead of dropping them. Mirrors the chat
     template's ``truncate_history_thinking`` toggle. OR-composes with
     ``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls``
     — see :class:`BaseRendererConfig` for the contract."""
 
+    # ``ultra`` is a template-variant SELECTOR — it picks which template the
+    # renderer mirrors (Ultra vs Nano/Super), not a variable passed into one;
+    # there is no ``ultra`` Jinja variable. Marked internal so the parity
+    # matrix doesn't cross it as a template field. Same ``_internal_fields``
+    # mechanism DeepSeek-V3 uses for its no-op ``enable_thinking``, for a
+    # different underlying reason (theirs is an ignored kwarg, this is a
+    # variant switch).
+    _internal_fields = frozenset({"ultra"})
+
 
 class DeepSeekV3RendererConfig(BaseRendererConfig):
     """DeepSeek V3 renderer config.

diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py
@@ -75,6 +75,35 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str]
     return lines
 
 
+# Per-model ``ultra`` default, applied when the renderer config leaves it
+# ``None``. The Nemotron-3 family ships two chat-template variants: Nano /
+# Super share one; Ultra differs in the reasoning-block glue (no ``\n`` around
+# ``</think>``) and the thinking-truncation boundary (drop thinking on every
+# assistant turn before the last user message). BF16 and FP8 share the same
+# tokenizer and template. Hard-coded keyed by
+# ``tokenizer.name_or_path`` rather than probed from the live template — the
+# same convention as Qwen3.5's ``_ENABLE_THINKING_DEFAULTS`` (avoids pulling
+# ``apply_chat_template`` onto the construction hot path and keeps
+# bring-your-own-tokenizer use working).
+_ULTRA_DEFAULTS: dict[str, bool] = {
+    "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": False,
+    "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": False,
+    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": True,
+    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": True,
+}
+
+
+def _default_ultra(tokenizer) -> bool:
+    """Hard-coded ``ultra`` default for ``tokenizer``'s model.
+
+    Falls back to ``False`` (the Nano / Super template, and the majority of
+    the family) for unknown / fine-tuned checkpoints whose ``name_or_path``
+    isn't in ``_ULTRA_DEFAULTS`` — pass an explicit ``ultra=True`` for an
+    Ultra fine-tune or a locally-pathed Ultra checkpoint.
+    """
+    return _ULTRA_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), False)
+
+
 class Nemotron3Renderer:
     """Deterministic message → token renderer for Nemotron 3 models."""
 
@@ -84,7 +113,14 @@ def __init__(
         config: Nemotron3RendererConfig | None = None,
     ):
         self._tokenizer = tokenizer
-        self.config = config or Nemotron3RendererConfig()
+        cfg = config or Nemotron3RendererConfig()
+        # ``ultra=None`` defers to the model's known default (see
+        # ``_ULTRA_DEFAULTS``). Materialise here so downstream reads see a
+        # concrete bool; rebind the frozen config with the resolved value so
+        # introspection sees the same.
+        if cfg.ultra is None:
+            cfg = cfg.model_copy(update={"ultra": _default_ultra(tokenizer)})
+        self.config = cfg
 
         # Look up special token IDs from the tokenizer (not hardcoded).
         # <|endoftext|> is optional: Nemotron-3 Nano / Super tokenizers ship
@@ -335,6 +371,17 @@ def emit_text_segments(
                 last_plain_assistant_idx = j
                 break
 
+        # Ultra truncates thinking on every assistant turn *before the last
+        # user message* (template rule ``loop.index0 < last_user_idx``),
+        # whereas Nano/Super preserve only the last plain assistant. Compute
+        # the last-user index over the normalized ``messages`` list (a leading
+        # system never holds a user, so the relative comparison is unaffected).
+        last_user_idx_norm = -1
+        for j in range(len(messages) - 1, -1, -1):
+            if messages[j].get("role") == "user":
+                last_user_idx_norm = j
+                break
+
         # ── 2. Iterate messages ─────────────────────────────────────
         for i, msg in enumerate(messages):
             role = msg["role"]
@@ -360,7 +407,10 @@ def emit_text_segments(
                 emit_text("\n", msg_orig_idx, is_sampled=False, is_content=False)
 
             elif role == "assistant":
-                is_last_turn = i >= last_plain_assistant_idx
+                if self.config.ultra:
+                    is_last_turn = i >= last_user_idx_norm
+                else:
+                    is_last_turn = i >= last_plain_assistant_idx
                 preserve_thinking = msg_orig_idx >= 0 and should_preserve_past_thinking(
                     original_messages,
                     msg_orig_idx,
@@ -617,6 +667,7 @@ def _render_assistant(
             content = after_think_end.lstrip("\n")
 
         reasoning_content = reasoning_content.strip()
+        ultra = self.config.ultra
 
         # ``<|im_start|>assistant\n`` is template-injected scaffolding —
         # at inference the chat template emits these as the generation
@@ -645,28 +696,36 @@ def _render_assistant(
             or not self.config.truncate_history_thinking
         ):
             emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
+            # Ultra: <think>\n{reasoning}</think>{content} (no \n around </think>).
+            # Nano/Super: <think>\n{reasoning}\n</think>\n{content}.
             emit_text(
-                "\n" + reasoning_content + "\n",
+                ("\n" + reasoning_content)
+                if ultra
+                else ("\n" + reasoning_content + "\n"),
                 msg_idx,
                 is_sampled=True,
                 is_content=True,
             )
             emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
-            # Single \n separator (not \n\n like Qwen3.5)
+            # Single \n separator (not \n\n like Qwen3.5); Ultra glues directly.
             emit_text(
-                "\n" + content + content_suffix,
+                (content + content_suffix)
+                if ultra
+                else ("\n" + content + content_suffix),
                 msg_idx,
                 is_sampled=True,
                 is_content=True,
             )
         elif reasoning_content:
-            # Historical assistant whose reasoning got stripped — template
-            # keeps a single \n between the collapsed <think></think> and
-            # the content as a marker that reasoning existed.
+            # Historical assistant whose reasoning got stripped. Nano/Super keep
+            # a single \n between the collapsed <think></think> and the content
+            # as a marker that reasoning existed; Ultra glues content directly.
             emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
             emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
             emit_text(
-                "\n" + content + content_suffix,
+                (content + content_suffix)
+                if ultra
+                else ("\n" + content + content_suffix),
                 msg_idx,
                 is_sampled=True,
                 is_content=True,

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -33,6 +33,8 @@
     ("moonshotai/Kimi-K2.6", "auto"),
     ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
     ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
+    # Ultra resolves the Ultra template variant via name (auto → ultra=True).
+    ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
     ("poolside/Laguna-XS.2", "auto"),
     ("openai/gpt-oss-20b", "gpt-oss"),
     ("Qwen/Qwen2.5-0.5B-Instruct", "default"),

diff --git a/tests/test_nemotron3_ultra.py b/tests/test_nemotron3_ultra.py
@@ -0,0 +1,59 @@
+"""Offline wiring tests for the Nemotron-3 Ultra template variant.
+
+Assert the name-based ``ultra`` auto-selection, the model→renderer mapping,
+and the typed-config surface WITHOUT loading any tokenizer (no network). This
+pins the wiring the parity matrix can't reach — in particular the FP8 entry,
+which no test loads a tokenizer for — so it can't silently rot.
+"""
+
+from types import SimpleNamespace
+
+from renderers.base import MODEL_RENDERER_MAP
+from renderers.configs import Nemotron3RendererConfig
+from renderers.nemotron3 import _ULTRA_DEFAULTS, _default_ultra
+
+_ULTRA_REPOS = [
+    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
+    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8",
+]
+_NON_ULTRA_REPOS = [
+    "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+    "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
+]
+
+
+def _fake_tok(name):
+    return SimpleNamespace(name_or_path=name)
+
+
+def test_ultra_and_non_ultra_models_map_to_nemotron3():
+    for repo in _ULTRA_REPOS + _NON_ULTRA_REPOS:
+        assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo
+
+
+def test_default_ultra_resolves_by_name():
+    # Ultra checkpoints (incl. the gated FP8 repo) resolve True.
+    for repo in _ULTRA_REPOS:
+        assert _ULTRA_DEFAULTS[repo] is True
+        assert _default_ultra(_fake_tok(repo)) is True
+    # Nano / Super resolve False (the shared Nano/Super template).
+    for repo in _NON_ULTRA_REPOS:
+        assert _default_ultra(_fake_tok(repo)) is False
+    # Unknown / fine-tuned / local-path checkpoints fall back to False;
+    # those must pass an explicit ultra= if they need the Ultra template.
+    assert _default_ultra(_fake_tok("acme/my-nemotron-ultra-ft")) is False
+    assert _default_ultra(_fake_tok("/home/user/local-ckpt")) is False
+    assert _default_ultra(SimpleNamespace()) is False  # no name_or_path attr
+
+
+def test_ultra_is_not_a_template_kwarg():
+    fields = Nemotron3RendererConfig.template_field_names()
+    assert "ultra" not in fields
+    assert fields == frozenset({"enable_thinking", "truncate_history_thinking"})
+    assert "ultra" in Nemotron3RendererConfig._internal_fields
+
+
+def test_ultra_config_default_is_none_and_overridable():
+    assert Nemotron3RendererConfig().ultra is None  # None => auto-detect by name
+    assert Nemotron3RendererConfig(ultra=True).ultra is True
+    assert Nemotron3RendererConfig(ultra=False).ultra is False
diff --git a/tests/test_renderer_config_parity.py b/tests/test_renderer_config_parity.py
@@ -55,6 +55,9 @@
     ("moonshotai/Kimi-K2.6", "auto"),
     ("deepseek-ai/DeepSeek-V3", "auto"),
     ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
+    # Ultra: auto-resolves to the Ultra template variant (ultra=True) via the
+    # model name; parity asserted against the Ultra apply_chat_template.
+    ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
     ("poolside/Laguna-XS.2", "auto"),
     ("openai/gpt-oss-20b", "gpt-oss"),
 ]

diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py
@@ -43,6 +43,9 @@
     ("moonshotai/Kimi-K2.6", "auto"),
     ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
     ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
+    # Ultra: parse must recover content after a </think> glued directly to it
+    # (no separating newline) — the Ultra-specific glue stresses the round-trip.
+    ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
     ("poolside/Laguna-XS.2", "auto"),
     ("openai/gpt-oss-20b", "gpt-oss"),
     ("Qwen/Qwen2.5-0.5B-Instruct", "default"),