diff --git a/renderers/base.py b/renderers/base.py
index 45768de..242adae 100644
--- a/renderers/base.py
+++ b/renderers/base.py
@@ -1037,9 +1037,14 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No
"moonshotai/Kimi-K2-Instruct": "kimi-k2",
"moonshotai/Kimi-K2.5": "kimi-k2.5",
"moonshotai/Kimi-K2.6": "kimi-k2.5",
- # Nemotron 3.
+ # Nemotron 3. Nano / Super share one chat-template variant; the Ultra
+ # checkpoints use the Ultra variant — the renderer auto-selects it from
+ # the model name (see ``nemotron3._ULTRA_DEFAULTS``). BF16 and FP8 share the
+ # same tokenizer and template.
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": "nemotron-3",
+ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3",
+ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3",
# Poolside Laguna.
"poolside/Laguna-XS.2": "laguna-xs.2",
# GPT-OSS.
diff --git a/renderers/configs.py b/renderers/configs.py
index e0098ba..2c18a17 100644
--- a/renderers/configs.py
+++ b/renderers/configs.py
@@ -337,6 +337,26 @@ class Nemotron3RendererConfig(BaseRendererConfig):
"""When ``True``, the generation prompt includes ````. Mirrors
the chat template's ``enable_thinking`` kwarg."""
+ ultra: bool | None = None
+ """Select the Nemotron-3 **Ultra** chat-template variant.
+
+ ``None`` (default) auto-detects from the model name (see
+ ``renderers.nemotron3._ULTRA_DEFAULTS``): the Ultra checkpoints resolve
+ to ``True``; Nano / Super and unknown checkpoints to ``False``. Set
+ explicitly to force a variant — e.g. an Ultra fine-tune or a
+ locally-pathed checkpoint whose ``name_or_path`` isn't in the table.
+
+ Ultra's template differs from Nano/Super: the reasoning block is glued
+ as ``\\n{reasoning}{content}`` (no ``\\n`` around
+ ````), truncated historical turns collapse to
+ ``{content}`` (no ``\\n``), and the thinking-truncation
+ boundary follows the template's ``loop.index0 < last_user_idx`` rule
+ (drop thinking on every assistant turn before the last user message).
+
+ Not a chat-template kwarg — it picks which template the renderer
+ mirrors, not a variable passed into one — so it's listed in
+ ``_internal_fields`` and excluded from ``template_field_names()``."""
+
truncate_history_thinking: bool = True
"""When ``False``, keep ``{reasoning}`` on past-cycle
assistant turns instead of dropping them. Mirrors the chat
@@ -344,6 +364,15 @@ class Nemotron3RendererConfig(BaseRendererConfig):
``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls``
— see :class:`BaseRendererConfig` for the contract."""
+ # ``ultra`` is a template-variant SELECTOR — it picks which template the
+ # renderer mirrors (Ultra vs Nano/Super), not a variable passed into one;
+ # there is no ``ultra`` Jinja variable. Marked internal so the parity
+ # matrix doesn't cross it as a template field. Same ``_internal_fields``
+ # mechanism DeepSeek-V3 uses for its no-op ``enable_thinking``, for a
+ # different underlying reason (theirs is an ignored kwarg, this is a
+ # variant switch).
+ _internal_fields = frozenset({"ultra"})
+
class DeepSeekV3RendererConfig(BaseRendererConfig):
"""DeepSeek V3 renderer config.
diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py
index e6398b5..b735cde 100644
--- a/renderers/nemotron3.py
+++ b/renderers/nemotron3.py
@@ -75,6 +75,35 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str]
return lines
+# Per-model ``ultra`` default, applied when the renderer config leaves it
+# ``None``. The Nemotron-3 family ships two chat-template variants: Nano /
+# Super share one; Ultra differs in the reasoning-block glue (no ``\n`` around
+# ````) and the thinking-truncation boundary (drop thinking on every
+# assistant turn before the last user message). BF16 and FP8 share the same
+# tokenizer and template. Hard-coded keyed by
+# ``tokenizer.name_or_path`` rather than probed from the live template — the
+# same convention as Qwen3.5's ``_ENABLE_THINKING_DEFAULTS`` (avoids pulling
+# ``apply_chat_template`` onto the construction hot path and keeps
+# bring-your-own-tokenizer use working).
+_ULTRA_DEFAULTS: dict[str, bool] = {
+ "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": False,
+ "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": False,
+ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": True,
+ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": True,
+}
+
+
+def _default_ultra(tokenizer) -> bool:
+ """Hard-coded ``ultra`` default for ``tokenizer``'s model.
+
+ Falls back to ``False`` (the Nano / Super template, and the majority of
+ the family) for unknown / fine-tuned checkpoints whose ``name_or_path``
+ isn't in ``_ULTRA_DEFAULTS`` — pass an explicit ``ultra=True`` for an
+ Ultra fine-tune or a locally-pathed Ultra checkpoint.
+ """
+ return _ULTRA_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), False)
+
+
class Nemotron3Renderer:
"""Deterministic message → token renderer for Nemotron 3 models."""
@@ -84,7 +113,14 @@ def __init__(
config: Nemotron3RendererConfig | None = None,
):
self._tokenizer = tokenizer
- self.config = config or Nemotron3RendererConfig()
+ cfg = config or Nemotron3RendererConfig()
+ # ``ultra=None`` defers to the model's known default (see
+ # ``_ULTRA_DEFAULTS``). Materialise here so downstream reads see a
+ # concrete bool; rebind the frozen config with the resolved value so
+ # introspection sees the same.
+ if cfg.ultra is None:
+ cfg = cfg.model_copy(update={"ultra": _default_ultra(tokenizer)})
+ self.config = cfg
# Look up special token IDs from the tokenizer (not hardcoded).
# <|endoftext|> is optional: Nemotron-3 Nano / Super tokenizers ship
@@ -335,6 +371,17 @@ def emit_text_segments(
last_plain_assistant_idx = j
break
+ # Ultra truncates thinking on every assistant turn *before the last
+ # user message* (template rule ``loop.index0 < last_user_idx``),
+ # whereas Nano/Super preserve only the last plain assistant. Compute
+ # the last-user index over the normalized ``messages`` list (a leading
+ # system never holds a user, so the relative comparison is unaffected).
+ last_user_idx_norm = -1
+ for j in range(len(messages) - 1, -1, -1):
+ if messages[j].get("role") == "user":
+ last_user_idx_norm = j
+ break
+
# ── 2. Iterate messages ─────────────────────────────────────
for i, msg in enumerate(messages):
role = msg["role"]
@@ -360,7 +407,10 @@ def emit_text_segments(
emit_text("\n", msg_orig_idx, is_sampled=False, is_content=False)
elif role == "assistant":
- is_last_turn = i >= last_plain_assistant_idx
+ if self.config.ultra:
+ is_last_turn = i >= last_user_idx_norm
+ else:
+ is_last_turn = i >= last_plain_assistant_idx
preserve_thinking = msg_orig_idx >= 0 and should_preserve_past_thinking(
original_messages,
msg_orig_idx,
@@ -617,6 +667,7 @@ def _render_assistant(
content = after_think_end.lstrip("\n")
reasoning_content = reasoning_content.strip()
+ ultra = self.config.ultra
# ``<|im_start|>assistant\n`` is template-injected scaffolding —
# at inference the chat template emits these as the generation
@@ -645,28 +696,36 @@ def _render_assistant(
or not self.config.truncate_history_thinking
):
emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
+ # Ultra: \n{reasoning}{content} (no \n around ).
+ # Nano/Super: \n{reasoning}\n\n{content}.
emit_text(
- "\n" + reasoning_content + "\n",
+ ("\n" + reasoning_content)
+ if ultra
+ else ("\n" + reasoning_content + "\n"),
msg_idx,
is_sampled=True,
is_content=True,
)
emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
- # Single \n separator (not \n\n like Qwen3.5)
+ # Single \n separator (not \n\n like Qwen3.5); Ultra glues directly.
emit_text(
- "\n" + content + content_suffix,
+ (content + content_suffix)
+ if ultra
+ else ("\n" + content + content_suffix),
msg_idx,
is_sampled=True,
is_content=True,
)
elif reasoning_content:
- # Historical assistant whose reasoning got stripped — template
- # keeps a single \n between the collapsed and
- # the content as a marker that reasoning existed.
+ # Historical assistant whose reasoning got stripped. Nano/Super keep
+ # a single \n between the collapsed and the content
+ # as a marker that reasoning existed; Ultra glues content directly.
emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
emit_text(
- "\n" + content + content_suffix,
+ (content + content_suffix)
+ if ultra
+ else ("\n" + content + content_suffix),
msg_idx,
is_sampled=True,
is_content=True,
diff --git a/tests/conftest.py b/tests/conftest.py
index c334430..4266487 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,6 +33,8 @@
("moonshotai/Kimi-K2.6", "auto"),
("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
+ # Ultra resolves the Ultra template variant via name (auto → ultra=True).
+ ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
("poolside/Laguna-XS.2", "auto"),
("openai/gpt-oss-20b", "gpt-oss"),
("Qwen/Qwen2.5-0.5B-Instruct", "default"),
diff --git a/tests/test_nemotron3_ultra.py b/tests/test_nemotron3_ultra.py
new file mode 100644
index 0000000..7716d15
--- /dev/null
+++ b/tests/test_nemotron3_ultra.py
@@ -0,0 +1,59 @@
+"""Offline wiring tests for the Nemotron-3 Ultra template variant.
+
+Assert the name-based ``ultra`` auto-selection, the model→renderer mapping,
+and the typed-config surface WITHOUT loading any tokenizer (no network). This
+pins the wiring the parity matrix can't reach — in particular the FP8 entry,
+which no test loads a tokenizer for — so it can't silently rot.
+"""
+
+from types import SimpleNamespace
+
+from renderers.base import MODEL_RENDERER_MAP
+from renderers.configs import Nemotron3RendererConfig
+from renderers.nemotron3 import _ULTRA_DEFAULTS, _default_ultra
+
+_ULTRA_REPOS = [
+ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
+ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8",
+]
+_NON_ULTRA_REPOS = [
+ "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+ "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
+]
+
+
+def _fake_tok(name):
+ return SimpleNamespace(name_or_path=name)
+
+
+def test_ultra_and_non_ultra_models_map_to_nemotron3():
+ for repo in _ULTRA_REPOS + _NON_ULTRA_REPOS:
+ assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo
+
+
+def test_default_ultra_resolves_by_name():
+ # Ultra checkpoints (incl. the gated FP8 repo) resolve True.
+ for repo in _ULTRA_REPOS:
+ assert _ULTRA_DEFAULTS[repo] is True
+ assert _default_ultra(_fake_tok(repo)) is True
+ # Nano / Super resolve False (the shared Nano/Super template).
+ for repo in _NON_ULTRA_REPOS:
+ assert _default_ultra(_fake_tok(repo)) is False
+ # Unknown / fine-tuned / local-path checkpoints fall back to False;
+ # those must pass an explicit ultra= if they need the Ultra template.
+ assert _default_ultra(_fake_tok("acme/my-nemotron-ultra-ft")) is False
+ assert _default_ultra(_fake_tok("/home/user/local-ckpt")) is False
+ assert _default_ultra(SimpleNamespace()) is False # no name_or_path attr
+
+
+def test_ultra_is_not_a_template_kwarg():
+ fields = Nemotron3RendererConfig.template_field_names()
+ assert "ultra" not in fields
+ assert fields == frozenset({"enable_thinking", "truncate_history_thinking"})
+ assert "ultra" in Nemotron3RendererConfig._internal_fields
+
+
+def test_ultra_config_default_is_none_and_overridable():
+ assert Nemotron3RendererConfig().ultra is None # None => auto-detect by name
+ assert Nemotron3RendererConfig(ultra=True).ultra is True
+ assert Nemotron3RendererConfig(ultra=False).ultra is False
diff --git a/tests/test_renderer_config_parity.py b/tests/test_renderer_config_parity.py
index 8ca2da3..1827a9c 100644
--- a/tests/test_renderer_config_parity.py
+++ b/tests/test_renderer_config_parity.py
@@ -55,6 +55,9 @@
("moonshotai/Kimi-K2.6", "auto"),
("deepseek-ai/DeepSeek-V3", "auto"),
("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
+ # Ultra: auto-resolves to the Ultra template variant (ultra=True) via the
+ # model name; parity asserted against the Ultra apply_chat_template.
+ ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
("poolside/Laguna-XS.2", "auto"),
("openai/gpt-oss-20b", "gpt-oss"),
]
diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py
index 383bc14..7d7ee36 100644
--- a/tests/test_roundtrip.py
+++ b/tests/test_roundtrip.py
@@ -43,6 +43,9 @@
("moonshotai/Kimi-K2.6", "auto"),
("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
+ # Ultra: parse must recover content after a glued directly to it
+ # (no separating newline) — the Ultra-specific glue stresses the round-trip.
+ ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
("poolside/Laguna-XS.2", "auto"),
("openai/gpt-oss-20b", "gpt-oss"),
("Qwen/Qwen2.5-0.5B-Instruct", "default"),