Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion renderers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,9 +1037,14 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No
"moonshotai/Kimi-K2-Instruct": "kimi-k2",
"moonshotai/Kimi-K2.5": "kimi-k2.5",
"moonshotai/Kimi-K2.6": "kimi-k2.5",
# Nemotron 3.
# Nemotron 3. Nano / Super share one chat-template variant; the Ultra
# checkpoints use the Ultra variant — the renderer auto-selects it from
# the model name (see ``nemotron3._ULTRA_DEFAULTS``). BF16 and FP8 share the
# same tokenizer and template.
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3",
# Poolside Laguna.
"poolside/Laguna-XS.2": "laguna-xs.2",
# GPT-OSS.
Expand Down
29 changes: 29 additions & 0 deletions renderers/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,13 +337,42 @@ class Nemotron3RendererConfig(BaseRendererConfig):
"""When ``True``, the generation prompt includes ``<think>``. Mirrors
the chat template's ``enable_thinking`` kwarg."""

ultra: bool | None = None
"""Select the Nemotron-3 **Ultra** chat-template variant.

``None`` (default) auto-detects from the model name (see
``renderers.nemotron3._ULTRA_DEFAULTS``): the Ultra checkpoints resolve
to ``True``; Nano / Super and unknown checkpoints to ``False``. Set
explicitly to force a variant — e.g. an Ultra fine-tune or a
locally-pathed checkpoint whose ``name_or_path`` isn't in the table.

Ultra's template differs from Nano/Super: the reasoning block is glued
as ``<think>\\n{reasoning}</think>{content}`` (no ``\\n`` around
``</think>``), truncated historical turns collapse to
``<think></think>{content}`` (no ``\\n``), and the thinking-truncation
boundary follows the template's ``loop.index0 < last_user_idx`` rule
(drop thinking on every assistant turn before the last user message).

Not a chat-template kwarg — it picks which template the renderer
mirrors, not a variable passed into one — so it's listed in
``_internal_fields`` and excluded from ``template_field_names()``."""

truncate_history_thinking: bool = True
"""When ``False``, keep ``<think>{reasoning}</think>`` on past-cycle
assistant turns instead of dropping them. Mirrors the chat
template's ``truncate_history_thinking`` toggle. OR-composes with
``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls``
— see :class:`BaseRendererConfig` for the contract."""

# ``ultra`` is a template-variant SELECTOR — it picks which template the
# renderer mirrors (Ultra vs Nano/Super), not a variable passed into one;
# there is no ``ultra`` Jinja variable. Marked internal so the parity
# matrix doesn't cross it as a template field. Same ``_internal_fields``
# mechanism DeepSeek-V3 uses for its no-op ``enable_thinking``, for a
# different underlying reason (theirs is an ignored kwarg, this is a
# variant switch).
_internal_fields = frozenset({"ultra"})


class DeepSeekV3RendererConfig(BaseRendererConfig):
"""DeepSeek V3 renderer config.
Expand Down
77 changes: 68 additions & 9 deletions renderers/nemotron3.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,35 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str]
return lines


# Per-model ``ultra`` default, applied when the renderer config leaves it
# ``None``. The Nemotron-3 family ships two chat-template variants: Nano /
# Super share one; Ultra differs in the reasoning-block glue (no ``\n`` around
# ``</think>``) and the thinking-truncation boundary (drop thinking on every
# assistant turn before the last user message). BF16 and FP8 share the same
# tokenizer and template. Hard-coded keyed by
# ``tokenizer.name_or_path`` rather than probed from the live template — the
# same convention as Qwen3.5's ``_ENABLE_THINKING_DEFAULTS`` (avoids pulling
# ``apply_chat_template`` onto the construction hot path and keeps
# bring-your-own-tokenizer use working).
_ULTRA_DEFAULTS: dict[str, bool] = {
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": False,
"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": False,
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": True,
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": True,
}


def _default_ultra(tokenizer) -> bool:
"""Hard-coded ``ultra`` default for ``tokenizer``'s model.

Falls back to ``False`` (the Nano / Super template, and the majority of
the family) for unknown / fine-tuned checkpoints whose ``name_or_path``
isn't in ``_ULTRA_DEFAULTS`` — pass an explicit ``ultra=True`` for an
Ultra fine-tune or a locally-pathed Ultra checkpoint.
"""
return _ULTRA_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), False)


class Nemotron3Renderer:
"""Deterministic message → token renderer for Nemotron 3 models."""

Expand All @@ -84,7 +113,14 @@ def __init__(
config: Nemotron3RendererConfig | None = None,
):
self._tokenizer = tokenizer
self.config = config or Nemotron3RendererConfig()
cfg = config or Nemotron3RendererConfig()
# ``ultra=None`` defers to the model's known default (see
# ``_ULTRA_DEFAULTS``). Materialise here so downstream reads see a
# concrete bool; rebind the frozen config with the resolved value so
# introspection sees the same.
if cfg.ultra is None:
cfg = cfg.model_copy(update={"ultra": _default_ultra(tokenizer)})
self.config = cfg

# Look up special token IDs from the tokenizer (not hardcoded).
# <|endoftext|> is optional: Nemotron-3 Nano / Super tokenizers ship
Expand Down Expand Up @@ -335,6 +371,17 @@ def emit_text_segments(
last_plain_assistant_idx = j
break

# Ultra truncates thinking on every assistant turn *before the last
# user message* (template rule ``loop.index0 < last_user_idx``),
# whereas Nano/Super preserve only the last plain assistant. Compute
# the last-user index over the normalized ``messages`` list (a leading
# system never holds a user, so the relative comparison is unaffected).
last_user_idx_norm = -1
for j in range(len(messages) - 1, -1, -1):
if messages[j].get("role") == "user":
last_user_idx_norm = j
break

# ── 2. Iterate messages ─────────────────────────────────────
for i, msg in enumerate(messages):
role = msg["role"]
Expand All @@ -360,7 +407,10 @@ def emit_text_segments(
emit_text("\n", msg_orig_idx, is_sampled=False, is_content=False)

elif role == "assistant":
is_last_turn = i >= last_plain_assistant_idx
if self.config.ultra:
is_last_turn = i >= last_user_idx_norm
else:
is_last_turn = i >= last_plain_assistant_idx
preserve_thinking = msg_orig_idx >= 0 and should_preserve_past_thinking(
original_messages,
msg_orig_idx,
Expand Down Expand Up @@ -617,6 +667,7 @@ def _render_assistant(
content = after_think_end.lstrip("\n")

reasoning_content = reasoning_content.strip()
ultra = self.config.ultra

# ``<|im_start|>assistant\n`` is template-injected scaffolding —
# at inference the chat template emits these as the generation
Expand Down Expand Up @@ -645,28 +696,36 @@ def _render_assistant(
or not self.config.truncate_history_thinking
):
emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
# Ultra: <think>\n{reasoning}</think>{content} (no \n around </think>).
# Nano/Super: <think>\n{reasoning}\n</think>\n{content}.
emit_text(
"\n" + reasoning_content + "\n",
("\n" + reasoning_content)
if ultra
else ("\n" + reasoning_content + "\n"),
msg_idx,
is_sampled=True,
is_content=True,
)
emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
# Single \n separator (not \n\n like Qwen3.5)
# Single \n separator (not \n\n like Qwen3.5); Ultra glues directly.
emit_text(
"\n" + content + content_suffix,
(content + content_suffix)
if ultra
else ("\n" + content + content_suffix),
msg_idx,
is_sampled=True,
is_content=True,
)
elif reasoning_content:
# Historical assistant whose reasoning got stripped — template
# keeps a single \n between the collapsed <think></think> and
# the content as a marker that reasoning existed.
# Historical assistant whose reasoning got stripped. Nano/Super keep
# a single \n between the collapsed <think></think> and the content
# as a marker that reasoning existed; Ultra glues content directly.
emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
emit_text(
"\n" + content + content_suffix,
(content + content_suffix)
if ultra
else ("\n" + content + content_suffix),
msg_idx,
is_sampled=True,
is_content=True,
Expand Down
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
("moonshotai/Kimi-K2.6", "auto"),
("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
# Ultra resolves the Ultra template variant via name (auto → ultra=True).
("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
("poolside/Laguna-XS.2", "auto"),
("openai/gpt-oss-20b", "gpt-oss"),
("Qwen/Qwen2.5-0.5B-Instruct", "default"),
Expand Down
59 changes: 59 additions & 0 deletions tests/test_nemotron3_ultra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Offline wiring tests for the Nemotron-3 Ultra template variant.

Assert the name-based ``ultra`` auto-selection, the model→renderer mapping,
and the typed-config surface WITHOUT loading any tokenizer (no network). This
pins the wiring the parity matrix can't reach — in particular the FP8 entry,
which no test loads a tokenizer for — so it can't silently rot.
"""

from types import SimpleNamespace

from renderers.base import MODEL_RENDERER_MAP
from renderers.configs import Nemotron3RendererConfig
from renderers.nemotron3 import _ULTRA_DEFAULTS, _default_ultra

_ULTRA_REPOS = [
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8",
]
_NON_ULTRA_REPOS = [
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
]


def _fake_tok(name):
return SimpleNamespace(name_or_path=name)


def test_ultra_and_non_ultra_models_map_to_nemotron3():
for repo in _ULTRA_REPOS + _NON_ULTRA_REPOS:
assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo


def test_default_ultra_resolves_by_name():
# Ultra checkpoints (incl. the gated FP8 repo) resolve True.
for repo in _ULTRA_REPOS:
assert _ULTRA_DEFAULTS[repo] is True
assert _default_ultra(_fake_tok(repo)) is True
# Nano / Super resolve False (the shared Nano/Super template).
for repo in _NON_ULTRA_REPOS:
assert _default_ultra(_fake_tok(repo)) is False
# Unknown / fine-tuned / local-path checkpoints fall back to False;
# those must pass an explicit ultra= if they need the Ultra template.
assert _default_ultra(_fake_tok("acme/my-nemotron-ultra-ft")) is False
assert _default_ultra(_fake_tok("/home/user/local-ckpt")) is False
assert _default_ultra(SimpleNamespace()) is False # no name_or_path attr


def test_ultra_is_not_a_template_kwarg():
fields = Nemotron3RendererConfig.template_field_names()
assert "ultra" not in fields
assert fields == frozenset({"enable_thinking", "truncate_history_thinking"})
assert "ultra" in Nemotron3RendererConfig._internal_fields


def test_ultra_config_default_is_none_and_overridable():
assert Nemotron3RendererConfig().ultra is None # None => auto-detect by name
assert Nemotron3RendererConfig(ultra=True).ultra is True
assert Nemotron3RendererConfig(ultra=False).ultra is False
3 changes: 3 additions & 0 deletions tests/test_renderer_config_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
("moonshotai/Kimi-K2.6", "auto"),
("deepseek-ai/DeepSeek-V3", "auto"),
("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
# Ultra: auto-resolves to the Ultra template variant (ultra=True) via the
# model name; parity asserted against the Ultra apply_chat_template.
("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
("poolside/Laguna-XS.2", "auto"),
("openai/gpt-oss-20b", "gpt-oss"),
]
Expand Down
3 changes: 3 additions & 0 deletions tests/test_roundtrip.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
("moonshotai/Kimi-K2.6", "auto"),
("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
# Ultra: parse must recover content after a </think> glued directly to it
# (no separating newline) — the Ultra-specific glue stresses the round-trip.
("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
("poolside/Laguna-XS.2", "auto"),
("openai/gpt-oss-20b", "gpt-oss"),
("Qwen/Qwen2.5-0.5B-Instruct", "default"),
Expand Down
Loading