diff --git a/renderers/base.py b/renderers/base.py index 45768de..242adae 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1037,9 +1037,14 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No "moonshotai/Kimi-K2-Instruct": "kimi-k2", "moonshotai/Kimi-K2.5": "kimi-k2.5", "moonshotai/Kimi-K2.6": "kimi-k2.5", - # Nemotron 3. + # Nemotron 3. Nano / Super share one chat-template variant; the Ultra + # checkpoints use the Ultra variant — the renderer auto-selects it from + # the model name (see ``nemotron3._ULTRA_DEFAULTS``). BF16 and FP8 share the + # same tokenizer and template. "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nemotron-3", "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": "nemotron-3", + "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3", + "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3", # Poolside Laguna. "poolside/Laguna-XS.2": "laguna-xs.2", # GPT-OSS. diff --git a/renderers/configs.py b/renderers/configs.py index e0098ba..2c18a17 100644 --- a/renderers/configs.py +++ b/renderers/configs.py @@ -337,6 +337,26 @@ class Nemotron3RendererConfig(BaseRendererConfig): """When ``True``, the generation prompt includes ````. Mirrors the chat template's ``enable_thinking`` kwarg.""" + ultra: bool | None = None + """Select the Nemotron-3 **Ultra** chat-template variant. + + ``None`` (default) auto-detects from the model name (see + ``renderers.nemotron3._ULTRA_DEFAULTS``): the Ultra checkpoints resolve + to ``True``; Nano / Super and unknown checkpoints to ``False``. Set + explicitly to force a variant — e.g. an Ultra fine-tune or a + locally-pathed checkpoint whose ``name_or_path`` isn't in the table. + + Ultra's template differs from Nano/Super: the reasoning block is glued + as ``\\n{reasoning}{content}`` (no ``\\n`` around + ````), truncated historical turns collapse to + ``{content}`` (no ``\\n``), and the thinking-truncation + boundary follows the template's ``loop.index0 < last_user_idx`` rule + (drop thinking on every assistant turn before the last user message). + + Not a chat-template kwarg — it picks which template the renderer + mirrors, not a variable passed into one — so it's listed in + ``_internal_fields`` and excluded from ``template_field_names()``.""" + truncate_history_thinking: bool = True """When ``False``, keep ``{reasoning}`` on past-cycle assistant turns instead of dropping them. Mirrors the chat @@ -344,6 +364,15 @@ class Nemotron3RendererConfig(BaseRendererConfig): ``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls`` — see :class:`BaseRendererConfig` for the contract.""" + # ``ultra`` is a template-variant SELECTOR — it picks which template the + # renderer mirrors (Ultra vs Nano/Super), not a variable passed into one; + # there is no ``ultra`` Jinja variable. Marked internal so the parity + # matrix doesn't cross it as a template field. Same ``_internal_fields`` + # mechanism DeepSeek-V3 uses for its no-op ``enable_thinking``, for a + # different underlying reason (theirs is an ignored kwarg, this is a + # variant switch). + _internal_fields = frozenset({"ultra"}) + class DeepSeekV3RendererConfig(BaseRendererConfig): """DeepSeek V3 renderer config. diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py index e6398b5..b735cde 100644 --- a/renderers/nemotron3.py +++ b/renderers/nemotron3.py @@ -75,6 +75,35 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str] return lines +# Per-model ``ultra`` default, applied when the renderer config leaves it +# ``None``. The Nemotron-3 family ships two chat-template variants: Nano / +# Super share one; Ultra differs in the reasoning-block glue (no ``\n`` around +# ````) and the thinking-truncation boundary (drop thinking on every +# assistant turn before the last user message). BF16 and FP8 share the same +# tokenizer and template. Hard-coded keyed by +# ``tokenizer.name_or_path`` rather than probed from the live template — the +# same convention as Qwen3.5's ``_ENABLE_THINKING_DEFAULTS`` (avoids pulling +# ``apply_chat_template`` onto the construction hot path and keeps +# bring-your-own-tokenizer use working). +_ULTRA_DEFAULTS: dict[str, bool] = { + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": False, + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": False, + "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": True, + "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": True, +} + + +def _default_ultra(tokenizer) -> bool: + """Hard-coded ``ultra`` default for ``tokenizer``'s model. + + Falls back to ``False`` (the Nano / Super template, and the majority of + the family) for unknown / fine-tuned checkpoints whose ``name_or_path`` + isn't in ``_ULTRA_DEFAULTS`` — pass an explicit ``ultra=True`` for an + Ultra fine-tune or a locally-pathed Ultra checkpoint. + """ + return _ULTRA_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), False) + + class Nemotron3Renderer: """Deterministic message → token renderer for Nemotron 3 models.""" @@ -84,7 +113,14 @@ def __init__( config: Nemotron3RendererConfig | None = None, ): self._tokenizer = tokenizer - self.config = config or Nemotron3RendererConfig() + cfg = config or Nemotron3RendererConfig() + # ``ultra=None`` defers to the model's known default (see + # ``_ULTRA_DEFAULTS``). Materialise here so downstream reads see a + # concrete bool; rebind the frozen config with the resolved value so + # introspection sees the same. + if cfg.ultra is None: + cfg = cfg.model_copy(update={"ultra": _default_ultra(tokenizer)}) + self.config = cfg # Look up special token IDs from the tokenizer (not hardcoded). # <|endoftext|> is optional: Nemotron-3 Nano / Super tokenizers ship @@ -335,6 +371,17 @@ def emit_text_segments( last_plain_assistant_idx = j break + # Ultra truncates thinking on every assistant turn *before the last + # user message* (template rule ``loop.index0 < last_user_idx``), + # whereas Nano/Super preserve only the last plain assistant. Compute + # the last-user index over the normalized ``messages`` list (a leading + # system never holds a user, so the relative comparison is unaffected). + last_user_idx_norm = -1 + for j in range(len(messages) - 1, -1, -1): + if messages[j].get("role") == "user": + last_user_idx_norm = j + break + # ── 2. Iterate messages ───────────────────────────────────── for i, msg in enumerate(messages): role = msg["role"] @@ -360,7 +407,10 @@ def emit_text_segments( emit_text("\n", msg_orig_idx, is_sampled=False, is_content=False) elif role == "assistant": - is_last_turn = i >= last_plain_assistant_idx + if self.config.ultra: + is_last_turn = i >= last_user_idx_norm + else: + is_last_turn = i >= last_plain_assistant_idx preserve_thinking = msg_orig_idx >= 0 and should_preserve_past_thinking( original_messages, msg_orig_idx, @@ -617,6 +667,7 @@ def _render_assistant( content = after_think_end.lstrip("\n") reasoning_content = reasoning_content.strip() + ultra = self.config.ultra # ``<|im_start|>assistant\n`` is template-injected scaffolding — # at inference the chat template emits these as the generation @@ -645,28 +696,36 @@ def _render_assistant( or not self.config.truncate_history_thinking ): emit_special(self._think, msg_idx, is_sampled=True, is_content=True) + # Ultra: \n{reasoning}{content} (no \n around ). + # Nano/Super: \n{reasoning}\n\n{content}. emit_text( - "\n" + reasoning_content + "\n", + ("\n" + reasoning_content) + if ultra + else ("\n" + reasoning_content + "\n"), msg_idx, is_sampled=True, is_content=True, ) emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True) - # Single \n separator (not \n\n like Qwen3.5) + # Single \n separator (not \n\n like Qwen3.5); Ultra glues directly. emit_text( - "\n" + content + content_suffix, + (content + content_suffix) + if ultra + else ("\n" + content + content_suffix), msg_idx, is_sampled=True, is_content=True, ) elif reasoning_content: - # Historical assistant whose reasoning got stripped — template - # keeps a single \n between the collapsed and - # the content as a marker that reasoning existed. + # Historical assistant whose reasoning got stripped. Nano/Super keep + # a single \n between the collapsed and the content + # as a marker that reasoning existed; Ultra glues content directly. emit_special(self._think, msg_idx, is_sampled=True, is_content=True) emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True) emit_text( - "\n" + content + content_suffix, + (content + content_suffix) + if ultra + else ("\n" + content + content_suffix), msg_idx, is_sampled=True, is_content=True, diff --git a/tests/conftest.py b/tests/conftest.py index c334430..4266487 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,6 +33,8 @@ ("moonshotai/Kimi-K2.6", "auto"), ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"), ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"), + # Ultra resolves the Ultra template variant via name (auto → ultra=True). + ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"), ("poolside/Laguna-XS.2", "auto"), ("openai/gpt-oss-20b", "gpt-oss"), ("Qwen/Qwen2.5-0.5B-Instruct", "default"), diff --git a/tests/test_nemotron3_ultra.py b/tests/test_nemotron3_ultra.py new file mode 100644 index 0000000..7716d15 --- /dev/null +++ b/tests/test_nemotron3_ultra.py @@ -0,0 +1,59 @@ +"""Offline wiring tests for the Nemotron-3 Ultra template variant. + +Assert the name-based ``ultra`` auto-selection, the model→renderer mapping, +and the typed-config surface WITHOUT loading any tokenizer (no network). This +pins the wiring the parity matrix can't reach — in particular the FP8 entry, +which no test loads a tokenizer for — so it can't silently rot. +""" + +from types import SimpleNamespace + +from renderers.base import MODEL_RENDERER_MAP +from renderers.configs import Nemotron3RendererConfig +from renderers.nemotron3 import _ULTRA_DEFAULTS, _default_ultra + +_ULTRA_REPOS = [ + "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", + "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8", +] +_NON_ULTRA_REPOS = [ + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", +] + + +def _fake_tok(name): + return SimpleNamespace(name_or_path=name) + + +def test_ultra_and_non_ultra_models_map_to_nemotron3(): + for repo in _ULTRA_REPOS + _NON_ULTRA_REPOS: + assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo + + +def test_default_ultra_resolves_by_name(): + # Ultra checkpoints (incl. the gated FP8 repo) resolve True. + for repo in _ULTRA_REPOS: + assert _ULTRA_DEFAULTS[repo] is True + assert _default_ultra(_fake_tok(repo)) is True + # Nano / Super resolve False (the shared Nano/Super template). + for repo in _NON_ULTRA_REPOS: + assert _default_ultra(_fake_tok(repo)) is False + # Unknown / fine-tuned / local-path checkpoints fall back to False; + # those must pass an explicit ultra= if they need the Ultra template. + assert _default_ultra(_fake_tok("acme/my-nemotron-ultra-ft")) is False + assert _default_ultra(_fake_tok("/home/user/local-ckpt")) is False + assert _default_ultra(SimpleNamespace()) is False # no name_or_path attr + + +def test_ultra_is_not_a_template_kwarg(): + fields = Nemotron3RendererConfig.template_field_names() + assert "ultra" not in fields + assert fields == frozenset({"enable_thinking", "truncate_history_thinking"}) + assert "ultra" in Nemotron3RendererConfig._internal_fields + + +def test_ultra_config_default_is_none_and_overridable(): + assert Nemotron3RendererConfig().ultra is None # None => auto-detect by name + assert Nemotron3RendererConfig(ultra=True).ultra is True + assert Nemotron3RendererConfig(ultra=False).ultra is False diff --git a/tests/test_renderer_config_parity.py b/tests/test_renderer_config_parity.py index 8ca2da3..1827a9c 100644 --- a/tests/test_renderer_config_parity.py +++ b/tests/test_renderer_config_parity.py @@ -55,6 +55,9 @@ ("moonshotai/Kimi-K2.6", "auto"), ("deepseek-ai/DeepSeek-V3", "auto"), ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"), + # Ultra: auto-resolves to the Ultra template variant (ultra=True) via the + # model name; parity asserted against the Ultra apply_chat_template. + ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"), ("poolside/Laguna-XS.2", "auto"), ("openai/gpt-oss-20b", "gpt-oss"), ] diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py index 383bc14..7d7ee36 100644 --- a/tests/test_roundtrip.py +++ b/tests/test_roundtrip.py @@ -43,6 +43,9 @@ ("moonshotai/Kimi-K2.6", "auto"), ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"), ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"), + # Ultra: parse must recover content after a glued directly to it + # (no separating newline) — the Ultra-specific glue stresses the round-trip. + ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"), ("poolside/Laguna-XS.2", "auto"), ("openai/gpt-oss-20b", "gpt-oss"), ("Qwen/Qwen2.5-0.5B-Instruct", "default"),