chore: Add more benchmarks (#146)

AbhiPrasad · web-flow · commit 7a4328e7db56 · 2026-03-26T10:38:24.000-07:00
diff --git a/py/benchmarks/benches/bench_id_gen.py b/py/benchmarks/benches/bench_id_gen.py
@@ -0,0 +1,38 @@
+"""Benchmarks for ID generation.
+
+get_span_id and get_trace_id are called on every span creation, so their
+cost accumulates in high-throughput tracing workloads.  This module
+compares the two generators: UUIDGenerator (default) and OTELIDGenerator
+(enabled via BRAINTRUST_OTEL_COMPAT=true).
+"""
+
+import pathlib
+import sys
+
+import pyperf
+
+
+if __package__ in (None, ""):
+    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))
+
+from braintrust.id_gen import OTELIDGenerator, UUIDGenerator
+
+from benchmarks._utils import disable_pyperf_psutil
+
+
+def main(runner: pyperf.Runner | None = None) -> None:
+    if runner is None:
+        disable_pyperf_psutil()
+        runner = pyperf.Runner()
+
+    uuid_gen = UUIDGenerator()
+    otel_gen = OTELIDGenerator()
+
+    runner.bench_func("id_gen.uuid.span_id", uuid_gen.get_span_id)
+    runner.bench_func("id_gen.uuid.trace_id", uuid_gen.get_trace_id)
+    runner.bench_func("id_gen.otel.span_id", otel_gen.get_span_id)
+    runner.bench_func("id_gen.otel.trace_id", otel_gen.get_trace_id)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/py/benchmarks/benches/bench_merge_dicts.py b/py/benchmarks/benches/bench_merge_dicts.py
@@ -0,0 +1,104 @@
+"""Benchmarks for merge_dicts and merge_dicts_with_paths.
+
+merge_dicts is called on every span log update and during row merging,
+making it one of the most frequently executed SDK functions.
+
+Note: merge_dicts mutates merge_into, so each benchmark wrapper creates a
+fresh copy of the target dict before calling. This means each bench_func
+measures a shallow/deep copy plus the merge itself — the copy cost is
+intentionally kept proportional to the input size so relative comparisons
+remain valid.
+"""
+
+import copy
+import pathlib
+import sys
+from typing import Any
+
+import pyperf
+
+
+if __package__ in (None, ""):
+    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))
+
+from braintrust.util import merge_dicts
+
+from benchmarks._utils import disable_pyperf_psutil
+from benchmarks.fixtures import make_large_payload, make_medium_payload, make_small_payload
+
+
+# Updates are pre-built once; only merge_into is copied per iteration.
+_SMALL_UPDATE: dict[str, Any] = {
+    "metadata": {"extra_key": "extra_value"},
+    "scores": {"relevance": 0.8},
+    "tags": ["new_tag"],
+}
+
+_MEDIUM_UPDATE: dict[str, Any] = {
+    "metadata": {"workspace_id": "workspace-789", "new_flag": True},
+    "metrics": {"cached_tokens": 64},
+    "tags": ["updated", "benchmark"],
+}
+
+_LARGE_UPDATE: dict[str, Any] = {
+    "metadata": {"routing": {"tier": "standard"}, "extra": "value"},
+    "metrics": {"cached_tokens": 512},
+    "tags": ["updated"],
+    "output": {"summary": "revised"},
+}
+
+# Pre-built base payloads (copied per iteration, not mutated at module level).
+_SMALL_BASE = make_small_payload()
+_MEDIUM_BASE = make_medium_payload()
+_LARGE_BASE = make_large_payload()
+
+_NESTED_BASE: dict[str, Any] = {
+    "a": {"b": {"c": {"d": 1, "e": 2}, "f": 3}, "g": 4},
+    "h": {"i": {"j": {"k": 5}}},
+}
+_NESTED_UPDATE: dict[str, Any] = {
+    "a": {"b": {"c": {"d": 99}, "new": "value"}, "g": 99},
+    "h": {"i": {"j": {"new_key": "hello"}}},
+}
+
+# Tags set-union: top-level "tags" field uses set-union semantics in merge_dicts.
+_TAGS_UPDATE: dict[str, Any] = {"tags": ["c", "d", "e"]}
+
+
+def _bench_small() -> None:
+    merge_dicts(dict(_SMALL_BASE), _SMALL_UPDATE)
+
+
+def _bench_medium() -> None:
+    # Shallow copy is enough: _MEDIUM_UPDATE only touches top-level dict values.
+    merge_dicts(dict(_MEDIUM_BASE), _MEDIUM_UPDATE)
+
+
+def _bench_large() -> None:
+    merge_dicts(dict(_LARGE_BASE), _LARGE_UPDATE)
+
+
+def _bench_nested() -> None:
+    # Deep copy required because the update recurses into nested dicts.
+    merge_dicts(copy.deepcopy(_NESTED_BASE), _NESTED_UPDATE)
+
+
+def _bench_tags_union() -> None:
+    # Tags list grows on each call, so start from a fresh copy every time.
+    merge_dicts({"tags": ["a", "b"], "value": 1}, _TAGS_UPDATE)
+
+
+def main(runner: pyperf.Runner | None = None) -> None:
+    if runner is None:
+        disable_pyperf_psutil()
+        runner = pyperf.Runner()
+
+    runner.bench_func("merge_dicts[small]", _bench_small)
+    runner.bench_func("merge_dicts[medium]", _bench_medium)
+    runner.bench_func("merge_dicts[large]", _bench_large)
+    runner.bench_func("merge_dicts[nested-deep]", _bench_nested)
+    runner.bench_func("merge_dicts[tags-union]", _bench_tags_union)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/py/benchmarks/benches/bench_merge_row_batch.py b/py/benchmarks/benches/bench_merge_row_batch.py
@@ -0,0 +1,148 @@
+"""Benchmarks for merge_row_batch and batch_items.
+
+merge_row_batch is called before every flush to the Braintrust API to
+de-duplicate and merge rows in a pending batch.  batch_items is used to
+split the resulting rows into API-request-sized chunks.
+
+Both functions mutate their inputs, so each benchmark wrapper builds fresh
+row lists per iteration.
+"""
+
+import pathlib
+import sys
+
+import pyperf
+
+
+if __package__ in (None, ""):
+    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))
+
+from braintrust.db_fields import IS_MERGE_FIELD
+from braintrust.merge_row_batch import batch_items, merge_row_batch
+
+from benchmarks._utils import disable_pyperf_psutil
+
+
+# ---------------------------------------------------------------------------
+# Row factories — called inside each benchmark wrapper to get fresh dicts.
+# ---------------------------------------------------------------------------
+
+
+def _unique_rows(n: int) -> list[dict]:
+    """n rows, all distinct IDs — no merging needed."""
+    return [{"id": f"row-{i}", "project_id": "proj-1", "value": i} for i in range(n)]
+
+
+def _merge_rows(n: int) -> list[dict]:
+    """n rows forming n//2 pairs: first is a base, second is an IS_MERGE update."""
+    rows = []
+    for i in range(n // 2):
+        rows.append({"id": f"row-{i}", "project_id": "proj-1", "payload": {"a": i}})
+        rows.append(
+            {
+                "id": f"row-{i}",
+                "project_id": "proj-1",
+                "payload": {"b": i + 100},
+                IS_MERGE_FIELD: True,
+            }
+        )
+    return rows
+
+
+def _mixed_rows(n: int) -> list[dict]:
+    """Mix of unique rows and merge pairs (roughly half each)."""
+    rows = []
+    for i in range(n // 4):
+        # pair that will be merged
+        rows.append({"id": f"merge-{i}", "project_id": "proj-1", "payload": {"a": i}})
+        rows.append(
+            {
+                "id": f"merge-{i}",
+                "project_id": "proj-1",
+                "payload": {"b": i + 100},
+                IS_MERGE_FIELD: True,
+            }
+        )
+    for i in range(n // 2):
+        rows.append({"id": f"unique-{i}", "project_id": "proj-1", "value": i})
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Benchmark wrappers
+# ---------------------------------------------------------------------------
+
+_SMALL_N = 10
+_MEDIUM_N = 50
+_LARGE_N = 200
+
+
+def _bench_no_conflict_small() -> None:
+    merge_row_batch(_unique_rows(_SMALL_N))
+
+
+def _bench_no_conflict_medium() -> None:
+    merge_row_batch(_unique_rows(_MEDIUM_N))
+
+
+def _bench_no_conflict_large() -> None:
+    merge_row_batch(_unique_rows(_LARGE_N))
+
+
+def _bench_all_merge_small() -> None:
+    merge_row_batch(_merge_rows(_SMALL_N))
+
+
+def _bench_all_merge_medium() -> None:
+    merge_row_batch(_merge_rows(_MEDIUM_N))
+
+
+def _bench_mixed_medium() -> None:
+    merge_row_batch(_mixed_rows(_MEDIUM_N))
+
+
+# batch_items: split a list of strings by item-count and byte-count limits.
+_BATCH_STRINGS = [f"item-payload-{i:04d}" * 4 for i in range(200)]
+_ITEM_SIZE = len(_BATCH_STRINGS[0].encode())
+
+
+def _bench_batch_items_count_limit() -> None:
+    batch_items(_BATCH_STRINGS, batch_max_num_items=20)
+
+
+def _bench_batch_items_byte_limit() -> None:
+    batch_items(
+        _BATCH_STRINGS,
+        batch_max_num_bytes=_ITEM_SIZE * 15,
+        get_byte_size=lambda s: len(s.encode()),
+    )
+
+
+def _bench_batch_items_both_limits() -> None:
+    batch_items(
+        _BATCH_STRINGS,
+        batch_max_num_items=20,
+        batch_max_num_bytes=_ITEM_SIZE * 15,
+        get_byte_size=lambda s: len(s.encode()),
+    )
+
+
+def main(runner: pyperf.Runner | None = None) -> None:
+    if runner is None:
+        disable_pyperf_psutil()
+        runner = pyperf.Runner()
+
+    runner.bench_func("merge_row_batch[no-conflict-small]", _bench_no_conflict_small)
+    runner.bench_func("merge_row_batch[no-conflict-medium]", _bench_no_conflict_medium)
+    runner.bench_func("merge_row_batch[no-conflict-large]", _bench_no_conflict_large)
+    runner.bench_func("merge_row_batch[all-merge-small]", _bench_all_merge_small)
+    runner.bench_func("merge_row_batch[all-merge-medium]", _bench_all_merge_medium)
+    runner.bench_func("merge_row_batch[mixed-medium]", _bench_mixed_medium)
+
+    runner.bench_func("batch_items[count-limit]", _bench_batch_items_count_limit)
+    runner.bench_func("batch_items[byte-limit]", _bench_batch_items_byte_limit)
+    runner.bench_func("batch_items[both-limits]", _bench_batch_items_both_limits)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/py/benchmarks/benches/bench_span_components.py b/py/benchmarks/benches/bench_span_components.py
@@ -0,0 +1,73 @@
+"""Benchmarks for SpanComponentsV3 and SpanComponentsV4 encode/decode.
+
+These are on the hot path: every span serializes/deserializes parent context.
+"""
+
+import pathlib
+import secrets
+import sys
+import uuid
+
+import pyperf
+
+
+if __package__ in (None, ""):
+    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[2]))
+
+from braintrust.span_identifier_v3 import SpanComponentsV3, SpanObjectTypeV3
+from braintrust.span_identifier_v4 import SpanComponentsV4
+
+from benchmarks._utils import disable_pyperf_psutil
+
+
+def main(runner: pyperf.Runner | None = None) -> None:
+    if runner is None:
+        disable_pyperf_psutil()
+        runner = pyperf.Runner()
+
+    # V3 — UUID-based IDs
+    v3_obj_only = SpanComponentsV3(
+        object_type=SpanObjectTypeV3.PROJECT_LOGS,
+        object_id=str(uuid.uuid4()),
+    )
+    v3_full = SpanComponentsV3(
+        object_type=SpanObjectTypeV3.EXPERIMENT,
+        object_id=str(uuid.uuid4()),
+        row_id=str(uuid.uuid4()),
+        span_id=str(uuid.uuid4()),
+        root_span_id=str(uuid.uuid4()),
+    )
+    v3_obj_only_str = v3_obj_only.to_str()
+    v3_full_str = v3_full.to_str()
+
+    runner.bench_func("span_components.v3.to_str[object-only]", v3_obj_only.to_str)
+    runner.bench_func("span_components.v3.to_str[full-uuid]", v3_full.to_str)
+    runner.bench_func("span_components.v3.from_str[object-only]", SpanComponentsV3.from_str, v3_obj_only_str)
+    runner.bench_func("span_components.v3.from_str[full-uuid]", SpanComponentsV3.from_str, v3_full_str)
+
+    # V4 — OTEL hex IDs for span_id (8-byte) and root_span_id (16-byte)
+    v4_obj_only = SpanComponentsV4(
+        object_type=SpanObjectTypeV3.PROJECT_LOGS,
+        object_id=str(uuid.uuid4()),
+    )
+    v4_full_otel = SpanComponentsV4(
+        object_type=SpanObjectTypeV3.EXPERIMENT,
+        object_id=str(uuid.uuid4()),
+        row_id=str(uuid.uuid4()),
+        span_id=secrets.token_hex(8),
+        root_span_id=secrets.token_hex(16),
+    )
+    v4_obj_only_str = v4_obj_only.to_str()
+    v4_full_otel_str = v4_full_otel.to_str()
+
+    runner.bench_func("span_components.v4.to_str[object-only]", v4_obj_only.to_str)
+    runner.bench_func("span_components.v4.to_str[full-otel]", v4_full_otel.to_str)
+    runner.bench_func("span_components.v4.from_str[object-only]", SpanComponentsV4.from_str, v4_obj_only_str)
+    runner.bench_func("span_components.v4.from_str[full-otel]", SpanComponentsV4.from_str, v4_full_otel_str)
+
+    # Cross-version: V4 decoder reading a V3-encoded string (backwards-compat path)
+    runner.bench_func("span_components.v4.from_str[v3-encoded]", SpanComponentsV4.from_str, v3_full_str)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/py/benchmarks/benches/bench_span_lifecycle.py b/py/benchmarks/benches/bench_span_lifecycle.py