@@ -125,7 +125,7 @@ async def test_agent_run_async(memory_logger):
125125 assert chat_span is not None , "chat span not found"
126126
127127 # Check agent span
128- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
128+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
129129 assert agent_span ["metadata" ]["model" ] == "gpt-4o-mini"
130130 assert agent_span ["metadata" ]["provider" ] == "openai"
131131 assert TEST_PROMPT in str (agent_span ["input" ])
@@ -146,6 +146,18 @@ async def test_agent_run_async(memory_logger):
146146 assert agent_span ["metrics" ]["prompt_tokens" ] > 0
147147 assert agent_span ["metrics" ]["completion_tokens" ] > 0
148148
149+ # Regression: no double-counting of cost/tokens. Experiment-level aggregations
150+ # sum metrics across type='llm' spans, so a single agent turn must contribute
151+ # its tokens exactly once. The wrapper agent_run span logs the same usage as
152+ # the leaf chat span; only the leaf should be type=LLM.
153+ llm_spans = [s for s in spans if s ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM ]
154+ assert len (llm_spans ) == 1 , f"expected exactly one LLM-typed span, got { len (llm_spans )} "
155+ assert llm_spans [0 ]["span_id" ] == chat_span ["span_id" ]
156+ llm_prompt_tokens_sum = sum (s ["metrics" ].get ("prompt_tokens" , 0 ) for s in llm_spans )
157+ llm_completion_tokens_sum = sum (s ["metrics" ].get ("completion_tokens" , 0 ) for s in llm_spans )
158+ assert llm_prompt_tokens_sum == chat_span ["metrics" ]["prompt_tokens" ]
159+ assert llm_completion_tokens_sum == chat_span ["metrics" ]["completion_tokens" ]
160+
149161
150162@pytest .mark .vcr
151163@pytest .mark .asyncio
@@ -205,7 +217,7 @@ def test_agent_run_sync(memory_logger):
205217 assert chat_span is not None , "chat span not found"
206218
207219 # Check agent span
208- assert agent_sync_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
220+ assert agent_sync_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
209221 assert agent_sync_span ["metadata" ]["model" ] == "gpt-4o-mini"
210222 assert agent_sync_span ["metadata" ]["provider" ] == "openai"
211223 assert TEST_PROMPT in str (agent_sync_span ["input" ])
@@ -287,7 +299,7 @@ async def fake_run_chat(
287299 assert len (spans ) == 1 , f"Expected 1 CLI span, got { len (spans )} "
288300
289301 cli_span = spans [0 ]
290- assert cli_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
302+ assert cli_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
291303 assert cli_span ["span_attributes" ]["name" ] == "agent_to_cli_sync [cli-agent]"
292304 assert cli_span ["metadata" ]["model" ] == "gpt-4o-mini"
293305 assert cli_span ["metadata" ]["provider" ] == "openai"
@@ -497,7 +509,7 @@ async def test_agent_run_stream(memory_logger):
497509 assert chat_span is not None , "chat span not found"
498510
499511 # Check agent span
500- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
512+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
501513 assert agent_span ["metadata" ]["model" ] == "gpt-4o-mini"
502514 assert "Count from 1 to 5" in str (agent_span ["input" ])
503515 _assert_metrics_are_valid (agent_span ["metrics" ], start , end )
@@ -607,7 +619,7 @@ async def test_direct_model_request(memory_logger, direct):
607619 direct_span = next ((s for s in spans if s ["span_attributes" ]["name" ] == "model_request" ), None )
608620 assert direct_span is not None
609621
610- assert direct_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
622+ assert direct_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
611623 assert direct_span ["metadata" ]["model" ] == "gpt-4o-mini"
612624 assert direct_span ["metadata" ]["provider" ] == "openai"
613625 assert TEST_PROMPT in str (direct_span ["input" ])
@@ -637,7 +649,7 @@ def test_direct_model_request_sync(memory_logger, direct):
637649 # Find the model_request_sync span
638650 span = next ((s for s in spans if s ["span_attributes" ]["name" ] == "model_request_sync" ), None )
639651 assert span is not None , "model_request_sync span not found"
640- assert span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
652+ assert span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
641653 assert span ["metadata" ]["model" ] == "gpt-4o-mini"
642654 assert TEST_PROMPT in str (span ["input" ])
643655 _assert_metrics_are_valid (span ["metrics" ], start , end )
@@ -668,7 +680,7 @@ async def test_direct_model_request_with_settings(memory_logger, direct):
668680 direct_span = next ((s for s in spans if s ["span_attributes" ]["name" ] == "model_request" ), None )
669681 assert direct_span is not None
670682
671- assert direct_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
683+ assert direct_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
672684
673685 # Verify model_settings is in input (NOT metadata)
674686 assert "model_settings" in direct_span ["input" ], "model_settings should be in input"
@@ -713,7 +725,7 @@ async def test_direct_model_request_stream(memory_logger, direct):
713725 direct_span = next ((s for s in spans if s ["span_attributes" ]["name" ] == "model_request_stream" ), None )
714726 assert direct_span is not None
715727
716- assert direct_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
728+ assert direct_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
717729 assert direct_span ["metadata" ]["model" ] == "gpt-4o-mini"
718730 _assert_metrics_are_valid (direct_span ["metrics" ], start , end )
719731
@@ -804,7 +816,7 @@ class MathAnswer(BaseModel):
804816 assert chat_span is not None , "chat span not found"
805817
806818 # Check agent span
807- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
819+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
808820 assert agent_span ["metadata" ]["model" ] == "gpt-4o-mini"
809821 assert agent_span ["metadata" ]["provider" ] == "openai"
810822 assert "10 + 15" in str (agent_span ["input" ])
@@ -1092,7 +1104,7 @@ def test_agent_run_stream_sync(memory_logger):
10921104 assert chat_span is not None , "chat span not found"
10931105
10941106 # Check agent span
1095- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
1107+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
10961108 assert agent_span ["metadata" ]["model" ] == "gpt-4o-mini"
10971109 assert "Count from 1 to 3" in str (agent_span ["input" ])
10981110 _assert_metrics_are_valid (agent_span ["metrics" ], start , end )
@@ -1165,7 +1177,7 @@ async def test_agent_run_stream_events(memory_logger):
11651177 assert agent_span is not None , "agent_run_stream_events span not found"
11661178
11671179 # Check agent span has basic structure
1168- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
1180+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
11691181 assert agent_span ["metadata" ]["model" ] == "gpt-4o-mini"
11701182 assert "5+5" in str (agent_span ["input" ]) or "What" in str (agent_span ["input" ])
11711183 assert agent_span ["metrics" ]["event_count" ] == event_count
@@ -1194,7 +1206,7 @@ def test_direct_model_request_stream_sync(memory_logger, direct):
11941206 assert len (spans ) == 1
11951207
11961208 span = spans [0 ]
1197- assert span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
1209+ assert span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
11981210 assert span ["span_attributes" ]["name" ] == "model_request_stream_sync"
11991211 assert span ["metadata" ]["model" ] == "gpt-4o-mini"
12001212 _assert_metrics_are_valid (span ["metrics" ], start , end )
@@ -1258,7 +1270,7 @@ async def stream_wrapper():
12581270 assert len (spans ) >= 1 , "Should have at least one span even with early break"
12591271
12601272 span = spans [0 ]
1261- assert span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
1273+ assert span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
12621274 assert span ["span_attributes" ]["name" ] == "model_request_stream"
12631275
12641276
@@ -1297,7 +1309,7 @@ async def test_agent_stream_early_break(memory_logger):
12971309
12981310 # Verify at least agent_run_stream span exists and has basic structure
12991311 if agent_span :
1300- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
1312+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
13011313 # Metrics may be incomplete due to early break
13021314 assert "start" in agent_span ["metrics" ]
13031315
@@ -1368,7 +1380,7 @@ async def _buffer_stream() -> LLMStreamResponse:
13681380 assert len (spans ) >= 1 , "Should have at least one span even with early return"
13691381
13701382 span = spans [0 ]
1371- assert span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
1383+ assert span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
13721384 assert span ["span_attributes" ]["name" ] == "model_request_stream"
13731385 assert "start" in span ["metrics" ]
13741386 assert span ["metrics" ]["start" ] >= start
@@ -1446,7 +1458,7 @@ async def _consume_until_final() -> StreamEvent:
14461458 # Find agent_run_stream span
14471459 agent_span = next ((s for s in spans if "agent_run_stream" in s ["span_attributes" ]["name" ]), None )
14481460 assert agent_span is not None , "agent_run_stream span should exist"
1449- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
1461+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
14501462 assert "start" in agent_span ["metrics" ]
14511463
14521464
@@ -1500,7 +1512,7 @@ async def test_agent_with_binary_content(memory_logger):
15001512 assert chat_span is not None , "chat span not found"
15011513
15021514 # Verify basic span structure
1503- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
1515+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
15041516 assert agent_span ["metadata" ]["model" ] == "gpt-4o-mini"
15051517 _assert_metrics_are_valid (agent_span ["metrics" ], start , end )
15061518
@@ -2113,7 +2125,7 @@ class Product(BaseModel):
21132125 assert chat_span is not None , "chat span not found"
21142126
21152127 # Check agent span
2116- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
2128+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
21172129 assert agent_span ["metadata" ]["model" ] == "gpt-4o-mini"
21182130 _assert_metrics_are_valid (agent_span ["metrics" ], start , end )
21192131
@@ -2663,7 +2675,7 @@ async def test_no_model_agent_run(memory_logger):
26632675 assert agent_span is not None , "agent_run span not found"
26642676 assert chat_span is not None , "chat span not found"
26652677
2666- assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .LLM
2678+ assert agent_span ["span_attributes" ]["type" ] == SpanTypeAttribute .TASK
26672679 assert agent_span ["metadata" ]["model" ] == "gpt-4o-mini"
26682680 assert agent_span ["metadata" ]["provider" ] == "openai"
26692681 assert TEST_PROMPT in str (agent_span ["input" ])
0 commit comments