Feat: separate tool_call_item and tool_call_output_item in stream events (#974)

gdisk · web-flow · commit 1f75464afdf2 · 2025-08-01T11:03:45.000-04:00
### Summary During my analysis of the streaming API (run_streamed) results, I noticed that tool_call_item and tool_call_output_item events are currently being emitted concurrently upon tool call completion (as evidenced in [ #831](#831)). This implementation conflates what should logically be distinct events. The current PR addresses this by properly separating these event triggers to better reflect the actual workflow. ### Test plan The test file is created in `tests` named `test_stream_events.py`.Run the test script below to test. ```bash pytest -s test_stream_events.py ``` The test result is: ```text ======================================================================== test session starts ======================================================================== platform win32 -- Python 3.12.10, pytest-8.3.5, pluggy-1.5.0 rootdir: D:\moon\projects\openai-agents-python configfile: pyproject.toml plugins: anyio-4.9.0, inline-snapshot-0.22.3, asyncio-0.26.0, mock-3.14.0 asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=session, asyncio_default_test_loop_scope=function collected 1 item tests\test_stream_events.py === Run starting === Agent updated: Joker -- Message output: a_message -- Tool was called at 1751271106971851300 -- Tool output: success! at 1751271109987313900 -- Message output: done === Run complete === . ========================================================================= 1 passed in 3.07s ========================================================================= ``` ### Issue number [ #831](#831) ### Checks - [x] I've added new tests (if relevant) - [x] I've run `make lint` and `make format` - [x] I've made sure tests pass
diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py
@@ -914,12 +914,12 @@ async def run_single_output_guardrail(
             return result
 
     @classmethod
-    def stream_step_result_to_queue(
+    def stream_step_items_to_queue(
         cls,
-        step_result: SingleStepResult,
+        new_step_items: list[RunItem],
         queue: asyncio.Queue[StreamEvent | QueueCompleteSentinel],
     ):
-        for item in step_result.new_step_items:
+        for item in new_step_items:
             if isinstance(item, MessageOutputItem):
                 event = RunItemStreamEvent(item=item, name="message_output_created")
             elif isinstance(item, HandoffCallItem):
@@ -944,6 +944,14 @@ def stream_step_result_to_queue(
             if event:
                 queue.put_nowait(event)
 
+    @classmethod
+    def stream_step_result_to_queue(
+        cls,
+        step_result: SingleStepResult,
+        queue: asyncio.Queue[StreamEvent | QueueCompleteSentinel],
+    ):
+        cls.stream_step_items_to_queue(step_result.new_step_items, queue)
+
     @classmethod
     async def _check_for_final_output_from_tools(
         cls,
diff --git a/src/agents/run.py b/src/agents/run.py
@@ -904,10 +904,9 @@ async def _run_single_turn_streamed(
             raise ModelBehaviorError("Model did not produce a final response!")
 
         # 3. Now, we can process the turn as we do in the non-streaming case
-        single_step_result = await cls._get_single_step_result_from_response(
+        return await cls._get_single_step_result_from_streamed_response(
             agent=agent,
-            original_input=streamed_result.input,
-            pre_step_items=streamed_result.new_items,
+            streamed_result=streamed_result,
             new_response=final_response,
             output_schema=output_schema,
             all_tools=all_tools,
@@ -918,9 +917,6 @@ async def _run_single_turn_streamed(
             tool_use_tracker=tool_use_tracker,
         )
 
-        RunImpl.stream_step_result_to_queue(single_step_result, streamed_result._event_queue)
-        return single_step_result
-
     @classmethod
     async def _run_single_turn(
         cls,
@@ -1023,6 +1019,57 @@ async def _get_single_step_result_from_response(
             run_config=run_config,
         )
 
+    @classmethod
+    async def _get_single_step_result_from_streamed_response(
+        cls,
+        *,
+        agent: Agent[TContext],
+        all_tools: list[Tool],
+        streamed_result: RunResultStreaming,
+        new_response: ModelResponse,
+        output_schema: AgentOutputSchemaBase | None,
+        handoffs: list[Handoff],
+        hooks: RunHooks[TContext],
+        context_wrapper: RunContextWrapper[TContext],
+        run_config: RunConfig,
+        tool_use_tracker: AgentToolUseTracker,
+    ) -> SingleStepResult:
+
+        original_input = streamed_result.input
+        pre_step_items = streamed_result.new_items
+        event_queue = streamed_result._event_queue
+
+        processed_response = RunImpl.process_model_response(
+            agent=agent,
+            all_tools=all_tools,
+            response=new_response,
+            output_schema=output_schema,
+            handoffs=handoffs,
+        )
+        new_items_processed_response = processed_response.new_items
+        tool_use_tracker.add_tool_use(agent, processed_response.tools_used)
+        RunImpl.stream_step_items_to_queue(new_items_processed_response, event_queue)
+
+        single_step_result = await RunImpl.execute_tools_and_side_effects(
+            agent=agent,
+            original_input=original_input,
+            pre_step_items=pre_step_items,
+            new_response=new_response,
+            processed_response=processed_response,
+            output_schema=output_schema,
+            hooks=hooks,
+            context_wrapper=context_wrapper,
+            run_config=run_config,
+        )
+        new_step_items = [
+            item
+            for item in single_step_result.new_step_items
+            if item not in new_items_processed_response
+        ]
+        RunImpl.stream_step_items_to_queue(new_step_items, event_queue)
+
+        return single_step_result
+
     @classmethod
     async def _run_input_guardrails(
         cls,
diff --git a/tests/test_stream_events.py b/tests/test_stream_events.py
@@ -0,0 +1,53 @@
+import asyncio
+import time
+
+import pytest
+
+from agents import Agent, Runner, function_tool
+
+from .fake_model import FakeModel
+from .test_responses import get_function_tool_call, get_text_message
+
+
+@function_tool
+async def foo() -> str:
+    await asyncio.sleep(3)
+    return "success!"
+
+@pytest.mark.asyncio
+async def test_stream_events_main():
+    model = FakeModel()
+    agent = Agent(
+        name="Joker",
+        model=model,
+        tools=[foo],
+    )
+
+    model.add_multiple_turn_outputs(
+        [
+            # First turn: a message and tool call
+            [
+                get_text_message("a_message"),
+                get_function_tool_call("foo", ""),
+            ],
+            # Second turn: text message
+            [get_text_message("done")],
+        ]
+    )
+
+    result = Runner.run_streamed(
+        agent,
+        input="Hello",
+    )
+    tool_call_start_time = -1
+    tool_call_end_time = -1
+    async for event in result.stream_events():
+        if event.type == "run_item_stream_event":
+            if event.item.type == "tool_call_item":
+                tool_call_start_time = time.time_ns()
+            elif event.item.type == "tool_call_output_item":
+                tool_call_end_time = time.time_ns()
+
+    assert tool_call_start_time > 0, "tool_call_item was not observed"
+    assert tool_call_end_time > 0, "tool_call_output_item was not observed"
+    assert tool_call_start_time < tool_call_end_time, "Tool call ended before or equals it started?"
diff --git a/uv.lock b/uv.lock