Merge pull request #522 from sarinali/jagjeevan/344-merged

Jagjeevan's Fix + Merged Main for 4o Model Fix
2026-04-25 07:58:33 -05:00 · 2025-10-28 15:58:26 -07:00
parent c92ebbe817 11e8f891c3
commit d509d013af
4 changed files with 24 additions and 10 deletions
@@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol):
        """Get screen dimensions as (width, height)."""
        ...

-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
        ...

    async def click(self, x: int, y: int, button: str = "left") -> None:
@@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler):
        screen_size = await self.interface.get_screen_size()
        return screen_size["width"], screen_size["height"]

-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
        assert self.interface is not None
        screenshot_bytes = await self.interface.screenshot()
        return base64.b64encode(screenshot_bytes).decode("utf-8")
@@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler):

        return self._last_screenshot_size

-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
        result = await self._call_function(self.functions["screenshot"])
        b64_str = self._to_b64_str(result)  # type: ignore

@@ -243,18 +243,20 @@ async def replace_computer_call_with_function(
                "id": item.get("id"),
                "call_id": item.get("call_id"),
                "status": "completed",
-                # Fall back to string representation
-                "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
            }
        ]

    elif item_type == "computer_call_output":
-        # Simple conversion: computer_call_output -> function_call_output
+        output = item.get("output")
+
+        if isinstance(output, dict):
+            output = [output]
+
        return [
            {
                "type": "function_call_output",
                "call_id": item.get("call_id"),
-                "content": [item.get("output")],
+                "output": output,
                "id": item.get("id"),
                "status": "completed",
            }