Merge pull request #522 from sarinali/jagjeevan/344-merged

Jagjeevan's Fix + Merged Main for 4o Model Fix
This commit is contained in:
James Murdza
2025-10-28 15:58:26 -07:00
committed by GitHub
4 changed files with 24 additions and 10 deletions
+6 -2
View File
@@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol):
"""Get screen dimensions as (width, height)."""
...
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
async def screenshot(self, text: Optional[str] = None) -> str:
"""Take a screenshot and return as base64 string.
Args:
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
"""
...
async def click(self, x: int, y: int, button: str = "left") -> None:
+6 -2
View File
@@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler):
screen_size = await self.interface.get_screen_size()
return screen_size["width"], screen_size["height"]
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
async def screenshot(self, text: Optional[str] = None) -> str:
"""Take a screenshot and return as base64 string.
Args:
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
"""
assert self.interface is not None
screenshot_bytes = await self.interface.screenshot()
return base64.b64encode(screenshot_bytes).decode("utf-8")
+6 -2
View File
@@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler):
return self._last_screenshot_size
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
async def screenshot(self, text: Optional[str] = None) -> str:
"""Take a screenshot and return as base64 string.
Args:
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
"""
result = await self._call_function(self.functions["screenshot"])
b64_str = self._to_b64_str(result) # type: ignore
+6 -4
View File
@@ -243,18 +243,20 @@ async def replace_computer_call_with_function(
"id": item.get("id"),
"call_id": item.get("call_id"),
"status": "completed",
# Fall back to string representation
"content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
}
]
elif item_type == "computer_call_output":
# Simple conversion: computer_call_output -> function_call_output
output = item.get("output")
if isinstance(output, dict):
output = [output]
return [
{
"type": "function_call_output",
"call_id": item.get("call_id"),
"content": [item.get("output")],
"output": output,
"id": item.get("id"),
"status": "completed",
}