App-usage stability fixes

This commit is contained in:
Dillon DuPont
2025-06-04 09:05:20 -04:00
parent 79bcf9d05d
commit 58a453dc49
3 changed files with 107 additions and 34 deletions

View File

@@ -36,11 +36,21 @@ class Diorama:
cls._ensure_scheduler()
return cls(args).computer
# Dictionary to store cursor positions for each unique app_list hash
_cursor_positions = {}
def __init__(self, app_list):
self.app_list = app_list
self.interface = self.Interface(self)
self.computer = DioramaComputer(self)
self.focus_context = None
# Create a hash for this app_list to use as a key
self.app_list_hash = hash(tuple(sorted(app_list)))
# Initialize cursor position for this app_list if it doesn't exist
if self.app_list_hash not in Diorama._cursor_positions:
Diorama._cursor_positions[self.app_list_hash] = (0, 0)
@classmethod
def _ensure_scheduler(cls):
@@ -67,10 +77,11 @@ class Diorama:
frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist)
focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger)
app_list_hash = hash(tuple(sorted(app_whitelist)))
with focus_context:
try:
if action == "screenshot":
app_whitelist = list(args["app_list"])
logger.info(f"Taking screenshot for apps: {app_whitelist}")
result, img = capture_all_apps(
app_whitelist=app_whitelist,
@@ -82,8 +93,15 @@ class Diorama:
future.set_result((result, img))
# Mouse actions
elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]:
x = args.get("x")
y = args.get("y")
# Get last cursor position for this app_list hash
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
x = args.get("x", last_pos[0])
y = args.get("y", last_pos[1])
# Update the cursor position for this app_list hash
Diorama._cursor_positions[app_list_hash] = (x, y)
duration = args.get("duration", 0.5)
if action == "left_click":
await automation_handler.left_click(x, y)
@@ -98,6 +116,10 @@ class Diorama:
if future:
future.set_result(None)
elif action in ["scroll_up", "scroll_down"]:
# Move cursor to last known position for this app_list hash
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
await automation_handler.move_cursor(*last_pos)
clicks = args.get("clicks", 1)
if action == "scroll_up":
await automation_handler.scroll_up(clicks)

View File

@@ -37,7 +37,7 @@ class DioramaComputerInterface:
raise RuntimeError("Computer interface not initialized. Call run() first.")
result = await iface.diorama_cmd(action, arguments)
if not result.get("success"):
raise RuntimeError(f"Diorama command failed: {result.get('error')}")
raise RuntimeError(f"Diorama command failed: {result.get('error')}\n{result.get('trace')}")
return result.get("result")
async def screenshot(self, as_bytes=True):

View File

@@ -463,7 +463,7 @@ async def execute(name, action, arguments):
elif action == "left_click":
if "x" in arguments and "y" in arguments:
await computer.interface.move_cursor(arguments["x"], arguments["y"])
await computer.interface.left_click()
await computer.interface.left_click(arguments["x"], arguments["y"])
await asyncio.sleep(0.5)
elif action == "right_click":
if "x" in arguments and "y" in arguments:
@@ -528,43 +528,75 @@ async def execute(name, action, arguments):
return results
async def handle_init_computer(os_choice: str):
"""Initialize the computer instance and tools for macOS or Ubuntu"""
async def handle_init_computer(os_choice: str, app_list=None, provider="lume"):
"""Initialize the computer instance and tools for macOS or Ubuntu
Args:
os_choice: The OS to use ("macOS" or "Ubuntu")
app_list: Optional list of apps to focus on using the app-use experiment
provider: The provider to use ("lume" or "self")
"""
global computer, tool_call_logs, tools
# Check if we should enable app-use experiment
use_app_experiment = app_list and len(app_list) > 0
experiments = ["app-use"] if use_app_experiment else None
# Determine if we should use host computer server
use_host_computer_server = provider == "self"
if os_choice == "Ubuntu":
computer = Computer(
image="ubuntu-noble-vanilla:latest",
os_type="linux",
provider_type=VMProviderType.LUME,
display="1024x768",
memory="8GB",
cpu="4"
)
os_type_str = "linux"
image_str = "ubuntu-noble-vanilla:latest"
else:
os_type_str = "macos"
image_str = "macos-sequoia-cua:latest"
# Create computer instance with appropriate configuration
if use_host_computer_server:
computer = Computer(
image="macos-sequoia-cua:latest",
os_type="macos",
os_type=os_type_str,
use_host_computer_server=True,
experiments=experiments
)
else:
computer = Computer(
image=image_str,
os_type=os_type_str,
provider_type=VMProviderType.LUME,
display="1024x768",
memory="8GB",
cpu="4"
cpu="4",
experiments=experiments
)
os_type_str = "macos"
image_str = "macos-sequoia-cua:latest"
await computer.run()
# If app list is provided, create desktop from apps
if use_app_experiment:
computer = computer.create_desktop_from_apps(app_list)
# Log computer initialization as a tool call
result = await execute("computer", "initialize", {
init_params = {
"os": os_type_str,
"image": image_str,
"display": "1024x768",
"memory": "8GB",
"cpu": "4"
})
"provider": provider
}
# Add VM-specific parameters if not using host computer server
if not use_host_computer_server:
init_params.update({
"image": image_str,
"display": "1024x768",
"memory": "8GB",
"cpu": "4"
})
# Add app list to the log if provided
if use_app_experiment:
init_params["apps"] = app_list
init_params["experiments"] = ["app-use"]
result = await execute("computer", "initialize", init_params)
return result["screenshot"], json.dumps(tool_call_logs, indent=2)
@@ -1029,12 +1061,31 @@ def create_gradio_ui():
setup_status = gr.Textbox(label="Setup Status", value="")
with gr.Group():
os_choice = gr.Radio(
label="OS",
choices=["macOS", "Ubuntu"],
value="macOS",
interactive=False # disable until the ubuntu image is ready
)
with gr.Accordion("Computer Configuration", open=False):
with gr.Row():
os_choice = gr.Radio(
label="OS",
choices=["macOS", "Ubuntu"],
value="macOS",
interactive=False # disable until the ubuntu image is ready
)
# Provider selection radio
provider_choice = gr.Radio(
label="Provider",
choices=["lume", "self"],
value="lume",
info="'lume' uses a VM, 'self' uses the host computer server"
)
# App filtering dropdown for app-use experiment
app_filter = gr.Dropdown(
label="Filter by apps (App-Use)",
multiselect=True,
allow_custom_value=True,
info="When apps are selected, the computer will focus on those apps using the app-use experiment"
)
start_btn = gr.Button("Initialize Computer")
with gr.Group():
@@ -1199,7 +1250,7 @@ def create_gradio_ui():
)
img.select(handle_click, inputs=[img, click_type], outputs=[img, action_log])
start_btn.click(handle_init_computer, inputs=[os_choice], outputs=[img, action_log])
start_btn.click(handle_init_computer, inputs=[os_choice, app_filter, provider_choice], outputs=[img, action_log])
wait_btn.click(handle_wait, outputs=[img, action_log])
# DONE and FAIL buttons just do a placeholder action