From 5846f0ef085794d10be185964d94cce545ac6782 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Sat, 31 May 2025 12:40:02 -0400 Subject: [PATCH] removed extra files --- notebooks/diorama/__init__.py | 3 - notebooks/diorama/agent_example.py | 68 -- notebooks/diorama/diorama.py | 392 -------- notebooks/diorama/diorama_computer.py | 27 - notebooks/diorama/draw.py | 1211 ------------------------- notebooks/diorama/safezone.py | 199 ---- 6 files changed, 1900 deletions(-) delete mode 100644 notebooks/diorama/__init__.py delete mode 100644 notebooks/diorama/agent_example.py delete mode 100644 notebooks/diorama/diorama.py delete mode 100644 notebooks/diorama/diorama_computer.py delete mode 100644 notebooks/diorama/draw.py delete mode 100644 notebooks/diorama/safezone.py diff --git a/notebooks/diorama/__init__.py b/notebooks/diorama/__init__.py deleted file mode 100644 index 3608c381..00000000 --- a/notebooks/diorama/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .diorama import Diorama - -__all__ = ["Diorama"] diff --git a/notebooks/diorama/agent_example.py b/notebooks/diorama/agent_example.py deleted file mode 100644 index 914306a7..00000000 --- a/notebooks/diorama/agent_example.py +++ /dev/null @@ -1,68 +0,0 @@ -import asyncio -from diorama import Diorama -from pathlib import Path -from dotenv import load_dotenv -load_dotenv(Path("~/cua/.env.local").expanduser()) - -from agent import AgentLoop, ComputerAgent as Agent, LLM, LLMProvider -from PIL import Image -import rpack - -async def make_mosaic(dioramas): - sizes = [] - for d in dioramas: - size = await d.interface.get_screen_size() - sizes.append((size['width'], size['height'])) - positions = rpack.pack(sizes) - max_x = max(x + w for (x, y), (w, h) in zip(positions, sizes)) - max_y = max(y + h for (x, y), (w, h) in zip(positions, sizes)) - mosaic = Image.new("RGBA", (max_x, max_y), (30, 30, 30, 255)) - draw_positions = positions - return mosaic, draw_positions - -async def main(): - # diorama's are virtual desktops, they allow you to control multiple apps at once - diorama1 = Diorama.create_from_apps("Safari") - diorama2 = Diorama.create_from_apps("Notes") - diorama3 = Diorama.create_from_apps("Calculator") - diorama4 = Diorama.create_from_apps("Terminal") - - # create agents - agents = [ - diorama1.agent.openai(), - diorama2.agent.openai(), - diorama3.agent.openai(),modif - diorama4.agent.openai() - ] - dioramas = [diorama1, diorama2, diorama3, diorama4] - mosaic, draw_positions = await make_mosaic(dioramas) - mosaic.save(Path("~/cua/notebooks/app_screenshots/mosaic.png").expanduser()) - - tasks = [ - "In Safari, find a cat picture", - "In Notes, make a note named 'Test' and draw an ASCII dog", - "In Calculator, add 2 + 2", - "In Terminal, type 'ls' and press enter" - ] - - async def run_agent(agent, task, diorama_idx): - diorama = dioramas[diorama_idx] - - # start with a screenshot - screenshot = await diorama.interface.screenshot(as_bytes=False) - mosaic.paste(screenshot, draw_positions[diorama_idx]) - mosaic.save(Path("~/cua/notebooks/app_screenshots/mosaic.png").expanduser()) - - async for response in agent.run(task): - print(response) - - # update mosaic - screenshot = await diorama.interface.screenshot(as_bytes=False) - mosaic.paste(screenshot, draw_positions[diorama_idx]) - mosaic.save(Path("~/cua/notebooks/app_screenshots/mosaic.png").expanduser()) - - # run agents - await asyncio.gather(*[run_agent(agent, task, idx) for idx, (agent, task) in enumerate(zip(agents, tasks))]) - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file diff --git a/notebooks/diorama/diorama.py b/notebooks/diorama/diorama.py deleted file mode 100644 index f4e5cbdf..00000000 --- a/notebooks/diorama/diorama.py +++ /dev/null @@ -1,392 +0,0 @@ -#!/usr/bin/env python3 -"""Diorama: A virtual desktop manager for macOS""" - -import os -import asyncio -import logging -import sys -import io -from typing import Union -from PIL import Image, ImageDraw - -from draw import capture_all_apps, AppActivationContext, get_frontmost_and_active_app, get_all_windows, get_running_apps - -from diorama_computer import DioramaComputer -from computer_server.handlers.macos import * -from agent import ComputerAgent, LLM, LLMProvider, AgentLoop - -# simple, nicely formatted logging -logging.basicConfig( - level=logging.INFO, - format='[%(asctime)s] [%(levelname)s] %(message)s', - datefmt='%H:%M:%S', - stream=sys.stdout -) -logger = logging.getLogger("diorama.virtual_desktop") - -automation_handler = MacOSAutomationHandler() - -class AgentFactory: - def __init__(self, diorama): - self.diorama = diorama - - def create_agent(self, loop: AgentLoop, model: LLM): - return ComputerAgent( - computer=self.diorama.computer, - loop=loop, - model=model - ) - - def openai(self): - return self.create_agent(AgentLoop.OPENAI, LLM( - provider=LLMProvider.OPENAI, - name="computer-use-preview" - )) - - def anthropic(self): - return self.create_agent(AgentLoop.ANTHROPIC, LLM( - provider=LLMProvider.ANTHROPIC, - )) - - def openai_omni(self, model_name): - return self.create_agent(AgentLoop.OMNI, LLM( - provider=LLMProvider.OPENAI, - name=model_name - )) - - def uitars(self): - return self.create_agent(AgentLoop.UITARS, LLM( - provider=LLMProvider.OAICOMPAT, - name="tgi", - provider_base_url=os.getenv("UITARS_BASE_URL") - )) - -class Diorama: - _scheduler_queue = None - _scheduler_task = None - _loop = None - _scheduler_started = False - - @classmethod - def create_from_apps(cls, *args) -> DioramaComputer: - cls._ensure_scheduler() - return cls(args).computer - - def __init__(self, app_list): - self.app_list = app_list - self.agent = AgentFactory(self) - self.interface = self.Interface(self) - self.computer = DioramaComputer(self) - self.focus_context = None - - @classmethod - def _ensure_scheduler(cls): - if not cls._scheduler_started: - logger.info("Starting Diorama scheduler loop…") - cls._scheduler_queue = asyncio.Queue() - cls._loop = asyncio.get_event_loop() - cls._scheduler_task = cls._loop.create_task(cls._scheduler_loop()) - cls._scheduler_started = True - - @classmethod - async def _scheduler_loop(cls): - while True: - cmd = await cls._scheduler_queue.get() - action = cmd.get("action") - args = cmd.get("arguments", {}) - future = cmd.get("future") - logger.info(f"Processing command: {action} | args={args}") - - app_whitelist = args.get("app_list", []) - - all_windows = get_all_windows() - running_apps = get_running_apps() - frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist) - focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger) - - with focus_context: - try: - if action == "screenshot": - app_whitelist = list(args["app_list"]) - logger.info(f"Taking screenshot for apps: {app_whitelist}") - result, img = capture_all_apps( - app_whitelist=app_whitelist, - save_to_disk=False, - take_focus=False - ) - logger.info("Screenshot complete.") - if future: - future.set_result((result, img)) - # Mouse actions - elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]: - x = args.get("x") - y = args.get("y") - duration = args.get("duration", 0.5) - if action == "left_click": - await automation_handler.left_click(x, y) - elif action == "right_click": - await automation_handler.right_click(x, y) - elif action == "double_click": - await automation_handler.double_click(x, y) - elif action == "move_cursor": - await automation_handler.move_cursor(x, y) - elif action == "drag_to": - await automation_handler.drag_to(x, y, duration=duration) - if future: - future.set_result(None) - # Keyboard actions - elif action == "type_text": - text = args.get("text") - await automation_handler.type_text(text) - if future: - future.set_result(None) - elif action == "press_key": - key = args.get("key") - await automation_handler.press_key(key) - if future: - future.set_result(None) - elif action == "hotkey": - keys = args.get("keys", []) - await automation_handler.hotkey(keys) - if future: - future.set_result(None) - elif action == "get_cursor_position": - pos = await automation_handler.get_cursor_position() - if future: - future.set_result(pos) - else: - logger.warning(f"Unknown action: {action}") - if future: - future.set_exception(ValueError(f"Unknown action: {action}")) - except Exception as e: - logger.error(f"Exception during {action}: {e}", exc_info=True) - if future: - future.set_exception(e) - - class Interface(): - def __init__(self, diorama): - self._diorama = diorama - - self._scene_hitboxes = [] - self._scene_size = None - - async def _send_cmd(self, action, arguments=None): - Diorama._ensure_scheduler() - loop = asyncio.get_event_loop() - future = loop.create_future() - logger.info(f"Enqueuing {action} command for apps: {self._diorama.app_list}") - await Diorama._scheduler_queue.put({ - "action": action, - "arguments": {"app_list": self._diorama.app_list, **(arguments or {})}, - "future": future - }) - try: - return await future - except asyncio.CancelledError: - logger.warning(f"Command was cancelled: {action}") - return None - - async def screenshot(self, as_bytes: bool = True) -> Union[bytes, Image]: - result, img = await self._send_cmd("screenshot") - self._scene_hitboxes = result.get("hitboxes", []) - self._scene_size = img.size - - if as_bytes: - # PIL Image to bytes - img_byte_arr = io.BytesIO() - img.save(img_byte_arr, format="PNG") - img_byte_arr = img_byte_arr.getvalue() - return img_byte_arr - else: - return img - - async def left_click(self, x, y): - sx, sy = await self.to_screen_coordinates(x, y) - await self._send_cmd("left_click", {"x": sx, "y": sy}) - - async def right_click(self, x, y): - sx, sy = await self.to_screen_coordinates(x, y) - await self._send_cmd("right_click", {"x": sx, "y": sy}) - - async def double_click(self, x, y): - sx, sy = await self.to_screen_coordinates(x, y) - await self._send_cmd("double_click", {"x": sx, "y": sy}) - - async def move_cursor(self, x, y): - sx, sy = await self.to_screen_coordinates(x, y) - await self._send_cmd("move_cursor", {"x": sx, "y": sy}) - - async def drag_to(self, x, y, duration=0.5): - sx, sy = await self.to_screen_coordinates(x, y) - await self._send_cmd("drag_to", {"x": sx, "y": sy, "duration": duration}) - - async def get_cursor_position(self): - return await self._send_cmd("get_cursor_position") - - async def type_text(self, text): - await self._send_cmd("type_text", {"text": text}) - - async def press_key(self, key): - await self._send_cmd("press_key", {"key": key}) - - async def hotkey(self, *keys): - await self._send_cmd("hotkey", {"keys": list(keys)}) - - async def get_screen_size(self) -> dict[str, int]: - if not self._scene_size: - await self.screenshot() - return { "width": self._scene_size[0], "height": self._scene_size[1] } - - async def to_screen_coordinates(self, x: float, y: float) -> tuple[float, float]: - """Convert screenshot coordinates to screen coordinates. - - Args: - x: X absolute coordinate in screenshot space - y: Y absolute coordinate in screenshot space - - Returns: - tuple[float, float]: (x, y) absolute coordinates in screen space - """ - if not self._scene_hitboxes: - await self.screenshot() # get hitboxes - # Try all hitboxes - for h in self._scene_hitboxes[::-1]: - rect_from = h.get("hitbox") - rect_to = h.get("target") - if not rect_from or len(rect_from) != 4: - continue - - # check if (x, y) is inside rect_from - x0, y0, x1, y1 = rect_from - if x0 <= x <= x1 and y0 <= y <= y1: - logger.info(f"Found hitbox: {h}") - # remap (x, y) to rect_to - tx0, ty0, tx1, ty1 = rect_to - - # calculate offset from x0, y0 - offset_x = x - x0 - offset_y = y - y0 - - # remap offset to rect_to - tx = tx0 + offset_x - ty = ty0 + offset_y - - return tx, ty - return x, y - - async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, float]: - """Convert screen coordinates to screenshot coordinates. - - Args: - x: X absolute coordinate in screen space - y: Y absolute coordinate in screen space - - Returns: - tuple[float, float]: (x, y) absolute coordinates in screenshot space - """ - if not self._scene_hitboxes: - await self.screenshot() # get hitboxes - # Try all hitboxes - for h in self._scene_hitboxes[::-1]: - rect_from = h.get("target") - rect_to = h.get("hitbox") - if not rect_from or len(rect_from) != 4: - continue - - # check if (x, y) is inside rect_from - x0, y0, x1, y1 = rect_from - if x0 <= x <= x1 and y0 <= y <= y1: - # remap (x, y) to rect_to - tx0, ty0, tx1, ty1 = rect_to - - # calculate offset from x0, y0 - offset_x = x - x0 - offset_y = y - y0 - - # remap offset to rect_to - tx = tx0 + offset_x - ty = ty0 + offset_y - - return tx, ty - return x, y - -import pyautogui -import time - -async def main(): - desktop1 = Diorama.create_from_apps(["Discord", "Notes"]) - desktop2 = Diorama.create_from_apps(["Terminal"]) - - img1 = await desktop1.interface.screenshot(as_bytes=False) - img2 = await desktop2.interface.screenshot(as_bytes=False) - - img1.save("app_screenshots/desktop1.png") - img2.save("app_screenshots/desktop2.png") - # Initialize Diorama desktop - desktop3 = Diorama.create_from_apps("Safari") - screen_size = await desktop3.interface.get_screen_size() - print(screen_size) - - # Take initial screenshot - img = await desktop3.interface.screenshot(as_bytes=False) - img.save("app_screenshots/desktop3.png") - - # Prepare hitboxes and draw on the single screenshot - hitboxes = desktop3.interface._scene_hitboxes[::-1] - base_img = img.copy() - draw = ImageDraw.Draw(base_img) - for h in hitboxes: - rect = h.get("hitbox") - if not rect or len(rect) != 4: - continue - draw.rectangle(rect, outline="red", width=2) - - # Track and draw mouse position in real time (single screenshot size) - last_mouse_pos = None - print("Tracking mouse... Press Ctrl+C to stop.") - try: - while True: - mouse_x, mouse_y = pyautogui.position() - if last_mouse_pos != (mouse_x, mouse_y): - last_mouse_pos = (mouse_x, mouse_y) - # Map to screenshot coordinates - sx, sy = await desktop3.interface.to_screenshot_coordinates(mouse_x, mouse_y) - # Draw on a copy of the screenshot - frame = base_img.copy() - frame_draw = ImageDraw.Draw(frame) - frame_draw.ellipse((sx-5, sy-5, sx+5, sy+5), fill="blue", outline="blue") - # Save the frame - frame.save("app_screenshots/desktop3_mouse.png") - print(f"Mouse at screen ({mouse_x}, {mouse_y}) -> screenshot ({sx:.1f}, {sy:.1f})") - time.sleep(0.05) # Throttle updates to ~20 FPS - except KeyboardInterrupt: - print("Stopped tracking.") - - draw.text((rect[0], rect[1]), str(idx), fill="red") - - canvas.save("app_screenshots/desktop3_hitboxes.png") - - - - # move mouse in a square spiral around the screen - import math - import random - - step = 20 # pixels per move - dot_radius = 10 - width = screen_size["width"] - height = screen_size["height"] - x, y = 0, 10 - - while x < width and y < height: - await desktop3.interface.move_cursor(x, y) - img = await desktop3.interface.screenshot(as_bytes=False) - draw = ImageDraw.Draw(img) - draw.ellipse((x-dot_radius, y-dot_radius, x+dot_radius, y+dot_radius), fill="red") - img.save("current.png") - await asyncio.sleep(0.03) - x += step - y = math.sin(x / width * math.pi * 2) * 50 + 25 - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/notebooks/diorama/diorama_computer.py b/notebooks/diorama/diorama_computer.py deleted file mode 100644 index 1ddad0a7..00000000 --- a/notebooks/diorama/diorama_computer.py +++ /dev/null @@ -1,27 +0,0 @@ -import asyncio - -class DioramaComputer: - """ - A minimal Computer-like interface for Diorama, compatible with ComputerAgent. - Implements _initialized, run(), and __aenter__ for agent compatibility. - """ - def __init__(self, diorama): - self.diorama = diorama - self.interface = self.diorama.interface - self.agent = self.diorama.agent - self._initialized = False - - async def __aenter__(self): - # Ensure the event loop is running (for compatibility) - try: - asyncio.get_running_loop() - except RuntimeError: - asyncio.set_event_loop(asyncio.new_event_loop()) - self._initialized = True - return self - - async def run(self): - # This is a stub for compatibility - if not self._initialized: - await self.__aenter__() - return self diff --git a/notebooks/diorama/draw.py b/notebooks/diorama/draw.py deleted file mode 100644 index d4097164..00000000 --- a/notebooks/diorama/draw.py +++ /dev/null @@ -1,1211 +0,0 @@ -#!/usr/bin/env python3 -"""Diorama Renderer - A tool for rendering selective views of macOS desktops - -This script renders filtered views of the macOS desktop, preserving only selected applications -while maintaining system UI elements like menubar and dock. Each "diorama" shows a consistent -view of the system while isolating specific applications. - -The image is "smart resized" to remove any empty space around the menubar and dock. - -Key features: -- Captures shared window state, z-order and position information -- Filters windows by application based on whitelist -- Preserves system context (menubar, dock) in each view -- Preserves menu-owning / keyboard-focused window in each view -- Supports parallel views of the same desktop for multi-agent systems -""" - -import sys -import os -import time -import argparse -from typing import List, Dict, Any, Optional, Tuple -import json -from PIL import Image, ImageDraw -import io -import asyncio -import functools -import logging - -# simple, nicely formatted logging -logging.basicConfig( - level=logging.INFO, - format='[%(asctime)s] [%(levelname)s] %(message)s', - datefmt='%H:%M:%S', - stream=sys.stdout -) -logger = logging.getLogger("diorama.draw") - - -from safezone import ( - get_menubar_bounds, - get_dock_bounds, -) - -# Timing decorator for profiling -def timing_decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - start_time = time.time() - result = func(*args, **kwargs) - end_time = time.time() - elapsed_time = end_time - start_time - logger.debug(f"Function {func.__name__} took {elapsed_time:.4f} seconds to run") - return result - return wrapper - -# Import Objective-C bridge libraries -try: - import Quartz - import AppKit - from ApplicationServices import ( - AXUIElementCreateSystemWide, # type: ignore - AXUIElementCreateApplication, # type: ignore - AXUIElementCopyAttributeValue, # type: ignore - AXUIElementCopyAttributeValues, # type: ignore - kAXFocusedWindowAttribute, # type: ignore - kAXWindowsAttribute, # type: ignore - kAXMainWindowAttribute, # type: ignore - kAXChildrenAttribute, # type: ignore - kAXRoleAttribute, # type: ignore - kAXTitleAttribute, # type: ignore - kAXValueAttribute, # type: ignore - kAXDescriptionAttribute, # type: ignore - kAXEnabledAttribute, # type: ignore - kAXPositionAttribute, # type: ignore - kAXSizeAttribute, # type: ignore - kAXErrorSuccess, # type: ignore - AXValueGetType, # type: ignore - kAXValueCGSizeType, # type: ignore - kAXValueCGPointType, # type: ignore - kAXValueCFRangeType, # type: ignore - AXUIElementGetTypeID, # type: ignore - AXValueGetValue, # type: ignore - kAXVisibleChildrenAttribute, # type: ignore - kAXRoleDescriptionAttribute, # type: ignore - kAXFocusedApplicationAttribute, # type: ignore - kAXFocusedUIElementAttribute, # type: ignore - kAXSelectedTextAttribute, # type: ignore - kAXSelectedTextRangeAttribute, # type: ignore - ) - from AppKit import NSWorkspace, NSApplication, NSApp, NSRunningApplication - import Foundation - from Foundation import NSObject, NSMakeRect - import objc -except ImportError: - logger.error("Error: This script requires PyObjC to be installed.") - logger.error("Please install it with: pip install pyobjc") - sys.exit(1) - -# Constants for accessibility API -kAXErrorSuccess = 0 -kAXRoleAttribute = "AXRole" -kAXTitleAttribute = "AXTitle" -kAXValueAttribute = "AXValue" -kAXWindowsAttribute = "AXWindows" -kAXFocusedAttribute = "AXFocused" -kAXPositionAttribute = "AXPosition" -kAXSizeAttribute = "AXSize" -kAXChildrenAttribute = "AXChildren" -kAXMenuBarAttribute = "AXMenuBar" -kAXMenuBarItemAttribute = "AXMenuBarItem" - -# Constants for window properties -kCGWindowLayer = "kCGWindowLayer" # Z-order information (lower values are higher in the stack) -kCGWindowAlpha = "kCGWindowAlpha" # Window opacity - -# Constants for application activation options -NSApplicationActivationOptions = { - "regular": 0, # Default activation - "bringing_all_windows_forward": 1 << 0, # NSApplicationActivateAllWindows - "ignoring_other_apps": 1 << 1 # NSApplicationActivateIgnoringOtherApps -} - - -def CFAttributeToPyObject(attrValue): - def list_helper(list_value): - list_builder = [] - for item in list_value: - list_builder.append(CFAttributeToPyObject(item)) - return list_builder - - def number_helper(number_value): - success, int_value = Foundation.CFNumberGetValue( # type: ignore - number_value, Foundation.kCFNumberIntType, None # type: ignore - ) - if success: - return int(int_value) - - success, float_value = Foundation.CFNumberGetValue( # type: ignore - number_value, Foundation.kCFNumberDoubleType, None # type: ignore - ) - if success: - return float(float_value) - return None - - def axuielement_helper(element_value): - return element_value - - cf_attr_type = Foundation.CFGetTypeID(attrValue) # type: ignore - cf_type_mapping = { - Foundation.CFStringGetTypeID(): str, # type: ignore - Foundation.CFBooleanGetTypeID(): bool, # type: ignore - Foundation.CFArrayGetTypeID(): list_helper, # type: ignore - Foundation.CFNumberGetTypeID(): number_helper, # type: ignore - AXUIElementGetTypeID(): axuielement_helper, # type: ignore - } - try: - return cf_type_mapping[cf_attr_type](attrValue) - except KeyError: - # did not get a supported CF type. Move on to AX type - pass - - ax_attr_type = AXValueGetType(attrValue) - ax_type_map = { - kAXValueCGSizeType: Foundation.NSSizeFromString, # type: ignore - kAXValueCGPointType: Foundation.NSPointFromString, # type: ignore - kAXValueCFRangeType: Foundation.NSRangeFromString, # type: ignore - } - try: - search_result = re.search("{.*}", attrValue.description()) - if search_result: - extracted_str = search_result.group() - return tuple(ax_type_map[ax_attr_type](extracted_str)) - return None - except KeyError: - return None - -def element_attribute(element, attribute): - if attribute == kAXChildrenAttribute: - err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None) - if err == kAXErrorSuccess: - if isinstance(value, Foundation.NSArray): # type: ignore - return CFAttributeToPyObject(value) - else: - return value - err, value = AXUIElementCopyAttributeValue(element, attribute, None) - if err == kAXErrorSuccess: - if isinstance(value, Foundation.NSArray): # type: ignore - return CFAttributeToPyObject(value) - else: - return value - return None - -def element_value(element, type): - err, value = AXValueGetValue(element, type, None) - if err == True: - return value - return None - - -@timing_decorator -def get_running_apps() -> List[NSRunningApplication]: - """Get list of all running applications - - Returns: - List of NSRunningApplication objects - """ - return NSWorkspace.sharedWorkspace().runningApplications() - -# @timing_decorator -def get_app_info(app: NSRunningApplication) -> Dict[str, Any]: - """Get information about an application - - Args: - app: NSRunningApplication object - - Returns: - Dictionary with application information - """ - return { - "name": app.localizedName(), - "bundle_id": app.bundleIdentifier(), - "pid": app.processIdentifier(), - "active": app.isActive(), - "hidden": app.isHidden(), - "terminated": app.isTerminated(), - } - -@timing_decorator -def get_all_windows() -> List[Dict[str, Any]]: - """Get all windows from all applications with z-order information - - Returns: - List of window dictionaries with z-order information - """ - # Get all windows from Quartz - # The kCGWindowListOptionOnScreenOnly flag gets only visible windows with preserved z-order - window_list = Quartz.CGWindowListCopyWindowInfo( - Quartz.kCGWindowListOptionOnScreenOnly, - Quartz.kCGNullWindowID - ) - - # Create a dictionary of window z-order - z_order = {window['kCGWindowNumber']: z_index for z_index, window in enumerate(window_list[::-1])} - - # The kCGWindowListOptionAll flag gets all windows *without* z-order preserved - window_list_all = Quartz.CGWindowListCopyWindowInfo( - Quartz.kCGWindowListOptionAll, - Quartz.kCGNullWindowID - ) - - # Process all windows - windows = [] - for window in window_list_all: - # We track z_index which is the index in the window list (0 is the desktop / background) - - # Get window properties - window_id = window.get('kCGWindowNumber', 0) - window_name = window.get('kCGWindowName', '') - window_pid = window.get('kCGWindowOwnerPID', 0) - window_bounds = window.get('kCGWindowBounds', {}) - window_owner = window.get('kCGWindowOwnerName', '') - window_is_on_screen = window.get('kCGWindowIsOnscreen', False) - - # Get z-order information - # Note: kCGWindowLayer provides the system's layer value (lower values are higher in the stack) - layer = window.get(kCGWindowLayer, 0) - opacity = window.get(kCGWindowAlpha, 1.0) - z_index = z_order.get(window_id, -1) - - # Determine window role (desktop, dock, menubar, app) - if window_name == "Dock" and window_owner == "Dock": - role = "dock" - elif window_name == "Menubar" and window_owner == "Window Server": - role = "menubar" - elif window_owner in ["Window Server", "Dock"]: - role = "desktop" - else: - role = "app" - - # Only include windows with valid bounds - if window_bounds: - windows.append({ - "id": window_id, - "name": window_name or "Unnamed Window", - "pid": window_pid, - "owner": window_owner, - "role": role, - "is_on_screen": window_is_on_screen, - "bounds": { - "x": window_bounds.get('X', 0), - "y": window_bounds.get('Y', 0), - "width": window_bounds.get('Width', 0), - "height": window_bounds.get('Height', 0) - }, - "layer": layer, # System layer (lower values are higher in stack) - "z_index": z_index, # Our z-index (0 is the desktop) - "opacity": opacity - }) - - windows = sorted(windows, key=lambda x: x["z_index"]) - - return windows - -def get_app_windows(app_pid: int, all_windows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Get all windows for a specific application - - Args: - app_pid: Process ID of the application - all_windows: List of all windows with z-order information - - Returns: - List of window dictionaries for the app - """ - # Filter windows by PID - return [window for window in all_windows if window["pid"] == app_pid] - -@timing_decorator -def draw_desktop_screenshot(app_whitelist: List[str] = None, all_windows: List[Dict[str, Any]] = None, dock_bounds: Dict[str, float] = None, dock_items: List[Dict[str, Any]] = None, menubar_bounds: Dict[str, float] = None, menubar_items: List[Dict[str, Any]] = None) -> Tuple[Optional[Image.Image], List[Dict[str, Any]]]: - """Capture a screenshot of the entire desktop using Quartz compositing, including dock as a second pass. - Args: - app_whitelist: Optional list of app names to include in the screenshot - Returns: - PIL Image of the desktop or None if capture failed - """ - import ctypes - - if dock_bounds is None: - dock_bounds = get_dock_bounds() - if dock_items is None: - dock_items = get_dock_items() - if menubar_bounds is None: - menubar_bounds = get_menubar_bounds() - if menubar_items is None: - menubar_items = get_menubar_items() - if all_windows is None: - all_windows = get_all_windows() - all_windows = all_windows[::-1] - all_windows = [window for window in all_windows if window["is_on_screen"]] - - main_screen = AppKit.NSScreen.mainScreen() - if main_screen: - frame = main_screen.frame() - screen_rect = Quartz.CGRectMake(0, 0, frame.size.width, frame.size.height) - else: - screen_rect = Quartz.CGRectNull - - # Screenshot-to-screen hitboxes - hitboxes = [] - - if app_whitelist is None: - # Single pass: desktop, menubar, app, dock - window_list = Foundation.CFArrayCreateMutable(None, len(all_windows), None) - for window in all_windows: - Foundation.CFArrayAppendValue(window_list, window["id"]) - cg_image = Quartz.CGWindowListCreateImageFromArray( - screen_rect, window_list, Quartz.kCGWindowImageDefault - ) - if cg_image is None: - return None - - # Create CGContext for compositing - width = int(frame.size.width) - height = int(frame.size.height) - color_space = Quartz.CGColorSpaceCreateWithName(Quartz.kCGColorSpaceSRGB) - cg_context = Quartz.CGBitmapContextCreate( - None, width, height, 8, 0, color_space, Quartz.kCGImageAlphaPremultipliedLast - ) - Quartz.CGContextDrawImage(cg_context, screen_rect, cg_image) - hitboxes.append({ - "hitbox": [0, 0, width, height], - "target": [0, 0, width, height] - }) - else: - # Filter out windows that are not in the whitelist - all_windows = [window for window in all_windows if window["owner"] in app_whitelist or window["role"] != "app"] - app_windows = [window for window in all_windows if window["role"] == "app"] - - dock_orientation = "side" if dock_bounds["width"] < dock_bounds["height"] else "bottom" - - menubar_length = max(item["bounds"]["x"] + item["bounds"]["width"] for item in menubar_items) - - # Calculate bounds of app windows - app_bounds = { - "x": min(window["bounds"]["x"] for window in app_windows) if app_windows else 0, - "y": min(window["bounds"]["y"] for window in app_windows) if app_windows else 0, - } - app_bounds["width"] = max(window["bounds"]["x"] + window["bounds"]["width"] for window in app_windows) - app_bounds["x"] if app_windows else 0 - app_bounds["height"] = max(window["bounds"]["y"] + window["bounds"]["height"] for window in app_windows) - app_bounds["y"] if app_windows else 0 - - # Set minimum bounds of 256x256 - app_bounds["width"] = max(app_bounds["width"], 256) - app_bounds["height"] = max(app_bounds["height"], 256) - - # Add dock bounds to app bounds - if dock_orientation == "bottom": - app_bounds["height"] += dock_bounds["height"] + 4 - elif dock_orientation == "side": - if dock_bounds["x"] > frame.size.width / 2: - app_bounds["width"] += dock_bounds["width"] + 4 - else: - app_bounds["x"] -= dock_bounds["width"] + 4 - app_bounds["width"] += dock_bounds["width"] + 4 - - # Add menubar bounds to app bounds - app_bounds["height"] += menubar_bounds["height"] - - # Make sure app bounds contains menubar bounds - app_bounds["width"] = max(app_bounds["width"], menubar_length) - - # Clamp bounds to screen - app_bounds["x"] = max(app_bounds["x"], 0) - app_bounds["y"] = max(app_bounds["y"], 0) - app_bounds["width"] = min(app_bounds["width"], frame.size.width - app_bounds["x"]) - app_bounds["height"] = min(app_bounds["height"], frame.size.height - app_bounds["y"] + menubar_bounds["height"]) - - # Create CGContext for compositing - width = int(app_bounds["width"]) - height = int(app_bounds["height"]) - color_space = Quartz.CGColorSpaceCreateWithName(Quartz.kCGColorSpaceSRGB) - cg_context = Quartz.CGBitmapContextCreate( - None, width, height, 8, 0, color_space, Quartz.kCGImageAlphaPremultipliedLast - ) - - def _draw_layer(cg_context, all_windows, source_rect, target_rect): - """Draw a layer of windows from source_rect to target_rect on the given context.""" - window_list = Foundation.CFArrayCreateMutable(None, len(all_windows), None) - for window in all_windows: - Foundation.CFArrayAppendValue(window_list, window["id"]) - cg_image = Quartz.CGWindowListCreateImageFromArray( - source_rect, window_list, Quartz.kCGWindowImageDefault - ) - if cg_image is not None: - Quartz.CGContextDrawImage(cg_context, target_rect, cg_image) - - # --- FIRST PASS: desktop, apps --- - source_position = [app_bounds["x"], app_bounds["y"]] - source_size = [app_bounds["width"], app_bounds["height"]] - target_position = [ - 0, - min( - menubar_bounds["y"] + menubar_bounds["height"], - app_bounds["y"] - ) - ] - target_size = [app_bounds["width"], app_bounds["height"]] - - if dock_orientation == "bottom": - source_size[1] += dock_bounds["height"] - target_size[1] += dock_bounds["height"] - elif dock_orientation == "side": - if dock_bounds["x"] < frame.size.width / 2: - source_position[0] -= dock_bounds["width"] - target_position[0] -= dock_bounds["width"] - source_size[0] += dock_bounds["width"] - target_size[0] += dock_bounds["width"] - - app_source_rect = Quartz.CGRectMake( - source_position[0], source_position[1], source_size[0], source_size[1] - ) - app_target_rect = Quartz.CGRectMake( - target_position[0], app_bounds["height"] - target_position[1] - target_size[1], target_size[0], target_size[1] - ) - first_pass_windows = [w for w in all_windows if w["role"] == "app" or w["role"] == "desktop"] - _draw_layer(cg_context, first_pass_windows, app_source_rect, app_target_rect) - - hitboxes.append({ - "hitbox": [0, menubar_bounds["height"], app_bounds["width"], menubar_bounds["height"] + app_bounds["height"]], - "target": [ - app_source_rect.origin.x, - app_source_rect.origin.y, - app_source_rect.origin.x + app_bounds["width"], - app_source_rect.origin.y + app_bounds["height"] - ] - }) - - # --- SECOND PASS: menubar --- - allowed_roles = {"menubar"} - menubar_windows = [w for w in all_windows if w["role"] in allowed_roles] - menubar_source_rect = Quartz.CGRectMake( - 0, 0, app_bounds["width"], menubar_bounds["height"] - ) - menubar_target_rect = Quartz.CGRectMake( - 0, app_bounds["height"] - menubar_bounds["height"], app_bounds["width"], menubar_bounds["height"] - ) - _draw_layer(cg_context, menubar_windows, menubar_source_rect, menubar_target_rect) - - hitboxes.append({ - "hitbox": [0, 0, app_bounds["width"], menubar_bounds["height"]], - "target": [0, 0, app_bounds["width"], menubar_bounds["height"]] - }) - - # --- THIRD PASS: dock, filtered --- - # Step 1: Collect dock items to draw, with their computed target rects - dock_draw_items = [] - for index, item in enumerate(dock_items): - source_position = (item["bounds"]["x"], item["bounds"]["y"]) - source_size = (item["bounds"]["width"], item["bounds"]["height"]) - - # apply whitelist to middle items - if not (index == 0 or index == len(dock_items) - 1): - if item["subrole"] == "AXApplicationDockItem": - if item["title"] not in app_whitelist: - continue - elif item["subrole"] == "AXMinimizedWindowDockItem": - if not any(window["name"] == item["title"] and window["role"] == "app" and window["owner"] in app_whitelist for window in all_windows): - continue - elif item["subrole"] == "AXFolderDockItem": - continue - - # Preserve unscaled (original) source position and size before any modification - hitbox_position = source_position - hitbox_size = source_size - - screen_position = source_position - screen_size = source_size - - # stretch to screen size - padding = 32 - if dock_orientation == "bottom": - source_position = (source_position[0], 0) - source_size = (source_size[0], frame.size.height) - - hitbox_position = (source_position[0], app_bounds['height'] - hitbox_size[1]) - hitbox_size = (source_size[0], hitbox_size[1]) - - if index == 0: - source_size = (padding + source_size[0], source_size[1]) - source_position = (source_position[0] - padding, 0) - elif index == len(dock_items) - 1: - source_size = (source_size[0] + padding, source_size[1]) - source_position = (source_position[0], 0) - - elif dock_orientation == "side": - source_position = (0, source_position[1]) - source_size = (frame.size.width, source_size[1]) - - hitbox_position = ( - source_position[0] if dock_bounds['x'] < frame.size.width / 2 else app_bounds['width'] - hitbox_size[0], - source_position[1] - ) - hitbox_size = (hitbox_size[0], source_size[1]) - - if index == 0: - source_size = (source_size[0], padding + source_size[1]) - source_position = (0, source_position[1] - padding) - elif index == len(dock_items) - 1: - source_size = (source_size[0], source_size[1] + padding) - source_position = (0, source_position[1]) - - - # Compute the initial target position - target_position = source_position - target_size = source_size - - dock_draw_items.append({ - "item": item, - "index": index, - "source_position": source_position, - "source_size": source_size, - "target_size": target_size, - "target_position": target_position, # Will be updated after packing - "hitbox_position": hitbox_position, - "hitbox_size": hitbox_size, - "screen_position": screen_position, - "screen_size": screen_size, - }) - - # Step 2: Pack the target rects along the main axis, removing gaps - packed_positions = [] - if dock_orientation == "bottom": - # Pack left-to-right - x_cursor = 0 - for draw_item in dock_draw_items: - packed_positions.append((x_cursor, draw_item["target_position"][1])) - x_cursor += draw_item["target_size"][0] - packed_strip_length = x_cursor - # Center horizontally - x_offset = (app_bounds['width'] - packed_strip_length) / 2 - y_offset = (frame.size.height - app_bounds['height']) - for i, draw_item in enumerate(dock_draw_items): - px, py = packed_positions[i] - draw_item["target_position"] = (px + x_offset, py - y_offset) - - # Pack unscaled source rects - x_cursor = 0 - for draw_item in dock_draw_items: - draw_item["hitbox_position"] = (x_cursor, draw_item["hitbox_position"][1]) - x_cursor += draw_item["hitbox_size"][0] - packed_strip_length = x_cursor - # Center horizontally - x_offset = (app_bounds['width'] - packed_strip_length) / 2 - for i, draw_item in enumerate(dock_draw_items): - px, py = draw_item["hitbox_position"] - draw_item["hitbox_position"] = (px + x_offset, py) - elif dock_orientation == "side": - # Pack top-to-bottom - y_cursor = 0 - for draw_item in dock_draw_items: - packed_positions.append((draw_item["target_position"][0], y_cursor)) - y_cursor += draw_item["target_size"][1] - packed_strip_length = y_cursor - # Center vertically - y_offset = (app_bounds['height'] - packed_strip_length) / 2 - x_offset = 0 if dock_bounds['x'] < frame.size.width / 2 else frame.size.width - app_bounds['width'] - for i, draw_item in enumerate(dock_draw_items): - px, py = packed_positions[i] - draw_item["target_position"] = (px - x_offset, py + y_offset) - - # Pack unscaled source rects - y_cursor = 0 - for draw_item in dock_draw_items: - draw_item["hitbox_position"] = (draw_item["hitbox_position"][0], y_cursor) - y_cursor += draw_item["hitbox_size"][1] - packed_strip_length = y_cursor - # Center vertically - y_offset = (app_bounds['height'] - packed_strip_length) / 2 - for i, draw_item in enumerate(dock_draw_items): - px, py = draw_item["hitbox_position"] - draw_item["hitbox_position"] = (px, py + y_offset) - - dock_windows = [window for window in all_windows if window["role"] == "dock"] - # Step 3: Draw dock items using packed and recentered positions - for draw_item in dock_draw_items: - item = draw_item["item"] - source_position = draw_item["source_position"] - source_size = draw_item["source_size"] - target_position = draw_item["target_position"] - target_size = draw_item["target_size"] - - # flip target position y - target_position = (target_position[0], app_bounds['height'] - target_position[1] - target_size[1]) - - source_rect = Quartz.CGRectMake(*source_position, *source_size) - target_rect = Quartz.CGRectMake(*target_position, *target_size) - - _draw_layer(cg_context, dock_windows, source_rect, target_rect) - - hitbox_position = draw_item["hitbox_position"] - hitbox_size = draw_item["hitbox_size"] - - # Debug: Draw true hitbox rect (packed position, unscaled size) - # # Flip y like target_rect - # hitbox_position_flipped = ( - # hitbox_position[0], - # app_bounds['height'] - hitbox_position[1] - hitbox_size[1] - # ) - # hitbox_rect = Quartz.CGRectMake(*hitbox_position_flipped, *hitbox_size) - # Quartz.CGContextSetStrokeColorWithColor(cg_context, Quartz.CGColorCreateGenericRGB(0, 1, 0, 1)) - # Quartz.CGContextStrokeRect(cg_context, hitbox_rect) - - hitboxes.append({ - "hitbox": [*hitbox_position, hitbox_position[0] + hitbox_size[0], hitbox_position[1] + hitbox_size[1]], - "target": [*draw_item["screen_position"], draw_item["screen_position"][0] + draw_item["screen_size"][0], draw_item["screen_position"][1] + draw_item["screen_size"][1]] - }) - - - # Convert composited context to CGImage - final_cg_image = Quartz.CGBitmapContextCreateImage(cg_context) - ns_image = AppKit.NSImage.alloc().initWithCGImage_size_(final_cg_image, Foundation.NSZeroSize) - ns_data = ns_image.TIFFRepresentation() - bitmap_rep = AppKit.NSBitmapImageRep.imageRepWithData_(ns_data) - png_data = bitmap_rep.representationUsingType_properties_(AppKit.NSBitmapImageFileTypePNG, None) - image_data = io.BytesIO(png_data) - return Image.open(image_data), hitboxes - -@timing_decorator -def get_menubar_items(active_app_pid: int = None) -> List[Dict[str, Any]]: - """Get menubar items from the active application using Accessibility API - - Args: - active_app_pid: PID of the active application - - Returns: - List of dictionaries with menubar item information - """ - menubar_items = [] - - if active_app_pid is None: - # Get the frontmost application's PID if none provided - frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication() - if frontmost_app: - active_app_pid = frontmost_app.processIdentifier() - else: - logger.error("Error: Could not determine frontmost application") - return menubar_items - - # Create an accessibility element for the application - app_element = AXUIElementCreateApplication(active_app_pid) - if app_element is None: - logger.error(f"Error: Could not create accessibility element for PID {active_app_pid}") - return menubar_items - - # Get the menubar - menubar = element_attribute(app_element, kAXMenuBarAttribute) - if menubar is None: - logger.error(f"Error: Could not get menubar for application with PID {active_app_pid}") - return menubar_items - - # Get the menubar items - children = element_attribute(menubar, kAXChildrenAttribute) - if children is None: - logger.error("Error: Could not get menubar items") - return menubar_items - - # Process each menubar item - for i in range(len(children)): - item = children[i] - - # Get item title - title = element_attribute(item, kAXTitleAttribute) or "Untitled" - - # Create bounding box - bounds = { - "x": 0, - "y": 0, - "width": 0, - "height": 0 - } - - # Get item position - position_value = element_attribute(item, kAXPositionAttribute) - if position_value: - position_value = element_value(position_value, kAXValueCGPointType) - bounds["x"] = position_value.x - bounds["y"] = position_value.y - - # Get item size - size_value = element_attribute(item, kAXSizeAttribute) - if size_value: - size_value = element_value(size_value, kAXValueCGSizeType) - bounds["width"] = size_value.width - bounds["height"] = size_value.height - - - # Add to list - menubar_items.append({ - "title": title, - "bounds": bounds, - "index": i, - "app_pid": active_app_pid - }) - - return menubar_items - -@timing_decorator -def get_dock_items() -> List[Dict[str, Any]]: - """Get all items in the macOS Dock - - Returns: - List of dictionaries with Dock item information - """ - dock_items = [] - - # Find the Dock process - dock_pid = None - running_apps = get_running_apps() - for app in running_apps: - if app.localizedName() == "Dock" and app.bundleIdentifier() == "com.apple.dock": - dock_pid = app.processIdentifier() - break - - if dock_pid is None: - logger.error("Error: Could not find Dock process") - return dock_items - - # Create an accessibility element for the Dock - dock_element = AXUIElementCreateApplication(dock_pid) - if dock_element is None: - logger.error(f"Error: Could not create accessibility element for Dock (PID {dock_pid})") - return dock_items - - # Get the Dock's main element - dock_list = element_attribute(dock_element, kAXChildrenAttribute) - if dock_list is None or len(dock_list) == 0: - logger.error("Error: Could not get Dock children") - return dock_items - - # Find the Dock's application list (usually the first child) - dock_app_list = None - for child in dock_list: - role = element_attribute(child, kAXRoleAttribute) - if role == "AXList": - dock_app_list = child - break - - if dock_app_list is None: - logger.error("Error: Could not find Dock application list") - return dock_items - - # Get all items in the Dock - items = element_attribute(dock_app_list, kAXChildrenAttribute) - if items is None: - logger.error("Error: Could not get Dock items") - return dock_items - - # Process each Dock item - for i, item in enumerate(items): - # Get item attributes - title = element_attribute(item, kAXTitleAttribute) or "Untitled" - description = element_attribute(item, "AXDescription") or "" - role = element_attribute(item, kAXRoleAttribute) or "" - subrole = element_attribute(item, "AXSubrole") or "" - - # Create bounding box - bounds = { - "x": 0, - "y": 0, - "width": 0, - "height": 0 - } - - # Get item position - position_value = element_attribute(item, kAXPositionAttribute) - if position_value: - position_value = element_value(position_value, kAXValueCGPointType) - bounds["x"] = position_value.x - bounds["y"] = position_value.y - - # Get item size - size_value = element_attribute(item, kAXSizeAttribute) - if size_value: - size_value = element_value(size_value, kAXValueCGSizeType) - bounds["width"] = size_value.width - bounds["height"] = size_value.height - - # Determine if this is an application, file/folder, or separator - item_type = "unknown" - if subrole == "AXApplicationDockItem": - item_type = "application" - elif subrole == "AXFolderDockItem": - item_type = "folder" - elif subrole == "AXDocumentDockItem": - item_type = "document" - elif subrole == "AXSeparatorDockItem" or role == "AXSeparator": - item_type = "separator" - elif "trash" in title.lower(): - item_type = "trash" - - # Add to list - dock_items.append({ - "title": title, - "description": description, - "bounds": bounds, - "index": i, - "type": item_type, - "role": role, - "subrole": subrole - }) - - return dock_items - -class AppActivationContext: - def __init__(self, active_app_pid=None, active_app_to_use="", logger=None): - self.active_app_pid = active_app_pid - self.active_app_to_use = active_app_to_use - self.logger = logger - self.frontmost_app = None - - def __enter__(self): - from AppKit import NSWorkspace - if self.active_app_pid: - if self.logger and self.active_app_to_use: - self.logger.debug(f"Automatically activating app '{self.active_app_to_use}' for screenshot composition") - self.frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication() - running_apps_list = NSWorkspace.sharedWorkspace().runningApplications() - for app in running_apps_list: - if app.processIdentifier() == self.active_app_pid: - app.activateWithOptions_(0) - break - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.frontmost_app: - self.frontmost_app.activateWithOptions_(0) - -def get_frontmost_and_active_app(all_windows, running_apps, app_whitelist): - from AppKit import NSWorkspace - frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication() - - active_app_to_use = None - active_app_pid = None - - # Find the topmost (highest z_index) non-filtered app - for window in reversed(all_windows): - owner = window.get("owner") - role = window.get("role") - is_on_screen = window.get("is_on_screen") - - # Skip non-app windows - if role != "app": - continue - - # Skip not-on-screen windows - if not is_on_screen: - continue - - # Skip filtered apps - if app_whitelist is not None and owner not in app_whitelist: - continue - - # Found a suitable app - active_app_to_use = owner - active_app_pid = window.get("pid") - break - - # If no suitable app found, use Finder - if active_app_to_use is None: - active_app_to_use = "Finder" - for app in running_apps: - if app.localizedName() == "Finder": - active_app_pid = app.processIdentifier() - break - - return frontmost_app, active_app_to_use, active_app_pid - -def capture_all_apps(save_to_disk: bool = False, app_whitelist: List[str] = None, output_dir: str = None, take_focus: bool = True) -> Tuple[Dict[str, Any], Optional[Image.Image]]: - """Capture screenshots of all running applications - - Args: - save_to_disk: Whether to save screenshots to disk - app_whitelist: Optional list of app names to include in the recomposited screenshot - (will always include 'Window Server' and 'Dock') - - Returns: - Dictionary with application information and screenshots - Optional PIL Image of the recomposited screenshot - """ - result = { - "timestamp": time.time(), - "applications": [], - "windows": [], # New array to store all windows, including those without apps - "menubar_items": [], # New array to store menubar items - "dock_items": [] # New array to store dock items - } - - # Get all windows with z-order information - all_windows = get_all_windows() - - # Get all running applications - running_apps = get_running_apps() - - frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist) if take_focus else (None, None, None) - - # Use AppActivationContext to activate the app and restore focus - with AppActivationContext(active_app_pid, active_app_to_use, logger): - - # Process applications - for app in running_apps: - # Skip system apps without a bundle ID - if app.bundleIdentifier() is None: - continue - - app_info = get_app_info(app) - app_windows = get_app_windows(app.processIdentifier(), all_windows) - - app_data = { - "info": app_info, - "windows": [ window["id"] for window in app_windows ] - } - - result["applications"].append(app_data) - - # Add all windows to the result - result["windows"] = all_windows - - # Get menubar items from the active application - menubar_items = get_menubar_items(active_app_pid) - result["menubar_items"] = menubar_items - - # Get dock items - dock_items = get_dock_items() - result["dock_items"] = dock_items - - # Get menubar bounds - menubar_bounds = get_menubar_bounds() - result["menubar_bounds"] = menubar_bounds - - # Get dock bounds - dock_bounds = get_dock_bounds() - result["dock_bounds"] = dock_bounds - - # Capture the entire desktop using Quartz compositing - desktop_screenshot, hitboxes = draw_desktop_screenshot(app_whitelist, all_windows, dock_bounds, dock_items, menubar_bounds, menubar_items) - - result["hitboxes"] = hitboxes - - from PIL import Image, ImageDraw, ImageChops - def _draw_hitboxes(img, hitboxes, key="target"): - """ - Overlay opaque colored rectangles for each hitbox (using hitbox[key]) - with color depending on index, then multiply overlay onto img. - Args: - img: PIL.Image (RGBA or RGB) - hitboxes: list of dicts with 'hitbox' and 'target' keys - key: 'hitbox' or 'target' - Returns: - PIL.Image with overlayed hitboxes (same mode/size as input) - """ - # Ensure RGBA mode for blending - base = img.convert("RGBA") - overlay = Image.new("RGBA", base.size, (0, 0, 0, 0)) - draw = ImageDraw.Draw(overlay) - - # Distinct colors for order - colors = [ - (255, 0, 0, 180), # Red - (0, 255, 0, 180), # Green - (0, 0, 255, 180), # Blue - (255, 255, 0, 180), # Yellow - (0, 255, 255, 180), # Cyan - (255, 0, 255, 180), # Magenta - (255, 128, 0, 180), # Orange - (128, 0, 255, 180), # Purple - (0, 128, 255, 180), # Sky blue - (128, 255, 0, 180), # Lime - ] - # Set minimum brightness for colors - min_brightness = 0 - colors = [ - (max(min_brightness, c[0]), max(min_brightness, c[1]), max(min_brightness, c[2]), c[3]) for c in colors - ] - - for i, h in enumerate(hitboxes): - rect = h.get(key) - color = colors[i % len(colors)] - if rect: - draw.rectangle(rect, fill=color) - - # Multiply blend overlay onto base - result = ImageChops.multiply(base, overlay) - return result - - # DEBUG: Save hitboxes to disk - if desktop_screenshot and save_to_disk and output_dir: - desktop_path = os.path.join(output_dir, "desktop.png") - desktop_screenshot.save(desktop_path) - result["desktop_screenshot"] = desktop_path - - logger.info(f"Saved desktop screenshot to {desktop_path}") - - if app_whitelist: - # Take screenshot without whitelist - desktop_screenshot_full, hitboxes_full = draw_desktop_screenshot( - None, all_windows, dock_bounds, dock_items, menubar_bounds, menubar_items) - - # Draw hitboxes on both images using overlay - img1 = _draw_hitboxes(desktop_screenshot.copy(), hitboxes, key="hitbox") - img2 = _draw_hitboxes(desktop_screenshot_full.copy(), hitboxes, key="target") if desktop_screenshot_full else None - - if img2 and hitboxes_full: - - # Compose side-by-side - from PIL import Image - width = img1.width + img2.width - height = max(img1.height, img2.height) - combined = Image.new('RGBA', (width, height), (0, 0, 0, 0)) - combined.paste(img1, (0, 0)) - combined.paste(img2, (img1.width, 0)) - side_by_side_path = os.path.join(output_dir, "side_by_side_hitboxes.png") - combined.save(side_by_side_path) - result["side_by_side_hitboxes"] = side_by_side_path - else: - # Overlay hitboxes using new function - hitbox_img = _draw_hitboxes(desktop_screenshot.copy(), hitboxes, key="hitbox") - hitbox_path = os.path.join(output_dir, "hitboxes.png") - hitbox_img.save(hitbox_path) - result["hitbox_screenshot"] = hitbox_path - - # Focus restoration is now handled by AppActivationContext - - return result, desktop_screenshot - -async def run_capture(): - """Run the screenshot capture asynchronously""" - # Parse command line arguments - parser = argparse.ArgumentParser(description="Capture screenshots of running macOS applications") - parser.add_argument("--output", "-o", help="Output directory for screenshots", default="app_screenshots") - parser.add_argument("--filter", "-f", nargs="+", help="Filter recomposited screenshot to only include specified apps") - parser.add_argument("--menubar", "-m", action="store_true", help="List menubar and status items with their bounding boxes") - parser.add_argument("--dock", "-d", action="store_true", help="List Dock items with their bounding boxes") - parser.add_argument("--demo", nargs="*", help="Demo mode: pass app names to capture individual and combinations, create mosaic PNG") - args = parser.parse_args() - - # Create output directory in the current directory if not absolute - if not os.path.isabs(args.output): - output_dir = os.path.join(os.getcwd(), args.output) - else: - output_dir = args.output - - # DEMO MODE: capture each app and all non-empty combinations, then mosaic - if args.demo: - from PIL import Image - demo_apps = args.demo - print(f"Running in DEMO mode for apps: {demo_apps}") - groups = [] - for item in demo_apps: - if "/" in item: - group = [x.strip() for x in item.split("/") if x.strip()] - else: - group = [item.strip()] - if group: - groups.append(group) - screenshots = [] - for group in groups: - print(f"Capturing for apps: {group}") - _, img = capture_all_apps(app_whitelist=group) - if img: - screenshots.append((group, img)) - if not screenshots: - print("No screenshots captured in demo mode.") - return - # Mosaic-pack: grid (rows of sqrt(N)) - def make_mosaic(images, pad=64, bg=(30,30,30)): - import rpack - sizes = [(img.width + pad, img.height + pad) for _, img in images] - positions = rpack.pack(sizes) - # Find the bounding box for the mosaic - max_x = max(x + w for (x, y), (w, h) in zip(positions, sizes)) - max_y = max(y + h for (x, y), (w, h) in zip(positions, sizes)) - mosaic = Image.new("RGBA", (max_x, max_y), bg) - for (group, img), (x, y) in zip(images, positions): - mosaic.paste(img, (x, y)) - return mosaic - mosaic_img = make_mosaic(screenshots) - mosaic_path = os.path.join(output_dir, "demo_mosaic.png") - os.makedirs(output_dir, exist_ok=True) - mosaic_img.save(mosaic_path) - print(f"Demo mosaic saved to: {mosaic_path}") - return - - # Capture all apps and save to disk, including a recomposited screenshot - print(f"Capturing screenshots of all running applications...") - print(f"Saving screenshots to: {output_dir}") - - # If filter is provided, show what we're filtering by - if args.filter: - print(f"Filtering recomposited screenshot to only include: {', '.join(args.filter)} (plus Window Server and Dock)") - - result, img = capture_all_apps( - save_to_disk=True, - app_whitelist=args.filter, - output_dir=output_dir, - take_focus=True - ) - - # Print summary - print(f"\nCapture complete!") - print(f"Captured {len(result['applications'])} applications") - - total_app_windows = sum(len(app["windows"]) for app in result["applications"]) - print(f"Total application windows captured: {total_app_windows}") - print(f"Total standalone windows captured: {len(result['windows'])}") - - # Print details of each application - print("\nApplication details:") - for app in result["applications"]: - app_info = app["info"] - windows = app["windows"] - print(f" - {app_info['name']} ({len(windows)} windows)") - - # Print recomposited screenshot path if available - if "desktop_screenshot" in result: - print(f"\nRecomposited screenshot saved to: {result['desktop_screenshot']}") - - # Print menubar items if requested - if args.menubar and "menubar_items" in result: - print("\nMenubar items:") - - # Find app name for the PID - app_name_by_pid = {} - for app in result["applications"]: - app_info = app["info"] - app_name_by_pid[app_info["pid"]] = app_info["name"] - - for item in result["menubar_items"]: - print(f" - {item['title']}") - print(f" Bounds: x={item['bounds']['x']}, y={item['bounds']['y']}, width={item['bounds']['width']}, height={item['bounds']['height']}") - - if "app_pid" in item: - app_name = app_name_by_pid.get(item["app_pid"], f"Unknown App (PID: {item['app_pid']})") - print(f" App: {app_name} (PID: {item['app_pid']})") - - if "window_id" in item: - print(f" Window ID: {item['window_id']}") - if "owner" in item: - print(f" Owner: {item['owner']}") - if "layer" in item and "z_index" in item: - print(f" Layer: {item['layer']}, Z-Index: {item['z_index']}") - print("") - - # Print dock items if requested - if args.dock and "dock_items" in result: - print("\nDock items:") - for item in result["dock_items"]: - print(f" - {item['title']} ({item['type']})") - print(f" Description: {item['description']}") - print(f" Bounds: x={item['bounds']['x']}, y={item['bounds']['y']}, width={item['bounds']['width']}, height={item['bounds']['height']}") - print(f" Role: {item['role']}, Subrole: {item['subrole']}") - print(f" Index: {item['index']}") - print("") - - # Save the metadata to a JSON file - metadata_path = os.path.join(output_dir, "metadata.json") - with open(metadata_path, "w") as f: - json.dump(result, f, indent=2) - - print(f"\nMetadata saved to: {metadata_path}") - -if __name__ == "__main__": - asyncio.run(run_capture()) \ No newline at end of file diff --git a/notebooks/diorama/safezone.py b/notebooks/diorama/safezone.py deleted file mode 100644 index 122b668f..00000000 --- a/notebooks/diorama/safezone.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python3 -""" -UI Safezone Helper - A utility to get accurate bounds for macOS UI elements - -This module provides helper functions to get accurate bounds for macOS UI elements -like the menubar and dock, which are needed for proper screenshot composition. -""" - -import sys -import time -from typing import Dict, Any, Optional, Tuple - -# Import Objective-C bridge libraries -try: - import AppKit - from ApplicationServices import ( - AXUIElementCreateSystemWide, - AXUIElementCreateApplication, - AXUIElementCopyAttributeValue, - AXUIElementCopyAttributeValues, - kAXChildrenAttribute, - kAXRoleAttribute, - kAXTitleAttribute, - kAXPositionAttribute, - kAXSizeAttribute, - kAXErrorSuccess, - AXValueGetType, - kAXValueCGSizeType, - kAXValueCGPointType, - AXUIElementGetTypeID, - AXValueGetValue, - kAXMenuBarAttribute, - ) - from AppKit import NSWorkspace, NSRunningApplication - import Foundation -except ImportError: - print("Error: This script requires PyObjC to be installed.") - print("Please install it with: pip install pyobjc") - sys.exit(1) - -# Constants for accessibility API -kAXErrorSuccess = 0 -kAXRoleAttribute = "AXRole" -kAXSubroleAttribute = "AXSubrole" -kAXTitleAttribute = "AXTitle" -kAXPositionAttribute = "AXPosition" -kAXSizeAttribute = "AXSize" -kAXChildrenAttribute = "AXChildren" -kAXMenuBarAttribute = "AXMenuBar" - - -def element_attribute(element, attribute): - """Get an attribute from an accessibility element""" - if attribute == kAXChildrenAttribute: - err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None) - if err == kAXErrorSuccess: - if isinstance(value, Foundation.NSArray): - return list(value) - else: - return value - err, value = AXUIElementCopyAttributeValue(element, attribute, None) - if err == kAXErrorSuccess: - return value - return None - - -def element_value(element, type): - """Get a value from an accessibility element""" - err, value = AXValueGetValue(element, type, None) - if err == True: - return value - return None - - -def get_element_bounds(element): - """Get the bounds of an accessibility element""" - bounds = { - "x": 0, - "y": 0, - "width": 0, - "height": 0 - } - - # Get position - position_value = element_attribute(element, kAXPositionAttribute) - if position_value: - position_value = element_value(position_value, kAXValueCGPointType) - if position_value: - bounds["x"] = position_value.x - bounds["y"] = position_value.y - - # Get size - size_value = element_attribute(element, kAXSizeAttribute) - if size_value: - size_value = element_value(size_value, kAXValueCGSizeType) - if size_value: - bounds["width"] = size_value.width - bounds["height"] = size_value.height - - return bounds - - -def find_dock_process(): - """Find the Dock process""" - running_apps = NSWorkspace.sharedWorkspace().runningApplications() - for app in running_apps: - if app.localizedName() == "Dock" and app.bundleIdentifier() == "com.apple.dock": - return app.processIdentifier() - return None - - -def get_menubar_bounds(): - """Get the bounds of the macOS menubar - - Returns: - Dictionary with x, y, width, height of the menubar - """ - # Get the system-wide accessibility element - system_element = AXUIElementCreateSystemWide() - - # Try to find the menubar - menubar = element_attribute(system_element, kAXMenuBarAttribute) - if menubar is None: - # If we can't get it directly, try through the frontmost app - frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication() - if frontmost_app: - app_pid = frontmost_app.processIdentifier() - app_element = AXUIElementCreateApplication(app_pid) - menubar = element_attribute(app_element, kAXMenuBarAttribute) - - if menubar is None: - print("Error: Could not get menubar") - # Return default menubar bounds as fallback - return {"x": 0, "y": 0, "width": 1800, "height": 24} - - # Get menubar bounds - return get_element_bounds(menubar) - - -def get_dock_bounds(): - """Get the bounds of the macOS Dock - - Returns: - Dictionary with x, y, width, height of the Dock - """ - dock_pid = find_dock_process() - if dock_pid is None: - print("Error: Could not find Dock process") - # Return empty bounds as fallback - return {"x": 0, "y": 0, "width": 0, "height": 0} - - # Create an accessibility element for the Dock - dock_element = AXUIElementCreateApplication(dock_pid) - if dock_element is None: - print(f"Error: Could not create accessibility element for Dock (PID {dock_pid})") - return {"x": 0, "y": 0, "width": 0, "height": 0} - - # Get the Dock's children - children = element_attribute(dock_element, kAXChildrenAttribute) - if not children or len(children) == 0: - print("Error: Could not get Dock children") - return {"x": 0, "y": 0, "width": 0, "height": 0} - - # Find the Dock's list (first child is usually the main dock list) - dock_list = None - for child in children: - role = element_attribute(child, kAXRoleAttribute) - if role == "AXList": - dock_list = child - break - - if dock_list is None: - print("Error: Could not find Dock list") - return {"x": 0, "y": 0, "width": 0, "height": 0} - - # Get the bounds of the dock list - return get_element_bounds(dock_list) - - -def get_ui_element_bounds(): - """Get the bounds of important UI elements like menubar and dock - - Returns: - Dictionary with menubar and dock bounds - """ - menubar_bounds = get_menubar_bounds() - dock_bounds = get_dock_bounds() - - return { - "menubar": menubar_bounds, - "dock": dock_bounds - } - - -if __name__ == "__main__": - # Example usage - bounds = get_ui_element_bounds() - print("Menubar bounds:", bounds["menubar"]) - print("Dock bounds:", bounds["dock"])