Create example for using @cua/computer with cloud container and openai

2026-01-04 12:30:08 -06:00 · 2025-06-25 15:54:24 -07:00
parent 8f49e0e2bf
commit 3e4620431d
7 changed files with 227 additions and 0 deletions
--- a/libs/typescript/examples/cua-cloud-openai/.env.example
+++ b/libs/typescript/examples/cua-cloud-openai/.env.example
@@ -0,0 +1,3 @@
+OPENAI_KEY=
+CUA_KEY=
+CUA_CONTAINER_NAME=
--- a/libs/typescript/examples/cua-cloud-openai/.gitignore
+++ b/libs/typescript/examples/cua-cloud-openai/.gitignore
@@ -0,0 +1,3 @@
+node_modules
+.DS_Store
+.env
--- a/libs/typescript/examples/cua-cloud-openai/.prettierrc
+++ b/libs/typescript/examples/cua-cloud-openai/.prettierrc
@@ -0,0 +1,7 @@
+{
+  "useTabs": false,
+  "semi": true,
+  "singleQuote": true,
+  "trailingComma": "es5",
+  "bracketSpacing": true
+}
--- a/libs/typescript/examples/cua-cloud-openai/package.json
+++ b/libs/typescript/examples/cua-cloud-openai/package.json
@@ -0,0 +1,25 @@
+{
+  "name": "cua-cloud-openai",
+  "version": "1.0.0",
+  "description": "",
+  "type": "module",
+  "main": "index.js",
+  "scripts": {
+    "dev": "tsx watch src/index.ts",
+    "start": "tsx src/index.ts"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "MIT",
+  "packageManager": "pnpm@10.12.3",
+  "dependencies": {
+    "@cua/computer": "link:../../computer",
+    "dotenv": "^16.5.0",
+    "openai": "^5.7.0"
+  },
+  "devDependencies": {
+    "@types/node": "^22.15.33",
+    "tsx": "^4.20.3",
+    "typescript": "^5.8.3"
+  }
+}
--- a/libs/typescript/examples/cua-cloud-openai/src/helpers.ts
+++ b/libs/typescript/examples/cua-cloud-openai/src/helpers.ts
@@ -0,0 +1,56 @@
+import { Computer } from "@cua/computer";
+import OpenAI from "openai";
+
+export async function executeAction(
+  computer: Computer,
+  action: OpenAI.Responses.ResponseComputerToolCall['action']
+) {
+  switch (action.type) {
+    case 'click':
+      const { x, y, button } = action;
+      console.log(`Executing click at (${x}, ${y}) with button '${button}'.`);
+      await computer.interface.moveCursor(x, y);
+      if (button === 'right') await computer.interface.rightClick();
+      else await computer.interface.leftClick();
+      break;
+    case 'type':
+      const { text } = action;
+      console.log(`Typing text: ${text}`);
+      await computer.interface.typeText(text);
+      break;
+    case 'scroll':
+      const { x: locX, y: locY, scroll_x, scroll_y } = action;
+      console.log(
+        `Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).`
+      );
+      await computer.interface.moveCursor(locX, locY);
+      await computer.interface.scroll(scroll_x, scroll_y);
+      break;
+    case 'keypress':
+      const { keys } = action;
+      for (const key of keys) {
+        console.log(`Pressing key: ${key}.`);
+        // Map common key names to CUA equivalents
+        if (key.toLowerCase() === 'enter') {
+          await computer.interface.pressKey('return');
+        } else if (key.toLowerCase() === 'space') {
+          await computer.interface.pressKey('space');
+        } else {
+          await computer.interface.pressKey(key);
+        }
+      }
+      break;
+    case 'wait':
+      console.log(`Waiting for 3 seconds.`);
+      await new Promise((resolve) => setTimeout(resolve, 3 * 1000));
+      break;
+    case 'screenshot':
+      console.log('Taking screenshot.');
+      // This is handled automatically in the main loop, but we can take an extra one if requested
+      const screenshot = await computer.interface.screenshot();
+      return screenshot;
+    default:
+      console.log(`Unrecognized action: ${action.type}`);
+      break;
+  }
+}
--- a/libs/typescript/examples/cua-cloud-openai/src/index.ts
+++ b/libs/typescript/examples/cua-cloud-openai/src/index.ts
@@ -0,0 +1,104 @@
+import { Computer, OSType } from '@cua/computer';
+import OpenAI from 'openai';
+import { executeAction } from './helpers';
+
+import 'dotenv/config';
+
+const openai = new OpenAI({ apiKey: process.env.OPENAI_KEY });
+
+const COMPUTER_USE_PROMPT = 'Open firefox and go to trycua.com';
+
+// Initialize the Computer Connection
+const computer = new Computer({
+  apiKey: process.env.CUA_KEY!,
+  name: process.env.CUA_CONTAINER_NAME!,
+  osType: OSType.LINUX,
+});
+
+await computer.run();
+// Take the initial screenshot
+const screenshot = await computer.interface.screenshot();
+const screenshotBase64 = screenshot.toString('base64');
+
+// Setup openai config for computer use
+const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = {
+  model: 'computer-use-preview',
+  tools: [
+    {
+      type: 'computer_use_preview',
+      display_width: 1024,
+      display_height: 768,
+      environment: 'linux', // we're using a linux vm
+    },
+  ],
+  truncation: 'auto',
+};
+
+// Send initial screenshot to the openai computer use model
+let res = await openai.responses.create({
+  ...computerUseConfig,
+  input: [
+    {
+      role: 'user',
+      content: [
+        // what we want the ai to do
+        { type: 'input_text', text: COMPUTER_USE_PROMPT },
+        // current screenshot of the vm
+        {
+          type: 'input_image',
+          image_url: `data:image/png;base64,${screenshotBase64}`,
+          detail: 'auto',
+        },
+      ],
+    },
+  ],
+});
+
+// Loop until there are no more computer use actions.
+while (true) {
+  const computerCalls = res.output.filter((o) => o.type === 'computer_call');
+  if (computerCalls.length < 1) {
+    console.log('No more computer calls. Loop complete.');
+    break;
+  }
+  // Get the first call
+  const call = computerCalls[0];
+  const action = call.action;
+  console.log('Received action from OpenAI Responses API:', action);
+  let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] =
+    [];
+  if (call.pending_safety_checks.length > 0) {
+    console.log('Safety checks pending:', call.pending_safety_checks);
+    // In a real implementation, you would want to get user confirmation here
+    ackChecks = call.pending_safety_checks;
+  }
+
+  // Execute the action in the container
+  await executeAction(computer, action);
+  // Wait for changes to process within the container (1sec)
+  await new Promise((resolve) => setTimeout(resolve, 1000));
+
+  // Capture new screenshot
+  const newScreenshot = await computer.interface.screenshot();
+  const newScreenshotBase64 = newScreenshot.toString('base64');
+
+  // Screenshot back as computer_call_output
+
+  res = await openai.responses.create({
+    ...computerUseConfig,
+    previous_response_id: res.id,
+    input: [
+      {
+        type: 'computer_call_output',
+        call_id: call.call_id,
+        acknowledged_safety_checks: ackChecks,
+        output: {
+          type: 'computer_screenshot',
+          image_url: `data:image/png;base64,${newScreenshotBase64}`,
+        },
+      },
+    ],
+  });
+}
+
+process.exit();
--- a/libs/typescript/examples/cua-cloud-openai/tsconfig.json
+++ b/libs/typescript/examples/cua-cloud-openai/tsconfig.json
@@ -0,0 +1,29 @@
+{
+  "compilerOptions": {
+    "target": "esnext",
+    "lib": [
+      "es2023"
+    ],
+    "moduleDetection": "force",
+    "module": "preserve",
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "resolveJsonModule": true,
+    "types": [
+      "node"
+    ],
+    "allowSyntheticDefaultImports": true,
+    "strict": true,
+    "noUnusedLocals": true,
+    "declaration": true,
+    "emitDeclarationOnly": true,
+    "esModuleInterop": true,
+    "isolatedModules": true,
+    "verbatimModuleSyntax": true,
+    "skipLibCheck": true,
+    "outDir": "build",
+  },
+  "include": [
+    "src"
+  ]
+}