Create example for using @cua/computer with cloud container and openai

This commit is contained in:
Morgan Dean
2025-06-25 15:54:24 -07:00
parent 8f49e0e2bf
commit 3e4620431d
7 changed files with 227 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
OPENAI_KEY=
CUA_KEY=
CUA_CONTAINER_NAME=

View File

@@ -0,0 +1,3 @@
node_modules
.DS_Store
.env

View File

@@ -0,0 +1,7 @@
{
"useTabs": false,
"semi": true,
"singleQuote": true,
"trailingComma": "es5",
"bracketSpacing": true
}

View File

@@ -0,0 +1,25 @@
{
"name": "cua-cloud-openai",
"version": "1.0.0",
"description": "",
"type": "module",
"main": "index.js",
"scripts": {
"dev": "tsx watch src/index.ts",
"start": "tsx src/index.ts"
},
"keywords": [],
"author": "",
"license": "MIT",
"packageManager": "pnpm@10.12.3",
"dependencies": {
"@cua/computer": "link:../../computer",
"dotenv": "^16.5.0",
"openai": "^5.7.0"
},
"devDependencies": {
"@types/node": "^22.15.33",
"tsx": "^4.20.3",
"typescript": "^5.8.3"
}
}

View File

@@ -0,0 +1,56 @@
import { Computer } from "@cua/computer";
import OpenAI from "openai";
export async function executeAction(
computer: Computer,
action: OpenAI.Responses.ResponseComputerToolCall['action']
) {
switch (action.type) {
case 'click':
const { x, y, button } = action;
console.log(`Executing click at (${x}, ${y}) with button '${button}'.`);
await computer.interface.moveCursor(x, y);
if (button === 'right') await computer.interface.rightClick();
else await computer.interface.leftClick();
break;
case 'type':
const { text } = action;
console.log(`Typing text: ${text}`);
await computer.interface.typeText(text);
break;
case 'scroll':
const { x: locX, y: locY, scroll_x, scroll_y } = action;
console.log(
`Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).`
);
await computer.interface.moveCursor(locX, locY);
await computer.interface.scroll(scroll_x, scroll_y);
break;
case 'keypress':
const { keys } = action;
for (const key of keys) {
console.log(`Pressing key: ${key}.`);
// Map common key names to CUA equivalents
if (key.toLowerCase() === 'enter') {
await computer.interface.pressKey('return');
} else if (key.toLowerCase() === 'space') {
await computer.interface.pressKey('space');
} else {
await computer.interface.pressKey(key);
}
}
break;
case 'wait':
console.log(`Waiting for 3 seconds.`);
await new Promise((resolve) => setTimeout(resolve, 3 * 1000));
break;
case 'screenshot':
console.log('Taking screenshot.');
// This is handled automatically in the main loop, but we can take an extra one if requested
const screenshot = await computer.interface.screenshot();
return screenshot;
default:
console.log(`Unrecognized action: ${action.type}`);
break;
}
}

View File

@@ -0,0 +1,104 @@
import { Computer, OSType } from '@cua/computer';
import OpenAI from 'openai';
import { executeAction } from './helpers';
import 'dotenv/config';
const openai = new OpenAI({ apiKey: process.env.OPENAI_KEY });
const COMPUTER_USE_PROMPT = 'Open firefox and go to trycua.com';
// Initialize the Computer Connection
const computer = new Computer({
apiKey: process.env.CUA_KEY!,
name: process.env.CUA_CONTAINER_NAME!,
osType: OSType.LINUX,
});
await computer.run();
// Take the initial screenshot
const screenshot = await computer.interface.screenshot();
const screenshotBase64 = screenshot.toString('base64');
// Setup openai config for computer use
const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = {
model: 'computer-use-preview',
tools: [
{
type: 'computer_use_preview',
display_width: 1024,
display_height: 768,
environment: 'linux', // we're using a linux vm
},
],
truncation: 'auto',
};
// Send initial screenshot to the openai computer use model
let res = await openai.responses.create({
...computerUseConfig,
input: [
{
role: 'user',
content: [
// what we want the ai to do
{ type: 'input_text', text: COMPUTER_USE_PROMPT },
// current screenshot of the vm
{
type: 'input_image',
image_url: `data:image/png;base64,${screenshotBase64}`,
detail: 'auto',
},
],
},
],
});
// Loop until there are no more computer use actions.
while (true) {
const computerCalls = res.output.filter((o) => o.type === 'computer_call');
if (computerCalls.length < 1) {
console.log('No more computer calls. Loop complete.');
break;
}
// Get the first call
const call = computerCalls[0];
const action = call.action;
console.log('Received action from OpenAI Responses API:', action);
let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] =
[];
if (call.pending_safety_checks.length > 0) {
console.log('Safety checks pending:', call.pending_safety_checks);
// In a real implementation, you would want to get user confirmation here
ackChecks = call.pending_safety_checks;
}
// Execute the action in the container
await executeAction(computer, action);
// Wait for changes to process within the container (1sec)
await new Promise((resolve) => setTimeout(resolve, 1000));
// Capture new screenshot
const newScreenshot = await computer.interface.screenshot();
const newScreenshotBase64 = newScreenshot.toString('base64');
// Screenshot back as computer_call_output
res = await openai.responses.create({
...computerUseConfig,
previous_response_id: res.id,
input: [
{
type: 'computer_call_output',
call_id: call.call_id,
acknowledged_safety_checks: ackChecks,
output: {
type: 'computer_screenshot',
image_url: `data:image/png;base64,${newScreenshotBase64}`,
},
},
],
});
}
process.exit();

View File

@@ -0,0 +1,29 @@
{
"compilerOptions": {
"target": "esnext",
"lib": [
"es2023"
],
"moduleDetection": "force",
"module": "preserve",
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"types": [
"node"
],
"allowSyntheticDefaultImports": true,
"strict": true,
"noUnusedLocals": true,
"declaration": true,
"emitDeclarationOnly": true,
"esModuleInterop": true,
"isolatedModules": true,
"verbatimModuleSyntax": true,
"skipLibCheck": true,
"outDir": "build",
},
"include": [
"src"
]
}