diff --git a/libs/typescript/examples/cua-cloud-openai/.env.example b/libs/typescript/examples/cua-cloud-openai/.env.example new file mode 100644 index 00000000..0496a574 --- /dev/null +++ b/libs/typescript/examples/cua-cloud-openai/.env.example @@ -0,0 +1,3 @@ +OPENAI_KEY= +CUA_KEY= +CUA_CONTAINER_NAME= \ No newline at end of file diff --git a/libs/typescript/examples/cua-cloud-openai/.gitignore b/libs/typescript/examples/cua-cloud-openai/.gitignore new file mode 100644 index 00000000..9bdf3559 --- /dev/null +++ b/libs/typescript/examples/cua-cloud-openai/.gitignore @@ -0,0 +1,3 @@ +node_modules +.DS_Store +.env \ No newline at end of file diff --git a/libs/typescript/examples/cua-cloud-openai/.prettierrc b/libs/typescript/examples/cua-cloud-openai/.prettierrc new file mode 100644 index 00000000..23eaef29 --- /dev/null +++ b/libs/typescript/examples/cua-cloud-openai/.prettierrc @@ -0,0 +1,7 @@ +{ + "useTabs": false, + "semi": true, + "singleQuote": true, + "trailingComma": "es5", + "bracketSpacing": true +} \ No newline at end of file diff --git a/libs/typescript/examples/cua-cloud-openai/package.json b/libs/typescript/examples/cua-cloud-openai/package.json new file mode 100644 index 00000000..3d769cb0 --- /dev/null +++ b/libs/typescript/examples/cua-cloud-openai/package.json @@ -0,0 +1,25 @@ +{ + "name": "cua-cloud-openai", + "version": "1.0.0", + "description": "", + "type": "module", + "main": "index.js", + "scripts": { + "dev": "tsx watch src/index.ts", + "start": "tsx src/index.ts" + }, + "keywords": [], + "author": "", + "license": "MIT", + "packageManager": "pnpm@10.12.3", + "dependencies": { + "@cua/computer": "link:../../computer", + "dotenv": "^16.5.0", + "openai": "^5.7.0" + }, + "devDependencies": { + "@types/node": "^22.15.33", + "tsx": "^4.20.3", + "typescript": "^5.8.3" + } +} \ No newline at end of file diff --git a/libs/typescript/examples/cua-cloud-openai/src/helpers.ts b/libs/typescript/examples/cua-cloud-openai/src/helpers.ts new file mode 100644 index 00000000..68062cd4 --- /dev/null +++ b/libs/typescript/examples/cua-cloud-openai/src/helpers.ts @@ -0,0 +1,56 @@ +import { Computer } from "@cua/computer"; +import OpenAI from "openai"; + +export async function executeAction( + computer: Computer, + action: OpenAI.Responses.ResponseComputerToolCall['action'] +) { + switch (action.type) { + case 'click': + const { x, y, button } = action; + console.log(`Executing click at (${x}, ${y}) with button '${button}'.`); + await computer.interface.moveCursor(x, y); + if (button === 'right') await computer.interface.rightClick(); + else await computer.interface.leftClick(); + break; + case 'type': + const { text } = action; + console.log(`Typing text: ${text}`); + await computer.interface.typeText(text); + break; + case 'scroll': + const { x: locX, y: locY, scroll_x, scroll_y } = action; + console.log( + `Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).` + ); + await computer.interface.moveCursor(locX, locY); + await computer.interface.scroll(scroll_x, scroll_y); + break; + case 'keypress': + const { keys } = action; + for (const key of keys) { + console.log(`Pressing key: ${key}.`); + // Map common key names to CUA equivalents + if (key.toLowerCase() === 'enter') { + await computer.interface.pressKey('return'); + } else if (key.toLowerCase() === 'space') { + await computer.interface.pressKey('space'); + } else { + await computer.interface.pressKey(key); + } + } + break; + case 'wait': + console.log(`Waiting for 3 seconds.`); + await new Promise((resolve) => setTimeout(resolve, 3 * 1000)); + break; + case 'screenshot': + console.log('Taking screenshot.'); + // This is handled automatically in the main loop, but we can take an extra one if requested + const screenshot = await computer.interface.screenshot(); + return screenshot; + default: + console.log(`Unrecognized action: ${action.type}`); + break; + } +} diff --git a/libs/typescript/examples/cua-cloud-openai/src/index.ts b/libs/typescript/examples/cua-cloud-openai/src/index.ts new file mode 100644 index 00000000..eb8b0022 --- /dev/null +++ b/libs/typescript/examples/cua-cloud-openai/src/index.ts @@ -0,0 +1,104 @@ +import { Computer, OSType } from '@cua/computer'; +import OpenAI from 'openai'; +import { executeAction } from './helpers'; + +import 'dotenv/config'; + +const openai = new OpenAI({ apiKey: process.env.OPENAI_KEY }); + +const COMPUTER_USE_PROMPT = 'Open firefox and go to trycua.com'; + +// Initialize the Computer Connection +const computer = new Computer({ + apiKey: process.env.CUA_KEY!, + name: process.env.CUA_CONTAINER_NAME!, + osType: OSType.LINUX, +}); + +await computer.run(); +// Take the initial screenshot +const screenshot = await computer.interface.screenshot(); +const screenshotBase64 = screenshot.toString('base64'); + +// Setup openai config for computer use +const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = { + model: 'computer-use-preview', + tools: [ + { + type: 'computer_use_preview', + display_width: 1024, + display_height: 768, + environment: 'linux', // we're using a linux vm + }, + ], + truncation: 'auto', +}; + +// Send initial screenshot to the openai computer use model +let res = await openai.responses.create({ + ...computerUseConfig, + input: [ + { + role: 'user', + content: [ + // what we want the ai to do + { type: 'input_text', text: COMPUTER_USE_PROMPT }, + // current screenshot of the vm + { + type: 'input_image', + image_url: `data:image/png;base64,${screenshotBase64}`, + detail: 'auto', + }, + ], + }, + ], +}); + +// Loop until there are no more computer use actions. +while (true) { + const computerCalls = res.output.filter((o) => o.type === 'computer_call'); + if (computerCalls.length < 1) { + console.log('No more computer calls. Loop complete.'); + break; + } + // Get the first call + const call = computerCalls[0]; + const action = call.action; + console.log('Received action from OpenAI Responses API:', action); + let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] = + []; + if (call.pending_safety_checks.length > 0) { + console.log('Safety checks pending:', call.pending_safety_checks); + // In a real implementation, you would want to get user confirmation here + ackChecks = call.pending_safety_checks; + } + + // Execute the action in the container + await executeAction(computer, action); + // Wait for changes to process within the container (1sec) + await new Promise((resolve) => setTimeout(resolve, 1000)); + + // Capture new screenshot + const newScreenshot = await computer.interface.screenshot(); + const newScreenshotBase64 = newScreenshot.toString('base64'); + + // Screenshot back as computer_call_output + + res = await openai.responses.create({ + ...computerUseConfig, + previous_response_id: res.id, + input: [ + { + type: 'computer_call_output', + call_id: call.call_id, + acknowledged_safety_checks: ackChecks, + output: { + type: 'computer_screenshot', + image_url: `data:image/png;base64,${newScreenshotBase64}`, + }, + }, + ], + }); +} + +process.exit(); diff --git a/libs/typescript/examples/cua-cloud-openai/tsconfig.json b/libs/typescript/examples/cua-cloud-openai/tsconfig.json new file mode 100644 index 00000000..c606e279 --- /dev/null +++ b/libs/typescript/examples/cua-cloud-openai/tsconfig.json @@ -0,0 +1,29 @@ +{ + "compilerOptions": { + "target": "esnext", + "lib": [ + "es2023" + ], + "moduleDetection": "force", + "module": "preserve", + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "resolveJsonModule": true, + "types": [ + "node" + ], + "allowSyntheticDefaultImports": true, + "strict": true, + "noUnusedLocals": true, + "declaration": true, + "emitDeclarationOnly": true, + "esModuleInterop": true, + "isolatedModules": true, + "verbatimModuleSyntax": true, + "skipLibCheck": true, + "outDir": "build", + }, + "include": [ + "src" + ] +} \ No newline at end of file