mirror of
https://github.com/trycua/computer.git
synced 2026-01-06 21:39:58 -06:00
Merge pull request #562 from YeIIcw/fix/agent-test-trigger
Gate local-model matrix entries in test workflow
This commit is contained in:
81
.github/workflows/test-cua-models.yml
vendored
81
.github/workflows/test-cua-models.yml
vendored
@@ -11,6 +11,11 @@ on:
|
||||
required: false
|
||||
default: true
|
||||
type: boolean
|
||||
include_local_models:
|
||||
description: "Also run huggingface-local models (requires large disk / self-hosted runner)"
|
||||
required: false
|
||||
default: false
|
||||
type: boolean
|
||||
schedule:
|
||||
# Runs at 3 PM UTC (8 AM PDT) daily
|
||||
- cron: "0 15 * * *"
|
||||
@@ -18,35 +23,47 @@ on:
|
||||
jobs:
|
||||
# Test all CUA models - runs on PRs, schedules, or when manually triggered
|
||||
test-all-models:
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false') }}
|
||||
if: ${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false')) && (!matrix.requires_local_weights || fromJSON(inputs.include_local_models || 'false') || vars.RUN_LOCAL_MODELS == 'true') }}
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
model:
|
||||
include:
|
||||
# Claude Sonnet/Haiku
|
||||
- anthropic/claude-sonnet-4-5-20250929
|
||||
- anthropic/claude-haiku-4-5-20251001
|
||||
- anthropic/claude-opus-4-1-20250805
|
||||
- model: anthropic/claude-sonnet-4-5-20250929
|
||||
requires_local_weights: false
|
||||
- model: anthropic/claude-haiku-4-5-20251001
|
||||
requires_local_weights: false
|
||||
- model: anthropic/claude-opus-4-1-20250805
|
||||
requires_local_weights: false
|
||||
|
||||
# OpenAI CU Preview
|
||||
- openai/computer-use-preview
|
||||
- model: openai/computer-use-preview
|
||||
requires_local_weights: false
|
||||
|
||||
# GLM-V
|
||||
- openrouter/z-ai/glm-4.5v
|
||||
# - huggingface-local/zai-org/GLM-4.5V # Requires local model setup
|
||||
- model: openrouter/z-ai/glm-4.5v
|
||||
requires_local_weights: false
|
||||
# - model: huggingface-local/zai-org/GLM-4.5V # Requires local model setup
|
||||
# requires_local_weights: true
|
||||
|
||||
# Gemini CU Preview
|
||||
- gemini-2.5-computer-use-preview-10-2025
|
||||
- model: gemini-2.5-computer-use-preview-10-2025
|
||||
requires_local_weights: false
|
||||
|
||||
# InternVL
|
||||
- huggingface-local/OpenGVLab/InternVL3_5-1B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-2B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-4B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-8B
|
||||
- model: huggingface-local/OpenGVLab/InternVL3_5-1B
|
||||
requires_local_weights: true
|
||||
# - model: huggingface-local/OpenGVLab/InternVL3_5-2B
|
||||
# requires_local_weights: true
|
||||
# - model: huggingface-local/OpenGVLab/InternVL3_5-4B
|
||||
# requires_local_weights: true
|
||||
# - model: huggingface-local/OpenGVLab/InternVL3_5-8B
|
||||
# requires_local_weights: true
|
||||
|
||||
# UI-TARS (supports full computer-use, can run standalone)
|
||||
- huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
- model: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
requires_local_weights: true
|
||||
|
||||
# Note: OpenCUA, GTA, and Holo are grounding-only models
|
||||
# They only support predict_click(), not agent.run()
|
||||
@@ -54,21 +71,28 @@ jobs:
|
||||
|
||||
# Moondream (typically used in composed agents)
|
||||
# Format: moondream3+{any-llm-with-tools}
|
||||
- moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
|
||||
# - moondream3+openai/gpt-4o # GPT-4o has VLM + Tools
|
||||
- model: moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
|
||||
requires_local_weights: false
|
||||
# - model: moondream3+openai/gpt-4o # GPT-4o has VLM + Tools
|
||||
# requires_local_weights: false
|
||||
|
||||
# OmniParser (typically used in composed agents)
|
||||
# Format: omniparser+{any-vlm-with-tools}
|
||||
- omniparser+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
|
||||
# - omniparser+openai/gpt-4o # GPT-4o has VLM + Tools
|
||||
- model: omniparser+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
|
||||
requires_local_weights: false
|
||||
# - model: omniparser+openai/gpt-4o # GPT-4o has VLM + Tools
|
||||
# requires_local_weights: false
|
||||
|
||||
# Other grounding models + VLM with tools
|
||||
# Format: {grounding-model}+{any-vlm-with-tools}
|
||||
# These grounding-only models (OpenCUA, GTA, Holo) must be used in composed form
|
||||
# since they only support predict_click(), not full agent.run()
|
||||
- huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929
|
||||
- huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929
|
||||
- huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929
|
||||
- model: huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929
|
||||
requires_local_weights: true
|
||||
- model: huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929
|
||||
requires_local_weights: true
|
||||
- model: huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929
|
||||
requires_local_weights: true
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@@ -218,6 +242,7 @@ jobs:
|
||||
tests/agent_loop_testing/test_images/
|
||||
*.log
|
||||
retention-days: 7
|
||||
if-no-files-found: ignore
|
||||
|
||||
- name: Upload test summary data
|
||||
if: always()
|
||||
@@ -227,6 +252,7 @@ jobs:
|
||||
name: test-summary-${{ env.SAFE_MODEL_NAME }}
|
||||
path: test_summary/
|
||||
retention-days: 1
|
||||
if-no-files-found: ignore
|
||||
|
||||
- name: Set default Slack color
|
||||
if: always() && env.SLACK_COLOR == ''
|
||||
@@ -268,10 +294,6 @@ jobs:
|
||||
# Create directory if it doesn't exist
|
||||
mkdir -p all_summaries
|
||||
|
||||
# Get list of models being tested in this run from the matrix
|
||||
# This helps filter out artifacts from previous runs when testing locally
|
||||
EXPECTED_MODELS="${{ join(matrix.model, ' ') }}"
|
||||
|
||||
# Aggregate all results
|
||||
PASSED_COUNT=0
|
||||
FAILED_COUNT=0
|
||||
@@ -295,15 +317,6 @@ jobs:
|
||||
continue
|
||||
fi
|
||||
|
||||
# Filter: Only include models that are in the current matrix
|
||||
# This prevents including artifacts from previous workflow runs
|
||||
if [ -n "$EXPECTED_MODELS" ]; then
|
||||
if ! echo "$EXPECTED_MODELS" | grep -q "$MODEL"; then
|
||||
echo "Skipping model from previous run: $MODEL"
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# Mark as processed
|
||||
processed_models[$MODEL]="1"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user