diff --git a/.github/workflows/test-cua-models.yml b/.github/workflows/test-cua-models.yml index 023abce3..cd29323a 100644 --- a/.github/workflows/test-cua-models.yml +++ b/.github/workflows/test-cua-models.yml @@ -11,6 +11,11 @@ on: required: false default: true type: boolean + include_local_models: + description: "Also run huggingface-local models (requires large disk / self-hosted runner)" + required: false + default: false + type: boolean schedule: # Runs at 3 PM UTC (8 AM PDT) daily - cron: "0 15 * * *" @@ -18,35 +23,47 @@ on: jobs: # Test all CUA models - runs on PRs, schedules, or when manually triggered test-all-models: - if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false') }} + if: ${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false')) && (!matrix.requires_local_weights || fromJSON(inputs.include_local_models || 'false') || vars.RUN_LOCAL_MODELS == 'true') }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: - model: + include: # Claude Sonnet/Haiku - - anthropic/claude-sonnet-4-5-20250929 - - anthropic/claude-haiku-4-5-20251001 - - anthropic/claude-opus-4-1-20250805 + - model: anthropic/claude-sonnet-4-5-20250929 + requires_local_weights: false + - model: anthropic/claude-haiku-4-5-20251001 + requires_local_weights: false + - model: anthropic/claude-opus-4-1-20250805 + requires_local_weights: false # OpenAI CU Preview - - openai/computer-use-preview + - model: openai/computer-use-preview + requires_local_weights: false # GLM-V - - openrouter/z-ai/glm-4.5v - # - huggingface-local/zai-org/GLM-4.5V # Requires local model setup + - model: openrouter/z-ai/glm-4.5v + requires_local_weights: false + # - model: huggingface-local/zai-org/GLM-4.5V # Requires local model setup + # requires_local_weights: true # Gemini CU Preview - - gemini-2.5-computer-use-preview-10-2025 + - model: gemini-2.5-computer-use-preview-10-2025 + requires_local_weights: false # InternVL - - huggingface-local/OpenGVLab/InternVL3_5-1B - # - huggingface-local/OpenGVLab/InternVL3_5-2B - # - huggingface-local/OpenGVLab/InternVL3_5-4B - # - huggingface-local/OpenGVLab/InternVL3_5-8B + - model: huggingface-local/OpenGVLab/InternVL3_5-1B + requires_local_weights: true + # - model: huggingface-local/OpenGVLab/InternVL3_5-2B + # requires_local_weights: true + # - model: huggingface-local/OpenGVLab/InternVL3_5-4B + # requires_local_weights: true + # - model: huggingface-local/OpenGVLab/InternVL3_5-8B + # requires_local_weights: true # UI-TARS (supports full computer-use, can run standalone) - - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B + - model: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B + requires_local_weights: true # Note: OpenCUA, GTA, and Holo are grounding-only models # They only support predict_click(), not agent.run() @@ -54,21 +71,28 @@ jobs: # Moondream (typically used in composed agents) # Format: moondream3+{any-llm-with-tools} - - moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools - # - moondream3+openai/gpt-4o # GPT-4o has VLM + Tools + - model: moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools + requires_local_weights: false + # - model: moondream3+openai/gpt-4o # GPT-4o has VLM + Tools + # requires_local_weights: false # OmniParser (typically used in composed agents) # Format: omniparser+{any-vlm-with-tools} - - omniparser+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools - # - omniparser+openai/gpt-4o # GPT-4o has VLM + Tools + - model: omniparser+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools + requires_local_weights: false + # - model: omniparser+openai/gpt-4o # GPT-4o has VLM + Tools + # requires_local_weights: false # Other grounding models + VLM with tools # Format: {grounding-model}+{any-vlm-with-tools} # These grounding-only models (OpenCUA, GTA, Holo) must be used in composed form # since they only support predict_click(), not full agent.run() - - huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929 - - huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929 - - huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929 + - model: huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929 + requires_local_weights: true + - model: huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929 + requires_local_weights: true + - model: huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929 + requires_local_weights: true steps: - name: Checkout repository @@ -218,6 +242,7 @@ jobs: tests/agent_loop_testing/test_images/ *.log retention-days: 7 + if-no-files-found: ignore - name: Upload test summary data if: always() @@ -227,6 +252,7 @@ jobs: name: test-summary-${{ env.SAFE_MODEL_NAME }} path: test_summary/ retention-days: 1 + if-no-files-found: ignore - name: Set default Slack color if: always() && env.SLACK_COLOR == '' @@ -268,10 +294,6 @@ jobs: # Create directory if it doesn't exist mkdir -p all_summaries - # Get list of models being tested in this run from the matrix - # This helps filter out artifacts from previous runs when testing locally - EXPECTED_MODELS="${{ join(matrix.model, ' ') }}" - # Aggregate all results PASSED_COUNT=0 FAILED_COUNT=0 @@ -295,15 +317,6 @@ jobs: continue fi - # Filter: Only include models that are in the current matrix - # This prevents including artifacts from previous workflow runs - if [ -n "$EXPECTED_MODELS" ]; then - if ! echo "$EXPECTED_MODELS" | grep -q "$MODEL"; then - echo "Skipping model from previous run: $MODEL" - continue - fi - fi - # Mark as processed processed_models[$MODEL]="1"