update supported models and slackbot announcement

2026-01-04 20:40:15 -06:00 · 2025-10-29 18:06:18 -07:00
parent 2ba64f018d
commit a2d7fc38dd
2 changed files with 302 additions and 43 deletions
--- a/.github/workflows/test-cua-models.yml
+++ b/.github/workflows/test-cua-models.yml
@@ -23,55 +23,51 @@ jobs:
      fail-fast: false
      matrix:
        model:
-          # Anthropic Claude Models
-          # - anthropic/claude-3-5-sonnet-20241022
-          # - anthropic/claude-3-7-sonnet-20250219
-          # - anthropic/claude-opus-4-20250514
-          # - anthropic/claude-sonnet-4-20250514
+          # Claude Sonnet/Haiku
+          # - anthropic/claude-sonnet-4-5-20250929
+          - anthropic/claude-haiku-4-5-20251001
          # - anthropic/claude-opus-4-1-20250805
-          - anthropic/claude-sonnet-4-5-20250929
-          # - anthropic/claude-haiku-4-5-20251001

-          # OpenAI Models
-          # - openai/computer-use-preview
+          # OpenAI CU Preview
+          - openai/computer-use-preview

-          # Gemini Models
+          # GLM-V
+          # - openrouter/z-ai/glm-4.5v
+          # - huggingface-local/zai-org/GLM-4.5V  # Requires local model setup
+
+          # Gemini CU Preview
          # - gemini-2.5-computer-use-preview-10-2025

-          # GLM-4.5V Models
-          # - openrouter/z-ai/glm-4.5v
-
-          # UI-TARS Models
-          # - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
-
-          # OpenCUA Models
-          # - huggingface-local/xlangai/OpenCUA-7B
-          # - huggingface-local/xlangai/OpenCUA-32B
-
-          # GTA1 Family Models
-          # - huggingface-local/HelloKKMe/GTA1-7B
-          # - huggingface-local/HelloKKMe/GTA1-32B
-          # - huggingface-local/HelloKKMe/GTA1-72B
-
-          # Holo 1.5 Family Models
-          # - huggingface-local/Hcompany/Holo1.5-3B
-          # - huggingface-local/Hcompany/Holo1.5-7B
-          # - huggingface-local/Hcompany/Holo1.5-72B
-
-          # InternVL 3.5 Family Models
+          # InternVL
          # - huggingface-local/OpenGVLab/InternVL3_5-1B
          # - huggingface-local/OpenGVLab/InternVL3_5-2B
          # - huggingface-local/OpenGVLab/InternVL3_5-4B
          # - huggingface-local/OpenGVLab/InternVL3_5-8B

-          # GLM-4.5V Local
-          # - huggingface-local/zai-org/GLM-4.5V
+          # UI-TARS (supports full computer-use, can run standalone)
+          # - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B

-          # Composed Models (Grounding + Planning)
-          # - omniparser+anthropic/claude-3-5-sonnet-20241022
-          # - omniparser+openai/gpt-4o-mini
-          # - moondream3+anthropic/claude-3-5-sonnet-20241022
-          # - moondream3+openai/gpt-4o-mini
+          # Note: OpenCUA, GTA, and Holo are grounding-only models
+          # They only support predict_click(), not agent.run()
+          # See composed agents section below for testing them
+
+          # Moondream (typically used in composed agents)
+          # Format: moondream3+{any-llm-with-tools}
+          # - moondream3+anthropic/claude-sonnet-4-5-20250929  # Claude has VLM + Tools
+          # - moondream3+openai/gpt-4o  # GPT-4o has VLM + Tools
+
+          # OmniParser (typically used in composed agents)
+          # Format: omniparser+{any-vlm-with-tools}
+          # - omniparser+anthropic/claude-sonnet-4-5-20250929  # Claude has VLM + Tools
+          # - omniparser+openai/gpt-4o  # GPT-4o has VLM + Tools
+
+          # Other grounding models + VLM with tools
+          # Format: {grounding-model}+{any-vlm-with-tools}
+          # These grounding-only models (OpenCUA, GTA, Holo) must be used in composed form
+          # since they only support predict_click(), not full agent.run()
+          # - huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929
+          # - huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929
+          # - huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929

    steps:
      - name: Checkout repository
@@ -82,28 +78,134 @@ jobs:
        with:
          python-version: "3.12"

+      - name: Cache system packages
+        uses: actions/cache@v4
+        with:
+          path: /var/cache/apt
+          key: ${{ runner.os }}-apt-${{ hashFiles('**/Dockerfile') }}
+          restore-keys: |
+            ${{ runner.os }}-apt-
+
      - name: Install system dependencies
+        timeout-minutes: 20
        run: |
          sudo apt-get update
          sudo apt-get install -y libgl1-mesa-dri libglib2.0-0

+      - name: Cache Python dependencies (uv)
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/uv
+            .venv
+          key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock', 'libs/python/**/pyproject.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-uv-
+
      - name: Install CUA dependencies (uv)
        run: |
-          uv venv
+          # Remove existing venv if it exists (from cache restore) to avoid interactive prompt
+          rm -rf .venv
+          uv venv --python 3.12
          uv pip install -e libs/python/agent -e libs/python/computer
          uv pip install -e libs/python/core
          uv pip install "cua-agent[uitars-hf]"
          uv pip install pytest

+      - name: Cache HuggingFace models
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: ${{ runner.os }}-hf-models-v1
+          restore-keys: |
+            ${{ runner.os }}-hf-models-
+          # Large cache - models can be several GB each and are reused across runs
+
+      - name: Record test start time
+        run: echo "TEST_START_TIME=$(date +%s)" >> $GITHUB_ENV
+        env:
+          # Ensure HuggingFace uses consistent cache location
+          HF_HOME: ~/.cache/huggingface
+
      - name: Test model with agent loop
+        id: test_model
+        timeout-minutes: 20
+        continue-on-error: true
        run: |
          cd tests/agent_loop_testing
          uv run python agent_test.py --model "${{ matrix.model }}"
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-          # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          # GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
-          # OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+
+      - name: Calculate test duration and prepare message
+        if: always()
+        run: |
+          TEST_END_TIME=$(date +%s)
+
+          # Handle case where TEST_START_TIME might not be set
+          if [ -z "$TEST_START_TIME" ]; then
+            TEST_START_TIME=$TEST_END_TIME
+          fi
+
+          TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME))
+
+          # Convert seconds to minutes and seconds
+          MINUTES=$((TEST_DURATION / 60))
+          SECONDS=$((TEST_DURATION % 60))
+
+          # Format duration
+          if [ $MINUTES -gt 0 ]; then
+            DURATION_STR="${MINUTES}m ${SECONDS}s"
+          else
+            DURATION_STR="${SECONDS}s"
+          fi
+
+          # Determine status icon based on test step outcome
+          if [ "${{ steps.test_model.outcome }}" == "success" ]; then
+            STATUS_ICON="✅"
+            STATUS_TEXT="PASSED"
+            SLACK_COLOR="#36a64f"
+          else
+            STATUS_ICON="❌"
+            STATUS_TEXT="FAILED"
+            SLACK_COLOR="#dc3545"
+          fi
+
+          # Prepare Slack message
+          echo "TESTS_CONTENT<<EOF" >> $GITHUB_ENV
+          echo "*CUA Model Test Results*" >> $GITHUB_ENV
+          echo "" >> $GITHUB_ENV
+          echo "*Model:* ${{ matrix.model }}" >> $GITHUB_ENV
+          echo "*Status:* ${STATUS_ICON} ${STATUS_TEXT}" >> $GITHUB_ENV
+          echo "*Duration:* ${DURATION_STR}" >> $GITHUB_ENV
+          echo "*Run:* ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+          # Set color based on outcome
+          echo "SLACK_COLOR=${SLACK_COLOR}" >> $GITHUB_ENV
+
+          # Save result to JSON file for summary
+          mkdir -p test_summary
+          MODEL_NAME="${{ matrix.model }}"
+          # Sanitize model name for filename
+          SAFE_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/[^a-zA-Z0-9]/_/g')
+
+          # Determine pass status
+          if [ "${{ steps.test_model.outcome }}" == "success" ]; then
+            PASSED_VAL="true"
+          else
+            PASSED_VAL="false"
+          fi
+
+          # Create JSON file using printf to avoid YAML parsing issues
+          printf '{\n  "model": "%s",\n  "status": "%s",\n  "status_icon": "%s",\n  "duration": "%s",\n  "duration_seconds": %d,\n  "passed": %s\n}' \
+            "${MODEL_NAME}" "${STATUS_TEXT}" "${STATUS_ICON}" "${DURATION_STR}" "${TEST_DURATION}" "${PASSED_VAL}" \
+            > "test_summary/${SAFE_MODEL_NAME}.json"
+          # Expose safe model name for subsequent steps (artifact naming)
+          echo "SAFE_MODEL_NAME=${SAFE_MODEL_NAME}" >> $GITHUB_ENV

      - name: Upload test results
        if: always()
@@ -114,3 +216,157 @@ jobs:
            tests/agent_loop_testing/test_images/
            *.log
          retention-days: 7
+
+      - name: Upload test summary data
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          # Unique, slash-free artifact name per matrix entry
+          name: test-summary-${{ env.SAFE_MODEL_NAME }}
+          path: test_summary/
+          retention-days: 1
+
+      - name: Set default Slack color
+        if: always() && env.SLACK_COLOR == ''
+        run: echo "SLACK_COLOR=#36a64f" >> $GITHUB_ENV
+
+      # Individual model notifications disabled - only summary is sent
+      # - name: Notify Slack with test results
+      #   if: always()
+      #   uses: rtCamp/action-slack-notify@v2
+      #   env:
+      #     SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      #     SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }}
+      #     SLACK_TITLE: CUA Model Test Update
+      #     SLACK_COLOR: ${{ env.SLACK_COLOR }}
+      #     SLACK_MESSAGE: |
+      #       ${{ env.TESTS_CONTENT }}
+
+  # Summary job that aggregates all model test results
+  test-summary:
+    if: ${{ always() && (github.event_name == 'pull_request_target' || fromJSON(inputs.test_models || 'false')) }}
+    needs: test-all-models
+    runs-on: ubuntu-latest
+    steps:
+      - name: Install jq
+        run: sudo apt-get update && sudo apt-get install -y jq
+
+      - name: Download all test summary artifacts
+        continue-on-error: true
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-summary-*
+          merge-multiple: true
+          path: all_summaries
+
+      - name: Generate and send summary
+        if: always()
+        shell: bash
+        run: |
+          # Create directory if it doesn't exist
+          mkdir -p all_summaries
+
+          # Get list of models being tested in this run from the matrix
+          # This helps filter out artifacts from previous runs when testing locally
+          EXPECTED_MODELS="${{ join(matrix.model, ' ') }}"
+
+          # Aggregate all results
+          PASSED_COUNT=0
+          FAILED_COUNT=0
+          TOTAL_DURATION=0
+          SUMMARY_MESSAGE="*🚀 Model Summaries*\n\n"
+
+          # Process each JSON file (find all JSON files recursively)
+          # Save to temp file first to avoid subshell issues
+          find all_summaries -name "*.json" -type f 2>/dev/null > /tmp/json_files.txt || true
+
+          # Use associative array to deduplicate by model name
+          declare -A processed_models
+
+          while IFS= read -r json_file; do
+            if [ -f "$json_file" ]; then
+              MODEL=$(jq -r '.model' "$json_file")
+              
+              # Skip if we've already processed this model
+              if [ "${processed_models[$MODEL]}" = "1" ]; then
+                echo "Skipping duplicate model: $MODEL"
+                continue
+              fi
+              
+              # Filter: Only include models that are in the current matrix
+              # This prevents including artifacts from previous workflow runs
+              if [ -n "$EXPECTED_MODELS" ]; then
+                if ! echo "$EXPECTED_MODELS" | grep -q "$MODEL"; then
+                  echo "Skipping model from previous run: $MODEL"
+                  continue
+                fi
+              fi
+              
+              # Mark as processed
+              processed_models[$MODEL]="1"
+              
+              STATUS_ICON=$(jq -r '.status_icon' "$json_file")
+              STATUS=$(jq -r '.status' "$json_file")
+              DURATION=$(jq -r '.duration' "$json_file")
+              DURATION_SEC=$(jq -r '.duration_seconds' "$json_file")
+              PASSED=$(jq -r '.passed' "$json_file")
+              
+              # Add to summary as clean line format
+              SUMMARY_MESSAGE="${SUMMARY_MESSAGE}${STATUS_ICON} ${STATUS} - \`${MODEL}\` - ${DURATION}\n"
+              
+              if [ "$PASSED" = "true" ]; then
+                PASSED_COUNT=$((PASSED_COUNT + 1))
+              else
+                FAILED_COUNT=$((FAILED_COUNT + 1))
+              fi
+              TOTAL_DURATION=$((TOTAL_DURATION + DURATION_SEC))
+            fi
+          done < /tmp/json_files.txt
+
+          # Check if we found any results
+          TOTAL_COUNT=$((PASSED_COUNT + FAILED_COUNT))
+          if [ $TOTAL_COUNT -eq 0 ]; then
+            SUMMARY_MESSAGE="${SUMMARY_MESSAGE}⚠️ No test results found (workflow may have been canceled)\n"
+            SLACK_COLOR="#ffa500"
+          else
+            # Add summary stats
+            SUMMARY_MESSAGE="${SUMMARY_MESSAGE}\n*Results:* ${PASSED_COUNT} passed, ${FAILED_COUNT} failed out of ${TOTAL_COUNT} models\n"
+            
+            # Calculate total duration
+            TOTAL_MIN=$((TOTAL_DURATION / 60))
+            TOTAL_SEC=$((TOTAL_DURATION % 60))
+            if [ $TOTAL_MIN -gt 0 ]; then
+              TOTAL_DURATION_STR="${TOTAL_MIN}m ${TOTAL_SEC}s"
+            else
+              TOTAL_DURATION_STR="${TOTAL_SEC}s"
+            fi
+            SUMMARY_MESSAGE="${SUMMARY_MESSAGE}*Total Duration:* ${TOTAL_DURATION_STR}\n"
+            
+            # Determine color based on results
+            if [ $FAILED_COUNT -eq 0 ]; then
+              SLACK_COLOR="#36a64f"
+            elif [ $PASSED_COUNT -eq 0 ]; then
+              SLACK_COLOR="#dc3545"
+            else
+              SLACK_COLOR="#ffa500"
+            fi
+          fi
+
+          SUMMARY_MESSAGE="${SUMMARY_MESSAGE}*Run:* ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+
+          # Export for use in next step
+          echo "SUMMARY_MESSAGE<<EOF" >> $GITHUB_ENV
+          echo -e "${SUMMARY_MESSAGE}" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+          echo "SLACK_COLOR=${SLACK_COLOR}" >> $GITHUB_ENV
+
+      - name: Send summary to Slack
+        if: always()
+        uses: rtCamp/action-slack-notify@v2
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }}
+          SLACK_TITLE: CUA Models Test Summary
+          SLACK_COLOR: ${{ env.SLACK_COLOR }}
+          SLACK_MESSAGE: |
+            ${{ env.SUMMARY_MESSAGE }}
--- a/.gitignore
+++ b/.gitignore
@@ -259,4 +259,7 @@ storage/
 .Trashes
 .Trash-1000/

-post-provision
+post-provision
+
+# Local secrets for act
+.secrets