diff --git a/.github/workflows/test-cua-models.yml b/.github/workflows/test-cua-models.yml index d5681983..2fa3f206 100644 --- a/.github/workflows/test-cua-models.yml +++ b/.github/workflows/test-cua-models.yml @@ -23,55 +23,51 @@ jobs: fail-fast: false matrix: model: - # Anthropic Claude Models - # - anthropic/claude-3-5-sonnet-20241022 - # - anthropic/claude-3-7-sonnet-20250219 - # - anthropic/claude-opus-4-20250514 - # - anthropic/claude-sonnet-4-20250514 + # Claude Sonnet/Haiku + # - anthropic/claude-sonnet-4-5-20250929 + - anthropic/claude-haiku-4-5-20251001 # - anthropic/claude-opus-4-1-20250805 - - anthropic/claude-sonnet-4-5-20250929 - # - anthropic/claude-haiku-4-5-20251001 - # OpenAI Models - # - openai/computer-use-preview + # OpenAI CU Preview + - openai/computer-use-preview - # Gemini Models + # GLM-V + # - openrouter/z-ai/glm-4.5v + # - huggingface-local/zai-org/GLM-4.5V # Requires local model setup + + # Gemini CU Preview # - gemini-2.5-computer-use-preview-10-2025 - # GLM-4.5V Models - # - openrouter/z-ai/glm-4.5v - - # UI-TARS Models - # - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B - - # OpenCUA Models - # - huggingface-local/xlangai/OpenCUA-7B - # - huggingface-local/xlangai/OpenCUA-32B - - # GTA1 Family Models - # - huggingface-local/HelloKKMe/GTA1-7B - # - huggingface-local/HelloKKMe/GTA1-32B - # - huggingface-local/HelloKKMe/GTA1-72B - - # Holo 1.5 Family Models - # - huggingface-local/Hcompany/Holo1.5-3B - # - huggingface-local/Hcompany/Holo1.5-7B - # - huggingface-local/Hcompany/Holo1.5-72B - - # InternVL 3.5 Family Models + # InternVL # - huggingface-local/OpenGVLab/InternVL3_5-1B # - huggingface-local/OpenGVLab/InternVL3_5-2B # - huggingface-local/OpenGVLab/InternVL3_5-4B # - huggingface-local/OpenGVLab/InternVL3_5-8B - # GLM-4.5V Local - # - huggingface-local/zai-org/GLM-4.5V + # UI-TARS (supports full computer-use, can run standalone) + # - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B - # Composed Models (Grounding + Planning) - # - omniparser+anthropic/claude-3-5-sonnet-20241022 - # - omniparser+openai/gpt-4o-mini - # - moondream3+anthropic/claude-3-5-sonnet-20241022 - # - moondream3+openai/gpt-4o-mini + # Note: OpenCUA, GTA, and Holo are grounding-only models + # They only support predict_click(), not agent.run() + # See composed agents section below for testing them + + # Moondream (typically used in composed agents) + # Format: moondream3+{any-llm-with-tools} + # - moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools + # - moondream3+openai/gpt-4o # GPT-4o has VLM + Tools + + # OmniParser (typically used in composed agents) + # Format: omniparser+{any-vlm-with-tools} + # - omniparser+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools + # - omniparser+openai/gpt-4o # GPT-4o has VLM + Tools + + # Other grounding models + VLM with tools + # Format: {grounding-model}+{any-vlm-with-tools} + # These grounding-only models (OpenCUA, GTA, Holo) must be used in composed form + # since they only support predict_click(), not full agent.run() + # - huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929 + # - huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929 + # - huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929 steps: - name: Checkout repository @@ -82,28 +78,134 @@ jobs: with: python-version: "3.12" + - name: Cache system packages + uses: actions/cache@v4 + with: + path: /var/cache/apt + key: ${{ runner.os }}-apt-${{ hashFiles('**/Dockerfile') }} + restore-keys: | + ${{ runner.os }}-apt- + - name: Install system dependencies + timeout-minutes: 20 run: | sudo apt-get update sudo apt-get install -y libgl1-mesa-dri libglib2.0-0 + - name: Cache Python dependencies (uv) + uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + .venv + key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock', 'libs/python/**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-uv- + - name: Install CUA dependencies (uv) run: | - uv venv + # Remove existing venv if it exists (from cache restore) to avoid interactive prompt + rm -rf .venv + uv venv --python 3.12 uv pip install -e libs/python/agent -e libs/python/computer uv pip install -e libs/python/core uv pip install "cua-agent[uitars-hf]" uv pip install pytest + - name: Cache HuggingFace models + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: ${{ runner.os }}-hf-models-v1 + restore-keys: | + ${{ runner.os }}-hf-models- + # Large cache - models can be several GB each and are reused across runs + + - name: Record test start time + run: echo "TEST_START_TIME=$(date +%s)" >> $GITHUB_ENV + env: + # Ensure HuggingFace uses consistent cache location + HF_HOME: ~/.cache/huggingface + - name: Test model with agent loop + id: test_model + timeout-minutes: 20 + continue-on-error: true run: | cd tests/agent_loop_testing uv run python agent_test.py --model "${{ matrix.model }}" env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - # GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - # OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + + - name: Calculate test duration and prepare message + if: always() + run: | + TEST_END_TIME=$(date +%s) + + # Handle case where TEST_START_TIME might not be set + if [ -z "$TEST_START_TIME" ]; then + TEST_START_TIME=$TEST_END_TIME + fi + + TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME)) + + # Convert seconds to minutes and seconds + MINUTES=$((TEST_DURATION / 60)) + SECONDS=$((TEST_DURATION % 60)) + + # Format duration + if [ $MINUTES -gt 0 ]; then + DURATION_STR="${MINUTES}m ${SECONDS}s" + else + DURATION_STR="${SECONDS}s" + fi + + # Determine status icon based on test step outcome + if [ "${{ steps.test_model.outcome }}" == "success" ]; then + STATUS_ICON="✅" + STATUS_TEXT="PASSED" + SLACK_COLOR="#36a64f" + else + STATUS_ICON="❌" + STATUS_TEXT="FAILED" + SLACK_COLOR="#dc3545" + fi + + # Prepare Slack message + echo "TESTS_CONTENT<> $GITHUB_ENV + echo "*CUA Model Test Results*" >> $GITHUB_ENV + echo "" >> $GITHUB_ENV + echo "*Model:* ${{ matrix.model }}" >> $GITHUB_ENV + echo "*Status:* ${STATUS_ICON} ${STATUS_TEXT}" >> $GITHUB_ENV + echo "*Duration:* ${DURATION_STR}" >> $GITHUB_ENV + echo "*Run:* ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + # Set color based on outcome + echo "SLACK_COLOR=${SLACK_COLOR}" >> $GITHUB_ENV + + # Save result to JSON file for summary + mkdir -p test_summary + MODEL_NAME="${{ matrix.model }}" + # Sanitize model name for filename + SAFE_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/[^a-zA-Z0-9]/_/g') + + # Determine pass status + if [ "${{ steps.test_model.outcome }}" == "success" ]; then + PASSED_VAL="true" + else + PASSED_VAL="false" + fi + + # Create JSON file using printf to avoid YAML parsing issues + printf '{\n "model": "%s",\n "status": "%s",\n "status_icon": "%s",\n "duration": "%s",\n "duration_seconds": %d,\n "passed": %s\n}' \ + "${MODEL_NAME}" "${STATUS_TEXT}" "${STATUS_ICON}" "${DURATION_STR}" "${TEST_DURATION}" "${PASSED_VAL}" \ + > "test_summary/${SAFE_MODEL_NAME}.json" + # Expose safe model name for subsequent steps (artifact naming) + echo "SAFE_MODEL_NAME=${SAFE_MODEL_NAME}" >> $GITHUB_ENV - name: Upload test results if: always() @@ -114,3 +216,157 @@ jobs: tests/agent_loop_testing/test_images/ *.log retention-days: 7 + + - name: Upload test summary data + if: always() + uses: actions/upload-artifact@v4 + with: + # Unique, slash-free artifact name per matrix entry + name: test-summary-${{ env.SAFE_MODEL_NAME }} + path: test_summary/ + retention-days: 1 + + - name: Set default Slack color + if: always() && env.SLACK_COLOR == '' + run: echo "SLACK_COLOR=#36a64f" >> $GITHUB_ENV + + # Individual model notifications disabled - only summary is sent + # - name: Notify Slack with test results + # if: always() + # uses: rtCamp/action-slack-notify@v2 + # env: + # SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + # SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }} + # SLACK_TITLE: CUA Model Test Update + # SLACK_COLOR: ${{ env.SLACK_COLOR }} + # SLACK_MESSAGE: | + # ${{ env.TESTS_CONTENT }} + + # Summary job that aggregates all model test results + test-summary: + if: ${{ always() && (github.event_name == 'pull_request_target' || fromJSON(inputs.test_models || 'false')) }} + needs: test-all-models + runs-on: ubuntu-latest + steps: + - name: Install jq + run: sudo apt-get update && sudo apt-get install -y jq + + - name: Download all test summary artifacts + continue-on-error: true + uses: actions/download-artifact@v4 + with: + pattern: test-summary-* + merge-multiple: true + path: all_summaries + + - name: Generate and send summary + if: always() + shell: bash + run: | + # Create directory if it doesn't exist + mkdir -p all_summaries + + # Get list of models being tested in this run from the matrix + # This helps filter out artifacts from previous runs when testing locally + EXPECTED_MODELS="${{ join(matrix.model, ' ') }}" + + # Aggregate all results + PASSED_COUNT=0 + FAILED_COUNT=0 + TOTAL_DURATION=0 + SUMMARY_MESSAGE="*🚀 Model Summaries*\n\n" + + # Process each JSON file (find all JSON files recursively) + # Save to temp file first to avoid subshell issues + find all_summaries -name "*.json" -type f 2>/dev/null > /tmp/json_files.txt || true + + # Use associative array to deduplicate by model name + declare -A processed_models + + while IFS= read -r json_file; do + if [ -f "$json_file" ]; then + MODEL=$(jq -r '.model' "$json_file") + + # Skip if we've already processed this model + if [ "${processed_models[$MODEL]}" = "1" ]; then + echo "Skipping duplicate model: $MODEL" + continue + fi + + # Filter: Only include models that are in the current matrix + # This prevents including artifacts from previous workflow runs + if [ -n "$EXPECTED_MODELS" ]; then + if ! echo "$EXPECTED_MODELS" | grep -q "$MODEL"; then + echo "Skipping model from previous run: $MODEL" + continue + fi + fi + + # Mark as processed + processed_models[$MODEL]="1" + + STATUS_ICON=$(jq -r '.status_icon' "$json_file") + STATUS=$(jq -r '.status' "$json_file") + DURATION=$(jq -r '.duration' "$json_file") + DURATION_SEC=$(jq -r '.duration_seconds' "$json_file") + PASSED=$(jq -r '.passed' "$json_file") + + # Add to summary as clean line format + SUMMARY_MESSAGE="${SUMMARY_MESSAGE}${STATUS_ICON} ${STATUS} - \`${MODEL}\` - ${DURATION}\n" + + if [ "$PASSED" = "true" ]; then + PASSED_COUNT=$((PASSED_COUNT + 1)) + else + FAILED_COUNT=$((FAILED_COUNT + 1)) + fi + TOTAL_DURATION=$((TOTAL_DURATION + DURATION_SEC)) + fi + done < /tmp/json_files.txt + + # Check if we found any results + TOTAL_COUNT=$((PASSED_COUNT + FAILED_COUNT)) + if [ $TOTAL_COUNT -eq 0 ]; then + SUMMARY_MESSAGE="${SUMMARY_MESSAGE}⚠️ No test results found (workflow may have been canceled)\n" + SLACK_COLOR="#ffa500" + else + # Add summary stats + SUMMARY_MESSAGE="${SUMMARY_MESSAGE}\n*Results:* ${PASSED_COUNT} passed, ${FAILED_COUNT} failed out of ${TOTAL_COUNT} models\n" + + # Calculate total duration + TOTAL_MIN=$((TOTAL_DURATION / 60)) + TOTAL_SEC=$((TOTAL_DURATION % 60)) + if [ $TOTAL_MIN -gt 0 ]; then + TOTAL_DURATION_STR="${TOTAL_MIN}m ${TOTAL_SEC}s" + else + TOTAL_DURATION_STR="${TOTAL_SEC}s" + fi + SUMMARY_MESSAGE="${SUMMARY_MESSAGE}*Total Duration:* ${TOTAL_DURATION_STR}\n" + + # Determine color based on results + if [ $FAILED_COUNT -eq 0 ]; then + SLACK_COLOR="#36a64f" + elif [ $PASSED_COUNT -eq 0 ]; then + SLACK_COLOR="#dc3545" + else + SLACK_COLOR="#ffa500" + fi + fi + + SUMMARY_MESSAGE="${SUMMARY_MESSAGE}*Run:* ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + # Export for use in next step + echo "SUMMARY_MESSAGE<> $GITHUB_ENV + echo -e "${SUMMARY_MESSAGE}" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + echo "SLACK_COLOR=${SLACK_COLOR}" >> $GITHUB_ENV + + - name: Send summary to Slack + if: always() + uses: rtCamp/action-slack-notify@v2 + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }} + SLACK_TITLE: CUA Models Test Summary + SLACK_COLOR: ${{ env.SLACK_COLOR }} + SLACK_MESSAGE: | + ${{ env.SUMMARY_MESSAGE }} diff --git a/.gitignore b/.gitignore index 8cae22ce..adacb39a 100644 --- a/.gitignore +++ b/.gitignore @@ -259,4 +259,7 @@ storage/ .Trashes .Trash-1000/ -post-provision \ No newline at end of file +post-provision + +# Local secrets for act +.secrets \ No newline at end of file