From 87c2f1e0e6919619d93cbaece2c954cc40f0b0a8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 6 Sep 2025 22:30:43 -0400 Subject: [PATCH 1/2] build(deps): bump the github-actions group across 1 directory with 8 updates (#1071) Co-authored-by: Jason Cameron --- .github/workflows/docker-pr.yml | 6 +++--- .github/workflows/docker.yml | 10 +++++----- .github/workflows/docs-deploy.yml | 10 +++++----- .github/workflows/docs-test.yml | 4 ++-- .github/workflows/go.yml | 8 ++++---- .github/workflows/package-builds-stable.yml | 6 +++--- .github/workflows/package-builds-unstable.yml | 6 +++--- .github/workflows/smoke-tests.yml | 2 +- .github/workflows/ssh-ci-runner-cron.yml | 4 ++-- .github/workflows/ssh-ci.yml | 2 +- .github/workflows/zizmor.yml | 6 +++--- 11 files changed, 32 insertions(+), 32 deletions(-) diff --git a/.github/workflows/docker-pr.yml b/.github/workflows/docker-pr.yml index 12fc6512..db83ff9e 100644 --- a/.github/workflows/docker-pr.yml +++ b/.github/workflows/docker-pr.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-tags: true fetch-depth: 0 @@ -25,7 +25,7 @@ jobs: uses: Homebrew/actions/setup-homebrew@main - name: Setup Homebrew cellar cache - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 with: path: | /home/linuxbrew/.linuxbrew/Cellar @@ -47,7 +47,7 @@ jobs: - name: Docker meta id: meta - uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0 + uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0 with: images: ghcr.io/${{ github.repository }} diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 7e8db319..5da3be6b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-tags: true fetch-depth: 0 @@ -35,7 +35,7 @@ jobs: uses: Homebrew/actions/setup-homebrew@main - name: Setup Homebrew cellar cache - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 with: path: | /home/linuxbrew/.linuxbrew/Cellar @@ -56,7 +56,7 @@ jobs: brew bundle - name: Log into registry - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 with: registry: ghcr.io username: ${{ github.repository_owner }} @@ -64,7 +64,7 @@ jobs: - name: Docker meta id: meta - uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0 + uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0 with: images: ${{ env.IMAGE }} @@ -78,7 +78,7 @@ jobs: SLOG_LEVEL: debug - name: Generate artifact attestation - uses: actions/attest-build-provenance@e8998f949152b193b063cb0ec769d69d929409be # v2.4.0 + uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0 with: subject-name: ${{ env.IMAGE }} subject-digest: ${{ steps.build.outputs.digest }} diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml index 93c41491..1d2fc878 100644 --- a/.github/workflows/docs-deploy.yml +++ b/.github/workflows/docs-deploy.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false @@ -25,7 +25,7 @@ jobs: uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1 - name: Log into registry - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 with: registry: ghcr.io username: techarohq @@ -33,7 +33,7 @@ jobs: - name: Docker meta id: meta - uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0 + uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0 with: images: ghcr.io/techarohq/anubis/docs tags: | @@ -53,14 +53,14 @@ jobs: push: true - name: Apply k8s manifests to limsa lominsa - uses: actions-hub/kubectl@b5b19eeb6a0ffde16637e398f8b96ef01eb8fdb7 # v1.33.3 + uses: actions-hub/kubectl@af345ed727f0268738e65be48422e463cc67c220 # v1.34.0 env: KUBE_CONFIG: ${{ secrets.LIMSA_LOMINSA_KUBECONFIG }} with: args: apply -k docs/manifest - name: Apply k8s manifests to limsa lominsa - uses: actions-hub/kubectl@b5b19eeb6a0ffde16637e398f8b96ef01eb8fdb7 # v1.33.3 + uses: actions-hub/kubectl@af345ed727f0268738e65be48422e463cc67c220 # v1.34.0 env: KUBE_CONFIG: ${{ secrets.LIMSA_LOMINSA_KUBECONFIG }} with: diff --git a/.github/workflows/docs-test.yml b/.github/workflows/docs-test.yml index 0a795036..d51cb505 100644 --- a/.github/workflows/docs-test.yml +++ b/.github/workflows/docs-test.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false @@ -22,7 +22,7 @@ jobs: - name: Docker meta id: meta - uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0 + uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0 with: images: ghcr.io/techarohq/anubis/docs tags: | diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index e7448a60..45a2bb5e 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -15,7 +15,7 @@ jobs: #runs-on: alrest-techarohq runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false @@ -28,7 +28,7 @@ jobs: uses: Homebrew/actions/setup-homebrew@main - name: Setup Homebrew cellar cache - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 with: path: | /home/linuxbrew/.linuxbrew/Cellar @@ -49,7 +49,7 @@ jobs: brew bundle - name: Setup Golang caches - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 with: path: | ~/.cache/go-build @@ -59,7 +59,7 @@ jobs: ${{ runner.os }}-golang- - name: Cache playwright binaries - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 id: playwright-cache with: path: | diff --git a/.github/workflows/package-builds-stable.yml b/.github/workflows/package-builds-stable.yml index 4ee17065..ab8d7b24 100644 --- a/.github/workflows/package-builds-stable.yml +++ b/.github/workflows/package-builds-stable.yml @@ -14,7 +14,7 @@ jobs: #runs-on: alrest-techarohq runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false fetch-tags: true @@ -29,7 +29,7 @@ jobs: uses: Homebrew/actions/setup-homebrew@main - name: Setup Homebrew cellar cache - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 with: path: | /home/linuxbrew/.linuxbrew/Cellar @@ -50,7 +50,7 @@ jobs: brew bundle - name: Setup Golang caches - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 with: path: | ~/.cache/go-build diff --git a/.github/workflows/package-builds-unstable.yml b/.github/workflows/package-builds-unstable.yml index facc2e3d..e7225397 100644 --- a/.github/workflows/package-builds-unstable.yml +++ b/.github/workflows/package-builds-unstable.yml @@ -15,7 +15,7 @@ jobs: #runs-on: alrest-techarohq runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false fetch-tags: true @@ -30,7 +30,7 @@ jobs: uses: Homebrew/actions/setup-homebrew@main - name: Setup Homebrew cellar cache - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 with: path: | /home/linuxbrew/.linuxbrew/Cellar @@ -51,7 +51,7 @@ jobs: brew bundle - name: Setup Golang caches - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 with: path: | ~/.cache/go-build diff --git a/.github/workflows/smoke-tests.yml b/.github/workflows/smoke-tests.yml index 1248d4ce..2b46f9a1 100644 --- a/.github/workflows/smoke-tests.yml +++ b/.github/workflows/smoke-tests.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false diff --git a/.github/workflows/ssh-ci-runner-cron.yml b/.github/workflows/ssh-ci-runner-cron.yml index 2e32541b..58db5278 100644 --- a/.github/workflows/ssh-ci-runner-cron.yml +++ b/.github/workflows/ssh-ci-runner-cron.yml @@ -18,13 +18,13 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-tags: true fetch-depth: 0 persist-credentials: false - name: Log into registry - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 with: registry: ghcr.io username: ${{ github.repository_owner }} diff --git a/.github/workflows/ssh-ci.yml b/.github/workflows/ssh-ci.yml index 2e739065..29745e2b 100644 --- a/.github/workflows/ssh-ci.yml +++ b/.github/workflows/ssh-ci.yml @@ -20,7 +20,7 @@ jobs: - ci@ppc64le.techaro.lol steps: - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-tags: true fetch-depth: 0 diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml index eaf13a1d..f90a4f0f 100644 --- a/.github/workflows/zizmor.yml +++ b/.github/workflows/zizmor.yml @@ -16,12 +16,12 @@ jobs: security-events: write steps: - name: Checkout repository - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false - name: Install the latest version of uv - uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3 + uses: astral-sh/setup-uv@4959332f0f014c5280e7eac8b70c90cb574c9f9b # v6.6.0 - name: Run zizmor 🌈 run: uvx zizmor --format sarif . > results.sarif @@ -29,7 +29,7 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Upload SARIF file - uses: github/codeql-action/upload-sarif@4e828ff8d448a8a6e532957b1811f387a63867e8 # v3.29.4 + uses: github/codeql-action/upload-sarif@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11 with: sarif_file: results.sarif category: zizmor From 82099d9e05d154277c3121675968ea20d2b182b4 Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Sat, 6 Sep 2025 22:35:19 -0400 Subject: [PATCH 2/2] fix(robots2policy): handle multiple user agents under one block (#925) --- cmd/robots2policy/main.go | 149 +++++++++++++----- cmd/robots2policy/robots2policy_test.go | 6 + cmd/robots2policy/testdata/blacklist.yaml | 6 +- cmd/robots2policy/testdata/complex.yaml | 24 +-- .../testdata/consecutive.robots.txt | 25 +++ cmd/robots2policy/testdata/consecutive.yaml | 47 ++++++ cmd/robots2policy/testdata/simple.json | 8 +- docs/docs/CHANGELOG.md | 1 + 8 files changed, 208 insertions(+), 58 deletions(-) create mode 100644 cmd/robots2policy/testdata/consecutive.robots.txt create mode 100644 cmd/robots2policy/testdata/consecutive.yaml diff --git a/cmd/robots2policy/main.go b/cmd/robots2policy/main.go index eaa4d7fe..3bb7219d 100644 --- a/cmd/robots2policy/main.go +++ b/cmd/robots2policy/main.go @@ -29,7 +29,7 @@ var ( ) type RobotsRule struct { - UserAgent string + UserAgents []string Disallows []string Allows []string CrawlDelay int @@ -130,10 +130,26 @@ func main() { } } +func createRuleFromAccumulated(userAgents, disallows, allows []string, crawlDelay int) RobotsRule { + rule := RobotsRule{ + UserAgents: make([]string, len(userAgents)), + Disallows: make([]string, len(disallows)), + Allows: make([]string, len(allows)), + CrawlDelay: crawlDelay, + } + copy(rule.UserAgents, userAgents) + copy(rule.Disallows, disallows) + copy(rule.Allows, allows) + return rule +} + func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { scanner := bufio.NewScanner(input) var rules []RobotsRule - var currentRule *RobotsRule + var currentUserAgents []string + var currentDisallows []string + var currentAllows []string + var currentCrawlDelay int for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) @@ -154,38 +170,42 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { switch directive { case "user-agent": - // Start a new rule section - if currentRule != nil { - rules = append(rules, *currentRule) - } - currentRule = &RobotsRule{ - UserAgent: value, - Disallows: make([]string, 0), - Allows: make([]string, 0), + // If we have accumulated rules with directives and encounter a new user-agent, + // flush the current rules + if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) { + rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay) + rules = append(rules, rule) + // Reset for next group + currentUserAgents = nil + currentDisallows = nil + currentAllows = nil + currentCrawlDelay = 0 } + currentUserAgents = append(currentUserAgents, value) case "disallow": - if currentRule != nil && value != "" { - currentRule.Disallows = append(currentRule.Disallows, value) + if len(currentUserAgents) > 0 && value != "" { + currentDisallows = append(currentDisallows, value) } case "allow": - if currentRule != nil && value != "" { - currentRule.Allows = append(currentRule.Allows, value) + if len(currentUserAgents) > 0 && value != "" { + currentAllows = append(currentAllows, value) } case "crawl-delay": - if currentRule != nil { + if len(currentUserAgents) > 0 { if delay, err := parseIntSafe(value); err == nil { - currentRule.CrawlDelay = delay + currentCrawlDelay = delay } } } } - // Don't forget the last rule - if currentRule != nil { - rules = append(rules, *currentRule) + // Don't forget the last group of rules + if len(currentUserAgents) > 0 { + rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay) + rules = append(rules, rule) } // Mark blacklisted user agents (those with "Disallow: /") @@ -211,10 +231,11 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { var anubisRules []AnubisRule ruleCounter := 0 + // Process each robots rule individually for _, robotsRule := range robotsRules { - userAgent := robotsRule.UserAgent + userAgents := robotsRule.UserAgents - // Handle crawl delay as weight adjustment (do this first before any continues) + // Handle crawl delay if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { ruleCounter++ rule := AnubisRule{ @@ -223,20 +244,32 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { Weight: &config.Weight{Adjust: *crawlDelay}, } - if userAgent == "*" { + if len(userAgents) == 1 && userAgents[0] == "*" { rule.Expression = &config.ExpressionOrList{ All: []string{"true"}, // Always applies } - } else { + } else if len(userAgents) == 1 { rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])}, + } + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, } } - anubisRules = append(anubisRules, rule) } - // Handle blacklisted user agents (complete deny/challenge) + // Handle blacklisted user agents if robotsRule.IsBlacklist { ruleCounter++ rule := AnubisRule{ @@ -244,21 +277,36 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { Action: *userAgentDeny, } - if userAgent == "*" { - // This would block everything - convert to a weight adjustment instead - rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) - rule.Action = "WEIGH" - rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly - rule.Expression = &config.ExpressionOrList{ - All: []string{"true"}, // Always applies + if len(userAgents) == 1 { + userAgent := userAgents[0] + if userAgent == "*" { + // This would block everything - convert to a weight adjustment instead + rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) + rule.Action = "WEIGH" + rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + } } } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + Any: expressions, } } anubisRules = append(anubisRules, rule) - continue } // Handle specific disallow rules @@ -276,9 +324,33 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { // Build CEL expression var conditions []string - // Add user agent condition if not wildcard - if userAgent != "*" { - conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent)) + // Add user agent conditions + if len(userAgents) == 1 && userAgents[0] == "*" { + // Wildcard user agent - no user agent condition needed + } else if len(userAgents) == 1 { + conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) + } else { + // For multiple user agents, we need to use a more complex expression + // This is a limitation - we can't easily combine any for user agents with all for path + // So we'll create separate rules for each user agent + for _, ua := range userAgents { + if ua == "*" { + continue // Skip wildcard as it's handled separately + } + ruleCounter++ + subRule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + Expression: &config.ExpressionOrList{ + All: []string{ + fmt.Sprintf("userAgent.contains(%q)", ua), + buildPathCondition(disallow), + }, + }, + } + anubisRules = append(anubisRules, subRule) + } + continue } // Add path condition @@ -291,7 +363,6 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { anubisRules = append(anubisRules, rule) } - } return anubisRules diff --git a/cmd/robots2policy/robots2policy_test.go b/cmd/robots2policy/robots2policy_test.go index aa73f6b6..e9d90e62 100644 --- a/cmd/robots2policy/robots2policy_test.go +++ b/cmd/robots2policy/robots2policy_test.go @@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) { expectedFile: "complex.yaml", options: TestOptions{format: "yaml", crawlDelayWeight: 5}, }, + { + name: "consecutive_user_agents", + robotsFile: "consecutive.robots.txt", + expectedFile: "consecutive.yaml", + options: TestOptions{format: "yaml", crawlDelayWeight: 3}, + }, } for _, tc := range testCases { diff --git a/cmd/robots2policy/testdata/blacklist.yaml b/cmd/robots2policy/testdata/blacklist.yaml index b22f06f6..a3096f51 100644 --- a/cmd/robots2policy/testdata/blacklist.yaml +++ b/cmd/robots2policy/testdata/blacklist.yaml @@ -25,6 +25,6 @@ - action: CHALLENGE expression: all: - - userAgent.contains("Googlebot") - - path.startsWith("/search") - name: robots-txt-policy-disallow-7 \ No newline at end of file + - userAgent.contains("Googlebot") + - path.startsWith("/search") + name: robots-txt-policy-disallow-7 diff --git a/cmd/robots2policy/testdata/complex.yaml b/cmd/robots2policy/testdata/complex.yaml index 2eb0d19a..6e677ad2 100644 --- a/cmd/robots2policy/testdata/complex.yaml +++ b/cmd/robots2policy/testdata/complex.yaml @@ -20,8 +20,8 @@ - action: CHALLENGE expression: all: - - userAgent.contains("Googlebot") - - path.startsWith("/search/") + - userAgent.contains("Googlebot") + - path.startsWith("/search/") name: robots-txt-policy-disallow-6 - action: WEIGH expression: userAgent.contains("Bingbot") @@ -31,14 +31,14 @@ - action: CHALLENGE expression: all: - - userAgent.contains("Bingbot") - - path.startsWith("/search/") + - userAgent.contains("Bingbot") + - path.startsWith("/search/") name: robots-txt-policy-disallow-8 - action: CHALLENGE expression: all: - - userAgent.contains("Bingbot") - - path.startsWith("/admin/") + - userAgent.contains("Bingbot") + - path.startsWith("/admin/") name: robots-txt-policy-disallow-9 - action: DENY expression: userAgent.contains("BadBot") @@ -54,18 +54,18 @@ - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/.*/admin") + - userAgent.contains("TestBot") + - path.matches("^/.*/admin") name: robots-txt-policy-disallow-13 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/temp.*\\.html") + - userAgent.contains("TestBot") + - path.matches("^/temp.*\\.html") name: robots-txt-policy-disallow-14 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/file.\\.log") + - userAgent.contains("TestBot") + - path.matches("^/file.\\.log") name: robots-txt-policy-disallow-15 diff --git a/cmd/robots2policy/testdata/consecutive.robots.txt b/cmd/robots2policy/testdata/consecutive.robots.txt new file mode 100644 index 00000000..e4f6cb5b --- /dev/null +++ b/cmd/robots2policy/testdata/consecutive.robots.txt @@ -0,0 +1,25 @@ +# Test consecutive user agents that should be grouped into any: blocks +User-agent: * +Disallow: /admin +Crawl-delay: 10 + +# Multiple consecutive user agents - should be grouped +User-agent: BadBot +User-agent: SpamBot +User-agent: EvilBot +Disallow: / + +# Single user agent - should be separate +User-agent: GoodBot +Disallow: /private + +# Multiple consecutive user agents with crawl delay +User-agent: SlowBot1 +User-agent: SlowBot2 +Crawl-delay: 5 + +# Multiple consecutive user agents with specific path +User-agent: SearchBot1 +User-agent: SearchBot2 +User-agent: SearchBot3 +Disallow: /search \ No newline at end of file diff --git a/cmd/robots2policy/testdata/consecutive.yaml b/cmd/robots2policy/testdata/consecutive.yaml new file mode 100644 index 00000000..144abda0 --- /dev/null +++ b/cmd/robots2policy/testdata/consecutive.yaml @@ -0,0 +1,47 @@ +- action: WEIGH + expression: "true" + name: robots-txt-policy-crawl-delay-1 + weight: + adjust: 3 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-2 +- action: DENY + expression: + any: + - userAgent.contains("BadBot") + - userAgent.contains("SpamBot") + - userAgent.contains("EvilBot") + name: robots-txt-policy-blacklist-3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("GoodBot") + - path.startsWith("/private") + name: robots-txt-policy-disallow-4 +- action: WEIGH + expression: + any: + - userAgent.contains("SlowBot1") + - userAgent.contains("SlowBot2") + name: robots-txt-policy-crawl-delay-5 + weight: + adjust: 3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot1") + - path.startsWith("/search") + name: robots-txt-policy-disallow-7 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot2") + - path.startsWith("/search") + name: robots-txt-policy-disallow-8 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot3") + - path.startsWith("/search") + name: robots-txt-policy-disallow-9 diff --git a/cmd/robots2policy/testdata/simple.json b/cmd/robots2policy/testdata/simple.json index 20bdf0d9..c8e1de09 100644 --- a/cmd/robots2policy/testdata/simple.json +++ b/cmd/robots2policy/testdata/simple.json @@ -1,12 +1,12 @@ [ { - "action": "CHALLENGE", "expression": "path.startsWith(\"/admin/\")", - "name": "robots-txt-policy-disallow-1" + "name": "robots-txt-policy-disallow-1", + "action": "CHALLENGE" }, { - "action": "CHALLENGE", "expression": "path.startsWith(\"/private\")", - "name": "robots-txt-policy-disallow-2" + "name": "robots-txt-policy-disallow-2", + "action": "CHALLENGE" } ] \ No newline at end of file diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 16b15029..d9633866 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - Document missing environment variables in installation guide: `SLOG_LEVEL`, `COOKIE_PREFIX`, `FORCED_LANGUAGE`, and `TARGET_DISABLE_KEEPALIVE` ([#1086](https://github.com/TecharoHQ/anubis/pull/1086)) +- Fixed `robots2policy` to properly group consecutive user agents into `any:` instead of only processing the last one ([#925](https://github.com/TecharoHQ/anubis/pull/925))