elastic · edsavage · Feb 20, 2026 · Feb 20, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py
@@ -67,6 +67,9 @@ def main():
     # Ingest step-level timings into Elasticsearch for anomaly detection
     pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings",
                                                        ".buildkite/pipelines/ingest_build_timings.yml.sh"))
+    # Analyze failures with AI if the build failed
+    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     # Build the DRA artifacts and upload to S3 and GCS
     pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts",

diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout
@@ -33,6 +33,12 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then
     export ES_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/elasticsearch/ci_analytics 2>/dev/null || echo "")
   fi
 
+  if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_failure" ]]; then
+    export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "")
+    export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "")
+    export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "")
+  fi
+
   # GCS service account — inject credentials for build and Java IT steps.
   # Build steps use it for sccache; Java IT steps use it for the Gradle
   # build cache.  The key is stored in Vault.

diff --git a/.buildkite/job-build-test-all-debug.json.py b/.buildkite/job-build-test-all-debug.json.py
@@ -24,11 +24,6 @@
     config as buildConfig,
 )
 
-env = {
-  "BUILD_SNAPSHOT": "true",
-  "VERSION_QUALIFIER": ""
-}
-
 def main():
     pipeline = {}
     pipeline_steps = step.PipelineStep([])
@@ -40,6 +35,23 @@ def main():
                                                        ".buildkite/pipelines/format_and_validation.yml.sh"))
     config = buildConfig.Config()
     config.parse()
+
+    build_step_keys = []
+    if config.build_linux and config.build_aarch64:
+        build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
+    if config.build_linux and config.build_x86_64:
+        build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo")
+    if config.build_macos and config.build_aarch64:
+        build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo")
+    if config.build_windows and config.build_x86_64:
+        build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo")
+
+    env = {
+        "BUILD_SNAPSHOT": "true",
+        "VERSION_QUALIFIER": "",
+        "ML_BUILD_STEP_KEYS": ",".join(build_step_keys),
+    }
+
     if config.build_windows:
         debug_windows = pipeline_steps.generate_step_template("Windows", "debug", "", config.build_x86_64)
         pipeline_steps.append(debug_windows)
@@ -57,6 +69,9 @@ def main():
     # Ingest step-level timings into Elasticsearch for anomaly detection
     pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings",
                                                        ".buildkite/pipelines/ingest_build_timings.yml.sh"))
+    # Analyze failures with AI if the build failed
+    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps

diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py
@@ -19,6 +19,7 @@ class Config:
     build_x86_64: str = ""
     run_qa_tests: bool = False
     run_pytorch_tests: bool = False
+    run_analyze: bool = False
     action: str = "build"
 
     def parse_comment(self):
@@ -37,7 +38,8 @@ def parse_comment(self):
             self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"]
             self.run_qa_tests = self.action == "run_qa_tests"
             self.run_pytorch_tests = self.action == "run_pytorch_tests"
-            if self.run_pytorch_tests or self.run_qa_tests:
+            self.run_analyze = self.action == "analyze"
+            if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze:
                 self.action = "build"
 
         # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the

diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
@@ -24,20 +24,31 @@
 )
 
 def main():
+    config = buildConfig.Config()
+    config.parse()
+
     pipeline = {}
     pipeline_steps = step.PipelineStep([])
+
+    # "buildkite analyze" triggers a lightweight pipeline that finds and
+    # analyzes the most recent failed build for this branch — no compilation.
+    if config.run_analyze:
+        pipeline["env"] = {"ML_ANALYZE_PREVIOUS": "true"}
+        pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                           ".buildkite/pipelines/analyze_build_failure.yml.sh"))
+        pipeline["steps"] = pipeline_steps
+        print(json.dumps(pipeline, indent=2))
+        return
+
     pipeline_steps.append(pipeline_steps.generate_step("Queue a :slack: notification for the pipeline",
                                                        ".buildkite/pipelines/send_slack_notification.sh"))
     pipeline_steps.append(pipeline_steps.generate_step("Queue a :email: notification for the pipeline",
                                                        ".buildkite/pipelines/send_email_notification.sh"))
     pipeline_steps.append(pipeline_steps.generate_step("Upload clang-format validation",
                                                        ".buildkite/pipelines/format_and_validation.yml.sh"))
-    config = buildConfig.Config()
-    config.parse()
 
-    # Compute which build step keys will exist so that analytics steps
-    # can emit a correct depends_on list (not all platforms are built
-    # for every PR, depending on labels/comments).
+    # Compute which build step keys will exist so that analytics and
+    # failure-analysis steps can emit a correct depends_on list.
     build_step_keys = []
     if config.build_linux and config.build_aarch64:
         build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")

diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0 and the following additional limitation. Functionality enabled by the
+# files subject to the Elastic License 2.0 may only be used in production when
+# invoked by an Elasticsearch process with a license key installed that permits
+# use of machine learning features. You may not use this file except in
+# compliance with the Elastic License 2.0 and the foregoing additional
+# limitation.
+
+EXTRA_FLAGS=""
+if [ "${ML_ANALYZE_PREVIOUS:-}" = "true" ]; then
+    EXTRA_FLAGS=" --find-previous-failure"
+fi
+
+cat <<EOL
+steps:
+  - label: "Analyze build failure :mag:"
+    key: "analyze_build_failure"
+    command:
+        - |
+            set -eu
+            # Step-level if/build.state is evaluated at pipeline upload time, so it cannot
+            # reliably gate on the final build outcome. Skip at job start when the build already
+            # succeeded, except for the lightweight "find previous failure" pipeline.
+            bs="\${BUILDKITE_BUILD_STATE:-}"
+            if [ "\$bs" = "passed" ] && [ "\${ML_ANALYZE_PREVIOUS:-}" != "true" ]; then
+              echo "Build state is passed; skipping failure analysis."
+              exit 0
+            fi
+            python3 dev-tools/analyze_build_failure.py --pipeline \$BUILDKITE_PIPELINE_SLUG --build \$BUILDKITE_BUILD_NUMBER${EXTRA_FLAGS}
+EOL
+
+# Emit depends_on dynamically — ML_BUILD_STEP_KEYS and ML_TEST_STEP_KEYS are
+# comma-separated lists set by the pipeline generator (branch builds expose
+# both; PR pipelines may only set ML_BUILD_STEP_KEYS). In analyze-previous
+# mode there are no build/test steps so this block is skipped.
+DEPENDS_ON_KEYS=()
+if [ -n "${ML_BUILD_STEP_KEYS:-}" ]; then
+    IFS=',' read -ra STEP_KEYS <<< "$ML_BUILD_STEP_KEYS"
+    DEPENDS_ON_KEYS+=("${STEP_KEYS[@]}")
+fi
+if [ -n "${ML_TEST_STEP_KEYS:-}" ]; then
+    IFS=',' read -ra STEP_KEYS <<< "$ML_TEST_STEP_KEYS"
+    DEPENDS_ON_KEYS+=("${STEP_KEYS[@]}")
+fi
+if [ "${#DEPENDS_ON_KEYS[@]}" -gt 0 ]; then
+    echo '    depends_on:'
+    seen=" "
+    for key in "${DEPENDS_ON_KEYS[@]}"; do
+        [ -z "$key" ] && continue
+        case "$seen" in
+            *" ${key} "*) continue ;;
+        esac
+        seen+=" ${key} "
+        echo "        - \"${key}\""
+    done
+fi
+
+cat <<'EOL'
+    allow_dependency_failure: true
+    soft_fail: true
+    agents:
+      image: "python:3"
+EOL
diff --git a/.buildkite/pull-requests.json b/.buildkite/pull-requests.json
@@ -9,7 +9,7 @@
       "commit_status_context": "ml-cpp-ci",
       "build_on_commit": true,
       "build_on_comment": true,
-      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
+      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
       "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
       "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"],
       "skip_target_branches": ["6.8", "7.11", "7.12"],

diff --git a/.github/workflows/post-build-analysis.yml b/.github/workflows/post-build-analysis.yml
@@ -0,0 +1,141 @@
+name: Post Build Failure Analysis
+
+# Triggered by commit status updates from Buildkite. When the
+# analyze_build_failure step completes, Buildkite posts a commit status
+# which fires this workflow. We fetch the AI analysis from Buildkite
+# build metadata and post it as a PR comment using the built-in
+# GITHUB_TOKEN (no PAT or GitHub App needed).
+
+on:
+  status:
+
+permissions:
+  pull-requests: write
+  statuses: read
+
+jobs:
+  post-analysis:
+    # Only run when the analyze step succeeds (soft_fail means Buildkite
+    # reports success even if the analysis itself had issues).
+    if: >-
+      github.event.state == 'success' &&
+      contains(github.event.context, 'Analyze build failure')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Find PR for commit
+        id: find-pr
+        env:
+          GH_TOKEN: ${{ github.token }}
+          SHA: ${{ github.event.sha }}
+        run: |
+          PR_NUMBER=$(gh api "repos/${{ github.repository }}/commits/${SHA}/pulls" \
+            --jq '.[0].number // empty' 2>/dev/null || true)
+          if [ -z "$PR_NUMBER" ]; then
+            echo "No PR found for commit ${SHA} — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "Found PR #${PR_NUMBER}"
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Extract Buildkite build info
+        if: steps.find-pr.outputs.skip != 'true'
+        id: bk-info
+        env:
+          TARGET_URL: ${{ github.event.target_url }}
+        run: |
+          # target_url looks like:
+          # https://buildkite.com/elastic/ml-cpp-pr-builds/builds/2361#step-key
+          # Extract pipeline slug and build number.
+          PIPELINE=$(echo "$TARGET_URL" | sed -n 's|.*/elastic/\([^/]*\)/builds/.*|\1|p')
+          BUILD_NUM=$(echo "$TARGET_URL" | sed -n 's|.*/builds/\([0-9]*\).*|\1|p')
+          if [ -z "$PIPELINE" ] || [ -z "$BUILD_NUM" ]; then
+            echo "Could not parse Buildkite URL: $TARGET_URL"
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "Pipeline: $PIPELINE, Build: $BUILD_NUM"
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+            echo "pipeline=${PIPELINE}" >> "$GITHUB_OUTPUT"
+            echo "build_num=${BUILD_NUM}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Fetch analysis from Buildkite
+        if: >-
+          steps.find-pr.outputs.skip != 'true' &&
+          steps.bk-info.outputs.skip != 'true'
+        id: fetch
+        env:
+          BK_TOKEN: ${{ secrets.BUILDKITE_API_READ_TOKEN }}
+          PIPELINE: ${{ steps.bk-info.outputs.pipeline }}
+          BUILD_NUM: ${{ steps.bk-info.outputs.build_num }}
+        run: |
+          if [ -z "$BK_TOKEN" ]; then
+            echo "BUILDKITE_API_READ_TOKEN secret not set — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          # Fetch build metadata containing the analysis (API returns JSON with a value field).
+          ANALYSIS_JSON=$(curl -sS -f \
+            -H "Authorization: Bearer ${BK_TOKEN}" \
+            "https://api.buildkite.com/v2/organizations/elastic/pipelines/${PIPELINE}/builds/${BUILD_NUM}/meta-data/build-failure-analysis" \
+            2>/dev/null) || true
+
+          if [ -z "$ANALYSIS_JSON" ]; then
+            echo "No analysis metadata found — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          ANALYSIS=$(printf '%s' "$ANALYSIS_JSON" | jq -r \
+            'if type == "string" then . elif has("value") then .value else empty end' 2>/dev/null || true)
+          if [ -z "$ANALYSIS" ]; then
+            ANALYSIS="$ANALYSIS_JSON"
+          fi
+
+          if [ -z "$ANALYSIS" ]; then
+            echo "Analysis metadata did not contain a value — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          printf '%s\n' "$ANALYSIS" > /tmp/analysis.md
+          echo "skip=false" >> "$GITHUB_OUTPUT"
+
+      - name: Post or update PR comment
+        if: >-
+          steps.find-pr.outputs.skip != 'true' &&
+          steps.bk-info.outputs.skip != 'true' &&
+          steps.fetch.outputs.skip != 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ steps.find-pr.outputs.pr_number }}
+          PIPELINE: ${{ steps.bk-info.outputs.pipeline }}
+          BUILD_NUM: ${{ steps.bk-info.outputs.build_num }}
+        run: |
+          MARKER="<!-- build-failure-analysis -->"
+          BUILD_URL="https://buildkite.com/elastic/${PIPELINE}/builds/${BUILD_NUM}"
+
+          {
+            printf '%s\n\n' "$MARKER"
+            printf '## :mag: Build Failure Analysis\n\n'
+            cat /tmp/analysis.md
+            printf '\n\n---\n[View Buildkite build](%s) | *Analysis generated by Claude. Verify before acting.*\n' "$BUILD_URL"
+          } > /tmp/pr-body.md
+
+          # Check for an existing comment to update.
+          EXISTING_ID=$(gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments?per_page=100" \
+            --jq ".[] | select(.body | contains(\"${MARKER}\")) | .id" 2>/dev/null | head -1)
+
+          if [ -n "$EXISTING_ID" ]; then
+            jq -n --rawfile b /tmp/pr-body.md '{body: $b}' \
+              | gh api "repos/${{ github.repository }}/issues/comments/${EXISTING_ID}" \
+                -X PATCH --input -
+            echo "Updated existing comment on PR #${PR_NUMBER}."
+          else
+            jq -n --rawfile b /tmp/pr-body.md '{body: $b}' \
+              | gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \
+                -X POST --input -
+            echo "Posted new comment on PR #${PR_NUMBER}."
+          fi
diff --git a/catalog-info.yaml b/catalog-info.yaml
@@ -38,6 +38,8 @@ spec:
         publish_commit_status: false
         publish_commit_status_per_step: true
         trigger_mode: code
+      env:
+        ELASTIC_PR_COMMENTS_ENABLED: 'true'
       repository: elastic/ml-cpp
       skip_intermediate_builds: true
       teams: