From 773c53403df34fc2b57c07d7aceb0a315f4c919a Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 29 Apr 2026 10:50:27 +1200
Subject: [PATCH 1/5] [ML] Fix allowlist validation image when DOCKER_IMAGE is
 pytorch_latest

PyTorch Docker nightly triggers ml-cpp-pr-builds with DOCKER_IMAGE set to
ml-linux-dependency-build:pytorch_latest for compile steps. validate_pytorch_allowlist
incorrectly reused that image; torch there cannot load MKL (libmkl_intel_lp64.so.2).

Only use DOCKER_IMAGE for allowlist validation when it is an ml-linux-build image;
otherwise keep the default ml-linux-build:34.

Made-with: Cursor
---
 .../validate_pytorch_allowlist.yml.sh          | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)
diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
index fc7fad301..c5e5cf7b7 100755
--- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -8,10 +8,20 @@
 # compliance with the Elastic License 2.0 and the foregoing additional
 # limitation.
 
-# Use the same Docker image as the build steps — it has Python 3.12 and
-# the source-built torch package, giving exact version parity with the
-# libtorch that pytorch_inference links against.
-VALIDATION_IMAGE="${DOCKER_IMAGE:-docker.elastic.co/ml-dev/ml-linux-build:34}"
+# Use an image that has Python 3.12, source-built torch, and MKL under
+# /usr/local/gcc133 so `import torch` matches ml-cpp's libtorch linkage.
+#
+# Child pipelines (e.g. PyTorch Docker nightly via build_pytorch_docker_image.yml.sh)
+# set DOCKER_IMAGE to ml-linux-dependency-build:pytorch_latest for *compile* agents.
+# That image does not ship MKL next to torch; reusing it here reproduces
+# libmkl_intel_lp64.so.2 errors. Only honour DOCKER_IMAGE when it is a ml-linux-build
+# image; otherwise default to the published ml-linux-build tag.
+DEFAULT_VALIDATION_IMAGE="docker.elastic.co/ml-dev/ml-linux-build:34"
+if [[ -n "${DOCKER_IMAGE:-}" && "${DOCKER_IMAGE}" == *ml-linux-build* ]]; then
+  VALIDATION_IMAGE="${DOCKER_IMAGE}"
+else
+  VALIDATION_IMAGE="${DEFAULT_VALIDATION_IMAGE}"
+fi
 
 cat <<EOL
 steps:

From 24b3781fc6120370bb23f2816ab7d5693c195827 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 29 Apr 2026 11:02:27 +1200
Subject: [PATCH 2/5] [ML] Stage Intel MKL into pytorch_latest image (match
 linux_image)

Install intel-oneapi-mkl-devel-2024.0 in the builder and copy MKL lib/ into
/usr/local/gcc133 before building PyTorch, mirroring dev-tools/docker/linux_image.

Set LD_LIBRARY_PATH (and PATH) in the final rockylinux stage so import torch
resolves MKL alongside libtorch_cpu.so in ml-linux-dependency-build:pytorch_latest.

Made-with: Cursor
---
 .../docker/pytorch_linux_image/Dockerfile     | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/dev-tools/docker/pytorch_linux_image/Dockerfile b/dev-tools/docker/pytorch_linux_image/Dockerfile
index d053d7a45..a6e4915c7 100644
--- a/dev-tools/docker/pytorch_linux_image/Dockerfile
+++ b/dev-tools/docker/pytorch_linux_image/Dockerfile
@@ -39,6 +39,21 @@ ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib
 ENV PATH=/usr/local/gcc133/bin:/usr/bin:/bin:/usr/sbin:/sbin
 ENV CXX="g++ -std=gnu++17"
 
+# Install Intel MKL into gcc133 (same pattern as dev-tools/docker/linux_image/Dockerfile) so
+# libtorch_cpu.so resolves MKL at runtime in the final image after COPY --from=builder.
+RUN \
+  echo -e '[oneAPI]\n\
+name=Intel oneAPI repository\n\
+baseurl=https://yum.repos.intel.com/oneapi\n\
+enabled=1\n\
+gpgcheck=1\n\
+repo_gpgcheck=1\n\
+gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB\n' > /etc/yum.repos.d/oneAPI.repo && \
+  dnf install -y intel-oneapi-mkl-devel-2024.0 && \
+  (cd /opt/intel/oneapi/mkl/2024.0 && tar cf - lib) | (cd /usr/local/gcc133 && tar xvf -) && \
+  dnf clean all && \
+  rm -rf /var/cache/dnf /opt/intel/oneapi/mkl/2024.0/doc /tmp/*
+
 # Clone PyTorch and build LibTorch
 # PYTORCH_BUILD_VERSION is only set for tagged branches (e.g. v2.7.1);
 # for main/viable/strict PyTorch derives the version from version.txt.
@@ -97,6 +112,10 @@ RUN --mount=type=secret,id=gcs_key \
 
 FROM rockylinux:8
 COPY --from=builder /usr/local/gcc133 /usr/local/gcc133
+# Match linux_image final stage: MKL + libtorch under gcc133; needed for `import torch`
+# when running without the full compile-time shell (e.g. allowlist validation).
+ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib
+ENV PATH=/usr/local/gcc133/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin
 # Python 3.12 + torch site-packages for allowlist validation
 COPY --from=builder /usr/local/bin/python3.12 /usr/local/bin/python3.12
 COPY --from=builder /usr/local/bin/pip3.12 /usr/local/bin/pip3.12

From 8ee589bc535569fc09ea3a181dab4a0c38055278 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 29 Apr 2026 11:24:20 +1200
Subject: [PATCH 3/5] [ML] Always run PyTorch allowlist validation on
 pytorch_latest image

Stop switching to ml-linux-build when DOCKER_IMAGE differs; always use
ml-linux-dependency-build:pytorch_latest (optional PYTORCH_ALLOWLIST_VALIDATION_IMAGE
override). Requires published pytorch_latest with MKL staged (pytorch_linux_image).

Keep LD_LIBRARY_PATH on the step for older tags until the image rolls out.

Made-with: Cursor
---
 .../validate_pytorch_allowlist.yml.sh         | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
index c5e5cf7b7..467682049 100755
--- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -8,20 +8,10 @@
 # compliance with the Elastic License 2.0 and the foregoing additional
 # limitation.
 
-# Use an image that has Python 3.12, source-built torch, and MKL under
-# /usr/local/gcc133 so `import torch` matches ml-cpp's libtorch linkage.
-#
-# Child pipelines (e.g. PyTorch Docker nightly via build_pytorch_docker_image.yml.sh)
-# set DOCKER_IMAGE to ml-linux-dependency-build:pytorch_latest for *compile* agents.
-# That image does not ship MKL next to torch; reusing it here reproduces
-# libmkl_intel_lp64.so.2 errors. Only honour DOCKER_IMAGE when it is a ml-linux-build
-# image; otherwise default to the published ml-linux-build tag.
-DEFAULT_VALIDATION_IMAGE="docker.elastic.co/ml-dev/ml-linux-build:34"
-if [[ -n "${DOCKER_IMAGE:-}" && "${DOCKER_IMAGE}" == *ml-linux-build* ]]; then
-  VALIDATION_IMAGE="${DOCKER_IMAGE}"
-else
-  VALIDATION_IMAGE="${DEFAULT_VALIDATION_IMAGE}"
-fi
+# Always validate against the published PyTorch Linux dependency image (same tag as
+# Linux compile agents: torch + MKL under /usr/local/gcc133 per dev-tools/docker/pytorch_linux_image).
+# Optional override for experiments: PYTORCH_ALLOWLIST_VALIDATION_IMAGE.
+VALIDATION_IMAGE="${PYTORCH_ALLOWLIST_VALIDATION_IMAGE:-docker.elastic.co/ml-dev/ml-linux-dependency-build:pytorch_latest}"
 
 cat <<EOL
 steps:
@@ -30,6 +20,8 @@ steps:
     timeout_in_minutes: 60
     env:
         HF_HUB_DISABLE_XET: "1"
+        # Redundant with the image ENV once pytorch_latest includes MKL; kept for older tags.
+        LD_LIBRARY_PATH: "/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib"
     command:
         - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi"
         - "python3 -c \"import torch; print(f'PyTorch version: {torch.__version__}')\""

From be5aa34076ad8bfe2350ae4ebb2448a2a37f5a2c Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 29 Apr 2026 11:28:43 +1200
Subject: [PATCH 4/5] [ML] Drop step LD_LIBRARY_PATH from allowlist validation

pytorch_linux_image sets LD_LIBRARY_PATH in pytorch_latest; nightly republish picks it up.

Made-with: Cursor
---
 .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
index 467682049..3342c684f 100755
--- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -20,8 +20,6 @@ steps:
     timeout_in_minutes: 60
     env:
         HF_HUB_DISABLE_XET: "1"
-        # Redundant with the image ENV once pytorch_latest includes MKL; kept for older tags.
-        LD_LIBRARY_PATH: "/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib"
     command:
         - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi"
         - "python3 -c \"import torch; print(f'PyTorch version: {torch.__version__}')\""

From b6339c4e41bceb3fafcf04768ceb94856bbf60eb Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 29 Apr 2026 11:59:01 +1200
Subject: [PATCH 5/5] [ML] Restore /usr/local/bin on PATH for PyTorch Docker
 builder

MKL install block was added without keeping /usr/local/bin in PATH; sccache
lives there so BuildKit RUN hit 'sccache: command not found' (exit 127) when
GCS credentials were mounted.

Made-with: Cursor
---
 dev-tools/docker/pytorch_linux_image/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dev-tools/docker/pytorch_linux_image/Dockerfile b/dev-tools/docker/pytorch_linux_image/Dockerfile
index a6e4915c7..57999cd41 100644
--- a/dev-tools/docker/pytorch_linux_image/Dockerfile
+++ b/dev-tools/docker/pytorch_linux_image/Dockerfile
@@ -34,9 +34,10 @@ ENV LDFLAGS_FOR_TARGET="-Wl,-z,relro -Wl,-z,now"
 
 ARG build_dir=/usr/src
 
-# Update paths to use the compiler in C++17 mode
+# Update paths to use the compiler in C++17 mode. /usr/local/bin must stay on PATH so
+# the sccache binary (installed above) is visible during the PyTorch RUN steps.
 ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib
-ENV PATH=/usr/local/gcc133/bin:/usr/bin:/bin:/usr/sbin:/sbin
+ENV PATH=/usr/local/gcc133/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin
 ENV CXX="g++ -std=gnu++17"
 
 # Install Intel MKL into gcc133 (same pattern as dev-tools/docker/linux_image/Dockerfile) so