From 773c53403df34fc2b57c07d7aceb0a315f4c919a Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 29 Apr 2026 10:50:27 +1200 Subject: [PATCH 1/5] [ML] Fix allowlist validation image when DOCKER_IMAGE is pytorch_latest PyTorch Docker nightly triggers ml-cpp-pr-builds with DOCKER_IMAGE set to ml-linux-dependency-build:pytorch_latest for compile steps. validate_pytorch_allowlist incorrectly reused that image; torch there cannot load MKL (libmkl_intel_lp64.so.2). Only use DOCKER_IMAGE for allowlist validation when it is an ml-linux-build image; otherwise keep the default ml-linux-build:34. Made-with: Cursor --- .../validate_pytorch_allowlist.yml.sh | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index fc7fad301..c5e5cf7b7 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -8,10 +8,20 @@ # compliance with the Elastic License 2.0 and the foregoing additional # limitation. -# Use the same Docker image as the build steps — it has Python 3.12 and -# the source-built torch package, giving exact version parity with the -# libtorch that pytorch_inference links against. -VALIDATION_IMAGE="${DOCKER_IMAGE:-docker.elastic.co/ml-dev/ml-linux-build:34}" +# Use an image that has Python 3.12, source-built torch, and MKL under +# /usr/local/gcc133 so `import torch` matches ml-cpp's libtorch linkage. +# +# Child pipelines (e.g. PyTorch Docker nightly via build_pytorch_docker_image.yml.sh) +# set DOCKER_IMAGE to ml-linux-dependency-build:pytorch_latest for *compile* agents. +# That image does not ship MKL next to torch; reusing it here reproduces +# libmkl_intel_lp64.so.2 errors. Only honour DOCKER_IMAGE when it is a ml-linux-build +# image; otherwise default to the published ml-linux-build tag. +DEFAULT_VALIDATION_IMAGE="docker.elastic.co/ml-dev/ml-linux-build:34" +if [[ -n "${DOCKER_IMAGE:-}" && "${DOCKER_IMAGE}" == *ml-linux-build* ]]; then + VALIDATION_IMAGE="${DOCKER_IMAGE}" +else + VALIDATION_IMAGE="${DEFAULT_VALIDATION_IMAGE}" +fi cat < Date: Wed, 29 Apr 2026 11:02:27 +1200 Subject: [PATCH 2/5] [ML] Stage Intel MKL into pytorch_latest image (match linux_image) Install intel-oneapi-mkl-devel-2024.0 in the builder and copy MKL lib/ into /usr/local/gcc133 before building PyTorch, mirroring dev-tools/docker/linux_image. Set LD_LIBRARY_PATH (and PATH) in the final rockylinux stage so import torch resolves MKL alongside libtorch_cpu.so in ml-linux-dependency-build:pytorch_latest. Made-with: Cursor --- .../docker/pytorch_linux_image/Dockerfile | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dev-tools/docker/pytorch_linux_image/Dockerfile b/dev-tools/docker/pytorch_linux_image/Dockerfile index d053d7a45..a6e4915c7 100644 --- a/dev-tools/docker/pytorch_linux_image/Dockerfile +++ b/dev-tools/docker/pytorch_linux_image/Dockerfile @@ -39,6 +39,21 @@ ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib ENV PATH=/usr/local/gcc133/bin:/usr/bin:/bin:/usr/sbin:/sbin ENV CXX="g++ -std=gnu++17" +# Install Intel MKL into gcc133 (same pattern as dev-tools/docker/linux_image/Dockerfile) so +# libtorch_cpu.so resolves MKL at runtime in the final image after COPY --from=builder. +RUN \ + echo -e '[oneAPI]\n\ +name=Intel oneAPI repository\n\ +baseurl=https://yum.repos.intel.com/oneapi\n\ +enabled=1\n\ +gpgcheck=1\n\ +repo_gpgcheck=1\n\ +gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB\n' > /etc/yum.repos.d/oneAPI.repo && \ + dnf install -y intel-oneapi-mkl-devel-2024.0 && \ + (cd /opt/intel/oneapi/mkl/2024.0 && tar cf - lib) | (cd /usr/local/gcc133 && tar xvf -) && \ + dnf clean all && \ + rm -rf /var/cache/dnf /opt/intel/oneapi/mkl/2024.0/doc /tmp/* + # Clone PyTorch and build LibTorch # PYTORCH_BUILD_VERSION is only set for tagged branches (e.g. v2.7.1); # for main/viable/strict PyTorch derives the version from version.txt. @@ -97,6 +112,10 @@ RUN --mount=type=secret,id=gcs_key \ FROM rockylinux:8 COPY --from=builder /usr/local/gcc133 /usr/local/gcc133 +# Match linux_image final stage: MKL + libtorch under gcc133; needed for `import torch` +# when running without the full compile-time shell (e.g. allowlist validation). +ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib +ENV PATH=/usr/local/gcc133/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin # Python 3.12 + torch site-packages for allowlist validation COPY --from=builder /usr/local/bin/python3.12 /usr/local/bin/python3.12 COPY --from=builder /usr/local/bin/pip3.12 /usr/local/bin/pip3.12 From 8ee589bc535569fc09ea3a181dab4a0c38055278 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 29 Apr 2026 11:24:20 +1200 Subject: [PATCH 3/5] [ML] Always run PyTorch allowlist validation on pytorch_latest image Stop switching to ml-linux-build when DOCKER_IMAGE differs; always use ml-linux-dependency-build:pytorch_latest (optional PYTORCH_ALLOWLIST_VALIDATION_IMAGE override). Requires published pytorch_latest with MKL staged (pytorch_linux_image). Keep LD_LIBRARY_PATH on the step for older tags until the image rolls out. Made-with: Cursor --- .../validate_pytorch_allowlist.yml.sh | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index c5e5cf7b7..467682049 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -8,20 +8,10 @@ # compliance with the Elastic License 2.0 and the foregoing additional # limitation. -# Use an image that has Python 3.12, source-built torch, and MKL under -# /usr/local/gcc133 so `import torch` matches ml-cpp's libtorch linkage. -# -# Child pipelines (e.g. PyTorch Docker nightly via build_pytorch_docker_image.yml.sh) -# set DOCKER_IMAGE to ml-linux-dependency-build:pytorch_latest for *compile* agents. -# That image does not ship MKL next to torch; reusing it here reproduces -# libmkl_intel_lp64.so.2 errors. Only honour DOCKER_IMAGE when it is a ml-linux-build -# image; otherwise default to the published ml-linux-build tag. -DEFAULT_VALIDATION_IMAGE="docker.elastic.co/ml-dev/ml-linux-build:34" -if [[ -n "${DOCKER_IMAGE:-}" && "${DOCKER_IMAGE}" == *ml-linux-build* ]]; then - VALIDATION_IMAGE="${DOCKER_IMAGE}" -else - VALIDATION_IMAGE="${DEFAULT_VALIDATION_IMAGE}" -fi +# Always validate against the published PyTorch Linux dependency image (same tag as +# Linux compile agents: torch + MKL under /usr/local/gcc133 per dev-tools/docker/pytorch_linux_image). +# Optional override for experiments: PYTORCH_ALLOWLIST_VALIDATION_IMAGE. +VALIDATION_IMAGE="${PYTORCH_ALLOWLIST_VALIDATION_IMAGE:-docker.elastic.co/ml-dev/ml-linux-dependency-build:pytorch_latest}" cat < Date: Wed, 29 Apr 2026 11:28:43 +1200 Subject: [PATCH 4/5] [ML] Drop step LD_LIBRARY_PATH from allowlist validation pytorch_linux_image sets LD_LIBRARY_PATH in pytorch_latest; nightly republish picks it up. Made-with: Cursor --- .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index 467682049..3342c684f 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -20,8 +20,6 @@ steps: timeout_in_minutes: 60 env: HF_HUB_DISABLE_XET: "1" - # Redundant with the image ENV once pytorch_latest includes MKL; kept for older tags. - LD_LIBRARY_PATH: "/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib" command: - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi" - "python3 -c \"import torch; print(f'PyTorch version: {torch.__version__}')\"" From b6339c4e41bceb3fafcf04768ceb94856bbf60eb Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 29 Apr 2026 11:59:01 +1200 Subject: [PATCH 5/5] [ML] Restore /usr/local/bin on PATH for PyTorch Docker builder MKL install block was added without keeping /usr/local/bin in PATH; sccache lives there so BuildKit RUN hit 'sccache: command not found' (exit 127) when GCS credentials were mounted. Made-with: Cursor --- dev-tools/docker/pytorch_linux_image/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dev-tools/docker/pytorch_linux_image/Dockerfile b/dev-tools/docker/pytorch_linux_image/Dockerfile index a6e4915c7..57999cd41 100644 --- a/dev-tools/docker/pytorch_linux_image/Dockerfile +++ b/dev-tools/docker/pytorch_linux_image/Dockerfile @@ -34,9 +34,10 @@ ENV LDFLAGS_FOR_TARGET="-Wl,-z,relro -Wl,-z,now" ARG build_dir=/usr/src -# Update paths to use the compiler in C++17 mode +# Update paths to use the compiler in C++17 mode. /usr/local/bin must stay on PATH so +# the sccache binary (installed above) is visible during the PyTorch RUN steps. ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib -ENV PATH=/usr/local/gcc133/bin:/usr/bin:/bin:/usr/sbin:/sbin +ENV PATH=/usr/local/gcc133/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin ENV CXX="g++ -std=gnu++17" # Install Intel MKL into gcc133 (same pattern as dev-tools/docker/linux_image/Dockerfile) so