From c18f43078ac89924ed22d0c4fa1b9879499e63bf Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 23 Apr 2026 15:34:12 +1200 Subject: [PATCH 1/3] [ML] Fix PyTorch Docker build: install sccache under gcc133 bin Install the sccache release binary into /usr/local/gcc133/bin so it is on the existing PATH (no extra directories). The previous layout used /usr/local/bin without listing it on PATH, which broke the GCS-backed compile RUN with 'sccache: command not found' (exit 127). Remove sccache from the final runtime image after copying gcc133; it is only required during the builder compile step. Made-with: Cursor --- dev-tools/docker/pytorch_linux_image/Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dev-tools/docker/pytorch_linux_image/Dockerfile b/dev-tools/docker/pytorch_linux_image/Dockerfile index d053d7a45..0cc5a8eca 100644 --- a/dev-tools/docker/pytorch_linux_image/Dockerfile +++ b/dev-tools/docker/pytorch_linux_image/Dockerfile @@ -20,11 +20,11 @@ RUN dnf -y update && \ dnf config-manager --set-enabled powertools && \ dnf install -y bzip2 curl gcc gcc-c++ git libffi-devel make texinfo unzip wget which xz zip zlib-devel findutils -# Install sccache for GCS-backed compilation caching across builds +# Install sccache for GCS-backed compilation caching across builds (same dir as the pinned GCC toolchain) ARG SCCACHE_VERSION=v0.14.0 RUN curl -fsSL "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" \ - | tar xz -C /usr/local/bin --strip-components=1 "sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl/sccache" && \ - chmod +x /usr/local/bin/sccache + | tar xz -C /usr/local/gcc133/bin --strip-components=1 "sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl/sccache" && \ + chmod +x /usr/local/gcc133/bin/sccache # For compiling with hardening and optimisation ENV CFLAGS="-g -O3 -fstack-protector -D_FORTIFY_SOURCE=2 -msse4.2 -mfpmath=sse" @@ -97,6 +97,7 @@ RUN --mount=type=secret,id=gcs_key \ FROM rockylinux:8 COPY --from=builder /usr/local/gcc133 /usr/local/gcc133 +RUN rm -f /usr/local/gcc133/bin/sccache # Python 3.12 + torch site-packages for allowlist validation COPY --from=builder /usr/local/bin/python3.12 /usr/local/bin/python3.12 COPY --from=builder /usr/local/bin/pip3.12 /usr/local/bin/pip3.12 From f3ec7da955728b888cf9409462a6e1009a5816e4 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 24 Apr 2026 10:43:25 +1200 Subject: [PATCH 2/3] [ML] Fix allowlist validation: LD_LIBRARY_PATH for MKL-linked torch PyTorch in ml-linux-build is linked against MKL under /usr/local/gcc133 but the final image stage did not export LD_LIBRARY_PATH, so import torch failed in CI (libmkl_intel_lp64.so.2 not found). Set LD_LIBRARY_PATH on the validate_pytorch_allowlist Buildkite step for existing agents, and bake the same env into linux_image for future image releases. Made-with: Cursor --- .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 3 +++ dev-tools/docker/linux_image/Dockerfile | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index fc7fad301..936352d55 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -20,6 +20,9 @@ steps: timeout_in_minutes: 60 env: HF_HUB_DISABLE_XET: "1" + # torch is linked against MKL under /usr/local/gcc133; importing torch fails without this + # (e.g. libmkl_intel_lp64.so.2: cannot open shared object file). + LD_LIBRARY_PATH: "/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib" command: - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi" - "python3 -c \"import torch; print(f'PyTorch version: {torch.__version__}')\"" diff --git a/dev-tools/docker/linux_image/Dockerfile b/dev-tools/docker/linux_image/Dockerfile index 7099e1a6d..ade3d4cbc 100644 --- a/dev-tools/docker/linux_image/Dockerfile +++ b/dev-tools/docker/linux_image/Dockerfile @@ -211,6 +211,10 @@ RUN \ FROM rockylinux:8 COPY --from=builder /usr/local/gcc133 /usr/local/gcc133 +# Match the builder so dynamically loaded deps (MKL, libtorch_cpu, etc.) resolve when +# running tools under plain python3 without the full compile-time shell. +ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib +ENV PATH=/usr/local/gcc133/bin:/usr/bin:/bin:/usr/sbin:/sbin RUN \ dnf -y update && \ dnf install -y bzip2 gcc git make unzip which zip zlib-devel findutils && \ From 581a9c3dea5f4bc68961c4af80497a41e9625315 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Tue, 28 Apr 2026 13:28:08 +1200 Subject: [PATCH 3/3] [ML] PyTorch Docker: install sccache under /usr/local/bin (builder only) Place sccache in /usr/local/bin and add that directory to PATH only in the builder stage so COPY --from=builder /usr/local/gcc133 no longer carries sccache into the runtime image. Removes the post-COPY rm workaround and avoids leaving sccache bytes in a gcc133 layer (per review feedback). Made-with: Cursor --- dev-tools/docker/pytorch_linux_image/Dockerfile | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dev-tools/docker/pytorch_linux_image/Dockerfile b/dev-tools/docker/pytorch_linux_image/Dockerfile index 0cc5a8eca..80e1f0801 100644 --- a/dev-tools/docker/pytorch_linux_image/Dockerfile +++ b/dev-tools/docker/pytorch_linux_image/Dockerfile @@ -20,11 +20,13 @@ RUN dnf -y update && \ dnf config-manager --set-enabled powertools && \ dnf install -y bzip2 curl gcc gcc-c++ git libffi-devel make texinfo unzip wget which xz zip zlib-devel findutils -# Install sccache for GCS-backed compilation caching across builds (same dir as the pinned GCC toolchain) +# Install sccache for GCS-backed compilation caching. Keep it under /usr/local/bin (not under +# /usr/local/gcc133) so it is not copied into the final image with the toolchain tree. ARG SCCACHE_VERSION=v0.14.0 -RUN curl -fsSL "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" \ - | tar xz -C /usr/local/gcc133/bin --strip-components=1 "sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl/sccache" && \ - chmod +x /usr/local/gcc133/bin/sccache +RUN mkdir -p /usr/local/bin && \ + curl -fsSL "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" \ + | tar xz -C /usr/local/bin --strip-components=1 "sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl/sccache" && \ + chmod +x /usr/local/bin/sccache # For compiling with hardening and optimisation ENV CFLAGS="-g -O3 -fstack-protector -D_FORTIFY_SOURCE=2 -msse4.2 -mfpmath=sse" @@ -34,9 +36,10 @@ ENV LDFLAGS_FOR_TARGET="-Wl,-z,relro -Wl,-z,now" ARG build_dir=/usr/src -# Update paths to use the compiler in C++17 mode +# Update paths to use the compiler in C++17 mode. /usr/local/bin is on PATH in this stage only +# so the PyTorch compile step can invoke sccache without shipping it inside /usr/local/gcc133. ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib -ENV PATH=/usr/local/gcc133/bin:/usr/bin:/bin:/usr/sbin:/sbin +ENV PATH=/usr/local/gcc133/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin ENV CXX="g++ -std=gnu++17" # Clone PyTorch and build LibTorch @@ -97,7 +100,6 @@ RUN --mount=type=secret,id=gcs_key \ FROM rockylinux:8 COPY --from=builder /usr/local/gcc133 /usr/local/gcc133 -RUN rm -f /usr/local/gcc133/bin/sccache # Python 3.12 + torch site-packages for allowlist validation COPY --from=builder /usr/local/bin/python3.12 /usr/local/bin/python3.12 COPY --from=builder /usr/local/bin/pip3.12 /usr/local/bin/pip3.12