diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index fc7fad3013..936352d554 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -20,6 +20,9 @@ steps: timeout_in_minutes: 60 env: HF_HUB_DISABLE_XET: "1" + # torch is linked against MKL under /usr/local/gcc133; importing torch fails without this + # (e.g. libmkl_intel_lp64.so.2: cannot open shared object file). + LD_LIBRARY_PATH: "/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib" command: - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi" - "python3 -c \"import torch; print(f'PyTorch version: {torch.__version__}')\"" diff --git a/dev-tools/docker/linux_image/Dockerfile b/dev-tools/docker/linux_image/Dockerfile index 7099e1a6d7..ade3d4cbc7 100644 --- a/dev-tools/docker/linux_image/Dockerfile +++ b/dev-tools/docker/linux_image/Dockerfile @@ -211,6 +211,10 @@ RUN \ FROM rockylinux:8 COPY --from=builder /usr/local/gcc133 /usr/local/gcc133 +# Match the builder so dynamically loaded deps (MKL, libtorch_cpu, etc.) resolve when +# running tools under plain python3 without the full compile-time shell. +ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib +ENV PATH=/usr/local/gcc133/bin:/usr/bin:/bin:/usr/sbin:/sbin RUN \ dnf -y update && \ dnf install -y bzip2 gcc git make unzip which zip zlib-devel findutils && \ diff --git a/dev-tools/docker/pytorch_linux_image/Dockerfile b/dev-tools/docker/pytorch_linux_image/Dockerfile index d053d7a459..80e1f08014 100644 --- a/dev-tools/docker/pytorch_linux_image/Dockerfile +++ b/dev-tools/docker/pytorch_linux_image/Dockerfile @@ -20,9 +20,11 @@ RUN dnf -y update && \ dnf config-manager --set-enabled powertools && \ dnf install -y bzip2 curl gcc gcc-c++ git libffi-devel make texinfo unzip wget which xz zip zlib-devel findutils -# Install sccache for GCS-backed compilation caching across builds +# Install sccache for GCS-backed compilation caching. Keep it under /usr/local/bin (not under +# /usr/local/gcc133) so it is not copied into the final image with the toolchain tree. ARG SCCACHE_VERSION=v0.14.0 -RUN curl -fsSL "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" \ +RUN mkdir -p /usr/local/bin && \ + curl -fsSL "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" \ | tar xz -C /usr/local/bin --strip-components=1 "sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl/sccache" && \ chmod +x /usr/local/bin/sccache @@ -34,9 +36,10 @@ ENV LDFLAGS_FOR_TARGET="-Wl,-z,relro -Wl,-z,now" ARG build_dir=/usr/src -# Update paths to use the compiler in C++17 mode +# Update paths to use the compiler in C++17 mode. /usr/local/bin is on PATH in this stage only +# so the PyTorch compile step can invoke sccache without shipping it inside /usr/local/gcc133. ENV LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib:/usr/lib:/lib -ENV PATH=/usr/local/gcc133/bin:/usr/bin:/bin:/usr/sbin:/sbin +ENV PATH=/usr/local/gcc133/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin ENV CXX="g++ -std=gnu++17" # Clone PyTorch and build LibTorch