-
Notifications
You must be signed in to change notification settings - Fork 66
[ML] Fix PyTorch allowlist validation timeout on HF download stall #3022
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bb8106b
a5ee643
e855f66
4379a65
f56b354
1765007
8353956
feec352
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -18,7 +18,13 @@ LABEL maintainer="Ed Savage <ed.savage@elastic.co>" | |||||||||||||||||
| RUN dnf -y update && \ | ||||||||||||||||||
| dnf install -y dnf-plugins-core && \ | ||||||||||||||||||
| dnf config-manager --set-enabled powertools && \ | ||||||||||||||||||
| dnf install -y bzip2 gcc gcc-c++ git libffi-devel make texinfo unzip wget which xz zip zlib-devel findutils | ||||||||||||||||||
| dnf install -y bzip2 curl gcc gcc-c++ git libffi-devel make texinfo unzip wget which xz zip zlib-devel findutils | ||||||||||||||||||
|
|
||||||||||||||||||
| # Install sccache for GCS-backed compilation caching across builds | ||||||||||||||||||
| ARG SCCACHE_VERSION=v0.14.0 | ||||||||||||||||||
| RUN curl -fsSL "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" \ | ||||||||||||||||||
| | tar xz -C /usr/local/bin --strip-components=1 "sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl/sccache" && \ | ||||||||||||||||||
| chmod +x /usr/local/bin/sccache | ||||||||||||||||||
|
Comment on lines
+23
to
+27
|
||||||||||||||||||
|
|
||||||||||||||||||
| # For compiling with hardening and optimisation | ||||||||||||||||||
| ENV CFLAGS="-g -O3 -fstack-protector -D_FORTIFY_SOURCE=2 -msse4.2 -mfpmath=sse" | ||||||||||||||||||
|
|
@@ -37,17 +43,24 @@ ENV CXX="g++ -std=gnu++17" | |||||||||||||||||
| # PYTORCH_BUILD_VERSION is only set for tagged branches (e.g. v2.7.1); | ||||||||||||||||||
| # for main/viable/strict PyTorch derives the version from version.txt. | ||||||||||||||||||
| ARG pytorch_branch=viable/strict | ||||||||||||||||||
| ARG pytorch_commit=unknown | ||||||||||||||||||
|
|
||||||||||||||||||
| ENV PYTORCH_BRANCH=${pytorch_branch} | ||||||||||||||||||
|
|
||||||||||||||||||
| # Split the build into clone + build so sccache (GCS-backed) can accelerate | ||||||||||||||||||
| # incremental rebuilds. The GCS key is passed via --mount=type=secret to | ||||||||||||||||||
| # avoid baking credentials into the image. | ||||||||||||||||||
| RUN \ | ||||||||||||||||||
| cd ${build_dir} && \ | ||||||||||||||||||
| git -c advice.detachedHead=false clone --depth=1 --branch=${PYTORCH_BRANCH} https://github.com/pytorch/pytorch.git && \ | ||||||||||||||||||
| cd pytorch && \ | ||||||||||||||||||
| git submodule sync && \ | ||||||||||||||||||
| git submodule update --init --recursive && \ | ||||||||||||||||||
| sed -i -e 's/system(/strlen(/' torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp && \ | ||||||||||||||||||
| sed -i -e '104 i set(PYTHON_EXECUTABLE "/usr/local/bin/python3.12")' ./third_party/onnx/CMakeLists.txt && \ | ||||||||||||||||||
| sed -i -e '104 i set(PYTHON_EXECUTABLE "/usr/local/bin/python3.12")' ./third_party/onnx/CMakeLists.txt | ||||||||||||||||||
|
|
||||||||||||||||||
| RUN --mount=type=secret,id=gcs_key \ | ||||||||||||||||||
| cd ${build_dir}/pytorch && \ | ||||||||||||||||||
| if [[ "$PYTORCH_BRANCH" != "main" && "$PYTORCH_BRANCH" != "viable/strict" ]]; then export PYTORCH_BUILD_VERSION=$(expr "$PYTORCH_BRANCH" : 'v\(.*\)'); fi && \ | ||||||||||||||||||
| export BLAS=MKL && \ | ||||||||||||||||||
| export BUILD_TEST=OFF && \ | ||||||||||||||||||
|
|
@@ -60,8 +73,21 @@ RUN \ | |||||||||||||||||
| export USE_XNNPACK=OFF && \ | ||||||||||||||||||
| export PYTORCH_BUILD_NUMBER=1 && \ | ||||||||||||||||||
| export MAX_JOBS=10 && \ | ||||||||||||||||||
| if [ -f /run/secrets/gcs_key ]; then \ | ||||||||||||||||||
| export SCCACHE_GCS_BUCKET=elastic-ml-cpp-sccache && \ | ||||||||||||||||||
| export SCCACHE_GCS_KEY_PREFIX=pytorch-build && \ | ||||||||||||||||||
| export SCCACHE_GCS_RW_MODE=READ_WRITE && \ | ||||||||||||||||||
| export SCCACHE_GCS_KEY_PATH=/run/secrets/gcs_key && \ | ||||||||||||||||||
| export CMAKE_C_COMPILER_LAUNCHER=sccache && \ | ||||||||||||||||||
| export CMAKE_CXX_COMPILER_LAUNCHER=sccache && \ | ||||||||||||||||||
| sccache --start-server && \ | ||||||||||||||||||
| echo "sccache: using GCS backend (bucket=$SCCACHE_GCS_BUCKET prefix=$SCCACHE_GCS_KEY_PREFIX)"; \ | ||||||||||||||||||
| else \ | ||||||||||||||||||
| echo "sccache: no GCS credentials — building without cache"; \ | ||||||||||||||||||
| fi && \ | ||||||||||||||||||
| /usr/local/bin/python3.12 -m pip install --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --trusted-host pypi.org -r requirements.txt && \ | ||||||||||||||||||
| /usr/local/bin/python3.12 setup.py install && \ | ||||||||||||||||||
| if command -v sccache &>/dev/null && sccache --show-stats &>/dev/null; then sccache --show-stats; sccache --stop-server; fi && \ | ||||||||||||||||||
| mkdir -p /usr/local/gcc133/include/pytorch && \ | ||||||||||||||||||
| /bin/cp -rf torch/include/* /usr/local/gcc133/include/pytorch/ && \ | ||||||||||||||||||
| /bin/cp -f torch/lib/libtorch_cpu.so /usr/local/gcc133/lib && \ | ||||||||||||||||||
|
|
@@ -71,8 +97,19 @@ RUN \ | |||||||||||||||||
|
|
||||||||||||||||||
| FROM rockylinux:8 | ||||||||||||||||||
| COPY --from=builder /usr/local/gcc133 /usr/local/gcc133 | ||||||||||||||||||
| # Python 3.12 + torch site-packages for allowlist validation | ||||||||||||||||||
| COPY --from=builder /usr/local/bin/python3.12 /usr/local/bin/python3.12 | ||||||||||||||||||
| COPY --from=builder /usr/local/bin/pip3.12 /usr/local/bin/pip3.12 | ||||||||||||||||||
| COPY --from=builder /usr/local/lib/python3.12 /usr/local/lib/python3.12 | ||||||||||||||||||
| COPY --from=builder /usr/local/lib/libpython3.12.a /usr/local/lib/libpython3.12.a | ||||||||||||||||||
| RUN ln -sf /usr/local/bin/python3.12 /usr/local/bin/python3 && \ | ||||||||||||||||||
| ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip3 | ||||||||||||||||||
|
Comment on lines
+105
to
+106
|
||||||||||||||||||
| RUN ln -sf /usr/local/bin/python3.12 /usr/local/bin/python3 && \ | |
| ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip3 | |
| COPY --from=builder /usr/local/lib/*.so* /usr/local/lib/ | |
| COPY --from=builder /usr/local/lib64 /usr/local/lib64 | |
| RUN ln -sf /usr/local/bin/python3.12 /usr/local/bin/python3 && \ | |
| ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip3 && \ | |
| ldconfig && \ | |
| python3.12 -c 'import ssl' |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| torch==2.7.1 | ||
| transformers>=4.40.0 | ||
| transformers>=4.40.0,<5.0.0 | ||
| sentencepiece>=0.2.0 | ||
| protobuf>=5.0.0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This step assumes the agent image provides
python3andpip3. The currentdev-tools/docker/linux_image/Dockerfilefinal stage only copies/usr/local/gcc133and does not include Python, soml-linux-build:34may not have these binaries. Consider invoking/usr/local/bin/python3.12+/usr/local/bin/pip3.12explicitly, or ensure the image guaranteespython3/pip3symlinks.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no change needed, the Dockerfile already creates them (lines 105-106)