diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile index 3ff79e1c8..a542b99e0 100644 --- a/.ci/images/ascend/Dockerfile +++ b/.ci/images/ascend/Dockerfile @@ -18,4 +18,12 @@ RUN pip install --no-cache-dir --progress off \ pytest-xdist \ ruff +ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest +ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH} +ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH} +ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH} +ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME} +ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp +ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit + WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py index 7330d9694..e293b4a28 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -8,6 +8,7 @@ import subprocess import sys import uuid +import xml.etree.ElementTree as ET from datetime import datetime from pathlib import Path @@ -24,6 +25,42 @@ _PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"} +def _junit_xml_indicates_pass(results_dir): + """Return True if `pytest` junit XML under `results_dir` reports no failures/errors. + + Used to distinguish a real CI failure from the docker 18.09 + container-teardown `SIGKILL` (exit code 137) that occurs on this host + after a child process exits successfully — bash returns 0 from inside + the container, but the docker daemon reports 137 due to a race in its + `--rm` cleanup path. The junit XML is written by pytest before that + teardown and reliably captures the real outcome of the test stage. + """ + for junit in Path(results_dir).rglob("test-results.xml"): + try: + root = ET.parse(junit).getroot() + except ET.ParseError: + continue + + suites = root.findall("testsuite") if root.tag == "testsuites" else [root] + + if not suites: + continue + + for suite in suites: + try: + if int(suite.get("failures", 0)) > 0: + return False + + if int(suite.get("errors", 0)) > 0: + return False + except ValueError: + return False + + return True + + return False + + def apply_test_override(run_cmd, test_path): """Replace positional test path(s) in a pytest stage command. @@ -437,8 +474,23 @@ def main(): pool.release(allocated_ids) if returncode != 0: - print(f"job {job_name} failed (exit code {returncode})", file=sys.stderr) - failed += 1 + # Docker 18.09 on this host occasionally SIGKILLs containers + # during `--rm` cleanup after the inner process already exited + # cleanly, producing exit code 137. Fall back to the pytest + # junit XML to recover the real outcome in that case. + if returncode == 137 and _junit_xml_indicates_pass(results_dir): + print( + f"[warn] job {job_name}: container exited with 137 " + f"(likely docker teardown SIGKILL after clean pytest); " + f"junit XML reports no failures — treating as success", + file=sys.stderr, + ) + else: + print( + f"job {job_name} failed (exit code {returncode})", + file=sys.stderr, + ) + failed += 1 sys.exit(1 if failed else 0) diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..63af38e3b --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,81 @@ +--- +# Google C++ Style Guide enforcement. +# +# `clang-format` (via `.clang-format`) handles formatting; this file handles +# everything `clang-format` cannot see — primarily identifier naming. +# +# Usage: +# clang-tidy -p build/ --quiet (needs `compile_commands.json`) +# +# The project's scikit-build config sets `CMAKE_EXPORT_COMPILE_COMMANDS=ON`, +# so `pip install -e .[dev]` produces `build*/compile_commands.json`. +# +# Subdirectories may ship their own `.clang-tidy` with +# `InheritParentConfig: true` to override specific rules (e.g. +# `src/ascend/custom/.clang-tidy` relaxes `FunctionCase` for `extern "C"` +# `AscendC` kernel entries whose symbol names are dictated by +# `ascendc_add_operator(OP_NAME …)`). + +Checks: > + readability-identifier-naming, + google-explicit-constructor, + google-readability-braces-around-statements, + google-readability-casting, + modernize-use-nullptr, + modernize-use-override + +# Only naming violations fail the lint — the rest are advisory while the +# project ramps up on `clang-tidy`. +WarningsAsErrors: 'readability-identifier-naming.*' + +HeaderFilterRegex: '^(src|tests)/.*\.(h|hpp|cuh)$' + +CheckOptions: + # Types. + - {key: readability-identifier-naming.ClassCase, value: CamelCase} + - {key: readability-identifier-naming.StructCase, value: CamelCase} + - {key: readability-identifier-naming.UnionCase, value: CamelCase} + - {key: readability-identifier-naming.EnumCase, value: CamelCase} + - {key: readability-identifier-naming.TypeAliasCase, value: CamelCase} + - {key: readability-identifier-naming.TypedefCase, value: CamelCase} + - {key: readability-identifier-naming.TypeTemplateParameterCase, value: CamelCase} + + # Enumerators and constants use `k` + PascalCase. + - {key: readability-identifier-naming.EnumConstantCase, value: CamelCase} + - {key: readability-identifier-naming.EnumConstantPrefix, value: 'k'} + - {key: readability-identifier-naming.ConstexprVariableCase, value: CamelCase} + - {key: readability-identifier-naming.ConstexprVariablePrefix, value: 'k'} + - {key: readability-identifier-naming.GlobalConstantCase, value: CamelCase} + - {key: readability-identifier-naming.GlobalConstantPrefix, value: 'k'} + - {key: readability-identifier-naming.StaticConstantCase, value: CamelCase} + - {key: readability-identifier-naming.StaticConstantPrefix, value: 'k'} + + # Functions and methods use PascalCase. + - {key: readability-identifier-naming.FunctionCase, value: CamelCase} + - {key: readability-identifier-naming.MethodCase, value: CamelCase} + + # Variables and parameters use snake_case. + - {key: readability-identifier-naming.VariableCase, value: lower_case} + - {key: readability-identifier-naming.ParameterCase, value: lower_case} + - {key: readability-identifier-naming.LocalVariableCase, value: lower_case} + + # Class data members use snake_case with a trailing `_`. + - {key: readability-identifier-naming.PrivateMemberCase, value: lower_case} + - {key: readability-identifier-naming.PrivateMemberSuffix, value: '_'} + - {key: readability-identifier-naming.ProtectedMemberCase, value: lower_case} + - {key: readability-identifier-naming.ProtectedMemberSuffix, value: '_'} + - {key: readability-identifier-naming.PublicMemberCase, value: lower_case} + + # Macros use UPPER_CASE. + - {key: readability-identifier-naming.MacroDefinitionCase, value: UPPER_CASE} + # Include guards end in a trailing `_` (e.g. `INFINI_OPS_FOO_H_`), which + # the default `UPPER_CASE` style rejects — skip them by regex. + - {key: readability-identifier-naming.MacroDefinitionIgnoredRegexp, value: '^INFINI_OPS_.*_H_$'} + + # Namespaces use lower_case. + - {key: readability-identifier-naming.NamespaceCase, value: lower_case} + + # Exemptions. + # Type-trait variables (`IsFP16`, `IsBFloat16`, `HasKey`, …) mirror + # `std::is_*_v<>` naming — `PascalCase` without `k` prefix. + - {key: readability-identifier-naming.ConstexprVariableIgnoredRegexp, value: '^(Is|Has|Can)[A-Z].*'} diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..656ddd9e4 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,202 @@ + + +## Summary + + + +- +- + +## Motivation + + + +Closes # + +## Type of Change + + + +- [ ] `feat` — new feature / new operator / new platform +- [ ] `fix` — bug fix +- [ ] `perf` — performance improvement (no behavioral change) +- [ ] `refactor` — code restructuring without behavior change +- [ ] `test` — adding or fixing tests only +- [ ] `docs` — documentation only +- [ ] `build` / `ci` — build system or CI configuration +- [ ] `chore` — tooling, formatting, or other non-code changes +- [ ] Breaking change (requires a `!` in the Conventional Commits prefix or a `BREAKING CHANGE:` footer) + +## Platforms Affected + + + +- [ ] CPU (`WITH_CPU`) +- [ ] NVIDIA (`WITH_NVIDIA`) +- [ ] Iluvatar (`WITH_ILUVATAR`) +- [ ] MetaX (`WITH_METAX`) +- [ ] Cambricon (`WITH_CAMBRICON`) +- [ ] Moore (`WITH_MOORE`) +- [ ] Ascend (`WITH_ASCEND`) +- [ ] PyTorch C++ bindings (`WITH_TORCH`) +- [ ] Build system / CMake / CI +- [ ] Python bindings / user-facing API + +## Test Results on Supported Platforms + + + +| Platform | Built | `pytest` Result | Notes / Hardware | +| ---------- | :---: | --------------- | ---------------- | +| NVIDIA | | | | +| Iluvatar | | | | +| MetaX | | | | +| Cambricon | | | | +| Moore | | | | +| Ascend | | | | + +
+Full `pytest` output (optional) + +```text +paste here +``` + +
+ +## Benchmark / Performance Impact + + + +## Notes for Reviewers + + + +--- + +## Checklist + +> Every contributor **must** verify every item below before requesting +> review. Tick each box only after the check has actually been performed — +> do not tick speculatively. If an item truly does not apply, replace the +> checkbox with `N/A` and briefly explain why in an inline comment. + +### Title, Branch, and Commits + +- [ ] PR **title** follows [Conventional Commits](https://www.conventionalcommits.org/) (e.g. `feat(nvidia): …`, `fix(cuda/gemm): …`). +- [ ] Branch name follows `/xxx-yyyy-zzzz` where `` matches the PR title's Conventional Commits type and words are joined with hyphens (see `CONTRIBUTING.md` §Branches). +- [ ] Each **commit** message follows Conventional Commits. +- [ ] Small PR is a **single squashable commit**; or, for a large PR, every commit is meaningful, well-formed, and independently reviewable (see `CONTRIBUTING.md` §Pull Requests). +- [ ] No stray merge commits from `master` — the branch is rebased cleanly on top of the current `master`. +- [ ] No `fixup!` / `squash!` / `wip` commits remain. + +### Scope and Design + +- [ ] Changes are **minimal** — nothing unrelated to the stated motivation was added (`CONTRIBUTING.md` §Code/General). +- [ ] No dead code, commented-out blocks, debug prints, `printf`/`std::cout`/`print(...)` left behind, or `TODO` without an owner and issue link. +- [ ] No unrelated formatting churn that would obscure the diff. +- [ ] Public API changes (if any) are intentional, documented, and reflected in affected callers/tests. + +### General Code Hygiene (applies to all languages) + +- [ ] The code is self-explanatory; comments were added **only** where the *why* is non-obvious (`CONTRIBUTING.md` §Code/General). +- [ ] Every modified or added file **ends with a single trailing newline** (`CONTRIBUTING.md` §Code/General). +- [ ] No trailing whitespace, tab/space mixing, or stray BOMs. +- [ ] Identifiers in comments and error messages are wrapped in backticks (e.g. ``the `seqlens_k` tensor``) (`CONTRIBUTING.md` §Code/General). +- [ ] All comments and error messages are in **English** (`CONTRIBUTING.md` §Code/General). +- [ ] Comments and error messages are complete sentences — capitalized first letter, terminal punctuation — **unless** the language/framework convention says otherwise (`CONTRIBUTING.md` §Code/General; §Python). + +### C++ Specific (if C++ files changed) + +- [ ] Code follows the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) strictly. +- [ ] `clang-format` (version **21**, per `.github/workflows/clang-format.yml`) has been run against all modified `.h`, `.cc`, `.cuh`, and `.mlu` files; the diff is clean. +- [ ] `clang-tidy` concerns (per `.clang-tidy`) have been reviewed — no new warnings beyond the existing baseline. +- [ ] Operator parameter order is **inputs first, outputs last**; attributes are between inputs and outputs; naming follows PyTorch → ONNX → CUDA API precedence (`CONTRIBUTING.md` §C++). +- [ ] **No exceptions** are thrown. Error paths use `assert` with messages that include at least `__FILE__`, `__LINE__`, and `__func__` (`CONTRIBUTING.md` §C++). +- [ ] Error and warning message wording follows the [LLVM Coding Standards](https://llvm.org/docs/CodingStandards.html#error-and-warning-messages) (`CONTRIBUTING.md` §C++). +- [ ] Kernel files are named correctly: custom = `kernel` / `kernel_v2` / …; well-known algorithms use the algorithm name; library-based implementations use the library name (`CONTRIBUTING.md` §C++). +- [ ] Kernel and kernel launcher are in **separate files**: launcher `.h`, kernel follows platform conventions (e.g. `.cuh` + `.cu`) even when non-templated (`CONTRIBUTING.md` §C++). +- [ ] Constructor **initializer list order matches member declaration order** (`CONTRIBUTING.md` §C++). +- [ ] Exactly **one blank line** between classes, between classes and functions, and between functions (`CONTRIBUTING.md` §C++). +- [ ] Exactly **one blank line** between members (functions *and* variables) within a class (`CONTRIBUTING.md` §C++). +- [ ] Exactly **one blank line** before and after the contents of a namespace (`CONTRIBUTING.md` §C++). +- [ ] New operators added via `src/base/.h` (inheriting `Operator`) with platform implementations under `src///` inheriting the base (`CONTRIBUTING.md` §Adding an Operator). +- [ ] No raw `new`/`delete`; RAII / smart pointers / existing allocators are used. + +### Python Specific (if Python files changed) + +- [ ] Code is [PEP 8](https://peps.python.org/pep-0008/) compliant; `ruff check` passes cleanly on CI (see `.github/workflows/ruff.yml`). +- [ ] `ruff format --check` passes cleanly — if not, run `ruff format` and commit the result. +- [ ] Comments are complete English sentences, starting with a capital letter and ending with punctuation; Markdown backticks are used for code references (`CONTRIBUTING.md` §Python). +- [ ] Framework-specific conventions (e.g. lowercase `pytest.skip` messages without terminal period) are honored where applicable (`CONTRIBUTING.md` §Python). +- [ ] **No blank line** between the function signature and the body when there is no docstring or comment (`CONTRIBUTING.md` §Python). +- [ ] A blank line is present **before and after** `if`, `for`, and similar control-flow statements (`CONTRIBUTING.md` §Python). +- [ ] A blank line appears **before** each `return`, except when it directly follows a control-flow statement (`CONTRIBUTING.md` §Python). +- [ ] Docstrings (if any) follow [PEP 257](https://peps.python.org/pep-0257/) (`CONTRIBUTING.md` §Python). +- [ ] Type hints are added / kept consistent with the surrounding code. + +### Testing + +- [ ] `pytest` was run locally on **every supported platform** that this PR can affect, and the results are recorded in the "Test Results" table above (`CONTRIBUTING.md` §Pull Requests). +- [ ] For any platform that could not be tested, an explicit reason is given in the table and a reviewer with access has been tagged. +- [ ] New functionality has matching tests under `tests/` following `tests/test_add.py` / `tests/test_gemm.py` patterns (`CONTRIBUTING.md` §Adding an Operator). +- [ ] Tests use `pytest.mark.parametrize` correctly: dependent parameters share one decorator (e.g. `@pytest.mark.parametrize("dtype, rtol, atol", …)`), independent parameters use separate decorators ordered by parameter declaration. +- [ ] Where appropriate, `pytest.mark.auto_act_and_assert` is used and the test returns a `Payload` whose `func` and `ref` share the same calling convention. +- [ ] Default `dtype` / `device` parameterization is relied on, or overridden with an explicit `pytest.mark.parametrize` when necessary. +- [ ] Any new test that is flaky under parallelism is marked so, or documented to require `pytest -n 1`. +- [ ] For bug fixes: a regression test has been added that fails on `master` and passes with this PR. + +### Build, CI, and Tooling + +- [ ] The project builds cleanly from a fresh directory with `pip install .[dev]` on at least one affected platform. +- [ ] `compile_commands.json` still regenerates (CMake option `CMAKE_EXPORT_COMPILE_COMMANDS=ON` in `pyproject.toml` — required by the `code-lint` skill and `clang-tidy -p`). +- [ ] New backends / devices have been added to auto-detection in `CMakeLists.txt` under `if(AUTO_DETECT_DEVICES)` **and** to `if(AUTO_DETECT_BACKENDS)` if applicable. +- [ ] Only one CUDA-like GPU backend is selectable at a time — the existing mutual-exclusion check in `CMakeLists.txt` is not broken. +- [ ] Both CI workflows (`clang-format.yml`, `ruff.yml`) are green locally (or expected to be green on CI). +- [ ] No new runtime dependency was added without updating `pyproject.toml`'s `[project.optional-dependencies]` (or justified in the PR description). + +### Documentation + +- [ ] `README.md`, `CONTRIBUTING.md`, or inline docs updated when behavior, build flags, or developer workflow changed. +- [ ] New operators, new dispatch helpers, or new public utilities are documented (docstring, header comment, or an addition to `CONTRIBUTING.md` §Some Code Explanations). +- [ ] Any user-visible breaking change is called out explicitly under "Motivation" **and** in the commit/PR title with a `!` or `BREAKING CHANGE:` footer. + +### Security and Safety + +- [ ] No secrets, access tokens, internal URLs, customer data, or personal hardware identifiers have been committed. +- [ ] Third-party code is license-compatible and attributed. +- [ ] No unsafe pointer arithmetic, uninitialized reads, or missing bounds checks were introduced. diff --git a/CMakeLists.txt b/CMakeLists.txt index e88cc20c0..91c2b0154 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(InfiniOps LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -# Internal variable to control pybind11's automatic optimization flags (like `-flto`). +# Internal variable to control `pybind11`'s automatic optimization flags (like `-flto`). set(PYBIND11_ENABLE_EXTRAS ON) # Options for backends. @@ -18,6 +18,13 @@ option(WITH_ASCEND "Enable Ascend backend" OFF) option(WITH_TORCH "Enable PyTorch C++ backend" OFF) +# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for +# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed +# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the +# toolchain is compatible or when building via the standalone +# `src/ascend/custom/build.sh` script. +option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF) + option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF) option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF) option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF) @@ -130,6 +137,28 @@ if(WITH_TORCH) find_library(C10_LIB c10 HINTS ${_torch_lib_dirs} REQUIRED) set(TORCH_LIBRARIES ${TORCH_LIB} ${TORCH_CPU_LIB} ${C10_LIB}) + # `auditwheel`-repaired `torch` wheels bundle transitive dependencies + # (e.g. `libgfortran-.so`, `libopenblasp-.so`) in a sibling + # `torch.libs/` directory that `library_paths()` does not return. When + # building against such a wheel, the linker needs this path to resolve + # the bundled `NEEDED` entries (otherwise: `undefined reference to + # _gfortran_etime@GFORTRAN_8` etc.). + execute_process( + COMMAND ${Python_EXECUTABLE} -c "import os, torch; d = os.path.dirname(torch.__file__); p = os.path.join(os.path.dirname(d), 'torch.libs'); print(p if os.path.isdir(p) else '')" + OUTPUT_VARIABLE TORCH_BUNDLED_LIBS_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + if(TORCH_BUNDLED_LIBS_DIR) + list(APPEND CMAKE_BUILD_RPATH "${TORCH_BUNDLED_LIBS_DIR}") + list(APPEND CMAKE_INSTALL_RPATH "${TORCH_BUNDLED_LIBS_DIR}") + # `rpath-link` is linker-only: lets `ld` resolve the bundled + # transitive `NEEDED` entries at link time without adding them to our + # own binary's direct `NEEDED` list. + add_link_options("-Wl,-rpath-link,${TORCH_BUNDLED_LIBS_DIR}") + message(STATUS "PyTorch bundled libs: ${TORCH_BUNDLED_LIBS_DIR}") + endif() + # Query the `CXX11` ABI setting that `torch` was compiled with. # A mismatch causes linker errors (e.g. undefined reference to # `c10::Device::Device(std::string const&)`). @@ -182,7 +211,7 @@ if(WITH_ILUVATAR) # `-x ivcore` must not be in `CMAKE_CUDA_FLAGS` — CMake passes those flags # to both compile and link steps. During linking, `-x ivcore` causes # `clang++` to re-parse `.o` files as source code. - set(CMAKE_CUDA_FLAGS "--cuda-gpu-arch=${ILUVATAR_ARCH} -fPIC -Wno-error=unused-variable -Wno-error=unused-private-field -Wno-unused-variable" CACHE STRING "Iluvatar CUDA flags") + set(CMAKE_CUDA_FLAGS "--cuda-gpu-arch=${ILUVATAR_ARCH} -fPIC -Wno-error=unused-variable -Wno-error=unused-private-field -Wno-unused-variable -std=c++17" CACHE STRING "Iluvatar CUDA flags") set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Iluvatar") set(CMAKE_CUDA_ARCHITECTURES OFF CACHE STRING "Iluvatar CUDA architectures (passed via CMAKE_CUDA_FLAGS)") # Iluvatar does not ship `libcudadevrt`, which CMake's compiler test diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d131f0add..fcc19c2cc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -91,7 +91,7 @@ pytest ## Adding an Operator 1. **Base class** in `src/base/`: the class must inherit from `Operator` (e.g. `class Gemm : public Operator`). See `src/base/gemm.h`. -2. **Platform implementation** in `src///`: the class must inherit from the base (e.g. `class Blas : public Gemm`). See `src/cuda/gemm/blas.h` and `src/cuda/nvidia/gemm/cublas.h`. +2. **Platform implementation** in `src/native///ops//` (or `src/torch/ops//` for the PyTorch backend): the class must inherit from the base (e.g. `class Blas : public Gemm`). See `src/native/cuda/ops/gemm/blas.h` and `src/native/cuda/nvidia/ops/gemm/cublas.h`. 3. **Tests** in `tests/`: - Use `pytest.mark.parametrize` for parameterization. Dependent parameters go in one decorator (e.g. `@pytest.mark.parametrize("dtype, rtol, atol", ...)`); independent parameters use separate decorators, ordered by parameter declaration. - `dtype` and `device` parameterization is included by default. Override with explicit `pytest.mark.parametrize` if needed. diff --git a/examples/runtime_api.h b/examples/runtime_api.h index c1ef31612..101fcad59 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -4,27 +4,27 @@ #include "device.h" #ifdef WITH_NVIDIA -#include "cuda/nvidia/gemm/cublas.h" -#include "cuda/nvidia/gemm/cublaslt.h" -#include "cuda/nvidia/runtime_.h" +#include "native/cuda/nvidia/ops/gemm/cublas.h" +#include "native/cuda/nvidia/ops/gemm/cublaslt.h" +#include "native/cuda/nvidia/runtime_.h" #elif WITH_ILUVATAR -#include "cuda/iluvatar/gemm/cublas.h" -#include "cuda/iluvatar/runtime_.h" +#include "native/cuda/iluvatar/ops/gemm/cublas.h" +#include "native/cuda/iluvatar/runtime_.h" #elif WITH_METAX -#include "cuda/metax/gemm/mcblas.h" -#include "cuda/metax/runtime_.h" +#include "native/cuda/metax/ops/gemm/mcblas.h" +#include "native/cuda/metax/runtime_.h" #elif WITH_CAMBRICON -#include "cambricon/gemm/cnblas.h" -#include "cambricon/runtime_.h" +#include "native/cambricon/ops/gemm/cnblas.h" +#include "native/cambricon/runtime_.h" #elif WITH_MOORE -#include "cuda/moore/gemm/mublas.h" -#include "cuda/moore/runtime_.h" +#include "native/cuda/moore/ops/gemm/mublas.h" +#include "native/cuda/moore/runtime_.h" #elif WITH_ASCEND -#include "ascend/gemm/kernel.h" -#include "ascend/runtime_.h" +#include "native/ascend/ops/gemm/kernel.h" +#include "native/ascend/runtime_.h" #elif WITH_CPU -#include "cpu/gemm/gemm.h" -#include "cpu/runtime_.h" +#include "native/cpu/ops/gemm/gemm.h" +#include "native/cpu/runtime_.h" #else #error "One `WITH_*` backend must be enabled for the examples." #endif diff --git a/pyproject.toml b/pyproject.toml index 58740166e..959699f90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,5 +17,10 @@ AUTO_DETECT_DEVICES = "ON" AUTO_DETECT_BACKENDS = "ON" GENERATE_PYTHON_BINDINGS = "ON" +# Enables `compile_commands.json` under `SKBUILD_BUILD_DIR` (or the default +# `scikit-build` build dir) for `clang-tidy -p ` used by +# the `code-lint` skill. +CMAKE_EXPORT_COMPILE_COMMANDS = "ON" + [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py index c050b31c0..effc0787d 100644 --- a/scripts/generate_wrappers.py +++ b/scripts/generate_wrappers.py @@ -103,8 +103,18 @@ def _find_optional_tensor_params(op_name): return set(re.findall(r"std::optional\s+(\w+)", source)) +def _find_vector_tensor_params(op_name): + """Return a set of parameter names declared as `std::vector` in + the base header. + """ + source = (_BASE_DIR / f"{op_name}.h").read_text() + + return set(re.findall(r"std::vector\s+(\w+)", source)) + + def _generate_pybind11(operator): optional_tensor_params = _find_optional_tensor_params(operator.name) + vector_tensor_params = _find_vector_tensor_params(operator.name) def _is_optional_tensor(arg): if arg.spelling in optional_tensor_params: @@ -112,6 +122,15 @@ def _is_optional_tensor(arg): return "std::optional" in arg.type.spelling and "Tensor" in arg.type.spelling + def _is_optional(arg): + return "std::optional" in arg.type.spelling + + def _is_vector_tensor(arg): + if arg.spelling in vector_tensor_params: + return True + + return "std::vector" in arg.type.spelling and "Tensor" in arg.type.spelling + def _generate_params(node): parts = [] @@ -121,6 +140,8 @@ def _generate_params(node): if _is_optional_tensor(arg): parts.append(f"std::optional {arg.spelling}") + elif _is_vector_tensor(arg): + parts.append(f"std::vector {arg.spelling}") else: param = arg.type.spelling.replace("const Tensor", "py::object").replace( "Tensor", "py::object" @@ -138,6 +159,8 @@ def _generate_arguments(node): if _is_optional_tensor(arg): args.append(f"OptionalTensorFromPybind11Handle({arg.spelling})") + elif _is_vector_tensor(arg): + args.append(f"VectorTensorFromPybind11Handle({arg.spelling})") elif "Tensor" in arg.type.spelling: args.append(f"TensorFromPybind11Handle({arg.spelling})") else: @@ -155,11 +178,18 @@ def _generate_init(constructor): }}))""" def _generate_py_args(node): - return ", ".join( - f'py::arg("{arg.spelling}")' - for arg in node.get_arguments() - if arg.spelling != "stream" - ) + parts = [] + + for arg in node.get_arguments(): + if arg.spelling == "stream": + continue + + if _is_optional(arg): + parts.append(f'py::arg("{arg.spelling}") = py::none()') + else: + parts.append(f'py::arg("{arg.spelling}")') + + return ", ".join(parts) def _generate_call(op_name, call, method=True): call_params = _generate_params(call) @@ -224,7 +254,8 @@ def _generate_call(op_name, call, method=True): {calls} .def_static("active_implementation_indices", [](const std::string& device) {{ return Self::active_implementation_indices(DeviceTypeFromString(device)); - }}); + }}) + .def_static("clear_cache", &Self::clear_cache); {callers} }} @@ -430,7 +461,7 @@ def _get_all_ops(devices, with_torch=False): ops[op_name] = [] for file_path in _SRC_DIR.rglob("*.h"): - if file_path.parent.parent.name not in scan_dirs: + if file_path.parent.parent.parent.name not in scan_dirs: continue if f"class Operator<{_snake_to_pascal(op_name)}" in file_path.read_text(): diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cbdae6745..ce888b4b5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,8 +7,8 @@ set(DEVICE_LIST "") if(WITH_CPU) set(CPU_PATTERNS - "cpu/*.cc" - "cpu/*.cpp" + "native/cpu/*.cc" + "native/cpu/*.cpp" ) file(GLOB_RECURSE CPU_SOURCES CONFIGURE_DEPENDS ${CPU_PATTERNS}) @@ -24,12 +24,12 @@ endif() if(WITH_NVIDIA) set(NVIDIA_PATTERNS - "cuda/*.cc" - "cuda/*.cpp" - "cuda/*.cu" - "cuda/nvidia/*.cc" - "cuda/nvidia/*.cpp" - "cuda/nvidia/*.cu" + "native/cuda/*.cc" + "native/cuda/*.cpp" + "native/cuda/*.cu" + "native/cuda/nvidia/*.cc" + "native/cuda/nvidia/*.cpp" + "native/cuda/nvidia/*.cu" ) file(GLOB_RECURSE NVIDIA_SOURCES CONFIGURE_DEPENDS ${NVIDIA_PATTERNS}) @@ -51,12 +51,12 @@ endif() if(WITH_ILUVATAR) set(ILUVATAR_PATTERNS - "cuda/*.cc" - "cuda/*.cpp" - "cuda/*.cu" - "cuda/iluvatar/*.cc" - "cuda/iluvatar/*.cpp" - "cuda/iluvatar/*.cu" + "native/cuda/*.cc" + "native/cuda/*.cpp" + "native/cuda/*.cu" + "native/cuda/iluvatar/*.cc" + "native/cuda/iluvatar/*.cpp" + "native/cuda/iluvatar/*.cu" ) file(GLOB_RECURSE ILUVATAR_SOURCES CONFIGURE_DEPENDS ${ILUVATAR_PATTERNS}) @@ -79,10 +79,10 @@ endif() if(WITH_METAX) set(METAX_PATTERNS - "cuda/*.cc" - "cuda/*.cpp" - "cuda/metax/*.cc" - "cuda/metax/*.maca" + "native/cuda/*.cc" + "native/cuda/*.cpp" + "native/cuda/metax/*.cc" + "native/cuda/metax/*.maca" ) file(GLOB_RECURSE METAX_SOURCES CONFIGURE_DEPENDS ${METAX_PATTERNS}) @@ -105,11 +105,11 @@ endif() if(WITH_MOORE) set(MOORE_PATTERNS - "cuda/*.cc" - "cuda/*.cpp" - "cuda/moore/*.cc" - "cuda/moore/*.cpp" - "cuda/moore/*.mu" + "native/cuda/*.cc" + "native/cuda/*.cpp" + "native/cuda/moore/*.cc" + "native/cuda/moore/*.cpp" + "native/cuda/moore/*.mu" ) file(GLOB_RECURSE MOORE_SOURCES CONFIGURE_DEPENDS ${MOORE_PATTERNS}) @@ -127,7 +127,7 @@ if(WITH_MOORE) endif() if(WITH_CAMBRICON) - file(GLOB_RECURSE CAMBRICON_MLU_SOURCES CONFIGURE_DEPENDS "cambricon/*/*.mlu") + file(GLOB_RECURSE CAMBRICON_MLU_SOURCES CONFIGURE_DEPENDS "native/cambricon/ops/*/*.mlu") find_program(CNCC_COMPILER cncc HINTS "${NEUWARE_HOME}/bin" "$ENV{NEUWARE_HOME}/bin" /usr/local/neuware/bin) if(CNCC_COMPILER) message(STATUS "Found cncc: ${CNCC_COMPILER}") @@ -175,11 +175,13 @@ endif() if(WITH_ASCEND) # ASCEND_HOME is set by the top-level CMakeLists.txt. file(GLOB_RECURSE ASCEND_SOURCES CONFIGURE_DEPENDS - "ascend/*.cc" - "ascend/*.cpp" + "native/ascend/*.cc" + "native/ascend/*.cpp" ) - # Exclude `kernel_impl.cpp` — AscendC device code, not compiled by the host C++ compiler. + # Exclude `kernel_impl.cpp` — `AscendC` device code, not compiled by the host C++ compiler. list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*kernel_impl\\.cpp$") + # Exclude custom/ — standalone PyTorch extension, built separately. + list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*/custom/.*") target_compile_definitions(infiniops PUBLIC WITH_ASCEND=1) target_sources(infiniops PRIVATE ${ASCEND_SOURCES}) @@ -215,7 +217,38 @@ if(WITH_ASCEND) "${ASCEND_HOME}/lib64/libopapi.so" "${ASCEND_HAL_LIB}") + # ATB (Ascend Transformer Boost) — provides fused operators like + # `PagedAttention` and `ReshapeAndCache` that are graph-capture safe. + set(ATB_HOME_DIR "$ENV{ATB_HOME_PATH}") + if(NOT ATB_HOME_DIR) + # Default search path under CANN nnal directory. + file(GLOB ATB_SEARCH_DIRS "/usr/local/Ascend/nnal/atb/*/atb/cxx_abi_1") + if(ATB_SEARCH_DIRS) + list(SORT ATB_SEARCH_DIRS ORDER DESCENDING) + list(GET ATB_SEARCH_DIRS 0 ATB_HOME_DIR) + endif() + endif() + + if(ATB_HOME_DIR AND EXISTS "${ATB_HOME_DIR}/include/atb/operation.h") + message(STATUS "ATB found: ${ATB_HOME_DIR}") + target_compile_definitions(infiniops PUBLIC INFINI_HAS_ATB=1) + target_include_directories(infiniops PUBLIC "${ATB_HOME_DIR}/include") + target_link_libraries(infiniops PUBLIC "${ATB_HOME_DIR}/lib/libatb.so") + else() + message(STATUS "ATB not found — ATB-based operators disabled") + endif() + list(APPEND DEVICE_LIST "ascend") + + # Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`). + if(BUILD_CUSTOM_KERNEL) + add_subdirectory(native/ascend/custom) + + # Link the compiled `AscendC` kernel objects into `infiniops` so that + # custom kernel implementations (e.g. `RmsNorm` index 1) can call + # them via the generated launch functions. + target_compile_definitions(infiniops PUBLIC INFINI_HAS_CUSTOM_KERNELS=1) + endif() endif() if(WITH_TORCH) @@ -340,6 +373,17 @@ if(GENERATE_PYTHON_BINDINGS) target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR}) target_link_libraries(ops PRIVATE infiniops) + # Custom `AscendC` kernel objects must be linked directly into ops + # because the `AscendC` toolchain compiles host stubs with hidden + # visibility — `libinfiniops.so` cannot re-export those symbols. + # The `Operator<..., 1>` template instantiations that call + # `aclrtlaunch_*` live in `ops.cc`, so link here with + # `--whole-archive` to ensure all launch functions are available. + if(BUILD_CUSTOM_KERNEL) + target_link_libraries(ops PRIVATE + -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive) + endif() + set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN") set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN") diff --git a/src/ascend/common.h b/src/ascend/common.h deleted file mode 100644 index fba4766b4..000000000 --- a/src/ascend/common.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_COMMON_H_ -#define INFINI_OPS_ASCEND_COMMON_H_ - -#include -#include - -#include "acl/acl.h" -#include "aclnn/acl_meta.h" -#include "ascend/data_type_.h" -#include "tensor.h" - -namespace infini::ops::ascend { - -// Build an `aclTensor` descriptor from an InfiniOps `Tensor`. -// -// When `transpose_last2` is true the last two dimensions are swapped in the -// descriptor (shape and strides) without copying data. This is used by `Gemm` -// and `MatMul` to express a transpose via the view. -inline aclTensor* buildAclTensor(const Tensor& t, - bool transpose_last2 = false) { - std::vector shape(t.shape().begin(), t.shape().end()); - std::vector strides(t.strides().begin(), t.strides().end()); - - if (transpose_last2 && shape.size() >= 2) { - auto n = shape.size(); - std::swap(shape[n - 2], shape[n - 1]); - std::swap(strides[n - 2], strides[n - 1]); - } - - // Compute the minimum physical storage needed for this strided view. - // For contiguous tensors this equals `numel()`; for non-contiguous (gapped) - // tensors it may be larger; for broadcast (stride-0) tensors it may be - // smaller. Passing the view shape as the storage shape causes - // "ViewShape overlap" errors in ACLNN for non-contiguous inputs. - int64_t storage_elems = 1; - for (size_t i = 0; i < shape.size(); ++i) { - if (shape[i] == 0) { - storage_elems = 0; - break; - } - if (strides[i] > 0 && shape[i] > 1) { - storage_elems += static_cast(shape[i] - 1) * strides[i]; - } - } - std::vector storage_shape = {storage_elems}; - - return aclCreateTensor( - shape.data(), static_cast(shape.size()), ToAclDtype(t.dtype()), - strides.data(), - /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape.data(), - static_cast(storage_shape.size()), const_cast(t.data())); -} - -} // namespace infini::ops::ascend - -#endif diff --git a/src/ascend/gemm/kernel.h b/src/ascend/gemm/kernel.h deleted file mode 100644 index 3360e7930..000000000 --- a/src/ascend/gemm/kernel.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_GEMM_KERNEL_H_ -#define INFINI_OPS_ASCEND_GEMM_KERNEL_H_ - -#include "acl/acl.h" -#include "aclnn/aclnn_base.h" -#include "aclnnop/aclnn_addmm.h" -#include "aclnnop/aclnn_baddbmm.h" -#include "ascend/common.h" -#include "ascend/workspace_pool_.h" -#include "base/gemm.h" -#include "operator.h" - -namespace infini::ops { - -template <> -class Operator : public Gemm { - public: - Operator(const Tensor a, const Tensor b, std::optional alpha, - std::optional beta, std::optional trans_a, - std::optional trans_b, Tensor c) - : Gemm(a, b, alpha, beta, trans_a, trans_b, c), - batched_{batch_count_ > 1}, - alpha_val_{alpha.value_or(1.0f)}, - beta_val_{beta.value_or(1.0f)} { - alpha_scalar_ = aclCreateScalar(&alpha_val_, ACL_FLOAT); - beta_scalar_ = aclCreateScalar(&beta_val_, ACL_FLOAT); - } - - ~Operator() { - aclDestroyScalar(alpha_scalar_); - aclDestroyScalar(beta_scalar_); - } - - void operator()(const Tensor a, const Tensor b, std::optional alpha, - std::optional beta, std::optional trans_a, - std::optional trans_b, Tensor c) const override { - auto stream = static_cast(stream_); - - auto t_self = ascend::buildAclTensor(c); - auto t_a = ascend::buildAclTensor(a, trans_a_); - auto t_b = ascend::buildAclTensor(b, trans_b_); - auto t_out = ascend::buildAclTensor(c); - - uint64_t ws_needed = 0; - aclOpExecutor* executor = nullptr; - - if (batched_) { - aclnnBaddbmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_, - alpha_scalar_, t_out, 0, &ws_needed, - &executor); - } else { - aclnnAddmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_, alpha_scalar_, - t_out, 0, &ws_needed, &executor); - } - - auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_needed); - - if (batched_) { - aclnnBaddbmm(arena.buf, ws_needed, executor, stream); - } else { - aclnnAddmm(arena.buf, ws_needed, executor, stream); - } - - aclDestroyTensor(t_self); - aclDestroyTensor(t_a); - aclDestroyTensor(t_b); - aclDestroyTensor(t_out); - } - - private: - bool batched_; - - float alpha_val_; - - float beta_val_; - - aclScalar* alpha_scalar_ = nullptr; - - aclScalar* beta_scalar_ = nullptr; -}; - -} // namespace infini::ops - -#endif diff --git a/src/ascend/workspace_pool_.h b/src/ascend/workspace_pool_.h deleted file mode 100644 index 8c4c91961..000000000 --- a/src/ascend/workspace_pool_.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_WORKSPACE_POOL__H_ -#define INFINI_OPS_ASCEND_WORKSPACE_POOL__H_ - -#include -#include -#include -#include - -#include "acl/acl.h" - -namespace infini::ops::ascend { - -struct WorkspaceArena { - void* buf = nullptr; - - uint64_t capacity = 0; -}; - -class WorkspacePool { - public: - WorkspaceArena& Ensure(aclrtStream stream, uint64_t needed) { - std::lock_guard lock(mutex_); - auto& arena = arenas_[stream]; - if (needed <= arena.capacity) return arena; - if (arena.capacity > 0) { - aclrtSynchronizeStream(stream); - aclrtFree(arena.buf); - } - if (needed > 0) { - auto ret = aclrtMalloc(&arena.buf, needed, ACL_MEM_MALLOC_NORMAL_ONLY); - assert(ret == ACL_SUCCESS && "`WorkspacePool`: `aclrtMalloc` failed"); - } - arena.capacity = needed; - return arena; - } - - ~WorkspacePool() { - for (auto& [stream, arena] : arenas_) { - if (arena.capacity > 0) aclrtFree(arena.buf); - } - } - - private: - std::unordered_map arenas_; - - std::mutex mutex_; -}; - -inline WorkspacePool& GetWorkspacePool() { - static WorkspacePool pool; - return pool; -} - -} // namespace infini::ops::ascend - -#endif diff --git a/src/base/cast.h b/src/base/cast.h new file mode 100644 index 000000000..29f1f40cf --- /dev/null +++ b/src/base/cast.h @@ -0,0 +1,52 @@ +#ifndef INFINI_OPS_BASE_CAST_H_ +#define INFINI_OPS_BASE_CAST_H_ + +#include "operator.h" + +namespace infini::ops { + +class Cast : public Operator { + public: + Cast(const Tensor input, Tensor out) + : ndim_{out.ndim()}, + output_size_{out.numel()}, + input_dtype_{input.dtype()}, + out_dtype_{out.dtype()}, + input_shape_{input.shape()}, + out_shape_{out.shape()}, + input_strides_{input.strides()}, + out_strides_{out.strides()}, + is_input_contiguous_{input.IsContiguous()}, + is_out_contiguous_{out.IsContiguous()} { + assert(input.numel() == out.numel() && + "the input and output of `Cast` must have the same number of " + "elements"); + } + + virtual void operator()(const Tensor input, Tensor out) const = 0; + + protected: + Tensor::Size ndim_{0}; + + Tensor::Size output_size_{0}; + + const DataType input_dtype_; + + const DataType out_dtype_; + + Tensor::Shape input_shape_; + + Tensor::Shape out_shape_; + + Tensor::Strides input_strides_; + + Tensor::Strides out_strides_; + + bool is_input_contiguous_{false}; + + bool is_out_contiguous_{false}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/base/cat.h b/src/base/cat.h new file mode 100644 index 000000000..dcb0ba587 --- /dev/null +++ b/src/base/cat.h @@ -0,0 +1,35 @@ +#ifndef INFINI_OPS_BASE_CAT_H_ +#define INFINI_OPS_BASE_CAT_H_ + +#include + +#include "operator.h" + +namespace infini::ops { + +class Cat : public Operator { + public: + Cat(const Tensor first_input, std::vector rest_inputs, int64_t dim, + Tensor out) + : input_count_{1 + rest_inputs.size()} { + assert(input_count_ >= 2 && "`Cat` requires at least 2 input tensors"); + + auto ndim = static_cast(out.ndim()); + // Normalize negative dim (e.g. -1 means last dimension). + dim_ = dim < 0 ? dim + ndim : dim; + assert(dim_ >= 0 && dim_ < ndim && "`Cat` dim out of range"); + } + + virtual void operator()(const Tensor first_input, + std::vector rest_inputs, int64_t dim, + Tensor out) const = 0; + + protected: + int64_t dim_; + + size_t input_count_; +}; + +} // namespace infini::ops + +#endif diff --git a/src/base/linear.h b/src/base/linear.h new file mode 100644 index 000000000..a5276e612 --- /dev/null +++ b/src/base/linear.h @@ -0,0 +1,65 @@ +#ifndef INFINI_OPS_BASE_LINEAR_H_ +#define INFINI_OPS_BASE_LINEAR_H_ + +#include + +#include "operator.h" + +namespace infini::ops { + +// Fused linear projection: out = a @ b (+ bias). +// +// When bias is present, computes out = a @ b + bias in a single dispatch. +// When bias is absent, computes out = a @ b (equivalent to Matmul). +// `trans_a` / `trans_b`: If true, transpose the last two dims before +// multiplying. +class Linear : public Operator { + public: + Linear(const Tensor a, const Tensor b, std::optional bias, + bool trans_a, bool trans_b, Tensor out) + : a_shape_{a.shape()}, + b_shape_{b.shape()}, + out_shape_{out.shape()}, + a_strides_{a.strides()}, + b_strides_{b.strides()}, + out_strides_{out.strides()}, + trans_a_{trans_a}, + trans_b_{trans_b}, + has_bias_{bias.has_value()} { + assert(a.dtype() == b.dtype() && + "operator `Linear` requires a and b to have the same dtype"); + assert(a.dtype() == out.dtype() && + "operator `Linear` requires a and out to have the same dtype"); + if (has_bias_) { + assert(bias->dtype() == out.dtype() && + "operator `Linear` requires bias and out to have the same dtype"); + } + } + + virtual void operator()(const Tensor a, const Tensor b, + std::optional bias, bool trans_a, + bool trans_b, Tensor out) const = 0; + + protected: + Tensor::Shape a_shape_; + + Tensor::Shape b_shape_; + + Tensor::Shape out_shape_; + + Tensor::Strides a_strides_; + + Tensor::Strides b_strides_; + + Tensor::Strides out_strides_; + + bool trans_a_{false}; + + bool trans_b_{false}; + + bool has_bias_{false}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/base/mat_mul.h b/src/base/mat_mul.h deleted file mode 100644 index 6180c8bf2..000000000 --- a/src/base/mat_mul.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef INFINI_OPS_BASE_MAT_MUL_H_ -#define INFINI_OPS_BASE_MAT_MUL_H_ - -#include "operator.h" -#include "tensor.h" - -namespace infini::ops { - -class MatMul : public Operator { - public: - MatMul(const Tensor input, const Tensor other, Tensor out) - : input_shape_{input.shape()}, - other_shape_{other.shape()}, - out_shape_{out.shape()} { - assert(input.dtype() == other.dtype()); - } - - virtual void operator()(const Tensor input, const Tensor other, - Tensor out) const = 0; - - protected: - Tensor::Shape input_shape_; - - Tensor::Shape other_shape_; - - Tensor::Shape out_shape_; -}; - -} // namespace infini::ops - -#endif diff --git a/src/base/matmul.h b/src/base/matmul.h new file mode 100644 index 000000000..071feaeaa --- /dev/null +++ b/src/base/matmul.h @@ -0,0 +1,41 @@ +#ifndef INFINI_OPS_BASE_MATMUL_H_ +#define INFINI_OPS_BASE_MATMUL_H_ + +#include "operator.h" +#include "tensor.h" + +namespace infini::ops { + +class Matmul : public Operator { + public: + // `trans_a` / `trans_b`: If true, transpose the last two dims of `a` / `b` + // before multiplying. These are constructor parameters so the `CacheKey` + // encodes the transposition and distinct descriptors are cached for each + // combination. + Matmul(const Tensor a, const Tensor b, Tensor c, bool trans_a, bool trans_b) + : a_shape_{a.shape()}, + b_shape_{b.shape()}, + c_shape_{c.shape()}, + trans_a_{trans_a}, + trans_b_{trans_b} { + assert(a.dtype() == b.dtype()); + } + + virtual void operator()(const Tensor a, const Tensor b, Tensor c, + bool trans_a, bool trans_b) const = 0; + + protected: + Tensor::Shape a_shape_; + + Tensor::Shape b_shape_; + + Tensor::Shape c_shape_; + + bool trans_a_{false}; + + bool trans_b_{false}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/base/mul.h b/src/base/mul.h new file mode 100644 index 000000000..9e7be2239 --- /dev/null +++ b/src/base/mul.h @@ -0,0 +1,67 @@ +#ifndef INFINI_OPS_BASE_MUL_H_ +#define INFINI_OPS_BASE_MUL_H_ + +#include "operator.h" + +namespace infini::ops { + +class Mul : public Operator { + public: + Mul(const Tensor input, const Tensor other, Tensor out) + : ndim_{out.ndim()}, + output_size_{out.numel()}, + input_type_{input.dtype()}, + other_type_{other.dtype()}, + out_type_{out.dtype()}, + input_shape_{input.shape()}, + other_shape_{other.shape()}, + out_shape_{out.shape()}, + input_strides_{input.strides()}, + other_strides_{other.strides()}, + out_strides_{out.strides()}, + is_input_contiguous_{input.IsContiguous()}, + is_other_contiguous_{other.IsContiguous()}, + is_out_contiguous_{out.IsContiguous()} { + assert(!out.HasBroadcastDim() && + "the output of `Mul` should NOT have broadcasted dim!"); + assert(input_type_ == other_type_ && other_type_ == out_type_ && + "operator `Mul` requires all input and output tensors to have the " + "same dtype"); + } + + virtual void operator()(const Tensor input, const Tensor other, + Tensor out) const = 0; + + protected: + Tensor::Size ndim_{0}; + + Tensor::Size output_size_{0}; + + const DataType input_type_; + + const DataType other_type_; + + const DataType out_type_; + + Tensor::Shape input_shape_; + + Tensor::Shape other_shape_; + + Tensor::Shape out_shape_; + + Tensor::Strides input_strides_; + + Tensor::Strides other_strides_; + + Tensor::Strides out_strides_; + + bool is_input_contiguous_{false}; + + bool is_other_contiguous_{false}; + + bool is_out_contiguous_{false}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/device.h b/src/device.h index 38e4bce11..688cd0dc2 100644 --- a/src/device.h +++ b/src/device.h @@ -1,6 +1,9 @@ #ifndef INFINI_OPS_DEVICE_H_ #define INFINI_OPS_DEVICE_H_ +#include +#include + #include "common/constexpr_map.h" #include "common/traits.h" #include "hash.h" diff --git a/src/hash.h b/src/hash.h index efb34f751..4721f33f3 100644 --- a/src/hash.h +++ b/src/hash.h @@ -2,6 +2,7 @@ #define INFINI_OPS_HASH_H_ #include +#include template inline void HashCombine(std::size_t& seed, const T& v) { @@ -9,4 +10,12 @@ inline void HashCombine(std::size_t& seed, const T& v) { seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); } +template +inline void HashCombine(std::size_t& seed, const std::vector& v) { + HashCombine(seed, v.size()); + for (const auto& elem : v) { + HashCombine(seed, elem); + } +} + #endif diff --git a/src/native/ascend/atb_common_.h b/src/native/ascend/atb_common_.h new file mode 100644 index 000000000..40b76698e --- /dev/null +++ b/src/native/ascend/atb_common_.h @@ -0,0 +1,95 @@ +#ifndef INFINI_OPS_ASCEND_ATB_COMMON__H_ +#define INFINI_OPS_ASCEND_ATB_COMMON__H_ + +#ifdef INFINI_HAS_ATB + +#include +#include +#include +#include + +#include "acl/acl.h" +#include "atb/context.h" +#include "atb/operation.h" +#include "atb/types.h" +#include "native/ascend/data_type_.h" +#include "tensor.h" + +namespace infini::ops::ascend { + +// Thread-local ATB context. +// +// ATB requires a `Context` for Setup/Execute. Creating one per call is +// expensive (internal tiling buffer allocation), so we cache one per thread. +// `SetExecuteStream` is called before every `Execute` to match the caller's +// stream. +inline atb::Context*& ThreadLocalAtbContext() { + thread_local atb::Context* ctx = nullptr; + + return ctx; +} + +inline atb::Context* GetAtbContext(aclrtStream stream) { + auto*& ctx = ThreadLocalAtbContext(); + + if (!ctx) { + atb::Status s = atb::CreateContext(&ctx); + assert(s == atb::NO_ERROR && "atb::CreateContext failed"); + } + + atb::Status s = ctx->SetExecuteStream(stream); + assert(s == atb::NO_ERROR && "atb::Context::SetExecuteStream failed"); + + return ctx; +} + +// Build an `atb::Tensor` from an InfiniOps Tensor. +// +// Sets dtype, ND format, shape dimensions, and the device data pointer. +// The caller must keep the InfiniOps Tensor alive for the duration of the +// ATB operation. +inline atb::Tensor ToAtbTensor(const Tensor& t) { + atb::Tensor out; + out.desc.dtype = ToAclDtype(t.dtype()); + out.desc.format = ACL_FORMAT_ND; + out.desc.shape.dimNum = t.ndim(); + assert(t.ndim() <= atb::MAX_DIM); + + for (uint64_t i = 0; i < t.ndim(); ++i) { + out.desc.shape.dims[i] = static_cast(t.size(i)); + } + + out.deviceData = const_cast(t.data()); + out.dataSize = static_cast(t.numel()) * t.element_size(); + + return out; +} + +// Build an `atb::Tensor` from explicit shape, dtype, and data pointer. +// +// Useful for sub-views of a larger buffer (e.g. K-cache and V-cache halves +// of a fused KV cache tensor). +inline atb::Tensor ToAtbTensor(const std::vector& shape, + aclDataType dtype, void* data, + uint64_t data_size) { + atb::Tensor out; + out.desc.dtype = dtype; + out.desc.format = ACL_FORMAT_ND; + out.desc.shape.dimNum = shape.size(); + assert(shape.size() <= atb::MAX_DIM); + + for (size_t i = 0; i < shape.size(); ++i) { + out.desc.shape.dims[i] = shape[i]; + } + + out.deviceData = data; + out.dataSize = data_size; + + return out; +} + +} // namespace infini::ops::ascend + +#endif // INFINI_HAS_ATB + +#endif // INFINI_OPS_ASCEND_ATB_COMMON__H_ diff --git a/src/native/ascend/common.h b/src/native/ascend/common.h new file mode 100644 index 000000000..46a914771 --- /dev/null +++ b/src/native/ascend/common.h @@ -0,0 +1,208 @@ +#ifndef INFINI_OPS_ASCEND_COMMON_H_ +#define INFINI_OPS_ASCEND_COMMON_H_ + +#include +#include + +#include "acl/acl.h" +#include "aclnn/acl_meta.h" +#include "native/ascend/data_type_.h" +#include "tensor.h" + +namespace infini::ops::ascend { + +// Check whether the ACL runtime is still usable. +// +// During process shutdown the CANN runtime may be torn down before C++ +// static destructors run. Calling `aclrtGetDevice` is the cheapest +// probe — it fails once the runtime is gone. Destructors that call +// ACL/ATB APIs must guard with this to avoid use-after-finalize crashes. +inline bool IsAclRuntimeAlive() { + int32_t dev_id = -1; + + return aclrtGetDevice(&dev_id) == ACL_SUCCESS; +} + +// Build an `aclTensor` descriptor from an InfiniOps `Tensor`. +// +// When `transpose_last2` is true the last two dimensions are swapped in the +// descriptor (shape and strides) without copying data. This is used by `Gemm` +// and `MatMul` to express a transpose via the view. +inline aclTensor* BuildAclTensor(const Tensor& t, + bool transpose_last2 = false) { + std::vector shape(t.shape().begin(), t.shape().end()); + std::vector strides(t.strides().begin(), t.strides().end()); + + if (transpose_last2 && shape.size() >= 2) { + auto n = shape.size(); + std::swap(shape[n - 2], shape[n - 1]); + std::swap(strides[n - 2], strides[n - 1]); + } + + // Compute the minimum physical storage needed for this strided view. + // For contiguous tensors this equals `numel()`; for non-contiguous (gapped) + // tensors it may be larger; for broadcast (stride-0) tensors it may be + // smaller. Passing the view shape as the storage shape causes + // "ViewShape overlap" errors in ACLNN for non-contiguous inputs. + int64_t storage_elems = 1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == 0) { + storage_elems = 0; + break; + } + if (strides[i] > 0 && shape[i] > 1) { + storage_elems += static_cast(shape[i] - 1) * strides[i]; + } + } + std::vector storage_shape = {storage_elems}; + + return aclCreateTensor( + shape.data(), static_cast(shape.size()), ToAclDtype(t.dtype()), + strides.data(), + /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape.data(), + static_cast(storage_shape.size()), const_cast(t.data())); +} + +// Pre-computed tensor metadata for descriptor reuse. +// +// Stores `shape`, `strides`, `storage_shape`, and `dtype` once (avoiding +// per-call heap allocations). The `aclTensor` descriptor is created on the +// first `get()` call and its data pointer is updated in-place via +// `aclSetRawTensorAddr` on subsequent calls. +class AclTensorCache { + public: + AclTensorCache() = default; + + // Construct from explicit metadata (for device buffers not wrapped in + // Tensor). Computes contiguous strides from shape. + AclTensorCache(std::vector shape, aclDataType dtype, void* data) + : shape_(std::move(shape)), dtype_(dtype) { + strides_.resize(shape_.size()); + int64_t stride = 1; + for (int i = static_cast(shape_.size()) - 1; i >= 0; --i) { + strides_[i] = stride; + stride *= shape_[i]; + } + storage_shape_ = {stride}; + + if (data) { + tensor_ = aclCreateTensor( + shape_.data(), static_cast(shape_.size()), dtype_, + strides_.data(), + /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape_.data(), + static_cast(storage_shape_.size()), data); + } + } + + explicit AclTensorCache(const Tensor& t, bool transpose_last2 = false) + : dtype_{ToAclDtype(t.dtype())} { + shape_.assign(t.shape().begin(), t.shape().end()); + strides_.assign(t.strides().begin(), t.strides().end()); + + if (transpose_last2 && shape_.size() >= 2) { + auto n = shape_.size(); + std::swap(shape_[n - 2], shape_[n - 1]); + std::swap(strides_[n - 2], strides_[n - 1]); + } + + int64_t storage_elems = 1; + for (size_t i = 0; i < shape_.size(); ++i) { + if (shape_[i] == 0) { + storage_elems = 0; + break; + } + if (strides_[i] > 0 && shape_[i] > 1) { + storage_elems += static_cast(shape_[i] - 1) * strides_[i]; + } + } + storage_shape_ = {storage_elems}; + } + + ~AclTensorCache() { + if (tensor_ && IsAclRuntimeAlive()) { + aclDestroyTensor(tensor_); + } + } + + AclTensorCache(const AclTensorCache&) = delete; + + AclTensorCache& operator=(const AclTensorCache&) = delete; + + AclTensorCache(AclTensorCache&& o) noexcept + : shape_(std::move(o.shape_)), + strides_(std::move(o.strides_)), + storage_shape_(std::move(o.storage_shape_)), + dtype_(o.dtype_), + tensor_(o.tensor_) { + o.tensor_ = nullptr; + } + + AclTensorCache& operator=(AclTensorCache&& o) noexcept { + if (this != &o) { + if (tensor_) { + aclDestroyTensor(tensor_); + } + shape_ = std::move(o.shape_); + strides_ = std::move(o.strides_); + storage_shape_ = std::move(o.storage_shape_); + dtype_ = o.dtype_; + tensor_ = o.tensor_; + o.tensor_ = nullptr; + } + + return *this; + } + + // Null the cached descriptor pointer without calling `aclDestroyTensor`. + // Call from the owning operator's destructor: the descriptor is still + // referenced by a Repeatable `aclOpExecutor` which would be destroyed + // alongside the tensor, and per CANN 8.5 docs that destruction is our + // responsibility. In practice `aclDestroyAclOpExecutor` during process + // shutdown crashes even with `IsAclRuntimeAlive()` true — see `64c367c` — + // so operators leak the executor at shutdown; skipping `aclDestroyTensor` + // here keeps `~AclTensorCache` from double-freeing a descriptor the + // executor still holds. + void release() { tensor_ = nullptr; } + + // Explicitly destroy the cached tensor and clear the pointer. + // Use only when the descriptor is owned outside any executor (e.g. an + // intermediate tensor not passed to `aclnn*GetWorkspaceSize`). + void destroy() { + if (tensor_) { + aclDestroyTensor(tensor_); + tensor_ = nullptr; + } + } + + // Update the data pointer and return the cached descriptor. + aclTensor* get(void* data) const { + if (tensor_) { + aclSetRawTensorAddr(tensor_, data); + + return tensor_; + } + + tensor_ = aclCreateTensor( + shape_.data(), static_cast(shape_.size()), dtype_, + strides_.data(), + /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape_.data(), + static_cast(storage_shape_.size()), data); + + return tensor_; + } + + private: + std::vector shape_; + + std::vector strides_; + + std::vector storage_shape_; + + aclDataType dtype_{ACL_DT_UNDEFINED}; + + mutable aclTensor* tensor_ = nullptr; +}; + +} // namespace infini::ops::ascend + +#endif diff --git a/src/native/ascend/custom/.clang-tidy b/src/native/ascend/custom/.clang-tidy new file mode 100644 index 000000000..d1f1296a6 --- /dev/null +++ b/src/native/ascend/custom/.clang-tidy @@ -0,0 +1,12 @@ +--- +# `CANN`'s `ascendc_add_operator(OP_NAME )` dictates the +# symbol name of the `extern "C" __global__ __aicore__ void (…)` +# kernel entry. The `aclrtlaunch_` wrapper and `op_host/` call site +# must match, so these entries cannot follow Google's `PascalCase` +# convention. Relax `FunctionCase` to `lower_case` within this subtree; +# class and method names still inherit `CamelCase` from the root config. + +InheritParentConfig: true + +CheckOptions: + - {key: readability-identifier-naming.FunctionCase, value: lower_case} diff --git a/src/native/ascend/custom/CMakeLists.txt b/src/native/ascend/custom/CMakeLists.txt new file mode 100644 index 000000000..ca6e6883f --- /dev/null +++ b/src/native/ascend/custom/CMakeLists.txt @@ -0,0 +1,97 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# Copyright (c) 2025 InfiniTensor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Adapted from https://github.com/vllm-project/vllm-ascend/blob/main/csrc/CMakeLists.txt + +cmake_minimum_required(VERSION 3.20 FATAL_ERROR) +project(ascend-kernel LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RELEASE) +endif() + +add_compile_options(-Wunused-value -Wcast-align -Wcast-qual -Wwrite-strings + -Wsign-compare -Wextra) + +if(${CMAKE_BUILD_TYPE} MATCHES "RELEASE") + add_compile_options(-O3 -fvisibility=hidden -fvisibility-inlines-hidden + -fstack-protector-strong -fPIE -fPIC) + message(STATUS "Build type set to `RELEASE`.") +else() + add_compile_options(-g -rdynamic) +endif() + +set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR}) +set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build) +set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output) + +include(cmake/config_envs.cmake) +include(cmake/config_ascend.cmake) + +find_program(CCACHE_PROGRAM ccache) +if(CCACHE_PROGRAM) + message(STATUS "Found `ccache`: ${CCACHE_PROGRAM}.") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") +endif() + +# Shared library output location. +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_PATH}) + +# Host-side files. +file(GLOB OP_SRCS + ${PROJECT_OP_SRC_BASE}/torch_binding.cpp + ${PROJECT_OP_SRC_BASE}/rms_norm/op_host/rms_norm.cpp +) + +# Shared library name — consumed by `kernel_custom.h` variants and by the +# Python side via `import ascend_kernel`. +set(OP_PLUGIN_NAME ascend_kernel) + +# Kernel-side files (device code compiled by the `AscendC` toolchain). +ascendc_library(no_workspace_kernel STATIC + ${PROJECT_OP_SRC_BASE}/rms_norm/op_kernel/rms_norm.cpp +) + +# Create the shared library `libascend_kernel.so`. +add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS}) + +target_link_libraries(${OP_PLUGIN_NAME} PRIVATE + no_workspace_kernel + torch_npu + ascendcl + tiling_api + nnopbase + opapi + register + platform + ascendalog + dl +) + +target_link_directories(${OP_PLUGIN_NAME} PRIVATE + ${TORCH_DIR}/lib + ${TORCH_NPU_DIR}/lib +) + +target_include_directories(${OP_PLUGIN_NAME} PRIVATE + ${PROJECT_OP_SRC_BASE}/utils + ${PROJECT_SOURCE_DIR}/include + ${TORCH_DIR}/include + ${TORCH_DIR}/include/torch/csrc/api/include + ${TORCH_NPU_DIR}/include/third_party/acl/inc + ${TORCH_NPU_DIR}/include/third_party/hccl/inc + ${TORCH_NPU_DIR}/include + ${PYTHON_INCLUDE_DIR} + ${ASCEND_INCLUDE_DIR}/external + ${ASCEND_INCLUDE_DIR}/experiment/platform + ${ASCEND_INCLUDE_DIR}/experiment/runtime +) diff --git a/src/native/ascend/custom/add_rms_norm/CMakeLists.txt b/src/native/ascend/custom/add_rms_norm/CMakeLists.txt new file mode 100644 index 000000000..1748afc06 --- /dev/null +++ b/src/native/ascend/custom/add_rms_norm/CMakeLists.txt @@ -0,0 +1 @@ +ascendc_add_operator(OP_NAME add_rms_norm) diff --git a/src/native/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp b/src/native/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp new file mode 100644 index 000000000..b8e0d504b --- /dev/null +++ b/src/native/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp @@ -0,0 +1,134 @@ +#include "aclrtlaunch_add_rms_norm.h" +#include "tiling/platform/platform_ascendc.h" +#include "torch_kernel_helper.h" + +namespace ascend::detail { + +std::vector AddRmsNorm(const at::Tensor& x1, const at::Tensor& x2, + const at::Tensor& weight, double eps) { + // Input validation. + TORCH_CHECK(x1.dim() > 0, + "`AddRmsNorm`: `x1` must have at least 1 dimension."); + TORCH_CHECK(x1.sizes() == x2.sizes(), + "`AddRmsNorm`: `x1` and `x2` must have the same shape."); + TORCH_CHECK(x1.scalar_type() == x2.scalar_type(), + "`AddRmsNorm`: `x1` and `x2` must have the same dtype."); + TORCH_CHECK(x1.scalar_type() == at::kHalf || x1.scalar_type() == at::kFloat, + "`AddRmsNorm`: only `float16` and `float32` are supported; got ", + x1.scalar_type(), "."); + TORCH_CHECK(weight.dim() == 1, + "`AddRmsNorm`: `weight` must be 1-dimensional."); + TORCH_CHECK(weight.size(0) == x1.size(-1), "`AddRmsNorm`: `weight` size (", + weight.size(0), ") must match input last dim (", x1.size(-1), + ")."); + + int64_t dim_length = x1.size(-1); + int64_t total_rows = x1.numel() / dim_length; + + if (total_rows == 0 || dim_length == 0) { + return {at::empty_like(x1), at::empty_like(x1)}; + } + + at::Tensor inp1 = x1.contiguous(); + at::Tensor inp2 = x2.contiguous(); + int64_t dtype_size = inp1.element_size(); + + // Hardware parameters. + auto ascendc_platform = + platform_ascendc::PlatformAscendCManager::GetInstance(); + int64_t core_num = static_cast(ascendc_platform->GetCoreNumAiv()); + uint64_t ub_size; + ascendc_platform->GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size); + int64_t ub_size_limit = static_cast(ub_size); + + // Alignment (32-byte boundary). + int64_t align_elements = 32 / dtype_size; + int64_t dim_length_align = + ((dim_length + align_elements - 1) / align_elements) * align_elements; + + // UB capacity check. + // + // - `fp16`: `inQ_x1` (*2*2) + `inQ_x2` (*2*2) + `outQ_y` (*2*2) + + // `outQ_xout` (*2*2) + `fp32Buf1` (*4) + `fp32Buf2` (*4) + + // `weight` (*4) = 16 + 12 = 28 + // - `fp32`: `inQ_x1` (*2*4) + `inQ_x2` (*2*4) + `outQ_y` (*2*4) + + // `outQ_xout` (*2*4) + `weight` (*4) = 32 + 4 = 36 + int64_t buffer_coefficient = (dtype_size == 2) ? 28 : 36; + int64_t max_dim_length = (ub_size_limit - 1024) / buffer_coefficient; + int64_t fp_align_elements = 32 / 4; + max_dim_length = (max_dim_length / fp_align_elements) * fp_align_elements; + TORCH_CHECK(dim_length_align <= max_dim_length, + "`AddRmsNorm`: `dim_length` ", dim_length, " (aligned ", + dim_length_align, ") exceeds UB capacity (max ", max_dim_length, + ")."); + + // Padding. + at::Tensor kernel_input1; + at::Tensor kernel_input2; + + if (dim_length != dim_length_align) { + kernel_input1 = inp1.reshape({total_rows, dim_length}); + kernel_input1 = at::constant_pad_nd( + kernel_input1, {0, dim_length_align - dim_length}, 0.0); + kernel_input1 = kernel_input1.contiguous(); + + kernel_input2 = inp2.reshape({total_rows, dim_length}); + kernel_input2 = at::constant_pad_nd( + kernel_input2, {0, dim_length_align - dim_length}, 0.0); + kernel_input2 = kernel_input2.contiguous(); + } else { + kernel_input1 = inp1.reshape({total_rows, dim_length_align}).contiguous(); + kernel_input2 = inp2.reshape({total_rows, dim_length_align}).contiguous(); + } + + at::Tensor kernel_output_y = at::empty_like(kernel_input1); + at::Tensor kernel_output_x_out = at::empty_like(kernel_input1); + + // Weight: always pass as fp32, padded to `dim_length_align`. + at::Tensor weight_float = weight.contiguous().to(at::kFloat); + + if (dim_length != dim_length_align) { + weight_float = at::constant_pad_nd( + weight_float, {0, dim_length_align - dim_length}, 0.0); + } + + weight_float = weight_float.contiguous(); + + // Block-level tiling (distribute rows across cores). + int64_t used_core_num = std::min(total_rows, core_num); + int64_t former_length = (total_rows + used_core_num - 1) / used_core_num; + int64_t tail_length = former_length - 1; + int64_t former_num = total_rows - tail_length * used_core_num; + uint32_t block_dim = static_cast(used_core_num); + + // All `EXEC_KERNEL_CMD` args must be lvalues. + float eps_float = static_cast(eps); + int64_t dtype_size_val = dtype_size; + + // The first arg `add_rms_norm` is the AscendC kernel entry-point name — it + // must match `ascendc_add_operator(OP_NAME add_rms_norm)` in `CMakeLists.txt`, + // the `__global__ __aicore__ void add_rms_norm(...)` definition in + // `op_kernel/`, and the generated `aclrtlaunch_add_rms_norm.h` header. + // Google C++ Style's PascalCase rule does NOT apply: this identifier is + // dictated by the AscendC toolchain's symbol convention. + EXEC_KERNEL_CMD(add_rms_norm, block_dim, kernel_input1, kernel_input2, + weight_float, kernel_output_y, kernel_output_x_out, + total_rows, dim_length, dim_length_align, former_num, + former_length, tail_length, eps_float, dtype_size_val); + + // Remove padding and reshape back to original shape. + at::Tensor output_y = kernel_output_y; + at::Tensor output_x_out = kernel_output_x_out; + + if (dim_length != dim_length_align) { + output_y = output_y.narrow(-1, 0, dim_length).contiguous(); + output_x_out = output_x_out.narrow(-1, 0, dim_length).contiguous(); + } + + output_y = output_y.reshape(x1.sizes()); + output_x_out = output_x_out.reshape(x1.sizes()); + + return {output_y, output_x_out}; +} + +} // namespace ascend::detail diff --git a/src/native/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp b/src/native/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp new file mode 100644 index 000000000..e2a08e555 --- /dev/null +++ b/src/native/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp @@ -0,0 +1,249 @@ +#include "kernel_operator.h" + +constexpr int32_t BUFFER_NUM = 2; + +template +class KernelAddRmsNorm { + public: + __aicore__ inline KernelAddRmsNorm() {} + + __aicore__ inline void Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y, + GM_ADDR x_out, int64_t totalRows, + int64_t dimLength, int64_t dimLengthAlign, + int64_t formerNum, int64_t formerLength, + int64_t tailLength, float eps) { + this->dimLength = dimLength; + this->dimLengthAlign = dimLengthAlign; + this->eps = eps; + + // Block-level tiling: determine row range for this core. + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t rowOffset; + + if (blockIdx < formerNum) { + this->blockRows = formerLength; + rowOffset = formerLength * blockIdx; + } else { + this->blockRows = tailLength; + int64_t tailIdx = blockIdx - formerNum; + rowOffset = formerLength * formerNum + tailLength * tailIdx; + } + + // Global memory pointers. + x1Gm.SetGlobalBuffer((__gm__ T*)x1 + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + x2Gm.SetGlobalBuffer((__gm__ T*)x2 + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + yGm.SetGlobalBuffer((__gm__ T*)y + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + xOutGm.SetGlobalBuffer((__gm__ T*)x_out + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + weightGm.SetGlobalBuffer((__gm__ float*)weight, dimLengthAlign); + + int32_t dimLenAlign = static_cast(this->dimLengthAlign); + + // I/O queues (double-buffered). + pipe.InitBuffer(inQueueX1, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + pipe.InitBuffer(inQueueX2, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + pipe.InitBuffer(outQueueY, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + pipe.InitBuffer(outQueueXOut, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + + // Weight buffer (fp32, loaded once, reused for all rows). + pipe.InitBuffer(weightBuf, + dimLenAlign * static_cast(sizeof(float))); + + // FP16 path needs extra fp32 compute buffers. + // buf1: holds x_out in fp32 (reused from x1_fp32 after Add). + // buf2: holds x2_fp32 initially, then x_out^2, then final result. + if constexpr (sizeof(T) == 2) { + pipe.InitBuffer(fp32Buf1, + dimLenAlign * static_cast(sizeof(float))); + pipe.InitBuffer(fp32Buf2, + dimLenAlign * static_cast(sizeof(float))); + } + + // ReduceSum temporary buffer (size per API formula). + constexpr int32_t ELEMS_PER_REPEAT = 256 / sizeof(float); + constexpr int32_t ELEMS_PER_BLOCK = 32 / sizeof(float); + int32_t firstMaxRepeat = + (dimLenAlign + ELEMS_PER_REPEAT - 1) / ELEMS_PER_REPEAT; + int32_t reduceTmpSize = + ((firstMaxRepeat + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK) * + ELEMS_PER_BLOCK; + pipe.InitBuffer(reduceTmpBuf, + reduceTmpSize * static_cast(sizeof(float))); + + // Scalar buffer for reduction result (8 floats = 32 bytes). + pipe.InitBuffer(sumBuf, 32); + + // Load weight (fp32) from GM into `weightBuf`. + AscendC::LocalTensor wLocal = weightBuf.Get(); + AscendC::DataCopyExtParams wParams{ + 1, static_cast(dimLenAlign * sizeof(float)), 0, 0, 0}; + AscendC::DataCopyPadExtParams wPad{false, 0, 0, 0.0f}; + AscendC::DataCopyPad(wLocal, weightGm, wParams, wPad); + + // Ensure weight DMA completes before compute. + AscendC::PipeBarrier(); + } + + __aicore__ inline void Process() { + for (int64_t row = 0; row < this->blockRows; ++row) { + CopyIn(row); + Compute(row); + CopyOut(row); + } + } + + private: + __aicore__ inline void CopyIn(int64_t row) { + AscendC::LocalTensor x1Local = inQueueX1.AllocTensor(); + AscendC::LocalTensor x2Local = inQueueX2.AllocTensor(); + AscendC::DataCopyExtParams params{ + 1, static_cast(this->dimLengthAlign * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPadExtParams pad{false, 0, 0, static_cast(0)}; + AscendC::DataCopyPad(x1Local, x1Gm[row * this->dimLengthAlign], params, + pad); + AscendC::DataCopyPad(x2Local, x2Gm[row * this->dimLengthAlign], params, + pad); + inQueueX1.EnQue(x1Local); + inQueueX2.EnQue(x2Local); + } + + __aicore__ inline void Compute(int64_t row) { + AscendC::LocalTensor x1Local = inQueueX1.DeQue(); + AscendC::LocalTensor x2Local = inQueueX2.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); + AscendC::LocalTensor xOutLocal = outQueueXOut.AllocTensor(); + + AscendC::LocalTensor wLocal = weightBuf.Get(); + AscendC::LocalTensor rTmp = reduceTmpBuf.Get(); + AscendC::LocalTensor sLocal = sumBuf.Get(); + + int32_t dimLen = static_cast(this->dimLength); + int32_t dimLenAlign = static_cast(this->dimLengthAlign); + + if constexpr (sizeof(T) == 4) { + // ---- FP32 path: compute directly. ---- + + // Step 1: x_out = x1 + x2. + AscendC::Add(xOutLocal, x1Local, x2Local, dimLenAlign); + + // Step 2: x_out^2 into yLocal (reuse output buffer temporarily). + AscendC::Mul(yLocal, xOutLocal, xOutLocal, dimLenAlign); + + // Step 3: ReduceSum(x_out^2) -> sLocal[0]. + // ReduceSum may modify yLocal, but we overwrite it below. + AscendC::ReduceSum(sLocal, yLocal, rTmp, dimLenAlign); + + // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps). + float sumVal = sLocal.GetValue(0); + float meanVal = sumVal / static_cast(dimLen) + this->eps; + sLocal.SetValue(0, meanVal); + AscendC::Sqrt(sLocal, sLocal, 8); + float scale = 1.0f / sLocal.GetValue(0); + + // Step 6: y = x_out * scale. + AscendC::Muls(yLocal, xOutLocal, scale, dimLenAlign); + + // Step 7: y = y * weight. + AscendC::Mul(yLocal, yLocal, wLocal, dimLenAlign); + + } else { + // ---- FP16 path: cast → fp32 compute → cast back. ---- + AscendC::LocalTensor b1 = fp32Buf1.Get(); + AscendC::LocalTensor b2 = fp32Buf2.Get(); + + // Cast inputs fp16 → fp32. + AscendC::Cast(b1, x1Local, AscendC::RoundMode::CAST_NONE, dimLenAlign); + AscendC::Cast(b2, x2Local, AscendC::RoundMode::CAST_NONE, dimLenAlign); + + // Step 1: x_out = x1 + x2 (fp32), stored in b1. + AscendC::Add(b1, b1, b2, dimLenAlign); + + // Cast x_out fp32 → fp16 for the x_out output. + AscendC::Cast(xOutLocal, b1, AscendC::RoundMode::CAST_ROUND, dimLenAlign); + + // Step 2: x_out^2 in fp32, stored in b2. + AscendC::Mul(b2, b1, b1, dimLenAlign); + + // Step 3: ReduceSum(x_out^2) -> sLocal[0]. + AscendC::ReduceSum(sLocal, b2, rTmp, dimLenAlign); + + // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps). + float sumVal = sLocal.GetValue(0); + float meanVal = sumVal / static_cast(dimLen) + this->eps; + sLocal.SetValue(0, meanVal); + AscendC::Sqrt(sLocal, sLocal, 8); + float scale = 1.0f / sLocal.GetValue(0); + + // Step 6: y = x_out * scale (fp32), reuse b2. + AscendC::Muls(b2, b1, scale, dimLenAlign); + + // Step 7: y = y * weight (fp32). + AscendC::Mul(b2, b2, wLocal, dimLenAlign); + + // Cast result fp32 → fp16. + AscendC::Cast(yLocal, b2, AscendC::RoundMode::CAST_ROUND, dimLenAlign); + } + + inQueueX1.FreeTensor(x1Local); + inQueueX2.FreeTensor(x2Local); + outQueueY.EnQue(yLocal); + outQueueXOut.EnQue(xOutLocal); + } + + __aicore__ inline void CopyOut(int64_t row) { + AscendC::LocalTensor yLocal = outQueueY.DeQue(); + AscendC::LocalTensor xOutLocal = outQueueXOut.DeQue(); + AscendC::DataCopyExtParams params{ + 1, static_cast(this->dimLengthAlign * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPad(yGm[row * this->dimLengthAlign], yLocal, params); + AscendC::DataCopyPad(xOutGm[row * this->dimLengthAlign], xOutLocal, params); + outQueueY.FreeTensor(yLocal); + outQueueXOut.FreeTensor(xOutLocal); + } + + private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX1; + AscendC::TQue inQueueX2; + AscendC::TQue outQueueY; + AscendC::TQue outQueueXOut; + + AscendC::TBuf weightBuf; + AscendC::TBuf fp32Buf1; + AscendC::TBuf fp32Buf2; + AscendC::TBuf reduceTmpBuf; + AscendC::TBuf sumBuf; + + AscendC::GlobalTensor x1Gm, x2Gm, yGm, xOutGm; + AscendC::GlobalTensor weightGm; + + int64_t blockRows; + int64_t dimLength; + int64_t dimLengthAlign; + float eps; +}; + +extern "C" __global__ __aicore__ void add_rms_norm( + GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y, GM_ADDR x_out, + int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign, + int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps, + int64_t dtypeSize) { + if (dtypeSize == 2) { + KernelAddRmsNorm op; + op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign, + formerNum, formerLength, tailLength, eps); + op.Process(); + } else { + KernelAddRmsNorm op; + op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign, + formerNum, formerLength, tailLength, eps); + op.Process(); + } +} diff --git a/src/native/ascend/custom/build.sh b/src/native/ascend/custom/build.sh new file mode 100755 index 000000000..258a88e4b --- /dev/null +++ b/src/native/ascend/custom/build.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Build custom `AscendC` kernels into `libascend_kernel.so`. +set -e + +SOC_VERSION="${1:-Ascend910_9382}" + +# Detect CANN toolkit path. +_CANN_TOOLKIT_INSTALL_PATH=$(grep "Toolkit_InstallPath" /etc/Ascend/ascend_cann_install.info | awk -F'=' '{print $2}') +source "${_CANN_TOOLKIT_INSTALL_PATH}/set_env.sh" +echo "CANN: ${ASCEND_TOOLKIT_HOME}" + +ASCEND_INCLUDE_DIR=${ASCEND_TOOLKIT_HOME}/$(arch)-linux/include +CURRENT_DIR=$(pwd) +OUTPUT_DIR=${CURRENT_DIR}/output +mkdir -p "${OUTPUT_DIR}" + +BUILD_DIR=build +rm -rf "${BUILD_DIR}" +mkdir -p "${BUILD_DIR}" + +cmake \ + -DASCEND_HOME_PATH="${ASCEND_HOME_PATH}" \ + -DASCEND_INCLUDE_DIR="${ASCEND_INCLUDE_DIR}" \ + -DSOC_VERSION="${SOC_VERSION}" \ + -B "${BUILD_DIR}" \ + -S . + +cmake --build "${BUILD_DIR}" -j 16 + +echo "Build complete. Output: ${OUTPUT_DIR}" diff --git a/src/native/ascend/custom/cmake/config_ascend.cmake b/src/native/ascend/custom/cmake/config_ascend.cmake new file mode 100644 index 000000000..1772e9e70 --- /dev/null +++ b/src/native/ascend/custom/cmake/config_ascend.cmake @@ -0,0 +1,40 @@ + +if(DEFINED ASCEND_HOME_PATH) +elseif(DEFINED ENV{ASCEND_HOME_PATH}) + set(ASCEND_HOME_PATH "$ENV{ASCEND_HOME_PATH}" CACHE PATH "ASCEND CANN package installation directory" FORCE) +endif() + +set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH}) + +# Auto-detect `SOC_VERSION` from `npu-smi info` if not set externally. +# Required by `CANN`'s `ascendc.cmake` for `AscendC` kernel compilation. +if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "") + execute_process( + COMMAND bash -c "npu-smi info 2>/dev/null | awk '/910B|910A|310/ {for (i=1;i<=NF;i++) if ($i ~ /^(910|310)/) {print \"Ascend\" $i; exit}}'" + OUTPUT_VARIABLE _DETECTED_SOC + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(_DETECTED_SOC) + set(SOC_VERSION "${_DETECTED_SOC}" CACHE STRING "Ascend SOC version" FORCE) + else() + set(SOC_VERSION "Ascend910B4" CACHE STRING "Ascend SOC version" FORCE) + endif() + + message(STATUS "SOC_VERSION auto-set to ${SOC_VERSION}") +endif() + +if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake) +else() + message(FATAL_ERROR "`ascendc_kernel_cmake` does not exist; please check whether the `CANN` package is installed.") +endif() + +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + + +message(STATUS "ASCEND_CANN_PACKAGE_PATH = ${ASCEND_CANN_PACKAGE_PATH}") +message(STATUS "ASCEND_HOME_PATH = ${ASCEND_HOME_PATH}") diff --git a/src/native/ascend/custom/cmake/config_envs.cmake b/src/native/ascend/custom/cmake/config_envs.cmake new file mode 100644 index 000000000..e715bfc7e --- /dev/null +++ b/src/native/ascend/custom/cmake/config_envs.cmake @@ -0,0 +1,83 @@ +# Find the Python binary. +find_program(PYTHON_EXECUTABLE NAMES python3) + +if (NOT EXISTS ${PYTHON_EXECUTABLE}) + message(FATAL_ERROR "`python3` is not found; install Python first.") +endif () + +# Get `torch`, `torch_npu`, and `pybind11` paths via a Python helper. +execute_process( + COMMAND ${PYTHON_EXECUTABLE} "-c" + "import torch; import torch_npu; import os; import pybind11; import sysconfig; +torch_dir = os.path.realpath(os.path.dirname(torch.__file__)); +torch_npu_dir = os.path.realpath(os.path.dirname(torch_npu.__file__)); +pybind11_dir = os.path.realpath(os.path.dirname(pybind11.__file__)); +abi_enabled=torch.compiled_with_cxx11_abi(); +python_include_dir = sysconfig.get_path('include'); +print(torch_dir, torch_npu_dir, pybind11_dir, abi_enabled, python_include_dir, end=''); +quit(0) + " + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE OUTPUT_ENV_DEFINES) + +# Abort if the Python helper failed. +if (NOT ${EXEC_RESULT} EQUAL 0) + message(FATAL_ERROR "Failed to run Python script to probe env vars like `TORCH_DIR`.") +else () + message(STATUS "Python probe succeeded; output string is [${OUTPUT_ENV_DEFINES}].") +endif () + +# Extract `TORCH_DIR`. +execute_process( + COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $1}'" + OUTPUT_VARIABLE TORCH_DIR + RESULT_VARIABLE EXEC_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Extract `TORCH_NPU_DIR`. +execute_process( + COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $2}'" + OUTPUT_VARIABLE TORCH_NPU_DIR + RESULT_VARIABLE EXEC_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Extract `PYBIND11_DIR`. +execute_process( + COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $3}'" + OUTPUT_VARIABLE PYBIND11_DIR + RESULT_VARIABLE EXEC_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Extract the PyTorch C++11 ABI flag. +execute_process( + COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $4}'" + OUTPUT_VARIABLE TORCH_API_ENABLED + RESULT_VARIABLE EXEC_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Extract `PYTHON_INCLUDE_DIR`. +execute_process( + COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $5}'" + OUTPUT_VARIABLE PYTHON_INCLUDE_DIR + RESULT_VARIABLE EXEC_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +message(STATUS "SOC_VERSION=${SOC_VERSION}") +message(STATUS "TORCH_DIR=${TORCH_DIR}") +message(STATUS "TORCH_NPU_DIR=${TORCH_NPU_DIR}") +message(STATUS "PYBIND11_DIR=${PYBIND11_DIR}") +message(STATUS "PYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}") + +# Set `_GLIBCXX_USE_CXX11_ABI` to match the PyTorch build. +if (${TORCH_API_ENABLED} STREQUAL "True") + add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=1) + message(STATUS "_GLIBCXX_USE_CXX11_ABI=1") +else () + add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=0) + message(STATUS "_GLIBCXX_USE_CXX11_ABI=0") +endif () diff --git a/src/native/ascend/custom/ops.h b/src/native/ascend/custom/ops.h new file mode 100644 index 000000000..e9e6ad9d4 --- /dev/null +++ b/src/native/ascend/custom/ops.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2025 InfiniTensor. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * Adapted from https://github.com/vllm-project/vllm-ascend/blob/main/csrc/ops.h + */ + +#ifndef OPS_H +#define OPS_H + +namespace ascend::detail { + +at::Tensor RmsNorm(const at::Tensor& input, const at::Tensor& weight, + double eps); + +} // namespace ascend::detail + +#endif // OPS_H diff --git a/src/native/ascend/custom/rms_norm/CMakeLists.txt b/src/native/ascend/custom/rms_norm/CMakeLists.txt new file mode 100644 index 000000000..94ceabaa6 --- /dev/null +++ b/src/native/ascend/custom/rms_norm/CMakeLists.txt @@ -0,0 +1 @@ +ascendc_add_operator(OP_NAME rms_norm) diff --git a/src/native/ascend/custom/rms_norm/op_host/rms_norm.cpp b/src/native/ascend/custom/rms_norm/op_host/rms_norm.cpp new file mode 100644 index 000000000..eb521c7b5 --- /dev/null +++ b/src/native/ascend/custom/rms_norm/op_host/rms_norm.cpp @@ -0,0 +1,120 @@ +#include "aclrtlaunch_rms_norm.h" +#include "tiling/platform/platform_ascendc.h" +#include "torch_kernel_helper.h" + +namespace ascend::detail { + +at::Tensor RmsNorm(const at::Tensor& input, const at::Tensor& weight, + double eps) { + // Input validation. + TORCH_CHECK(input.dim() > 0, + "`RmsNorm`: `input` must have at least 1 dimension."); + TORCH_CHECK( + input.scalar_type() == at::kHalf || input.scalar_type() == at::kFloat, + "`RmsNorm`: only `float16` and `float32` are supported; got ", + input.scalar_type(), "."); + TORCH_CHECK(weight.dim() == 1, "`RmsNorm`: `weight` must be 1-dimensional."); + TORCH_CHECK(weight.size(0) == input.size(-1), "`RmsNorm`: `weight` size (", + weight.size(0), ") must match input last dim (", input.size(-1), + ")."); + + int64_t dim_length = input.size(-1); + int64_t total_rows = input.numel() / dim_length; + + if (total_rows == 0 || dim_length == 0) { + return at::empty_like(input); + } + + at::Tensor x = input.contiguous(); + int64_t dtype_size = x.element_size(); + + // Hardware parameters. + auto ascendc_platform = + platform_ascendc::PlatformAscendCManager::GetInstance(); + int64_t core_num = static_cast(ascendc_platform->GetCoreNumAiv()); + uint64_t ub_size; + ascendc_platform->GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size); + int64_t ub_size_limit = static_cast(ub_size); + + // Alignment (32-byte boundary). + int64_t align_elements = 32 / dtype_size; + int64_t dim_length_align = + ((dim_length + align_elements - 1) / align_elements) * align_elements; + + // UB capacity check. + // + // - `fp32`: `inQ` (*2) + `outQ` (*2) + `weight` = 5 * `dim_length_align` + // * 4 = coeff 20. + // - `fp16`: `inQ` (*2) + `outQ` (*2) + `x_fp32` + `tmp_fp32` + `weight` + // = 2 * `dim_length_align` * 2 * 2 + 3 * `dim_length_align` * 4 = + // 8 + 12 = coeff 20. + int64_t buffer_coefficient = 20; + // Reserve 1024 bytes for reduce buffers. + int64_t max_dim_length = (ub_size_limit - 1024) / buffer_coefficient; + // `fp32` alignment. + int64_t fp_align_elements = 32 / 4; + max_dim_length = (max_dim_length / fp_align_elements) * fp_align_elements; + TORCH_CHECK(dim_length_align <= max_dim_length, + "`RmsNorm`: `dim_length` ", dim_length, " (aligned ", + dim_length_align, ") exceeds UB capacity (max ", max_dim_length, + ")."); + + // Padding. + at::Tensor kernel_input; + + if (dim_length != dim_length_align) { + kernel_input = x.reshape({total_rows, dim_length}); + kernel_input = at::constant_pad_nd( + kernel_input, {0, dim_length_align - dim_length}, 0.0); + kernel_input = kernel_input.contiguous(); + } else { + kernel_input = x.reshape({total_rows, dim_length_align}).contiguous(); + } + + at::Tensor kernel_output = at::empty_like(kernel_input); + + // Weight: always pass as fp32, padded to `dim_length_align`. + at::Tensor weight_float = weight.contiguous().to(at::kFloat); + + if (dim_length != dim_length_align) { + weight_float = at::constant_pad_nd( + weight_float, {0, dim_length_align - dim_length}, 0.0); + } + + weight_float = weight_float.contiguous(); + + // Block-level tiling (distribute rows across cores). + int64_t used_core_num = std::min(total_rows, core_num); + int64_t former_length = (total_rows + used_core_num - 1) / used_core_num; + int64_t tail_length = former_length - 1; + int64_t former_num = total_rows - tail_length * used_core_num; + uint32_t block_dim = static_cast(used_core_num); + + // All `EXEC_KERNEL_CMD` args must be lvalues. + float eps_float = static_cast(eps); + int64_t dtype_size_val = dtype_size; + + // The first arg `rms_norm` is the AscendC kernel entry-point name — it + // must match `ascendc_add_operator(OP_NAME rms_norm)` in `CMakeLists.txt`, + // the `__global__ __aicore__ void rms_norm(...)` definition in `op_kernel/`, + // and the generated `aclrtlaunch_rms_norm.h` header. Google C++ Style's + // PascalCase rule does NOT apply: this identifier is dictated by the + // AscendC toolchain's symbol convention. + EXEC_KERNEL_CMD(rms_norm, block_dim, kernel_input, weight_float, + kernel_output, total_rows, dim_length, dim_length_align, + former_num, former_length, tail_length, eps_float, + dtype_size_val); + + // Remove padding and reshape back to original shape. + at::Tensor output = kernel_output; + + if (dim_length != dim_length_align) { + output = output.narrow(-1, 0, dim_length).contiguous(); + } + + output = output.reshape(input.sizes()); + + return output; +} + +} // namespace ascend::detail diff --git a/src/native/ascend/custom/rms_norm/op_kernel/.clang-tidy b/src/native/ascend/custom/rms_norm/op_kernel/.clang-tidy new file mode 100644 index 000000000..ccf13972c --- /dev/null +++ b/src/native/ascend/custom/rms_norm/op_kernel/.clang-tidy @@ -0,0 +1,9 @@ +--- +# `op_kernel/*.cpp` is `AscendC` device code compiled by `ccec`, not by +# the host toolchain, so it has no entry in `compile_commands.json` and +# `clang-tidy` cannot parse it correctly (the `__aicore__` macro expands +# unexpectedly when `kernel_operator.h` is absent). Disable all checks +# here — the `op_host/` side and the `kernel_custom.h` launcher still +# enforce the full ruleset. + +Checks: '-*' diff --git a/src/native/ascend/custom/rms_norm/op_kernel/rms_norm.cpp b/src/native/ascend/custom/rms_norm/op_kernel/rms_norm.cpp new file mode 100644 index 000000000..5c8f4fc67 --- /dev/null +++ b/src/native/ascend/custom/rms_norm/op_kernel/rms_norm.cpp @@ -0,0 +1,215 @@ +#include "kernel_operator.h" + +constexpr int32_t BUFFER_NUM = 2; + +template +class KernelRmsNorm { + public: + __aicore__ inline KernelRmsNorm() {} + + __aicore__ inline void Init(GM_ADDR x, GM_ADDR weight, GM_ADDR y, + int64_t totalRows, int64_t dimLength, + int64_t dimLengthAlign, int64_t formerNum, + int64_t formerLength, int64_t tailLength, + float eps) { + this->dimLength = dimLength; + this->dimLengthAlign = dimLengthAlign; + this->eps = eps; + + // Block-level tiling: determine row range for this core. + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t rowOffset; + + if (blockIdx < formerNum) { + this->blockRows = formerLength; + rowOffset = formerLength * blockIdx; + } else { + this->blockRows = tailLength; + int64_t tailIdx = blockIdx - formerNum; + rowOffset = formerLength * formerNum + tailLength * tailIdx; + } + + // Global memory pointers. + xGm.SetGlobalBuffer((__gm__ T*)x + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + yGm.SetGlobalBuffer((__gm__ T*)y + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + weightGm.SetGlobalBuffer((__gm__ float*)weight, dimLengthAlign); + + int32_t dimLenAlign = static_cast(this->dimLengthAlign); + + // I/O queues (double-buffered). + pipe.InitBuffer(inQueueX, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + pipe.InitBuffer(outQueueY, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + + // Weight buffer (fp32, loaded once, reused for all rows). + pipe.InitBuffer(weightBuf, + dimLenAlign * static_cast(sizeof(float))); + + // FP16 path needs extra fp32 compute buffers. + if constexpr (sizeof(T) == 2) { + pipe.InitBuffer(xFp32Buf, + dimLenAlign * static_cast(sizeof(float))); + pipe.InitBuffer(tmpFp32Buf, + dimLenAlign * static_cast(sizeof(float))); + } + + // ReduceSum temporary buffer (size per API formula). + constexpr int32_t ELEMS_PER_REPEAT = 256 / sizeof(float); + constexpr int32_t ELEMS_PER_BLOCK = 32 / sizeof(float); + int32_t firstMaxRepeat = + (dimLenAlign + ELEMS_PER_REPEAT - 1) / ELEMS_PER_REPEAT; + int32_t reduceTmpSize = + ((firstMaxRepeat + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK) * + ELEMS_PER_BLOCK; + pipe.InitBuffer(reduceTmpBuf, + reduceTmpSize * static_cast(sizeof(float))); + + // Scalar buffer for reduction result (8 floats = 32 bytes). + pipe.InitBuffer(sumBuf, 32); + + // Load weight (fp32) from GM into `weightBuf`. + AscendC::LocalTensor wLocal = weightBuf.Get(); + AscendC::DataCopyExtParams wParams{ + 1, static_cast(dimLenAlign * sizeof(float)), 0, 0, 0}; + AscendC::DataCopyPadExtParams wPad{false, 0, 0, 0.0f}; + AscendC::DataCopyPad(wLocal, weightGm, wParams, wPad); + + // Ensure weight DMA completes before compute. + AscendC::PipeBarrier(); + } + + __aicore__ inline void Process() { + for (int64_t row = 0; row < this->blockRows; ++row) { + CopyIn(row); + Compute(row); + CopyOut(row); + } + } + + private: + __aicore__ inline void CopyIn(int64_t row) { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyExtParams params{ + 1, static_cast(this->dimLengthAlign * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPadExtParams pad{false, 0, 0, static_cast(0)}; + AscendC::DataCopyPad(xLocal, xGm[row * this->dimLengthAlign], params, pad); + inQueueX.EnQue(xLocal); + } + + __aicore__ inline void Compute(int64_t row) { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); + + AscendC::LocalTensor wLocal = weightBuf.Get(); + AscendC::LocalTensor rTmp = reduceTmpBuf.Get(); + AscendC::LocalTensor sLocal = sumBuf.Get(); + + int32_t dimLen = static_cast(this->dimLength); + int32_t dimLenAlign = static_cast(this->dimLengthAlign); + + if constexpr (sizeof(T) == 4) { + // ---- FP32 path: compute directly. ---- + + // Step 1: x^2 into yLocal (reuse output buffer temporarily). + AscendC::Mul(yLocal, xLocal, xLocal, dimLenAlign); + + // Step 2: ReduceSum(x^2) -> sLocal[0]. + // ReduceSum may modify src (yLocal), but we overwrite it later. + AscendC::ReduceSum(sLocal, yLocal, rTmp, dimLenAlign); + + // Step 3-5: scale = 1 / sqrt(mean(x^2) + eps). + float sumVal = sLocal.GetValue(0); + float meanVal = sumVal / static_cast(dimLen) + this->eps; + sLocal.SetValue(0, meanVal); + AscendC::Sqrt(sLocal, sLocal, 8); + float scale = 1.0f / sLocal.GetValue(0); + + // Step 6: y = x * scale. + AscendC::Muls(yLocal, xLocal, scale, dimLenAlign); + + // Step 7: y = y * weight. + AscendC::Mul(yLocal, yLocal, wLocal, dimLenAlign); + + } else { + // ---- FP16 path: cast → fp32 compute → cast back. ---- + AscendC::LocalTensor xF32 = xFp32Buf.Get(); + AscendC::LocalTensor tmpF32 = tmpFp32Buf.Get(); + + // Cast input fp16 → fp32. + AscendC::Cast(xF32, xLocal, AscendC::RoundMode::CAST_NONE, dimLenAlign); + + // Step 1: x^2 in fp32. + AscendC::Mul(tmpF32, xF32, xF32, dimLenAlign); + + // Step 2: ReduceSum(x^2) -> sLocal[0]. + AscendC::ReduceSum(sLocal, tmpF32, rTmp, dimLenAlign); + + // Step 3-5: scale = 1 / sqrt(mean(x^2) + eps). + float sumVal = sLocal.GetValue(0); + float meanVal = sumVal / static_cast(dimLen) + this->eps; + sLocal.SetValue(0, meanVal); + AscendC::Sqrt(sLocal, sLocal, 8); + float scale = 1.0f / sLocal.GetValue(0); + + // Step 6: y = x * scale (fp32). + AscendC::Muls(tmpF32, xF32, scale, dimLenAlign); + + // Step 7: y = y * weight (fp32). + AscendC::Mul(tmpF32, tmpF32, wLocal, dimLenAlign); + + // Cast result fp32 → fp16. + AscendC::Cast(yLocal, tmpF32, AscendC::RoundMode::CAST_ROUND, + dimLenAlign); + } + + inQueueX.FreeTensor(xLocal); + outQueueY.EnQue(yLocal); + } + + __aicore__ inline void CopyOut(int64_t row) { + AscendC::LocalTensor yLocal = outQueueY.DeQue(); + AscendC::DataCopyExtParams params{ + 1, static_cast(this->dimLengthAlign * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPad(yGm[row * this->dimLengthAlign], yLocal, params); + outQueueY.FreeTensor(yLocal); + } + + private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueY; + + AscendC::TBuf weightBuf; + AscendC::TBuf xFp32Buf; + AscendC::TBuf tmpFp32Buf; + AscendC::TBuf reduceTmpBuf; + AscendC::TBuf sumBuf; + + AscendC::GlobalTensor xGm, yGm; + AscendC::GlobalTensor weightGm; + + int64_t blockRows; + int64_t dimLength; + int64_t dimLengthAlign; + float eps; +}; + +extern "C" __global__ __aicore__ void rms_norm( + GM_ADDR x, GM_ADDR weight, GM_ADDR y, int64_t totalRows, int64_t dimLength, + int64_t dimLengthAlign, int64_t formerNum, int64_t formerLength, + int64_t tailLength, float eps, int64_t dtypeSize) { + if (dtypeSize == 2) { + KernelRmsNorm op; + op.Init(x, weight, y, totalRows, dimLength, dimLengthAlign, formerNum, + formerLength, tailLength, eps); + op.Process(); + } else { + KernelRmsNorm op; + op.Init(x, weight, y, totalRows, dimLength, dimLengthAlign, formerNum, + formerLength, tailLength, eps); + op.Process(); + } +} diff --git a/src/native/ascend/custom/torch_binding.cpp b/src/native/ascend/custom/torch_binding.cpp new file mode 100644 index 000000000..1d343c064 --- /dev/null +++ b/src/native/ascend/custom/torch_binding.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2025 InfiniTensor. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * Adapted from https://github.com/vllm-project/vllm-ascend/blob/main/csrc/torch_binding.cpp + */ + +#include +#include + +#include "ops.h" + +namespace { +TORCH_LIBRARY_FRAGMENT(npu, m) { + m.def("rms_norm(Tensor input, Tensor weight, float eps=1e-6) -> Tensor"); +} + +TORCH_LIBRARY_IMPL(npu, PrivateUse1, m) { + m.impl("rms_norm", TORCH_FN(ascend::detail::RmsNorm)); +} +} // namespace diff --git a/src/native/ascend/custom/utils/torch_kernel_helper.h b/src/native/ascend/custom/utils/torch_kernel_helper.h new file mode 100644 index 000000000..c4679d1dc --- /dev/null +++ b/src/native/ascend/custom/utils/torch_kernel_helper.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2025 InfiniTensor. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * Adapted from https://github.com/vllm-project/vllm-ascend/tree/main/csrc + */ + +#ifndef TORCH_KERNEL_HELPER_H +#define TORCH_KERNEL_HELPER_H + +#include +#include + +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +namespace ascend::detail { + +#define DEVICE_TYPE c10::DeviceType::PrivateUse1 + +class TorchNpuHelper { + public: + inline static at::Tensor CopyTensorHostToDevice( + const at::Tensor& cpu_tensor) { + at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory(); + int deviceIndex = 0; + c10_npu::GetDevice(&deviceIndex); + return cpuPinMemTensor.to(c10::Device(DEVICE_TYPE, deviceIndex), + cpuPinMemTensor.scalar_type(), true, true); + } + + inline static at::Tensor CopyScalarToDevice(const c10::Scalar& cpu_scalar, + at::ScalarType scalar_data_type) { + return CopyTensorHostToDevice( + scalar_to_tensor(cpu_scalar).to(scalar_data_type)); + } + + inline static void* ConvertType(const at::Tensor& at_tensor) { + return const_cast(at_tensor.data_ptr()); + } + + template + inline static T ConvertType(T value) { + return value; + } + + template + inline static constexpr auto ConvertTypes(Ts&... args) { + return std::make_tuple(ConvertType(args)...); + } +}; +} // namespace ascend::detail + +/** + * @brief Launch real kernel function on NPU + * + * @param kernel_name [in] name of kernel + * @param blockdim [in] dim size of block + */ +#define EXEC_KERNEL_CMD(kernel_name, blockdim, ...) \ + do { \ + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ + auto converted_params = \ + ascend::detail::TorchNpuHelper::ConvertTypes(__VA_ARGS__); \ + auto acl_call = [acl_stream, blockdim, converted_params]() -> int { \ + std::apply( \ + [&](auto&&... params) { \ + ACLRT_LAUNCH_KERNEL(kernel_name) \ + (blockdim, acl_stream, params...); \ + }, \ + converted_params); \ + return 0; \ + }; \ + at_npu::native::OpCommand::RunOpApi(#kernel_name, acl_call); \ + } while (false) + +#endif // TORCH_KERNEL_HELPER_H diff --git a/src/ascend/data_type_.h b/src/native/ascend/data_type_.h similarity index 88% rename from src/ascend/data_type_.h rename to src/native/ascend/data_type_.h index 9026f515f..81253a9b3 100644 --- a/src/ascend/data_type_.h +++ b/src/native/ascend/data_type_.h @@ -4,13 +4,19 @@ #include #include "acl/acl.h" -#include "ascend/device_.h" #include "data_type.h" +#include "native/ascend/device_.h" namespace infini::ops::ascend { inline aclDataType ToAclDtype(DataType dt) { switch (dt) { + case DataType::kFloat16: + return ACL_FLOAT16; + case DataType::kBFloat16: + return ACL_BF16; + case DataType::kFloat32: + return ACL_FLOAT; case DataType::kInt8: return ACL_INT8; case DataType::kInt16: @@ -27,19 +33,13 @@ inline aclDataType ToAclDtype(DataType dt) { return ACL_UINT32; case DataType::kUInt64: return ACL_UINT64; - case DataType::kFloat16: - return ACL_FLOAT16; - case DataType::kBFloat16: - return ACL_BF16; - case DataType::kFloat32: - return ACL_FLOAT; default: - assert(false && "Unsupported dtype for Ascend backend."); + assert(false && "unsupported dtype for Ascend backend"); return ACL_DT_UNDEFINED; } } -// Returns true for integer (signed or unsigned) `DataType` values. +// Returns true for integer (signed or unsigned) DataType values. inline bool IsIntegerDtype(DataType dt) { switch (dt) { case DataType::kInt8: diff --git a/src/ascend/device_.h b/src/native/ascend/device_.h similarity index 100% rename from src/ascend/device_.h rename to src/native/ascend/device_.h diff --git a/src/native/ascend/ops/add/kernel.h b/src/native/ascend/ops/add/kernel.h new file mode 100644 index 000000000..8bf4a2564 --- /dev/null +++ b/src/native/ascend/ops/add/kernel.h @@ -0,0 +1,93 @@ +#ifndef INFINI_OPS_ASCEND_ADD_KERNEL_H_ +#define INFINI_OPS_ASCEND_ADD_KERNEL_H_ + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnn_add.h" +#include "base/add.h" +#include "data_type.h" +#include "native/ascend/common.h" +#include "native/ascend/workspace_pool_.h" +#include "operator.h" + +namespace infini::ops { + +template <> +class Operator : public Add { + public: + Operator(const Tensor input, const Tensor other, Tensor out) + : Add(input, other, out), + in_cache_(input), + oth_cache_(other), + out_cache_(out) { + // `aclCreateScalar` stores the pointer rather than copying the value, so + // `alpha_storage_*` must remain alive for the lifetime of `alpha_`. + // The alpha scalar type must match the tensor dtype: use int64 for integer + // dtypes and float for floating-point dtypes. + if (ascend::IsIntegerDtype(input.dtype())) { + alpha_ = aclCreateScalar(&alpha_int_storage_, ACL_INT64); + } else { + alpha_ = aclCreateScalar(&alpha_float_storage_, ACL_FLOAT); + } + } + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. The + // descriptors are still referenced by the Repeatable `executor_`, so + // skipping `aclDestroyTensor` (and leaking the executor at shutdown) + // avoids a double-free; see `64c367c`. + in_cache_.release(); + oth_cache_.release(); + out_cache_.release(); + + if (alpha_) aclDestroyScalar(alpha_); + } + + void operator()(const Tensor input, const Tensor other, + Tensor out) const override { + auto stream = static_cast(stream_); + auto t_in = in_cache_.get(const_cast(input.data())); + auto t_oth = oth_cache_.get(const_cast(other.data())); + auto t_out = out_cache_.get(out.data()); + + if (!executor_) { + aclnnAddGetWorkspaceSize(t_in, t_oth, alpha_, t_out, &ws_size_, + &executor_); + aclSetAclOpExecutorRepeatable(executor_); + } else { + aclSetInputTensorAddr(executor_, 0, t_in, + const_cast(input.data())); + aclSetInputTensorAddr(executor_, 1, t_oth, + const_cast(other.data())); + aclSetOutputTensorAddr(executor_, 0, t_out, out.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + aclnnAdd(arena.buf, ws_size_, executor_, stream); + } + + private: + mutable ascend::AclTensorCache in_cache_; + + mutable ascend::AclTensorCache oth_cache_; + + mutable ascend::AclTensorCache out_cache_; + + mutable aclOpExecutor* executor_ = nullptr; + + mutable uint64_t ws_size_ = 0; + + // Stable address for `aclCreateScalar` (float). + float alpha_float_storage_ = 1.0f; + + // Stable address for `aclCreateScalar` (int). + int64_t alpha_int_storage_ = 1; + + aclScalar* alpha_ = nullptr; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/ascend/ops/cast/kernel.h b/src/native/ascend/ops/cast/kernel.h new file mode 100644 index 000000000..9fb6e7b5f --- /dev/null +++ b/src/native/ascend/ops/cast/kernel.h @@ -0,0 +1,64 @@ +#ifndef INFINI_OPS_ASCEND_CAST_KERNEL_H_ +#define INFINI_OPS_ASCEND_CAST_KERNEL_H_ + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnnop/aclnn_cast.h" +#include "base/cast.h" +#include "native/ascend/common.h" +#include "native/ascend/workspace_pool_.h" +#include "operator.h" + +namespace infini::ops { + +template <> +class Operator : public Cast { + public: + Operator(const Tensor input, Tensor out) + : Cast(input, out), + in_cache_(input), + out_cache_(out), + acl_out_dtype_(ascend::ToAclDtype(out.dtype())) {} + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + in_cache_.release(); + out_cache_.release(); + } + + void operator()(const Tensor input, Tensor out) const override { + auto stream = static_cast(stream_); + auto t_in = in_cache_.get(const_cast(input.data())); + auto t_out = out_cache_.get(out.data()); + + if (!executor_) { + aclnnCastGetWorkspaceSize(t_in, acl_out_dtype_, t_out, &ws_size_, + &executor_); + aclSetAclOpExecutorRepeatable(executor_); + } else { + aclSetInputTensorAddr(executor_, 0, t_in, + const_cast(input.data())); + aclSetOutputTensorAddr(executor_, 0, t_out, out.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + aclnnCast(arena.buf, ws_size_, executor_, stream); + } + + private: + mutable ascend::AclTensorCache in_cache_; + + mutable ascend::AclTensorCache out_cache_; + + aclDataType acl_out_dtype_; + + mutable aclOpExecutor* executor_ = nullptr; + + mutable uint64_t ws_size_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/ascend/ops/cat/kernel.h b/src/native/ascend/ops/cat/kernel.h new file mode 100644 index 000000000..32948181a --- /dev/null +++ b/src/native/ascend/ops/cat/kernel.h @@ -0,0 +1,105 @@ +#ifndef INFINI_OPS_ASCEND_CAT_KERNEL_H_ +#define INFINI_OPS_ASCEND_CAT_KERNEL_H_ + +#include + +#include "acl/acl.h" +#include "aclnn/acl_meta.h" +#include "aclnn/aclnn_base.h" +#include "aclnnop/aclnn_cat.h" +#include "base/cat.h" +#include "native/ascend/common.h" +#include "native/ascend/workspace_pool_.h" +#include "operator.h" + +namespace infini::ops { + +template <> +class Operator : public Cat { + public: + Operator(const Tensor first_input, std::vector rest_inputs, + int64_t dim, Tensor out) + : Cat(first_input, rest_inputs, dim, out), out_cache_(out) { + // Build `AclTensorCache` for each input tensor. + in_caches_.reserve(input_count_); + in_caches_.emplace_back(first_input); + for (const auto& t : rest_inputs) { + in_caches_.emplace_back(t); + } + } + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. The input + // descriptors are referenced by the Repeatable `executor_` via + // `tensor_list_`, so every `in_caches_[i]` must be released alongside + // `out_cache_`; otherwise `~AclTensorCache()` double-frees them when the + // vector destructs. + for (auto& c : in_caches_) { + c.release(); + } + out_cache_.release(); + + if (tensor_list_) aclDestroyTensorList(tensor_list_); + } + + void operator()(const Tensor first_input, std::vector rest_inputs, + int64_t /*dim*/, Tensor out) const override { + auto stream = static_cast(stream_); + + // Collect all input tensors in order. + std::vector inputs; + inputs.reserve(input_count_); + inputs.push_back(&first_input); + for (const auto& t : rest_inputs) { + inputs.push_back(&t); + } + + auto t_out = out_cache_.get(out.data()); + + if (!executor_) { + // First call: create descriptors, tensor list, and executor. + std::vector acl_tensors(input_count_); + for (size_t i = 0; i < input_count_; ++i) { + acl_tensors[i] = + in_caches_[i].get(const_cast(inputs[i]->data())); + } + + tensor_list_ = + aclCreateTensorList(const_cast(acl_tensors.data()), + static_cast(input_count_)); + + aclnnCatGetWorkspaceSize(tensor_list_, dim_, t_out, &ws_size_, + &executor_); + aclSetAclOpExecutorRepeatable(executor_); + } else { + // Subsequent calls: update data pointers on cached descriptors via + // `aclSetRawTensorAddr`. The executor holds references to the same + // `aclTensor*` objects inside `tensor_list_`, so updating their data + // pointers is sufficient — no `aclSetInputTensorAddr` needed. + for (size_t i = 0; i < input_count_; ++i) { + in_caches_[i].get(const_cast(inputs[i]->data())); + } + aclSetOutputTensorAddr(executor_, 0, t_out, out.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + aclnnCat(arena.buf, ws_size_, executor_, stream); + } + + private: + mutable std::vector in_caches_; + + mutable ascend::AclTensorCache out_cache_; + + mutable aclTensorList* tensor_list_ = nullptr; + + mutable aclOpExecutor* executor_ = nullptr; + + mutable uint64_t ws_size_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/ascend/ops/gemm/kernel.h b/src/native/ascend/ops/gemm/kernel.h new file mode 100644 index 000000000..34644cab7 --- /dev/null +++ b/src/native/ascend/ops/gemm/kernel.h @@ -0,0 +1,109 @@ +#ifndef INFINI_OPS_ASCEND_GEMM_KERNEL_H_ +#define INFINI_OPS_ASCEND_GEMM_KERNEL_H_ + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnnop/aclnn_addmm.h" +#include "aclnnop/aclnn_baddbmm.h" +#include "base/gemm.h" +#include "native/ascend/common.h" +#include "native/ascend/workspace_pool_.h" +#include "operator.h" + +namespace infini::ops { + +template <> +class Operator : public Gemm { + public: + Operator(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) + : Gemm(a, b, alpha, beta, trans_a, trans_b, c), + batched_{batch_count_ > 1}, + alpha_val_{alpha.value_or(1.0f)}, + beta_val_{beta.value_or(1.0f)}, + self_cache_(c), + a_cache_(a, trans_a_), + b_cache_(b, trans_b_), + out_cache_(c) { + alpha_scalar_ = aclCreateScalar(&alpha_val_, ACL_FLOAT); + beta_scalar_ = aclCreateScalar(&beta_val_, ACL_FLOAT); + } + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + self_cache_.release(); + a_cache_.release(); + b_cache_.release(); + out_cache_.release(); + + if (alpha_scalar_) aclDestroyScalar(alpha_scalar_); + if (beta_scalar_) aclDestroyScalar(beta_scalar_); + } + + void operator()(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) const override { + auto stream = static_cast(stream_); + + auto t_self = self_cache_.get(c.data()); + auto t_a = a_cache_.get(const_cast(a.data())); + auto t_b = b_cache_.get(const_cast(b.data())); + auto t_out = out_cache_.get(c.data()); + + if (!executor_) { + if (batched_) { + aclnnBaddbmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_, + alpha_scalar_, t_out, 0, &ws_size_, + &executor_); + } else { + aclnnAddmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_, + alpha_scalar_, t_out, 0, &ws_size_, + &executor_); + } + aclSetAclOpExecutorRepeatable(executor_); + } else { + aclSetInputTensorAddr(executor_, 0, t_self, c.data()); + aclSetInputTensorAddr(executor_, 1, t_a, const_cast(a.data())); + aclSetInputTensorAddr(executor_, 2, t_b, const_cast(b.data())); + aclSetOutputTensorAddr(executor_, 0, t_out, c.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + + if (batched_) { + aclnnBaddbmm(arena.buf, ws_size_, executor_, stream); + } else { + aclnnAddmm(arena.buf, ws_size_, executor_, stream); + } + } + + private: + bool batched_; + + float alpha_val_; + + float beta_val_; + + aclScalar* alpha_scalar_ = nullptr; + + aclScalar* beta_scalar_ = nullptr; + + mutable ascend::AclTensorCache self_cache_; + + mutable ascend::AclTensorCache a_cache_; + + mutable ascend::AclTensorCache b_cache_; + + mutable ascend::AclTensorCache out_cache_; + + mutable aclOpExecutor* executor_ = nullptr; + + mutable uint64_t ws_size_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/ascend/ops/linear/kernel.h b/src/native/ascend/ops/linear/kernel.h new file mode 100644 index 000000000..d6ce2a6da --- /dev/null +++ b/src/native/ascend/ops/linear/kernel.h @@ -0,0 +1,125 @@ +#ifndef INFINI_OPS_ASCEND_LINEAR_KERNEL_H_ +#define INFINI_OPS_ASCEND_LINEAR_KERNEL_H_ + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnnop/aclnn_addmm.h" +#include "aclnnop/aclnn_baddbmm.h" +#include "aclnnop/aclnn_matmul.h" +#include "base/linear.h" +#include "native/ascend/common.h" +#include "native/ascend/workspace_pool_.h" +#include "operator.h" + +namespace infini::ops { + +template <> +class Operator : public Linear { + public: + Operator(const Tensor a, const Tensor b, std::optional bias, + bool trans_a, bool trans_b, Tensor out) + : Linear(a, b, bias, trans_a, trans_b, out), + batched_{out.ndim() > 2}, + a_cache_(a, trans_a), + b_cache_(b, trans_b), + out_cache_(out) { + if (has_bias_) { + bias_cache_ = ascend::AclTensorCache(*bias); + alpha_scalar_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT); + beta_scalar_ = aclCreateScalar(&beta_storage_, ACL_FLOAT); + } + } + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + bias_cache_.release(); + a_cache_.release(); + b_cache_.release(); + out_cache_.release(); + + if (alpha_scalar_) aclDestroyScalar(alpha_scalar_); + if (beta_scalar_) aclDestroyScalar(beta_scalar_); + } + + void operator()(const Tensor a, const Tensor b, std::optional bias, + bool trans_a, bool trans_b, Tensor out) const override { + auto stream = static_cast(stream_); + auto t_a = a_cache_.get(const_cast(a.data())); + auto t_b = b_cache_.get(const_cast(b.data())); + auto t_out = out_cache_.get(out.data()); + + if (has_bias_) { + auto t_bias = bias_cache_.get(const_cast(bias->data())); + + if (!executor_) { + if (batched_) { + aclnnBaddbmmGetWorkspaceSize(t_bias, t_a, t_b, beta_scalar_, + alpha_scalar_, t_out, 0, &ws_size_, + &executor_); + } else { + aclnnAddmmGetWorkspaceSize(t_bias, t_a, t_b, beta_scalar_, + alpha_scalar_, t_out, 0, &ws_size_, + &executor_); + } + aclSetAclOpExecutorRepeatable(executor_); + } else { + aclSetInputTensorAddr(executor_, 0, t_bias, + const_cast(bias->data())); + aclSetInputTensorAddr(executor_, 1, t_a, const_cast(a.data())); + aclSetInputTensorAddr(executor_, 2, t_b, const_cast(b.data())); + aclSetOutputTensorAddr(executor_, 0, t_out, out.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + + if (batched_) { + aclnnBaddbmm(arena.buf, ws_size_, executor_, stream); + } else { + aclnnAddmm(arena.buf, ws_size_, executor_, stream); + } + } else { + if (!executor_) { + int8_t cube_math_type = 1; + aclnnMatmulGetWorkspaceSize(t_a, t_b, t_out, cube_math_type, &ws_size_, + &executor_); + aclSetAclOpExecutorRepeatable(executor_); + } else { + aclSetInputTensorAddr(executor_, 0, t_a, const_cast(a.data())); + aclSetInputTensorAddr(executor_, 1, t_b, const_cast(b.data())); + aclSetOutputTensorAddr(executor_, 0, t_out, out.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + aclnnMatmul(arena.buf, ws_size_, executor_, stream); + } + } + + private: + bool batched_; + + mutable ascend::AclTensorCache bias_cache_; + + mutable ascend::AclTensorCache a_cache_; + + mutable ascend::AclTensorCache b_cache_; + + mutable ascend::AclTensorCache out_cache_; + + float alpha_storage_ = 1.0f; + + float beta_storage_ = 1.0f; + + aclScalar* alpha_scalar_ = nullptr; + + aclScalar* beta_scalar_ = nullptr; + + mutable aclOpExecutor* executor_ = nullptr; + + mutable uint64_t ws_size_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/ascend/ops/matmul/kernel.h b/src/native/ascend/ops/matmul/kernel.h new file mode 100644 index 000000000..c574e4e55 --- /dev/null +++ b/src/native/ascend/ops/matmul/kernel.h @@ -0,0 +1,68 @@ +#ifndef INFINI_OPS_ASCEND_MATMUL_KERNEL_H_ +#define INFINI_OPS_ASCEND_MATMUL_KERNEL_H_ + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnnop/aclnn_matmul.h" +#include "base/matmul.h" +#include "native/ascend/common.h" +#include "native/ascend/workspace_pool_.h" +#include "operator.h" + +namespace infini::ops { + +template <> +class Operator : public Matmul { + public: + Operator(const Tensor a, const Tensor b, Tensor c, bool trans_a, bool trans_b) + : Matmul(a, b, c, trans_a, trans_b), + a_cache_(a, trans_a), + b_cache_(b, trans_b), + out_cache_(c) {} + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + a_cache_.release(); + b_cache_.release(); + out_cache_.release(); + } + + void operator()(const Tensor a, const Tensor b, Tensor c, bool trans_a, + bool trans_b) const override { + auto stream = static_cast(stream_); + auto t_a = a_cache_.get(const_cast(a.data())); + auto t_b = b_cache_.get(const_cast(b.data())); + auto t_out = out_cache_.get(c.data()); + + if (!executor_) { + int8_t cube_math_type = 1; + aclnnMatmulGetWorkspaceSize(t_a, t_b, t_out, cube_math_type, &ws_size_, + &executor_); + aclSetAclOpExecutorRepeatable(executor_); + } else { + aclSetInputTensorAddr(executor_, 0, t_a, const_cast(a.data())); + aclSetInputTensorAddr(executor_, 1, t_b, const_cast(b.data())); + aclSetOutputTensorAddr(executor_, 0, t_out, c.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + aclnnMatmul(arena.buf, ws_size_, executor_, stream); + } + + private: + mutable ascend::AclTensorCache a_cache_; + + mutable ascend::AclTensorCache b_cache_; + + mutable ascend::AclTensorCache out_cache_; + + mutable aclOpExecutor* executor_ = nullptr; + + mutable uint64_t ws_size_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/ascend/ops/mul/kernel.h b/src/native/ascend/ops/mul/kernel.h new file mode 100644 index 000000000..311b85de8 --- /dev/null +++ b/src/native/ascend/ops/mul/kernel.h @@ -0,0 +1,68 @@ +#ifndef INFINI_OPS_ASCEND_MUL_KERNEL_H_ +#define INFINI_OPS_ASCEND_MUL_KERNEL_H_ + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnn_mul.h" +#include "base/mul.h" +#include "native/ascend/common.h" +#include "native/ascend/workspace_pool_.h" +#include "operator.h" + +namespace infini::ops { + +template <> +class Operator : public Mul { + public: + Operator(const Tensor input, const Tensor other, Tensor out) + : Mul(input, other, out), + in_cache_(input), + oth_cache_(other), + out_cache_(out) {} + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + in_cache_.release(); + oth_cache_.release(); + out_cache_.release(); + } + + void operator()(const Tensor input, const Tensor other, + Tensor out) const override { + auto stream = static_cast(stream_); + auto t_in = in_cache_.get(const_cast(input.data())); + auto t_oth = oth_cache_.get(const_cast(other.data())); + auto t_out = out_cache_.get(out.data()); + + if (!executor_) { + aclnnMulGetWorkspaceSize(t_in, t_oth, t_out, &ws_size_, &executor_); + aclSetAclOpExecutorRepeatable(executor_); + } else { + aclSetInputTensorAddr(executor_, 0, t_in, + const_cast(input.data())); + aclSetInputTensorAddr(executor_, 1, t_oth, + const_cast(other.data())); + aclSetOutputTensorAddr(executor_, 0, t_out, out.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + aclnnMul(arena.buf, ws_size_, executor_, stream); + } + + private: + mutable ascend::AclTensorCache in_cache_; + + mutable ascend::AclTensorCache oth_cache_; + + mutable ascend::AclTensorCache out_cache_; + + mutable aclOpExecutor* executor_ = nullptr; + + mutable uint64_t ws_size_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/ascend/runtime_.h b/src/native/ascend/runtime_.h similarity index 97% rename from src/ascend/runtime_.h rename to src/native/ascend/runtime_.h index dca74258c..2b9e14136 100644 --- a/src/ascend/runtime_.h +++ b/src/native/ascend/runtime_.h @@ -5,7 +5,7 @@ #include "acl/acl.h" // clang-format on -#include "ascend/device_.h" +#include "native/ascend/device_.h" #include "runtime.h" namespace infini::ops { diff --git a/src/native/ascend/workspace_pool_.h b/src/native/ascend/workspace_pool_.h new file mode 100644 index 000000000..bd3774fab --- /dev/null +++ b/src/native/ascend/workspace_pool_.h @@ -0,0 +1,156 @@ +#ifndef INFINI_OPS_ASCEND_WORKSPACE_POOL__H_ +#define INFINI_OPS_ASCEND_WORKSPACE_POOL__H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +namespace infini::ops::ascend { + +struct WorkspaceArena { + void* buf = nullptr; + + uint64_t capacity = 0; +}; + +class WorkspacePool { + public: + // Ensure the arena for `(stream, slot)` has at least `needed` bytes. + // + // The `slot` parameter defaults to `"workspace"` for backward + // compatibility. Operators needing a separate temp arena pass + // `"temp"`. + WorkspaceArena& Ensure(aclrtStream stream, uint64_t needed, + const char* slot = "workspace") { + // Thread-local fast path: a small flat array of recently used + // `(stream, slot, arena*)` triples. In practice operators use at + // most 2-3 slots, so linear scan is sufficient — no heap + // allocation on the hot path. + struct CacheEntry { + aclrtStream stream = nullptr; + const char* slot = nullptr; + WorkspaceArena* arena = nullptr; + }; + static constexpr int kCacheSize = 4; + thread_local CacheEntry cache[kCacheSize] = {}; + + for (int i = 0; i < kCacheSize; ++i) { + auto& e = cache[i]; + + if (e.stream == stream && e.slot != nullptr && + std::strcmp(e.slot, slot) == 0 && e.arena != nullptr && + needed <= e.arena->capacity) { + return *e.arena; + } + } + + // Slow path: look up arena in the map under lock. + assert(!capturing_ && + "`WorkspacePool`: `aclrtMalloc` on slow path during graph " + "capture. Ensure all operators run at least once during " + "eager warmup."); + + std::lock_guard lock(mutex_); + + SlotKey key{stream, slot}; + auto& owned = arenas_[key]; + + if (!owned) { + owned = std::make_unique(); + } + + auto* arena = owned.get(); + + if (needed > arena->capacity) { + if (arena->capacity > 0) { + aclrtSynchronizeStream(stream); + aclrtFree(arena->buf); + } + + if (needed > 0) { + auto ret = aclrtMalloc(&arena->buf, needed, ACL_MEM_MALLOC_NORMAL_ONLY); + assert(ret == ACL_SUCCESS && "`WorkspacePool`: `aclrtMalloc` failed"); + } + + arena->capacity = needed; + } + + // Insert into the thread-local cache (evict oldest). + for (int i = kCacheSize - 1; i > 0; --i) { + cache[i] = cache[i - 1]; + } + cache[0] = {stream, slot, arena}; + + return *arena; + } + + // Set to true before NPUGraph capture, false after. When true, + // the slow path (which calls `aclrtMalloc`) triggers an assert + // failure — a safety net against accidental device allocations + // being recorded into the graph. + void set_capture_mode(bool capturing) { capturing_ = capturing; } + + ~WorkspacePool() { + for (auto& [key, arena] : arenas_) { + if (arena && arena->capacity > 0) { + // The CANN runtime may already be torn down when this static + // destructor runs. `aclrtGetDevice` fails in that case — + // skip the free to avoid glibc "double free" abort. + int32_t dev_id = -1; + + if (aclrtGetDevice(&dev_id) == ACL_SUCCESS) { + aclrtFree(arena->buf); + } else { + fprintf(stderr, + "[InfiniOps] `WorkspacePool`: CANN runtime already " + "finalized, skipping `aclrtFree` (%" PRIu64 + " bytes leaked).\n", + arena->capacity); + } + } + } + } + + private: + struct SlotKey { + aclrtStream stream; + std::string slot; + + bool operator==(const SlotKey& o) const { + return stream == o.stream && slot == o.slot; + } + }; + + struct SlotKeyHash { + size_t operator()(const SlotKey& k) const { + auto h1 = std::hash{}(static_cast(k.stream)); + auto h2 = std::hash{}(k.slot); + + return h1 ^ (h2 << 1); + } + }; + + std::unordered_map, SlotKeyHash> + arenas_; + + std::mutex mutex_; + + bool capturing_ = false; +}; + +inline WorkspacePool& GetWorkspacePool() { + static WorkspacePool pool; + + return pool; +} + +} // namespace infini::ops::ascend + +#endif diff --git a/src/cambricon/common.h b/src/native/cambricon/common.h similarity index 100% rename from src/cambricon/common.h rename to src/native/cambricon/common.h diff --git a/src/cambricon/data_type_.h b/src/native/cambricon/data_type_.h similarity index 91% rename from src/cambricon/data_type_.h rename to src/native/cambricon/data_type_.h index 1efa09a21..f4ca82da8 100644 --- a/src/cambricon/data_type_.h +++ b/src/native/cambricon/data_type_.h @@ -3,8 +3,8 @@ #include "bang_bf16.h" #include "bang_fp16.h" -#include "cambricon/device_.h" #include "data_type.h" +#include "native/cambricon/device_.h" namespace infini::ops { diff --git a/src/cambricon/device_.h b/src/native/cambricon/device_.h similarity index 100% rename from src/cambricon/device_.h rename to src/native/cambricon/device_.h diff --git a/src/cambricon/gemm/cnblas.h b/src/native/cambricon/ops/gemm/cnblas.h similarity index 99% rename from src/cambricon/gemm/cnblas.h rename to src/native/cambricon/ops/gemm/cnblas.h index ac95bd568..42248d4fa 100644 --- a/src/cambricon/gemm/cnblas.h +++ b/src/native/cambricon/ops/gemm/cnblas.h @@ -11,7 +11,7 @@ // clang-format on #include "base/gemm.h" -#include "cambricon/common.h" +#include "native/cambricon/common.h" namespace infini::ops { diff --git a/src/cambricon/rms_norm/kernel.mlu b/src/native/cambricon/ops/rms_norm/kernel.mlu similarity index 100% rename from src/cambricon/rms_norm/kernel.mlu rename to src/native/cambricon/ops/rms_norm/kernel.mlu diff --git a/src/cambricon/rms_norm/rms_norm.h b/src/native/cambricon/ops/rms_norm/rms_norm.h similarity index 96% rename from src/cambricon/rms_norm/rms_norm.h rename to src/native/cambricon/ops/rms_norm/rms_norm.h index 61906f730..6a9aed098 100644 --- a/src/cambricon/rms_norm/rms_norm.h +++ b/src/native/cambricon/ops/rms_norm/rms_norm.h @@ -6,8 +6,8 @@ #include #include "base/rms_norm.h" -#include "cambricon/common.h" -#include "cambricon/data_type_.h" +#include "native/cambricon/common.h" +#include "native/cambricon/data_type_.h" namespace infini::ops { diff --git a/src/cambricon/runtime_.h b/src/native/cambricon/runtime_.h similarity index 95% rename from src/cambricon/runtime_.h rename to src/native/cambricon/runtime_.h index f260c0c7a..7ff30fe9c 100644 --- a/src/cambricon/runtime_.h +++ b/src/native/cambricon/runtime_.h @@ -3,7 +3,7 @@ #include -#include "cambricon/device_.h" +#include "native/cambricon/device_.h" #include "runtime.h" namespace infini::ops { diff --git a/src/cpu/caster_.h b/src/native/cpu/caster_.h similarity index 98% rename from src/cpu/caster_.h rename to src/native/cpu/caster_.h index ab3afee45..7da1bf365 100644 --- a/src/cpu/caster_.h +++ b/src/native/cpu/caster_.h @@ -4,7 +4,7 @@ #include #include "caster.h" -#include "cpu/data_type_.h" +#include "native/cpu/data_type_.h" namespace infini::ops { diff --git a/src/cpu/data_type_.h b/src/native/cpu/data_type_.h similarity index 91% rename from src/cpu/data_type_.h rename to src/native/cpu/data_type_.h index f1195aac8..36231db51 100644 --- a/src/cpu/data_type_.h +++ b/src/native/cpu/data_type_.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_CPU_DATA_TYPE__H_ #define INFINI_OPS_CPU_DATA_TYPE__H_ -#include "cpu/device_.h" #include "data_type.h" +#include "native/cpu/device_.h" namespace infini::ops { diff --git a/src/cpu/device_.h b/src/native/cpu/device_.h similarity index 100% rename from src/cpu/device_.h rename to src/native/cpu/device_.h diff --git a/src/cpu/add/add.h b/src/native/cpu/ops/add/add.h similarity index 98% rename from src/cpu/add/add.h rename to src/native/cpu/ops/add/add.h index c56d31f4f..8580c6df0 100644 --- a/src/cpu/add/add.h +++ b/src/native/cpu/ops/add/add.h @@ -5,7 +5,7 @@ #include "base/add.h" #include "common/generic_utils.h" -#include "cpu/caster_.h" +#include "native/cpu/caster_.h" namespace infini::ops { diff --git a/src/native/cpu/ops/cast/cast.h b/src/native/cpu/ops/cast/cast.h new file mode 100644 index 000000000..6e570d664 --- /dev/null +++ b/src/native/cpu/ops/cast/cast.h @@ -0,0 +1,52 @@ +#ifndef INFINI_OPS_CPU_CAST_CAST_H_ +#define INFINI_OPS_CPU_CAST_CAST_H_ + +#include "base/cast.h" +#include "common/generic_utils.h" +#include "native/cpu/caster_.h" + +namespace infini::ops { + +template <> +class Operator : public Cast { + public: + Operator(const Tensor input, Tensor out) : Cast{input, out} {} + + void operator()(const Tensor input, Tensor out) const override { + DispatchFunc( + {input_dtype_, out_dtype_}, + [&](auto in_tag, auto out_tag) { + using InT = typename decltype(in_tag)::type; + using OutT = typename decltype(out_tag)::type; + Compute(input, out); + }, + "`Operator::operator()`"); + } + + private: + template + void Compute(const Tensor input, Tensor out) const { + const auto* in_ptr = static_cast(input.data()); + auto* out_ptr = static_cast(out.data()); + + auto get_idx = [&](Tensor::Size i, bool is_contig, const auto* shape, + const auto* strides) { + return is_contig ? i : utils::IndexToOffset(i, ndim_, shape, strides); + }; + +#pragma omp parallel for + for (Tensor::Size i = 0; i < output_size_; ++i) { + auto in_idx = get_idx(i, is_input_contiguous_, input_shape_.data(), + input_strides_.data()); + auto out_idx = get_idx(i, is_out_contiguous_, out_shape_.data(), + out_strides_.data()); + + out_ptr[out_idx] = + Caster::template Cast(in_ptr[in_idx]); + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cpu/ops/cat/cat.h b/src/native/cpu/ops/cat/cat.h new file mode 100644 index 000000000..a2f600e5c --- /dev/null +++ b/src/native/cpu/ops/cat/cat.h @@ -0,0 +1,71 @@ +#ifndef INFINI_OPS_CPU_CAT_CAT_H_ +#define INFINI_OPS_CPU_CAT_CAT_H_ + +#include +#include + +#include "base/cat.h" +#include "native/cpu/caster_.h" + +namespace infini::ops { + +template <> +class Operator : public Cat { + public: + Operator(const Tensor first_input, std::vector rest_inputs, + int64_t dim, Tensor out) + : Cat{first_input, rest_inputs, dim, out} {} + + void operator()(const Tensor first_input, std::vector rest_inputs, + int64_t /*dim*/, Tensor out) const override { + // Collect all input tensors. + std::vector inputs; + inputs.reserve(input_count_); + inputs.push_back(&first_input); + for (const auto& t : rest_inputs) { + inputs.push_back(&t); + } + + // Use normalized `dim_` from base class (handles negative dim). + auto dim = dim_; + auto elem_size = kDataTypeToSize.at(out.dtype()); + auto ndim = out.ndim(); + auto out_shape = out.shape(); + + // Compute outer and inner sizes relative to the cat dimension. + Tensor::Size outer = 1; + for (int64_t i = 0; i < dim; ++i) { + outer *= out_shape[i]; + } + + Tensor::Size inner = 1; + for (size_t i = static_cast(dim) + 1; i < ndim; ++i) { + inner *= out_shape[i]; + } + + auto* out_ptr = static_cast(out.data()); + Tensor::Size out_dim_size = out_shape[dim]; + + // For each outer index, copy slices from each input along the cat dim. + for (Tensor::Size o = 0; o < outer; ++o) { + Tensor::Size offset_in_dim = 0; + + for (size_t t = 0; t < input_count_; ++t) { + auto in_dim = inputs[t]->shape()[dim]; + auto in_ptr = static_cast(inputs[t]->data()); + + auto src_offset = (o * in_dim) * inner * elem_size; + auto dst_offset = + (o * out_dim_size + offset_in_dim) * inner * elem_size; + auto copy_size = in_dim * inner * elem_size; + + std::memcpy(out_ptr + dst_offset, in_ptr + src_offset, copy_size); + offset_in_dim += in_dim; + } + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/causal_softmax/causal_softmax.h b/src/native/cpu/ops/causal_softmax/causal_softmax.h similarity index 98% rename from src/cpu/causal_softmax/causal_softmax.h rename to src/native/cpu/ops/causal_softmax/causal_softmax.h index 14848ee40..8b00c6535 100644 --- a/src/cpu/causal_softmax/causal_softmax.h +++ b/src/native/cpu/ops/causal_softmax/causal_softmax.h @@ -5,8 +5,8 @@ #include "base/causal_softmax.h" #include "common/generic_utils.h" -#include "cpu/caster_.h" #include "data_type.h" +#include "native/cpu/caster_.h" #include "tensor.h" namespace infini::ops { diff --git a/src/cpu/gemm/gemm.h b/src/native/cpu/ops/gemm/gemm.h similarity index 99% rename from src/cpu/gemm/gemm.h rename to src/native/cpu/ops/gemm/gemm.h index a4dfb9891..0eaa0a0bd 100644 --- a/src/cpu/gemm/gemm.h +++ b/src/native/cpu/ops/gemm/gemm.h @@ -5,7 +5,7 @@ #include "base/gemm.h" #include "common/generic_utils.h" -#include "cpu/caster_.h" +#include "native/cpu/caster_.h" namespace infini::ops { diff --git a/src/native/cpu/ops/linear/linear.h b/src/native/cpu/ops/linear/linear.h new file mode 100644 index 000000000..aa152658a --- /dev/null +++ b/src/native/cpu/ops/linear/linear.h @@ -0,0 +1,107 @@ +#ifndef INFINI_OPS_CPU_LINEAR_LINEAR_H_ +#define INFINI_OPS_CPU_LINEAR_LINEAR_H_ + +#include + +#include "base/linear.h" +#include "common/generic_utils.h" +#include "native/cpu/caster_.h" + +namespace infini::ops { + +template <> +class Operator : public Linear, + Caster { + public: + Operator(const Tensor a, const Tensor b, std::optional bias, + bool trans_a, bool trans_b, Tensor out) + : Linear{a, b, bias, trans_a, trans_b, out} {} + + void operator()(const Tensor a, const Tensor b, std::optional bias, + bool trans_a, bool trans_b, Tensor out) const override { + DispatchFunc( + out.dtype(), + [&](auto tag) { + using T = typename decltype(tag)::type; + Compute(a, b, bias, trans_a, trans_b, out); + }, + "`Operator::operator()`"); + } + + private: + template + void Compute(const Tensor a, const Tensor b, std::optional bias, + bool trans_a, bool trans_b, Tensor out) const { + const auto* a_ptr = static_cast(a.data()); + const auto* b_ptr = static_cast(b.data()); + auto* out_ptr = static_cast(out.data()); + const T* bias_ptr = bias ? static_cast(bias->data()) : nullptr; + + auto ndim_a = a_shape_.size(); + auto ndim_b = b_shape_.size(); + auto ndim_out = out_shape_.size(); + + Tensor::Size m = out_shape_[ndim_out - 2]; + Tensor::Size n = out_shape_[ndim_out - 1]; + Tensor::Size k = trans_a ? a_shape_[ndim_a - 2] : a_shape_[ndim_a - 1]; + + // Compute strides for the inner matrix dimensions after transpose. + Tensor::Stride stride_a_m = + trans_a ? a_strides_[ndim_a - 1] : a_strides_[ndim_a - 2]; + Tensor::Stride stride_a_k = + trans_a ? a_strides_[ndim_a - 2] : a_strides_[ndim_a - 1]; + Tensor::Stride stride_b_k = + trans_b ? b_strides_[ndim_b - 1] : b_strides_[ndim_b - 2]; + Tensor::Stride stride_b_n = + trans_b ? b_strides_[ndim_b - 2] : b_strides_[ndim_b - 1]; + Tensor::Stride stride_out_m = out_strides_[ndim_out - 2]; + Tensor::Stride stride_out_n = out_strides_[ndim_out - 1]; + + // Batch dimensions. + Tensor::Size batch_count = 1; + for (size_t i = 0; i + 2 < ndim_out; ++i) { + batch_count *= out_shape_[i]; + } + + Tensor::Stride batch_stride_a = ndim_a > 2 ? a_strides_[ndim_a - 3] : 0; + Tensor::Stride batch_stride_b = ndim_b > 2 ? b_strides_[ndim_b - 3] : 0; + Tensor::Stride batch_stride_out = + ndim_out > 2 ? out_strides_[ndim_out - 3] : 0; + + // Bias stride: for 1D bias `[n]`, stride is 1. For batched bias, use last + // stride. + Tensor::Stride bias_stride = 0; + if (bias_ptr) { + auto ndim_bias = bias->shape().size(); + bias_stride = bias->strides()[ndim_bias - 1]; + } + + for (Tensor::Size batch = 0; batch < batch_count; ++batch) { + const auto* a_batch = a_ptr + batch * batch_stride_a; + const auto* b_batch = b_ptr + batch * batch_stride_b; + auto* out_batch = out_ptr + batch * batch_stride_out; + + for (Tensor::Size i = 0; i < m; ++i) { + for (Tensor::Size j = 0; j < n; ++j) { + float sum = 0.0f; + + for (Tensor::Size l = 0; l < k; ++l) { + float a_val = Cast(a_batch[i * stride_a_m + l * stride_a_k]); + float b_val = Cast(b_batch[l * stride_b_k + j * stride_b_n]); + sum += a_val * b_val; + } + + if (bias_ptr) { + sum += Cast(bias_ptr[j * bias_stride]); + } + + out_batch[i * stride_out_m + j * stride_out_n] = Cast(sum); + } + } + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cpu/ops/mul/mul.h b/src/native/cpu/ops/mul/mul.h new file mode 100644 index 000000000..c3039b005 --- /dev/null +++ b/src/native/cpu/ops/mul/mul.h @@ -0,0 +1,63 @@ +#ifndef INFINI_OPS_CPU_MUL_MUL_H_ +#define INFINI_OPS_CPU_MUL_MUL_H_ + +#include + +#include "base/mul.h" +#include "common/generic_utils.h" +#include "native/cpu/caster_.h" + +namespace infini::ops { + +template <> +class Operator : public Mul, + Caster { + public: + Operator(const Tensor input, const Tensor other, Tensor out) + : Mul{input, other, out} {} + + void operator()(const Tensor input, const Tensor other, + Tensor out) const override { + DispatchFunc( + out_type_, + [&](auto tag) { + using T = typename decltype(tag)::type; + Compute(input, other, out); + }, + "`Operator::operator()`"); + } + + private: + template + void Compute(const Tensor input, const Tensor other, Tensor out) const { + using ComputeType = std::conditional_t || + IsFP16, + float, T>; + + const auto* input_ptr = static_cast(input.data()); + const auto* other_ptr = static_cast(other.data()); + auto* out_ptr = static_cast(out.data()); + + auto get_idx = [&](Tensor::Size i, bool is_contig, const auto* shape, + const auto* strides) { + return is_contig ? i : utils::IndexToOffset(i, ndim_, shape, strides); + }; + +#pragma omp parallel for + for (Tensor::Size i = 0; i < output_size_; ++i) { + auto input_idx = get_idx(i, is_input_contiguous_, input_shape_.data(), + input_strides_.data()); + auto other_idx = get_idx(i, is_other_contiguous_, other_shape_.data(), + other_strides_.data()); + auto out_idx = get_idx(i, is_out_contiguous_, out_shape_.data(), + out_strides_.data()); + + out_ptr[out_idx] = Cast(Cast(input_ptr[input_idx]) * + Cast(other_ptr[other_idx])); + } + } +}; + +} // namespace infini::ops + +#endif diff --git a/src/cpu/rms_norm/rms_norm.h b/src/native/cpu/ops/rms_norm/rms_norm.h similarity index 98% rename from src/cpu/rms_norm/rms_norm.h rename to src/native/cpu/ops/rms_norm/rms_norm.h index 9cae419e2..c3f091cb4 100644 --- a/src/cpu/rms_norm/rms_norm.h +++ b/src/native/cpu/ops/rms_norm/rms_norm.h @@ -5,8 +5,8 @@ #include "base/rms_norm.h" #include "common/generic_utils.h" -#include "cpu/caster_.h" #include "data_type.h" +#include "native/cpu/caster_.h" #include "tensor.h" namespace infini::ops { diff --git a/src/cpu/swiglu/swiglu.h b/src/native/cpu/ops/swiglu/swiglu.h similarity index 98% rename from src/cpu/swiglu/swiglu.h rename to src/native/cpu/ops/swiglu/swiglu.h index 57dccf188..37ae46767 100644 --- a/src/cpu/swiglu/swiglu.h +++ b/src/native/cpu/ops/swiglu/swiglu.h @@ -5,7 +5,7 @@ #include "base/swiglu.h" #include "common/generic_utils.h" -#include "cpu/caster_.h" +#include "native/cpu/caster_.h" namespace infini::ops { diff --git a/src/cpu/runtime_.h b/src/native/cpu/runtime_.h similarity index 100% rename from src/cpu/runtime_.h rename to src/native/cpu/runtime_.h diff --git a/src/cuda/blas.h b/src/native/cuda/blas.h similarity index 100% rename from src/cuda/blas.h rename to src/native/cuda/blas.h diff --git a/src/cuda/blas_utils.h b/src/native/cuda/blas_utils.h similarity index 100% rename from src/cuda/blas_utils.h rename to src/native/cuda/blas_utils.h diff --git a/src/cuda/caster.cuh b/src/native/cuda/caster.cuh similarity index 100% rename from src/cuda/caster.cuh rename to src/native/cuda/caster.cuh diff --git a/src/cuda/iluvatar/blas.h b/src/native/cuda/iluvatar/blas.h similarity index 91% rename from src/cuda/iluvatar/blas.h rename to src/native/cuda/iluvatar/blas.h index 791f2dde5..7e7545a40 100644 --- a/src/cuda/iluvatar/blas.h +++ b/src/native/cuda/iluvatar/blas.h @@ -7,10 +7,10 @@ #include "cublas_v2.h" // clang-format on -#include "cuda/blas.h" -#include "cuda/iluvatar/blas_utils.h" -#include "cuda/iluvatar/runtime_.h" #include "data_type.h" +#include "native/cuda/blas.h" +#include "native/cuda/iluvatar/blas_utils.h" +#include "native/cuda/iluvatar/runtime_.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/blas_utils.h b/src/native/cuda/iluvatar/blas_utils.h similarity index 94% rename from src/cuda/iluvatar/blas_utils.h rename to src/native/cuda/iluvatar/blas_utils.h index 6243d8d89..3654e3541 100644 --- a/src/cuda/iluvatar/blas_utils.h +++ b/src/native/cuda/iluvatar/blas_utils.h @@ -5,8 +5,8 @@ #include "cublas_v2.h" // clang-format on -#include "cuda/blas_utils.h" #include "data_type.h" +#include "native/cuda/blas_utils.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/caster.cuh b/src/native/cuda/iluvatar/caster.cuh similarity index 93% rename from src/cuda/iluvatar/caster.cuh rename to src/native/cuda/iluvatar/caster.cuh index c2ff06774..a75369570 100644 --- a/src/cuda/iluvatar/caster.cuh +++ b/src/native/cuda/iluvatar/caster.cuh @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_ILUVATAR_CASTER__H_ #define INFINI_OPS_ILUVATAR_CASTER__H_ -#include "cuda/caster.cuh" -#include "cuda/iluvatar/data_type_.h" +#include "native/cuda/caster.cuh" +#include "native/cuda/iluvatar/data_type_.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/data_type_.h b/src/native/cuda/iluvatar/data_type_.h similarity index 92% rename from src/cuda/iluvatar/data_type_.h rename to src/native/cuda/iluvatar/data_type_.h index 533cc3364..ac997f992 100644 --- a/src/cuda/iluvatar/data_type_.h +++ b/src/native/cuda/iluvatar/data_type_.h @@ -6,8 +6,8 @@ #include // clang-format on -#include "cuda/iluvatar/device_.h" #include "data_type.h" +#include "native/cuda/iluvatar/device_.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/device_.h b/src/native/cuda/iluvatar/device_.h similarity index 100% rename from src/cuda/iluvatar/device_.h rename to src/native/cuda/iluvatar/device_.h diff --git a/src/cuda/iluvatar/device_property.h b/src/native/cuda/iluvatar/device_property.h similarity index 100% rename from src/cuda/iluvatar/device_property.h rename to src/native/cuda/iluvatar/device_property.h diff --git a/src/cuda/iluvatar/add/kernel.h b/src/native/cuda/iluvatar/ops/add/kernel.h similarity index 73% rename from src/cuda/iluvatar/add/kernel.h rename to src/native/cuda/iluvatar/ops/add/kernel.h index 365ec9e2e..f41aa6be4 100644 --- a/src/cuda/iluvatar/add/kernel.h +++ b/src/native/cuda/iluvatar/ops/add/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/add/kernel.h" -#include "cuda/iluvatar/caster.cuh" -#include "cuda/iluvatar/runtime_.h" +#include "native/cuda/iluvatar/caster.cuh" +#include "native/cuda/iluvatar/runtime_.h" +#include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/causal_softmax/kernel.h b/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h similarity index 75% rename from src/cuda/iluvatar/causal_softmax/kernel.h rename to src/native/cuda/iluvatar/ops/causal_softmax/kernel.h index a66fa2644..8c35c87ce 100644 --- a/src/cuda/iluvatar/causal_softmax/kernel.h +++ b/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/causal_softmax/kernel.h" -#include "cuda/iluvatar/caster.cuh" -#include "cuda/iluvatar/runtime_.h" +#include "native/cuda/iluvatar/caster.cuh" +#include "native/cuda/iluvatar/runtime_.h" +#include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/gemm/cublas.h b/src/native/cuda/iluvatar/ops/gemm/cublas.h similarity index 81% rename from src/cuda/iluvatar/gemm/cublas.h rename to src/native/cuda/iluvatar/ops/gemm/cublas.h index 4575e5b15..d086cf4f4 100644 --- a/src/cuda/iluvatar/gemm/cublas.h +++ b/src/native/cuda/iluvatar/ops/gemm/cublas.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_ILUVATAR_GEMM_CUBLAS_H_ #define INFINI_OPS_ILUVATAR_GEMM_CUBLAS_H_ -#include "cuda/gemm/blas.h" -#include "cuda/iluvatar/blas.h" +#include "native/cuda/iluvatar/blas.h" +#include "native/cuda/ops/gemm/blas.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/linear/kernel.h b/src/native/cuda/iluvatar/ops/linear/kernel.h new file mode 100644 index 000000000..c3976144a --- /dev/null +++ b/src/native/cuda/iluvatar/ops/linear/kernel.h @@ -0,0 +1,19 @@ +#ifndef INFINI_OPS_ILUVATAR_LINEAR_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_LINEAR_KERNEL_H_ + +#include "native/cuda/iluvatar/ops/add/kernel.h" +#include "native/cuda/iluvatar/ops/gemm/cublas.h" +#include "native/cuda/ops/linear/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaLinear { + public: + using CudaLinear::CudaLinear; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/iluvatar/rms_norm/kernel.h b/src/native/cuda/iluvatar/ops/rms_norm/kernel.h similarity index 74% rename from src/cuda/iluvatar/rms_norm/kernel.h rename to src/native/cuda/iluvatar/ops/rms_norm/kernel.h index ad9a0a1f4..3230fe8b0 100644 --- a/src/cuda/iluvatar/rms_norm/kernel.h +++ b/src/native/cuda/iluvatar/ops/rms_norm/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/iluvatar/caster.cuh" -#include "cuda/iluvatar/runtime_.h" -#include "cuda/rms_norm/kernel.h" +#include "native/cuda/iluvatar/caster.cuh" +#include "native/cuda/iluvatar/runtime_.h" +#include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/swiglu/kernel.h b/src/native/cuda/iluvatar/ops/swiglu/kernel.h similarity index 74% rename from src/cuda/iluvatar/swiglu/kernel.h rename to src/native/cuda/iluvatar/ops/swiglu/kernel.h index b28521f67..b03ef19c9 100644 --- a/src/cuda/iluvatar/swiglu/kernel.h +++ b/src/native/cuda/iluvatar/ops/swiglu/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/iluvatar/caster.cuh" -#include "cuda/iluvatar/runtime_.h" -#include "cuda/swiglu/kernel.h" +#include "native/cuda/iluvatar/caster.cuh" +#include "native/cuda/iluvatar/runtime_.h" +#include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h similarity index 87% rename from src/cuda/iluvatar/runtime_.h rename to src/native/cuda/iluvatar/runtime_.h index acd41f868..1e41b68e9 100644 --- a/src/cuda/iluvatar/runtime_.h +++ b/src/native/cuda/iluvatar/runtime_.h @@ -7,9 +7,9 @@ #include // clang-format on -#include "cuda/iluvatar/device_.h" -#include "cuda/iluvatar/runtime_utils.h" -#include "cuda/runtime.h" +#include "native/cuda/iluvatar/device_.h" +#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/runtime_.h" namespace infini::ops { diff --git a/src/cuda/iluvatar/runtime_utils.h b/src/native/cuda/iluvatar/runtime_utils.h similarity index 74% rename from src/cuda/iluvatar/runtime_utils.h rename to src/native/cuda/iluvatar/runtime_utils.h index be52981cc..df88ed3e5 100644 --- a/src/cuda/iluvatar/runtime_utils.h +++ b/src/native/cuda/iluvatar/runtime_utils.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_ILUVATAR_RUNTIME_UTILS_H_ #define INFINI_OPS_ILUVATAR_RUNTIME_UTILS_H_ -#include "cuda/iluvatar/device_property.h" -#include "cuda/runtime_utils.h" +#include "native/cuda/iluvatar/device_property.h" +#include "native/cuda/runtime_utils.h" namespace infini::ops { diff --git a/src/cuda/kernel_commons.cuh b/src/native/cuda/kernel_commons.cuh similarity index 100% rename from src/cuda/kernel_commons.cuh rename to src/native/cuda/kernel_commons.cuh diff --git a/src/cuda/metax/blas.h b/src/native/cuda/metax/blas.h similarity index 91% rename from src/cuda/metax/blas.h rename to src/native/cuda/metax/blas.h index 9c0dd2d05..68e5183e1 100644 --- a/src/cuda/metax/blas.h +++ b/src/native/cuda/metax/blas.h @@ -7,10 +7,10 @@ #include // clang-format on -#include "cuda/blas.h" -#include "cuda/metax/blas_utils.h" -#include "cuda/metax/runtime_.h" #include "data_type.h" +#include "native/cuda/blas.h" +#include "native/cuda/metax/blas_utils.h" +#include "native/cuda/metax/runtime_.h" namespace infini::ops { diff --git a/src/cuda/metax/blas_utils.h b/src/native/cuda/metax/blas_utils.h similarity index 94% rename from src/cuda/metax/blas_utils.h rename to src/native/cuda/metax/blas_utils.h index a84fd3cf8..6095cd70b 100644 --- a/src/cuda/metax/blas_utils.h +++ b/src/native/cuda/metax/blas_utils.h @@ -5,8 +5,8 @@ #include // clang-format on -#include "cuda/blas_utils.h" #include "data_type.h" +#include "native/cuda/blas_utils.h" namespace infini::ops { diff --git a/src/cuda/metax/caster.cuh b/src/native/cuda/metax/caster.cuh similarity index 96% rename from src/cuda/metax/caster.cuh rename to src/native/cuda/metax/caster.cuh index dc9c808df..6ad565870 100644 --- a/src/cuda/metax/caster.cuh +++ b/src/native/cuda/metax/caster.cuh @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_METAX_CASTER__H_ #define INFINI_OPS_METAX_CASTER__H_ -#include "cuda/caster.cuh" -#include "cuda/metax/data_type_.h" +#include "native/cuda/caster.cuh" +#include "native/cuda/metax/data_type_.h" namespace infini::ops { diff --git a/src/cuda/metax/data_type_.h b/src/native/cuda/metax/data_type_.h similarity index 93% rename from src/cuda/metax/data_type_.h rename to src/native/cuda/metax/data_type_.h index f14620267..73e498774 100644 --- a/src/cuda/metax/data_type_.h +++ b/src/native/cuda/metax/data_type_.h @@ -5,8 +5,8 @@ #include #include -#include "cuda/metax/device_.h" #include "data_type.h" +#include "native/cuda/metax/device_.h" namespace infini::ops { diff --git a/src/cuda/metax/device_.h b/src/native/cuda/metax/device_.h similarity index 100% rename from src/cuda/metax/device_.h rename to src/native/cuda/metax/device_.h diff --git a/src/cuda/metax/device_property.h b/src/native/cuda/metax/device_property.h similarity index 100% rename from src/cuda/metax/device_property.h rename to src/native/cuda/metax/device_property.h diff --git a/src/cuda/metax/add/kernel.h b/src/native/cuda/metax/ops/add/kernel.h similarity index 73% rename from src/cuda/metax/add/kernel.h rename to src/native/cuda/metax/ops/add/kernel.h index e0c177426..7059bb7c1 100644 --- a/src/cuda/metax/add/kernel.h +++ b/src/native/cuda/metax/ops/add/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/add/kernel.h" -#include "cuda/metax/caster.cuh" -#include "cuda/metax/runtime_.h" +#include "native/cuda/metax/caster.cuh" +#include "native/cuda/metax/runtime_.h" +#include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/cuda/metax/causal_softmax/kernel.h b/src/native/cuda/metax/ops/causal_softmax/kernel.h similarity index 75% rename from src/cuda/metax/causal_softmax/kernel.h rename to src/native/cuda/metax/ops/causal_softmax/kernel.h index 55a54f632..426465984 100644 --- a/src/cuda/metax/causal_softmax/kernel.h +++ b/src/native/cuda/metax/ops/causal_softmax/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/causal_softmax/kernel.h" -#include "cuda/metax/caster.cuh" -#include "cuda/metax/runtime_.h" +#include "native/cuda/metax/caster.cuh" +#include "native/cuda/metax/runtime_.h" +#include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/cuda/metax/gemm/mcblas.h b/src/native/cuda/metax/ops/gemm/mcblas.h similarity index 80% rename from src/cuda/metax/gemm/mcblas.h rename to src/native/cuda/metax/ops/gemm/mcblas.h index df1eac51a..c1ae08d08 100644 --- a/src/cuda/metax/gemm/mcblas.h +++ b/src/native/cuda/metax/ops/gemm/mcblas.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_METAX_GEMM_MCBLAS_H_ #define INFINI_OPS_METAX_GEMM_MCBLAS_H_ -#include "cuda/gemm/blas.h" -#include "cuda/metax/blas.h" +#include "native/cuda/metax/blas.h" +#include "native/cuda/ops/gemm/blas.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/linear/kernel.h b/src/native/cuda/metax/ops/linear/kernel.h new file mode 100644 index 000000000..0f0a414fa --- /dev/null +++ b/src/native/cuda/metax/ops/linear/kernel.h @@ -0,0 +1,19 @@ +#ifndef INFINI_OPS_METAX_LINEAR_KERNEL_H_ +#define INFINI_OPS_METAX_LINEAR_KERNEL_H_ + +#include "native/cuda/metax/ops/add/kernel.h" +#include "native/cuda/metax/ops/gemm/mcblas.h" +#include "native/cuda/ops/linear/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaLinear { + public: + using CudaLinear::CudaLinear; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/metax/rms_norm/kernel.h b/src/native/cuda/metax/ops/rms_norm/kernel.h similarity index 74% rename from src/cuda/metax/rms_norm/kernel.h rename to src/native/cuda/metax/ops/rms_norm/kernel.h index 38b29f4a9..38d84c830 100644 --- a/src/cuda/metax/rms_norm/kernel.h +++ b/src/native/cuda/metax/ops/rms_norm/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/metax/caster.cuh" -#include "cuda/metax/runtime_.h" -#include "cuda/rms_norm/kernel.h" +#include "native/cuda/metax/caster.cuh" +#include "native/cuda/metax/runtime_.h" +#include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/cuda/metax/swiglu/kernel.h b/src/native/cuda/metax/ops/swiglu/kernel.h similarity index 74% rename from src/cuda/metax/swiglu/kernel.h rename to src/native/cuda/metax/ops/swiglu/kernel.h index a4f9359d7..0bcfd3198 100644 --- a/src/cuda/metax/swiglu/kernel.h +++ b/src/native/cuda/metax/ops/swiglu/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/metax/caster.cuh" -#include "cuda/metax/runtime_.h" -#include "cuda/swiglu/kernel.h" +#include "native/cuda/metax/caster.cuh" +#include "native/cuda/metax/runtime_.h" +#include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h similarity index 86% rename from src/cuda/metax/runtime_.h rename to src/native/cuda/metax/runtime_.h index 576e0fb38..6563b6b41 100644 --- a/src/cuda/metax/runtime_.h +++ b/src/native/cuda/metax/runtime_.h @@ -3,9 +3,9 @@ #include -#include "cuda/metax/device_.h" -#include "cuda/metax/runtime_utils.h" -#include "cuda/runtime.h" +#include "native/cuda/metax/device_.h" +#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/runtime_.h" namespace infini::ops { diff --git a/src/cuda/metax/runtime_utils.h b/src/native/cuda/metax/runtime_utils.h similarity index 74% rename from src/cuda/metax/runtime_utils.h rename to src/native/cuda/metax/runtime_utils.h index 3bd9eafef..94100549b 100644 --- a/src/cuda/metax/runtime_utils.h +++ b/src/native/cuda/metax/runtime_utils.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_METAX_RUNTIME_UTILS_H_ #define INFINI_OPS_METAX_RUNTIME_UTILS_H_ -#include "cuda/metax/device_property.h" -#include "cuda/runtime_utils.h" +#include "native/cuda/metax/device_property.h" +#include "native/cuda/runtime_utils.h" namespace infini::ops { diff --git a/src/cuda/moore/blas.h b/src/native/cuda/moore/blas.h similarity index 89% rename from src/cuda/moore/blas.h rename to src/native/cuda/moore/blas.h index 91a86e507..b2531a1ca 100644 --- a/src/cuda/moore/blas.h +++ b/src/native/cuda/moore/blas.h @@ -5,10 +5,10 @@ #include -#include "cuda/blas.h" -#include "cuda/moore/blas_utils.h" -#include "cuda/moore/runtime_.h" #include "data_type.h" +#include "native/cuda/blas.h" +#include "native/cuda/moore/blas_utils.h" +#include "native/cuda/moore/runtime_.h" namespace infini::ops { diff --git a/src/cuda/moore/blas_utils.h b/src/native/cuda/moore/blas_utils.h similarity index 94% rename from src/cuda/moore/blas_utils.h rename to src/native/cuda/moore/blas_utils.h index b3a08d956..f897936ed 100644 --- a/src/cuda/moore/blas_utils.h +++ b/src/native/cuda/moore/blas_utils.h @@ -3,8 +3,8 @@ #include -#include "cuda/blas_utils.h" #include "data_type.h" +#include "native/cuda/blas_utils.h" namespace infini::ops { diff --git a/src/cuda/moore/caster.cuh b/src/native/cuda/moore/caster.cuh similarity index 95% rename from src/cuda/moore/caster.cuh rename to src/native/cuda/moore/caster.cuh index 9b45cd046..48154980d 100644 --- a/src/cuda/moore/caster.cuh +++ b/src/native/cuda/moore/caster.cuh @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_MOORE_CASTER__H_ #define INFINI_OPS_MOORE_CASTER__H_ -#include "cuda/caster.cuh" -#include "cuda/moore/data_type_.h" +#include "native/cuda/caster.cuh" +#include "native/cuda/moore/data_type_.h" namespace infini::ops { diff --git a/src/cuda/moore/data_type_.h b/src/native/cuda/moore/data_type_.h similarity index 92% rename from src/cuda/moore/data_type_.h rename to src/native/cuda/moore/data_type_.h index bec9e473a..c97cb33dd 100644 --- a/src/cuda/moore/data_type_.h +++ b/src/native/cuda/moore/data_type_.h @@ -4,8 +4,8 @@ #include #include -#include "cuda/moore/device_.h" #include "data_type.h" +#include "native/cuda/moore/device_.h" namespace infini::ops { diff --git a/src/cuda/moore/device_.h b/src/native/cuda/moore/device_.h similarity index 100% rename from src/cuda/moore/device_.h rename to src/native/cuda/moore/device_.h diff --git a/src/cuda/moore/device_property.h b/src/native/cuda/moore/device_property.h similarity index 100% rename from src/cuda/moore/device_property.h rename to src/native/cuda/moore/device_property.h diff --git a/src/cuda/moore/add/kernel.h b/src/native/cuda/moore/ops/add/kernel.h similarity index 64% rename from src/cuda/moore/add/kernel.h rename to src/native/cuda/moore/ops/add/kernel.h index 2a1590dc3..abcea37c5 100644 --- a/src/cuda/moore/add/kernel.h +++ b/src/native/cuda/moore/ops/add/kernel.h @@ -4,13 +4,13 @@ #include // clang-format off -#include "cuda/moore/polyfills.cuh" +#include "native/cuda/moore/polyfills.cuh" // clang-format on -#include "cuda/add/kernel.h" -#include "cuda/moore/caster.cuh" -#include "cuda/moore/polyfills.cuh" -#include "cuda/moore/runtime_.h" +#include "native/cuda/moore/caster.cuh" +#include "native/cuda/moore/polyfills.cuh" +#include "native/cuda/moore/runtime_.h" +#include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/cuda/moore/causal_softmax/kernel.h b/src/native/cuda/moore/ops/causal_softmax/kernel.h similarity index 79% rename from src/cuda/moore/causal_softmax/kernel.h rename to src/native/cuda/moore/ops/causal_softmax/kernel.h index 0f5de1b95..e13118763 100644 --- a/src/cuda/moore/causal_softmax/kernel.h +++ b/src/native/cuda/moore/ops/causal_softmax/kernel.h @@ -6,12 +6,12 @@ // clang-format on // clang-format off -#include "cuda/moore/device_.h" +#include "native/cuda/moore/device_.h" // clang-format on -#include "cuda/causal_softmax/kernel.h" -#include "cuda/moore/caster.cuh" -#include "cuda/moore/runtime_.h" +#include "native/cuda/moore/caster.cuh" +#include "native/cuda/moore/runtime_.h" +#include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/cuda/moore/gemm/mublas.h b/src/native/cuda/moore/ops/gemm/mublas.h similarity index 92% rename from src/cuda/moore/gemm/mublas.h rename to src/native/cuda/moore/ops/gemm/mublas.h index 2aadb68e8..a34cba523 100644 --- a/src/cuda/moore/gemm/mublas.h +++ b/src/native/cuda/moore/ops/gemm/mublas.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_MOORE_GEMM_MUBLAS_H_ #define INFINI_OPS_MOORE_GEMM_MUBLAS_H_ -#include "cuda/gemm/blas.h" -#include "cuda/moore/blas.h" +#include "native/cuda/moore/blas.h" +#include "native/cuda/ops/gemm/blas.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/linear/kernel.h b/src/native/cuda/moore/ops/linear/kernel.h new file mode 100644 index 000000000..76a217bdf --- /dev/null +++ b/src/native/cuda/moore/ops/linear/kernel.h @@ -0,0 +1,19 @@ +#ifndef INFINI_OPS_MOORE_LINEAR_KERNEL_H_ +#define INFINI_OPS_MOORE_LINEAR_KERNEL_H_ + +#include "native/cuda/moore/ops/add/kernel.h" +#include "native/cuda/moore/ops/gemm/mublas.h" +#include "native/cuda/ops/linear/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaLinear { + public: + using CudaLinear::CudaLinear; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/moore/rms_norm/kernel.h b/src/native/cuda/moore/ops/rms_norm/kernel.h similarity index 77% rename from src/cuda/moore/rms_norm/kernel.h rename to src/native/cuda/moore/ops/rms_norm/kernel.h index 5cdb1e58e..0f160017d 100644 --- a/src/cuda/moore/rms_norm/kernel.h +++ b/src/native/cuda/moore/ops/rms_norm/kernel.h @@ -7,9 +7,9 @@ #include // clang-format on -#include "cuda/moore/caster.cuh" -#include "cuda/moore/runtime_.h" -#include "cuda/rms_norm/kernel.h" +#include "native/cuda/moore/caster.cuh" +#include "native/cuda/moore/runtime_.h" +#include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/cuda/moore/swiglu/kernel.h b/src/native/cuda/moore/ops/swiglu/kernel.h similarity index 65% rename from src/cuda/moore/swiglu/kernel.h rename to src/native/cuda/moore/ops/swiglu/kernel.h index daa72074f..ac54bff50 100644 --- a/src/cuda/moore/swiglu/kernel.h +++ b/src/native/cuda/moore/ops/swiglu/kernel.h @@ -4,13 +4,13 @@ #include // clang-format off -#include "cuda/moore/polyfills.cuh" +#include "native/cuda/moore/polyfills.cuh" // clang-format on -#include "cuda/moore/caster.cuh" -#include "cuda/moore/polyfills.cuh" -#include "cuda/moore/runtime_.h" -#include "cuda/swiglu/kernel.h" +#include "native/cuda/moore/caster.cuh" +#include "native/cuda/moore/polyfills.cuh" +#include "native/cuda/moore/runtime_.h" +#include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/cuda/moore/polyfills.cuh b/src/native/cuda/moore/polyfills.cuh similarity index 100% rename from src/cuda/moore/polyfills.cuh rename to src/native/cuda/moore/polyfills.cuh diff --git a/src/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h similarity index 89% rename from src/cuda/moore/runtime_.h rename to src/native/cuda/moore/runtime_.h index dd00c3d12..bc519b41e 100644 --- a/src/cuda/moore/runtime_.h +++ b/src/native/cuda/moore/runtime_.h @@ -5,9 +5,9 @@ #include -#include "cuda/moore/device_.h" -#include "cuda/moore/runtime_utils.h" -#include "cuda/runtime.h" +#include "native/cuda/moore/device_.h" +#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/runtime_.h" namespace infini::ops { diff --git a/src/cuda/moore/runtime_utils.h b/src/native/cuda/moore/runtime_utils.h similarity index 74% rename from src/cuda/moore/runtime_utils.h rename to src/native/cuda/moore/runtime_utils.h index 92d2dd926..4c5ae3f6c 100644 --- a/src/cuda/moore/runtime_utils.h +++ b/src/native/cuda/moore/runtime_utils.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_MOORE_RUNTIME_UTILS_H_ #define INFINI_OPS_MOORE_RUNTIME_UTILS_H_ -#include "cuda/moore/device_property.h" -#include "cuda/runtime_utils.h" +#include "native/cuda/moore/device_property.h" +#include "native/cuda/runtime_utils.h" namespace infini::ops { diff --git a/src/cuda/nvidia/blas.h b/src/native/cuda/nvidia/blas.h similarity index 90% rename from src/cuda/nvidia/blas.h rename to src/native/cuda/nvidia/blas.h index 7319e86fd..7cdfd2c62 100644 --- a/src/cuda/nvidia/blas.h +++ b/src/native/cuda/nvidia/blas.h @@ -7,10 +7,10 @@ #include "cublas_v2.h" // clang-format on -#include "cuda/blas.h" -#include "cuda/nvidia/blas_utils.h" -#include "cuda/nvidia/runtime_.h" #include "data_type.h" +#include "native/cuda/blas.h" +#include "native/cuda/nvidia/blas_utils.h" +#include "native/cuda/nvidia/runtime_.h" namespace infini::ops { diff --git a/src/cuda/nvidia/blas_utils.h b/src/native/cuda/nvidia/blas_utils.h similarity index 94% rename from src/cuda/nvidia/blas_utils.h rename to src/native/cuda/nvidia/blas_utils.h index c9c81e974..45644ae38 100644 --- a/src/cuda/nvidia/blas_utils.h +++ b/src/native/cuda/nvidia/blas_utils.h @@ -5,8 +5,8 @@ #include "cublas_v2.h" // clang-format on -#include "cuda/blas_utils.h" #include "data_type.h" +#include "native/cuda/blas_utils.h" namespace infini::ops { diff --git a/src/cuda/nvidia/caster.cuh b/src/native/cuda/nvidia/caster.cuh similarity index 96% rename from src/cuda/nvidia/caster.cuh rename to src/native/cuda/nvidia/caster.cuh index a19740813..b8a366e26 100644 --- a/src/cuda/nvidia/caster.cuh +++ b/src/native/cuda/nvidia/caster.cuh @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_NVIDIA_CASTER__H_ #define INFINI_OPS_NVIDIA_CASTER__H_ -#include "cuda/caster.cuh" -#include "cuda/nvidia/data_type_.h" +#include "native/cuda/caster.cuh" +#include "native/cuda/nvidia/data_type_.h" namespace infini::ops { diff --git a/src/cuda/nvidia/data_type_.h b/src/native/cuda/nvidia/data_type_.h similarity index 92% rename from src/cuda/nvidia/data_type_.h rename to src/native/cuda/nvidia/data_type_.h index 80424ba01..1266c0c8a 100644 --- a/src/cuda/nvidia/data_type_.h +++ b/src/native/cuda/nvidia/data_type_.h @@ -6,8 +6,8 @@ #include // clang-format on -#include "cuda/nvidia/device_.h" #include "data_type.h" +#include "native/cuda/nvidia/device_.h" namespace infini::ops { diff --git a/src/cuda/nvidia/device_.h b/src/native/cuda/nvidia/device_.h similarity index 100% rename from src/cuda/nvidia/device_.h rename to src/native/cuda/nvidia/device_.h diff --git a/src/cuda/nvidia/device_property.h b/src/native/cuda/nvidia/device_property.h similarity index 100% rename from src/cuda/nvidia/device_property.h rename to src/native/cuda/nvidia/device_property.h diff --git a/src/cuda/nvidia/add/kernel.h b/src/native/cuda/nvidia/ops/add/kernel.h similarity index 73% rename from src/cuda/nvidia/add/kernel.h rename to src/native/cuda/nvidia/ops/add/kernel.h index 56fc608cb..373f5a775 100644 --- a/src/cuda/nvidia/add/kernel.h +++ b/src/native/cuda/nvidia/ops/add/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/add/kernel.h" -#include "cuda/nvidia/caster.cuh" -#include "cuda/nvidia/runtime_.h" +#include "native/cuda/nvidia/caster.cuh" +#include "native/cuda/nvidia/runtime_.h" +#include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/cuda/nvidia/causal_softmax/kernel.h b/src/native/cuda/nvidia/ops/causal_softmax/kernel.h similarity index 75% rename from src/cuda/nvidia/causal_softmax/kernel.h rename to src/native/cuda/nvidia/ops/causal_softmax/kernel.h index 88c71fb49..5a7e18a94 100644 --- a/src/cuda/nvidia/causal_softmax/kernel.h +++ b/src/native/cuda/nvidia/ops/causal_softmax/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/causal_softmax/kernel.h" -#include "cuda/nvidia/caster.cuh" -#include "cuda/nvidia/runtime_.h" +#include "native/cuda/nvidia/caster.cuh" +#include "native/cuda/nvidia/runtime_.h" +#include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/cuda/nvidia/gemm/cublas.h b/src/native/cuda/nvidia/ops/gemm/cublas.h similarity index 81% rename from src/cuda/nvidia/gemm/cublas.h rename to src/native/cuda/nvidia/ops/gemm/cublas.h index e5b2e37b9..eefe0b7af 100644 --- a/src/cuda/nvidia/gemm/cublas.h +++ b/src/native/cuda/nvidia/ops/gemm/cublas.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_NVIDIA_GEMM_CUBLAS_H_ #define INFINI_OPS_NVIDIA_GEMM_CUBLAS_H_ -#include "cuda/gemm/blas.h" -#include "cuda/nvidia/blas.h" +#include "native/cuda/nvidia/blas.h" +#include "native/cuda/ops/gemm/blas.h" namespace infini::ops { diff --git a/src/cuda/nvidia/gemm/cublaslt.h b/src/native/cuda/nvidia/ops/gemm/cublaslt.h similarity index 98% rename from src/cuda/nvidia/gemm/cublaslt.h rename to src/native/cuda/nvidia/ops/gemm/cublaslt.h index c01a7528c..dee5d9a2e 100644 --- a/src/cuda/nvidia/gemm/cublaslt.h +++ b/src/native/cuda/nvidia/ops/gemm/cublaslt.h @@ -9,8 +9,8 @@ // clang-format on #include "base/gemm.h" -#include "cuda/nvidia/blas_utils.h" -#include "cuda/nvidia/runtime_.h" +#include "native/cuda/nvidia/blas_utils.h" +#include "native/cuda/nvidia/runtime_.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/linear/kernel.h b/src/native/cuda/nvidia/ops/linear/kernel.h new file mode 100644 index 000000000..be07883f4 --- /dev/null +++ b/src/native/cuda/nvidia/ops/linear/kernel.h @@ -0,0 +1,19 @@ +#ifndef INFINI_OPS_NVIDIA_LINEAR_KERNEL_H_ +#define INFINI_OPS_NVIDIA_LINEAR_KERNEL_H_ + +#include "native/cuda/nvidia/ops/add/kernel.h" +#include "native/cuda/nvidia/ops/gemm/cublas.h" +#include "native/cuda/ops/linear/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaLinear { + public: + using CudaLinear::CudaLinear; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/nvidia/rms_norm/kernel.h b/src/native/cuda/nvidia/ops/rms_norm/kernel.h similarity index 74% rename from src/cuda/nvidia/rms_norm/kernel.h rename to src/native/cuda/nvidia/ops/rms_norm/kernel.h index bf25d9ac8..74cfab17a 100644 --- a/src/cuda/nvidia/rms_norm/kernel.h +++ b/src/native/cuda/nvidia/ops/rms_norm/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/nvidia/caster.cuh" -#include "cuda/nvidia/runtime_.h" -#include "cuda/rms_norm/kernel.h" +#include "native/cuda/nvidia/caster.cuh" +#include "native/cuda/nvidia/runtime_.h" +#include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/cuda/nvidia/swiglu/kernel.h b/src/native/cuda/nvidia/ops/swiglu/kernel.h similarity index 74% rename from src/cuda/nvidia/swiglu/kernel.h rename to src/native/cuda/nvidia/ops/swiglu/kernel.h index dba82110c..fd58f29ef 100644 --- a/src/cuda/nvidia/swiglu/kernel.h +++ b/src/native/cuda/nvidia/ops/swiglu/kernel.h @@ -3,9 +3,9 @@ #include -#include "cuda/nvidia/caster.cuh" -#include "cuda/nvidia/runtime_.h" -#include "cuda/swiglu/kernel.h" +#include "native/cuda/nvidia/caster.cuh" +#include "native/cuda/nvidia/runtime_.h" +#include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h similarity index 88% rename from src/cuda/nvidia/runtime_.h rename to src/native/cuda/nvidia/runtime_.h index b03a9c4b6..326ecdb24 100644 --- a/src/cuda/nvidia/runtime_.h +++ b/src/native/cuda/nvidia/runtime_.h @@ -7,9 +7,9 @@ #include // clang-format on -#include "cuda/nvidia/device_.h" -#include "cuda/nvidia/runtime_utils.h" -#include "cuda/runtime.h" +#include "native/cuda/nvidia/device_.h" +#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/runtime_.h" namespace infini::ops { diff --git a/src/cuda/nvidia/runtime_utils.h b/src/native/cuda/nvidia/runtime_utils.h similarity index 74% rename from src/cuda/nvidia/runtime_utils.h rename to src/native/cuda/nvidia/runtime_utils.h index 26bfbffb5..c881dabde 100644 --- a/src/cuda/nvidia/runtime_utils.h +++ b/src/native/cuda/nvidia/runtime_utils.h @@ -1,8 +1,8 @@ #ifndef INFINI_OPS_NVIDIA_RUNTIME_UTILS_H_ #define INFINI_OPS_NVIDIA_RUNTIME_UTILS_H_ -#include "cuda/nvidia/device_property.h" -#include "cuda/runtime_utils.h" +#include "native/cuda/nvidia/device_property.h" +#include "native/cuda/runtime_utils.h" namespace infini::ops { diff --git a/src/cuda/add/kernel.cuh b/src/native/cuda/ops/add/kernel.cuh similarity index 97% rename from src/cuda/add/kernel.cuh rename to src/native/cuda/ops/add/kernel.cuh index f4dcd3de8..ec11fa7ca 100644 --- a/src/cuda/add/kernel.cuh +++ b/src/native/cuda/ops/add/kernel.cuh @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_CUDA_ADD_KERNEL_CUH_ #define INFINI_OPS_CUDA_ADD_KERNEL_CUH_ -#include "cuda/kernel_commons.cuh" +#include "native/cuda/kernel_commons.cuh" namespace infini::ops { diff --git a/src/cuda/add/kernel.h b/src/native/cuda/ops/add/kernel.h similarity index 96% rename from src/cuda/add/kernel.h rename to src/native/cuda/ops/add/kernel.h index 95d82c919..ac3e698d4 100644 --- a/src/cuda/add/kernel.h +++ b/src/native/cuda/ops/add/kernel.h @@ -8,9 +8,9 @@ #include "base/add.h" #include "common/generic_utils.h" -#include "cuda/add/kernel.cuh" -#include "cuda/kernel_commons.cuh" -#include "cuda/runtime_utils.h" +#include "native/cuda/kernel_commons.cuh" +#include "native/cuda/ops/add/kernel.cuh" +#include "native/cuda/runtime_utils.h" namespace infini::ops { diff --git a/src/cuda/causal_softmax/kernel.cuh b/src/native/cuda/ops/causal_softmax/kernel.cuh similarity index 97% rename from src/cuda/causal_softmax/kernel.cuh rename to src/native/cuda/ops/causal_softmax/kernel.cuh index 11b23b523..31f685e84 100644 --- a/src/cuda/causal_softmax/kernel.cuh +++ b/src/native/cuda/ops/causal_softmax/kernel.cuh @@ -5,8 +5,8 @@ #include #include -#include "cuda/caster.cuh" -#include "cuda/kernel_commons.cuh" +#include "native/cuda/caster.cuh" +#include "native/cuda/kernel_commons.cuh" namespace infini::ops { diff --git a/src/cuda/causal_softmax/kernel.h b/src/native/cuda/ops/causal_softmax/kernel.h similarity index 93% rename from src/cuda/causal_softmax/kernel.h rename to src/native/cuda/ops/causal_softmax/kernel.h index c060d834b..c77d97387 100644 --- a/src/cuda/causal_softmax/kernel.h +++ b/src/native/cuda/ops/causal_softmax/kernel.h @@ -6,11 +6,11 @@ #include #include "base/causal_softmax.h" -#include "cuda/causal_softmax/kernel.cuh" -#include "cuda/kernel_commons.cuh" -#include "cuda/runtime_utils.h" #include "data_type.h" #include "dispatcher.h" +#include "native/cuda/kernel_commons.cuh" +#include "native/cuda/ops/causal_softmax/kernel.cuh" +#include "native/cuda/runtime_utils.h" namespace infini::ops { diff --git a/src/cuda/gemm/blas.h b/src/native/cuda/ops/gemm/blas.h similarity index 99% rename from src/cuda/gemm/blas.h rename to src/native/cuda/ops/gemm/blas.h index 264d52c15..6bac0d6a6 100644 --- a/src/cuda/gemm/blas.h +++ b/src/native/cuda/ops/gemm/blas.h @@ -4,7 +4,7 @@ #include #include "base/gemm.h" -#include "cuda/blas_utils.h" +#include "native/cuda/blas_utils.h" namespace infini::ops { diff --git a/src/native/cuda/ops/linear/kernel.h b/src/native/cuda/ops/linear/kernel.h new file mode 100644 index 000000000..655721d5d --- /dev/null +++ b/src/native/cuda/ops/linear/kernel.h @@ -0,0 +1,72 @@ +#ifndef INFINI_OPS_CUDA_LINEAR_KERNEL_H_ +#define INFINI_OPS_CUDA_LINEAR_KERNEL_H_ + +#include +#include + +#include "base/add.h" +#include "base/gemm.h" +#include "base/linear.h" + +namespace infini::ops { + +template +class CudaLinear : public Linear { + public: + CudaLinear(const Tensor a, const Tensor b, std::optional bias, + bool trans_a, bool trans_b, Tensor out) + : Linear(a, b, bias, trans_a, trans_b, out), + gemm_(a, b, std::optional{1.0f}, std::optional{0.0f}, + std::optional{static_cast(trans_a)}, + std::optional{static_cast(trans_b)}, out) { + if (has_bias_) { + add_.emplace(out, BroadcastBias(*bias, out), out); + } + } + + void operator()(const Tensor a, const Tensor b, std::optional bias, + bool trans_a, bool trans_b, Tensor out) const override { + assert(has_bias_ == bias.has_value()); + + ConfigureSubOperator(gemm_); + gemm_(a, b, std::optional{1.0f}, std::optional{0.0f}, + std::optional{static_cast(trans_a)}, + std::optional{static_cast(trans_b)}, out); + + if (has_bias_) { + auto bias_view = BroadcastBias(*bias, out); + ConfigureSubOperator(*add_); + (*add_)(out, bias_view, out); + } + } + + private: + static Tensor BroadcastBias(const Tensor& bias, const Tensor& out) { + assert(bias.ndim() == 1); + assert(bias.size(0) == out.size(-1)); + + auto shape = out.shape(); + Tensor::Strides strides(out.ndim(), 0); + strides.back() = bias.stride(0); + + return Tensor{const_cast(bias.data()), shape, bias.dtype(), + bias.device(), strides}; + } + + template + void ConfigureSubOperator(Op& op) const { + op.set_handle(handle_); + op.set_config(config_); + op.set_stream(stream_); + op.set_workspace(workspace_); + op.set_workspace_size_in_bytes(workspace_size_in_bytes_); + } + + mutable Operator gemm_; + + mutable std::optional> add_; +}; + +} // namespace infini::ops + +#endif diff --git a/src/cuda/rms_norm/kernel.cuh b/src/native/cuda/ops/rms_norm/kernel.cuh similarity index 96% rename from src/cuda/rms_norm/kernel.cuh rename to src/native/cuda/ops/rms_norm/kernel.cuh index 980f776ec..91036a37f 100644 --- a/src/cuda/rms_norm/kernel.cuh +++ b/src/native/cuda/ops/rms_norm/kernel.cuh @@ -5,8 +5,8 @@ #include #include -#include "cuda/caster.cuh" -#include "cuda/kernel_commons.cuh" +#include "native/cuda/caster.cuh" +#include "native/cuda/kernel_commons.cuh" namespace infini::ops { diff --git a/src/cuda/rms_norm/kernel.h b/src/native/cuda/ops/rms_norm/kernel.h similarity index 93% rename from src/cuda/rms_norm/kernel.h rename to src/native/cuda/ops/rms_norm/kernel.h index 14146edca..00d2a8c2d 100644 --- a/src/cuda/rms_norm/kernel.h +++ b/src/native/cuda/ops/rms_norm/kernel.h @@ -5,11 +5,11 @@ #include #include "base/rms_norm.h" -#include "cuda/kernel_commons.cuh" -#include "cuda/rms_norm/kernel.cuh" -#include "cuda/runtime_utils.h" #include "data_type.h" #include "dispatcher.h" +#include "native/cuda/kernel_commons.cuh" +#include "native/cuda/ops/rms_norm/kernel.cuh" +#include "native/cuda/runtime_utils.h" namespace infini::ops { diff --git a/src/cuda/swiglu/kernel.cuh b/src/native/cuda/ops/swiglu/kernel.cuh similarity index 98% rename from src/cuda/swiglu/kernel.cuh rename to src/native/cuda/ops/swiglu/kernel.cuh index 36b9f9752..a782b6f6b 100644 --- a/src/cuda/swiglu/kernel.cuh +++ b/src/native/cuda/ops/swiglu/kernel.cuh @@ -3,7 +3,7 @@ #include -#include "cuda/kernel_commons.cuh" +#include "native/cuda/kernel_commons.cuh" namespace infini::ops { diff --git a/src/cuda/swiglu/kernel.h b/src/native/cuda/ops/swiglu/kernel.h similarity index 97% rename from src/cuda/swiglu/kernel.h rename to src/native/cuda/ops/swiglu/kernel.h index 5fcfe73bb..bfd19df48 100644 --- a/src/cuda/swiglu/kernel.h +++ b/src/native/cuda/ops/swiglu/kernel.h @@ -8,8 +8,8 @@ #include "base/swiglu.h" #include "common/generic_utils.h" -#include "cuda/runtime_utils.h" -#include "cuda/swiglu/kernel.cuh" +#include "native/cuda/ops/swiglu/kernel.cuh" +#include "native/cuda/runtime_utils.h" namespace infini::ops { diff --git a/src/cuda/runtime.h b/src/native/cuda/runtime_.h similarity index 97% rename from src/cuda/runtime.h rename to src/native/cuda/runtime_.h index fdc92cc9c..4e4e61404 100644 --- a/src/cuda/runtime.h +++ b/src/native/cuda/runtime_.h @@ -3,7 +3,7 @@ #include -#include "../runtime.h" +#include "runtime.h" namespace infini::ops { diff --git a/src/cuda/runtime_utils.h b/src/native/cuda/runtime_utils.h similarity index 100% rename from src/cuda/runtime_utils.h rename to src/native/cuda/runtime_utils.h diff --git a/src/operator.h b/src/operator.h index d4609e055..83fc4ec2c 100644 --- a/src/operator.h +++ b/src/operator.h @@ -37,6 +37,14 @@ struct CacheKey { tensors.push_back(t); } + void Absorb(const std::vector& ts) { + HashCombine(hash, ts.size()); + for (const auto& t : ts) { + HashCombine(hash, t); + tensors.push_back(t); + } + } + template void Absorb(const T& v) { HashCombine(hash, v); @@ -121,7 +129,16 @@ class OperatorBase { template class Operator : public OperatorBase { + // Generation counter for lazy cache invalidation. Bumped by + // `clear_cache()`; the next `call()` detects the mismatch and + // destroys all cached operator instances. + static inline std::size_t cache_generation_{0}; + public: + // Invalidate the operator cache. Cached operators are destroyed on the + // next `call()` invocation. Intended for test isolation — production + // code should never call this. + static void clear_cache() { ++cache_generation_; } template static auto Make(const Config& config, const Tensor tensor, Args&&... args) { std::unique_ptr op_ptr; @@ -166,6 +183,12 @@ class Operator : public OperatorBase { static auto Call(const Handle& handle, const Config& config, Args&&... args) { static std::unordered_map> cache; + static std::size_t generation{0}; + + if (generation != cache_generation_) { + cache.clear(); + generation = cache_generation_; + } auto key = detail::CacheKey::Build(config.implementation_index(), args...); @@ -174,7 +197,7 @@ class Operator : public OperatorBase { if (it == cache.end()) { // Pass args as lvalue refs so they remain valid for the `operator()` call // below. Forwarding rvalue temporaries into `Make()` would leave the args - // in a moved-from (empty) state before operator() can use them. + // in a moved-from (empty) state before `operator()` can use them. it = cache.emplace(std::move(key), Make(config, args...)).first; } diff --git a/src/pybind11_utils.h b/src/pybind11_utils.h index b595836cc..f13d3116c 100644 --- a/src/pybind11_utils.h +++ b/src/pybind11_utils.h @@ -66,10 +66,20 @@ inline Tensor TensorFromPybind11Handle(py::handle obj) { inline std::optional OptionalTensorFromPybind11Handle( const std::optional& obj) { - if (!obj.has_value()) return std::nullopt; + if (!obj.has_value() || obj->is_none()) return std::nullopt; return TensorFromPybind11Handle(*obj); } +inline std::vector VectorTensorFromPybind11Handle( + const std::vector& objs) { + std::vector result; + result.reserve(objs.size()); + for (const auto& obj : objs) { + result.push_back(TensorFromPybind11Handle(obj)); + } + return result; +} + } // namespace infini::ops #endif diff --git a/src/torch/add/add.cc b/src/torch/ops/add/add.cc similarity index 98% rename from src/torch/add/add.cc rename to src/torch/ops/add/add.cc index cd2f2cc92..2108e652c 100644 --- a/src/torch/add/add.cc +++ b/src/torch/ops/add/add.cc @@ -1,4 +1,4 @@ -#include "torch/add/add.h" +#include "torch/ops/add/add.h" #include "torch/tensor_.h" diff --git a/src/torch/add/add.h b/src/torch/ops/add/add.h similarity index 100% rename from src/torch/add/add.h rename to src/torch/ops/add/add.h diff --git a/src/torch/gemm/gemm.cc b/src/torch/ops/gemm/gemm.cc similarity index 98% rename from src/torch/gemm/gemm.cc rename to src/torch/ops/gemm/gemm.cc index d17353f15..e24da7a84 100644 --- a/src/torch/gemm/gemm.cc +++ b/src/torch/ops/gemm/gemm.cc @@ -1,4 +1,4 @@ -#include "torch/gemm/gemm.h" +#include "torch/ops/gemm/gemm.h" #include "torch/tensor_.h" diff --git a/src/torch/gemm/gemm.h b/src/torch/ops/gemm/gemm.h similarity index 100% rename from src/torch/gemm/gemm.h rename to src/torch/ops/gemm/gemm.h diff --git a/tests/conftest.py b/tests/conftest.py index 8a72355e5..86d01c249 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,6 +44,31 @@ def set_seed_per_test(request): _set_random_seed(_hash(_test_case_path_from_request(request))) +@pytest.fixture(scope="module", autouse=True) +def _clear_operator_caches(): + """Clear the C++ operator cache between test modules. + + The `Operator::call()` cache keys on tensor geometry (shape, strides, + dtype) but not data pointers. When different test modules create tensors + with identical geometry but different data content (e.g., random + `cos_sin_cache` tables), a stale cached operator from a prior module + silently returns wrong results. Clearing the cache at module boundaries + ensures each module starts with a cold cache. + """ + yield + + try: + import infini.ops as ops + + for name in dir(ops): + cls = getattr(ops, name) + + if hasattr(cls, "clear_cache"): + cls.clear_cache() + except ImportError: + pass + + _NPU_UNSUPPORTED_DTYPES = {torch.float64} # `torch_npu` does not implement random number generation for @@ -65,6 +90,43 @@ def skip_unsupported_dtypes(request): pytest.skip(f"{params['dtype']} not supported on Ascend 910B") +@pytest.fixture(autouse=True) +def skip_op_without_platform_impl(request): + """Skip `device=` parametrizations when the op has no + implementation on any of the corresponding platforms. + + Only runs for tests that parametrize `device` but not + `implementation_index` — joint `(device, impl_idx)` parametrize in + `pytest_generate_tests` already prunes empty-impl pairs at collection + time, making this check redundant (and wasteful) on those tests. + """ + if not hasattr(request.node, "callspec"): + return + + params = request.node.callspec.params + + if "implementation_index" in params: + return + + device_selectors = _active_device_selectors_for_torch_device( + request.config, params.get("device") + ) + + if not device_selectors: + return + + op_cls = _op_class_from_module(request.node.module) + + if op_cls is None or not hasattr(op_cls, "active_implementation_indices"): + return + + if not any(op_cls.active_implementation_indices(d) for d in device_selectors): + pytest.skip( + f"{op_cls.__name__} has no implementation on any " + f"`{params.get('device')}`-mapped platform/device" + ) + + def _set_random_seed(seed): random.seed(seed) torch.manual_seed(seed) @@ -80,6 +142,26 @@ def _set_random_seed(seed): } +def _active_device_selectors_for_torch_device(config, torch_device): + """Return platform or torch device names selected for a torch device type.""" + if not torch_device: + return () + + cli_devices = config.getoption("--devices") or () + requested_platforms = tuple( + name + for name in cli_devices + if _PLATFORM_TO_TORCH_DEVICE.get(name) == torch_device + ) + + if requested_platforms: + return requested_platforms + + # The pybind layer maps torch device names (e.g. "cuda") to the backend + # compiled into the current wheel, avoiding probes of inactive CUDA siblings. + return (torch_device,) + + def _resolve_device(name): """Map a platform name (e.g., `ascend`) to a PyTorch device type (e.g., `npu`).""" return _PLATFORM_TO_TORCH_DEVICE.get(name, name) @@ -109,7 +191,69 @@ def pytest_generate_tests(metafunc): else: devices = () - metafunc.parametrize("device", devices or available) + devices = devices or available + + # Joint `(device, implementation_index)` parametrize: generate only + # pairs where the op has an active implementation on that device. + # Avoids cross-device pollution — an impl active on `cpu` but not on + # `npu` no longer appears as a runtime skip in the npu column. + if ( + "implementation_index" in metafunc.fixturenames + and "implementation_index" not in already_parametrized + ): + op_cls = _op_class_from_module(metafunc.module) + + if op_cls is not None and hasattr(op_cls, "active_implementation_indices"): + pairs = [ + (dev, idx) + for dev in devices + for idx in op_cls.active_implementation_indices(dev) + ] + + if not pairs: + # Emit one skipped placeholder so test IDs read + # `[skip-dtype0-...]` instead of `[NOTSET-...]`. + # `get_available_devices()` always includes `"cpu"`, so + # `devices[0]` is safe. + pairs = [ + pytest.param( + devices[0], + 0, + marks=pytest.mark.skip( + reason=( + f"{op_cls.__name__} has no active " + "implementation on any available device" + ) + ), + id="skip", + ) + ] + + metafunc.parametrize("device, implementation_index", pairs) + + return + + metafunc.parametrize("device", devices) + + +def _op_class_from_module(module): + """Derive the `infini.ops.` class from a `tests/test_.py` module. + + Test modules have already imported `infini.ops` by the time this runs, so + no `try/except ImportError` is needed — a real import failure would have + aborted collection long before. + """ + module_name = module.__name__.rsplit(".", 1)[-1] + + if not module_name.startswith("test_"): + return None + + op_snake = module_name[len("test_") :] + op_pascal = "".join(part.capitalize() for part in op_snake.split("_")) + + import infini.ops as _ops + + return getattr(_ops, op_pascal, None) @pytest.hookimpl(tryfirst=True) diff --git a/tests/test_add.py b/tests/test_add.py index 825fc932c..e2266c30d 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -2,7 +2,13 @@ import pytest import torch -from tests.utils import Payload, empty_strided, randint_strided, randn_strided +from tests.utils import ( + Payload, + empty_strided, + get_stream, + randint_strided, + randn_strided, +) _INT_DTYPES = (torch.int16, torch.int32, torch.int64) @@ -29,9 +35,6 @@ ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), ), ) -# TODO: Generate implementation indices dynamically from -# `Add.active_implementation_indices` instead of hardcoding. -@pytest.mark.parametrize("implementation_index", (0, 1)) @pytest.mark.parametrize( ("dtype", "rtol", "atol"), ( @@ -57,11 +60,6 @@ def test_add( "The `torch.musa` test cloning path does not support `uint16`, `uint32`, or `uint64`." ) - active_indices = infini.ops.Add.active_implementation_indices(device) - - if implementation_index not in active_indices: - pytest.skip(f"implementation `{implementation_index}` not active on `{device}`") - if implementation_index == 1 and dtype in _UINT_DTYPES: pytest.skip("ATen `add` does not support unsigned integer types") @@ -89,7 +87,13 @@ def test_add( def _add(input, other, out, implementation_index=0): - infini.ops.add(input, other, out, implementation_index=implementation_index) + infini.ops.add( + input, + other, + out, + stream=get_stream(input.device), + implementation_index=implementation_index, + ) return out diff --git a/tests/test_cast.py b/tests/test_cast.py new file mode 100644 index 000000000..bd19d934b --- /dev/null +++ b/tests/test_cast.py @@ -0,0 +1,62 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, get_stream, randn_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "shape, input_strides, out_strides", + ( + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4, 4), None, None), + ((16, 5632), None, None), + ((4, 4, 5632), None, None), + ), +) +@pytest.mark.parametrize( + ("input_dtype", "out_dtype", "rtol", "atol"), + ( + (torch.float16, torch.float32, 1e-3, 1e-3), + (torch.float32, torch.float16, 1e-3, 1e-3), + (torch.bfloat16, torch.float32, 1e-2, 5e-3), + (torch.float32, torch.bfloat16, 1e-2, 5e-3), + (torch.float16, torch.bfloat16, 1e-2, 5e-3), + (torch.bfloat16, torch.float16, 1e-2, 5e-3), + ), +) +def test_cast( + shape, + input_strides, + out_strides, + input_dtype, + out_dtype, + device, + rtol, + atol, +): + input = randn_strided(shape, input_strides, dtype=input_dtype, device=device) + out = empty_strided(shape, out_strides, dtype=out_dtype, device=device) + + return Payload( + _cast, + _torch_cast, + (input, out), + {}, + rtol=rtol, + atol=atol, + ) + + +def _cast(input, out): + infini.ops.cast(input, out, stream=get_stream(input.device)) + + return out + + +def _torch_cast(input, out): + out.copy_(input.to(out.dtype)) + + return out diff --git a/tests/test_cat.py b/tests/test_cat.py new file mode 100644 index 000000000..85428b53f --- /dev/null +++ b/tests/test_cat.py @@ -0,0 +1,69 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, get_stream, randn_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "shapes, dim, out_shape", + ( + # 2 inputs, dim=0 + (((4, 64), (4, 64)), 0, (8, 64)), + # 2 inputs, dim=1 + (((4, 32), (4, 64)), 1, (4, 96)), + # 2 inputs, dim=-1 (negative dim) + (((4, 32), (4, 64)), -1, (4, 96)), + # 3 inputs, dim=1 + (((4, 16), (4, 32), (4, 16)), 1, (4, 64)), + # 2 inputs, dim=0, 3D + (((2, 4, 64), (2, 4, 64)), 0, (4, 4, 64)), + # 2 inputs, dim=2, 3D + (((2, 4, 32), (2, 4, 64)), 2, (2, 4, 96)), + # 4 inputs, dim=1 + (((1, 1024), (1, 1024), (1, 1024), (1, 1024)), 1, (1, 4096)), + ), +) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 5e-3), + ), +) +def test_cat(shapes, dim, out_shape, dtype, device, rtol, atol): + inputs = [randn_strided(s, None, dtype=dtype, device=device) for s in shapes] + out = empty_strided(out_shape, None, dtype=dtype, device=device) + + return Payload( + lambda *args: _cat(*args, dim=dim), + lambda *args: _torch_cat(*args, dim=dim), + (*inputs, out), + {}, + rtol=rtol, + atol=atol, + ) + + +def _cat(*args, dim): + inputs = list(args[:-1]) + out = args[-1] + + first = inputs[0] + rest = inputs[1:] + + infini.ops.cat(first, rest, dim, out, stream=get_stream(first.device)) + + return out + + +def _torch_cat(*args, dim): + inputs = list(args[:-1]) + out = args[-1] + + result = torch.cat(inputs, dim=dim) + out.copy_(result) + + return out diff --git a/tests/test_causal_softmax.py b/tests/test_causal_softmax.py index 8b35457a9..79e6be8e0 100644 --- a/tests/test_causal_softmax.py +++ b/tests/test_causal_softmax.py @@ -2,7 +2,7 @@ import pytest import torch -from tests.utils import Payload, empty_strided, randn_strided +from tests.utils import Payload, empty_strided, get_stream, randn_strided @pytest.mark.auto_act_and_assert @@ -40,7 +40,7 @@ def test_causal_softmax(shape, input_strides, out_strides, dtype, device, rtol, def _causal_softmax(input, out): - infini.ops.causal_softmax(input, out) + infini.ops.causal_softmax(input, out, stream=get_stream(input.device)) return out @@ -48,7 +48,7 @@ def _causal_softmax(input, out): def _torch_causal_softmax(input, out): mask = torch.tril(torch.ones_like(input), diagonal=-1).flip(dims=[-2, -1]) masked = torch.where(mask == 1, -torch.inf, input.to(torch.float32)) - result = torch.nn.functional.softmax(masked, dim=-1, dtype=input.dtype) + result = torch.nn.functional.softmax(masked, dim=-1) out.copy_(result) return out diff --git a/tests/test_gemm.py b/tests/test_gemm.py index 26e102d25..71e0e8fde 100644 --- a/tests/test_gemm.py +++ b/tests/test_gemm.py @@ -20,9 +20,6 @@ @pytest.mark.parametrize("beta", (-1, -0.5, 0, 0.5, 1)) @pytest.mark.parametrize("trans_a", (False, True)) @pytest.mark.parametrize("trans_b", (False, True)) -# TODO: Generate implementation indices dynamically from -# `Gemm.active_implementation_indices` instead of hardcoding. -@pytest.mark.parametrize("implementation_index", (0, 1, 2)) @pytest.mark.parametrize( ("dtype", "rtol", "atol"), ( @@ -56,11 +53,6 @@ def test_gemm( if device == "mlu" and dtype == torch.bfloat16: pytest.skip("`bfloat16` is not supported by `cnnlBatchMatMulEx`") - active_indices = infini.ops.Gemm.active_implementation_indices(device) - - if implementation_index not in active_indices: - pytest.skip(f"implementation `{implementation_index}` not active on `{device}`") - if implementation_index == 1 and dtype in (torch.float16, torch.bfloat16): pytest.skip("cuBLASLt half-precision exceeds current tolerances") @@ -71,6 +63,18 @@ def test_gemm( ): pytest.skip("ATen CPU `addmm`/`baddbmm` does not support half-precision") + if implementation_index == 2 and device == "npu": + # `src/torch/gemm/gemm.h` partial-specializes `Operator` + # for every `kDev` including `kAscend`, so the SFINAE-based + # `active_implementation_indices` reports `2` as active even though + # `torch/gemm/gemm.cc` only instantiates it for CPU/NVIDIA. + # Dispatching through the unused Ascend specialization reads from an + # uninitialized vtable and crashes. See PR #64 discussion. + pytest.skip( + "Gemm impl=2 on Ascend is a torch-fallback stub without an " + "instantiated specialization" + ) + a = randn_strided(a_shape, a_strides, dtype=dtype, device=device) b = randn_strided(b_shape, b_strides, dtype=dtype, device=device) diff --git a/tests/test_linear.py b/tests/test_linear.py new file mode 100644 index 000000000..364ba5fcf --- /dev/null +++ b/tests/test_linear.py @@ -0,0 +1,90 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, get_stream, randn_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "a_shape, b_shape, out_shape", + ( + ((4, 64), (64, 32), (4, 32)), + ((2, 128), (128, 256), (2, 256)), + ((1, 4096), (4096, 4096), (1, 4096)), + ((2, 4, 64), (2, 64, 32), (2, 4, 32)), + ((4, 8, 128), (4, 128, 64), (4, 8, 64)), + ), +) +@pytest.mark.parametrize("trans_a", (False, True)) +@pytest.mark.parametrize("trans_b", (False, True)) +@pytest.mark.parametrize("has_bias", (False, True)) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-2, 5e-2), + (torch.float16, 1e-2, 1e-2), + (torch.bfloat16, 1e-2, 1e-2), + ), +) +def test_linear( + a_shape, + b_shape, + out_shape, + trans_a, + trans_b, + has_bias, + dtype, + device, + rtol, + atol, +): + a = randn_strided(a_shape, None, dtype=dtype, device=device) + b = randn_strided(b_shape, None, dtype=dtype, device=device) + + if trans_a: + a = a.transpose(-2, -1) + + if trans_b: + b = b.transpose(-2, -1) + + # Bias shape is [N], the last dim of the output. + bias = None + + if has_bias: + N = out_shape[-1] + bias = randn_strided((N,), None, dtype=dtype, device=device) + + out = empty_strided(out_shape, None, dtype=dtype, device=device) + + return Payload( + lambda *args: _linear(*args, trans_a=trans_a, trans_b=trans_b), + lambda *args: _torch_linear(*args, trans_a=trans_a, trans_b=trans_b), + (a, b, bias, out), + {}, + rtol=rtol, + atol=atol, + ) + + +def _linear(a, b, bias, out, trans_a=False, trans_b=False): + infini.ops.linear(a, b, bias, trans_a, trans_b, out, stream=get_stream(a.device)) + + return out + + +def _torch_linear(a, b, bias, out, trans_a=False, trans_b=False): + if trans_a: + a = a.transpose(-2, -1) + + if trans_b: + b = b.transpose(-2, -1) + + result = torch.matmul(a.float(), b.float()) + + if bias is not None: + result = result + bias.float() + + out.copy_(result.to(out.dtype)) + + return out diff --git a/tests/test_matmul.py b/tests/test_matmul.py new file mode 100644 index 000000000..fea3822a8 --- /dev/null +++ b/tests/test_matmul.py @@ -0,0 +1,76 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, get_stream, randn_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "a_shape, b_shape, c_shape", + ( + ((4, 64), (64, 32), (4, 32)), + ((2, 128), (128, 256), (2, 256)), + ((2, 4, 64), (2, 64, 32), (2, 4, 32)), + ((4, 8, 128), (4, 128, 64), (4, 8, 64)), + ), +) +@pytest.mark.parametrize("trans_a", (False, True)) +@pytest.mark.parametrize("trans_b", (False, True)) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-2, 1e-2), + (torch.float16, 1e-2, 1e-2), + (torch.bfloat16, 1e-2, 1e-2), + ), +) +def test_matmul( + a_shape, + b_shape, + c_shape, + trans_a, + trans_b, + dtype, + device, + rtol, + atol, +): + a = randn_strided(a_shape, None, dtype=dtype, device=device) + b = randn_strided(b_shape, None, dtype=dtype, device=device) + + if trans_a: + a = a.transpose(-2, -1) + + if trans_b: + b = b.transpose(-2, -1) + + c = empty_strided(c_shape, None, dtype=dtype, device=device) + + return Payload( + lambda *args: _matmul(*args, trans_a=trans_a, trans_b=trans_b), + lambda *args: _torch_matmul(*args, trans_a=trans_a, trans_b=trans_b), + (a, b, c), + {}, + rtol=rtol, + atol=atol, + ) + + +def _matmul(a, b, c, trans_a=False, trans_b=False): + infini.ops.matmul(a, b, c, trans_a, trans_b, stream=get_stream(a.device)) + + return c + + +def _torch_matmul(a, b, c, trans_a=False, trans_b=False): + if trans_a: + a = a.transpose(-2, -1) + + if trans_b: + b = b.transpose(-2, -1) + + result = torch.matmul(a.float(), b.float()).to(c.dtype) + c.copy_(result) + + return c diff --git a/tests/test_mul.py b/tests/test_mul.py new file mode 100644 index 000000000..e368f96df --- /dev/null +++ b/tests/test_mul.py @@ -0,0 +1,87 @@ +import infini.ops +import pytest +import torch + +from tests.utils import ( + Payload, + empty_strided, + get_stream, + randint_strided, + randn_strided, +) + +_INT_DTYPES = (torch.int16, torch.int32, torch.int64) + +_UINT_DTYPES = tuple( + filter(None, (getattr(torch, f"uint{bits}", None) for bits in (16, 32, 64))) +) + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "shape, input_strides, other_strides, out_strides", + ( + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), + ), +) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 5e-3), + ) + + tuple((dtype, 0, 0) for dtype in _INT_DTYPES + _UINT_DTYPES), +) +def test_mul( + shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol +): + if device == "musa" and dtype in _UINT_DTYPES: + pytest.skip( + "The `torch.musa` test cloning path does not support `uint16`, `uint32`, or `uint64`." + ) + + if dtype in _INT_DTYPES or dtype in _UINT_DTYPES: + input = randint_strided( + 0, 100, shape, input_strides, dtype=dtype, device=device + ) + other = randint_strided( + 0, 100, shape, other_strides, dtype=dtype, device=device + ) + else: + input = randn_strided(shape, input_strides, dtype=dtype, device=device) + other = randn_strided(shape, other_strides, dtype=dtype, device=device) + + out = empty_strided(shape, out_strides, dtype=dtype, device=device) + + return Payload(_mul, _torch_mul, (input, other, out), {}, rtol=rtol, atol=atol) + + +def _mul(input, other, out): + infini.ops.mul(input, other, out, stream=get_stream(input.device)) + + return out + + +def _torch_mul(input, other, out): + if input.dtype in _UINT_DTYPES: + input = input.to(torch.int64) + + if other.dtype in _UINT_DTYPES: + other = other.to(torch.int64) + + res = torch.mul(input, other) + out.copy_(res.to(out.dtype)) + + return out diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py index d6d4dff17..45f9199ba 100644 --- a/tests/test_rms_norm.py +++ b/tests/test_rms_norm.py @@ -2,7 +2,7 @@ import pytest import torch -from tests.utils import Payload, empty_strided, randn_strided +from tests.utils import Payload, empty_strided, get_stream, randn_strided @pytest.mark.auto_act_and_assert @@ -33,6 +33,7 @@ def test_rms_norm( weight_strides, out_strides, eps, + implementation_index, dtype, device, rtol, @@ -43,7 +44,9 @@ def test_rms_norm( out = empty_strided(input_shape, out_strides, dtype=dtype, device=device) return Payload( - _rms_norm, + lambda *args, **kwargs: _rms_norm( + *args, **kwargs, implementation_index=implementation_index + ), _torch_rms_norm, (input, weight), {"eps": eps, "out": out}, @@ -52,8 +55,15 @@ def test_rms_norm( ) -def _rms_norm(input, weight, *, eps=1e-6, out=None): - infini.ops.rms_norm(input, weight, eps, out) +def _rms_norm(input, weight, *, eps=1e-6, out=None, implementation_index=0): + infini.ops.rms_norm( + input, + weight, + eps, + out, + implementation_index=implementation_index, + stream=get_stream(input.device), + ) return out diff --git a/tests/test_swiglu.py b/tests/test_swiglu.py index 89c95f77f..f159742ca 100644 --- a/tests/test_swiglu.py +++ b/tests/test_swiglu.py @@ -2,7 +2,7 @@ import pytest import torch -from tests.utils import Payload, empty_strided, rand_strided +from tests.utils import Payload, empty_strided, get_stream, rand_strided @pytest.mark.auto_act_and_assert @@ -28,17 +28,40 @@ ), ) def test_swiglu( - shape, input_strides, gate_strides, out_strides, dtype, device, rtol, atol + shape, + input_strides, + gate_strides, + out_strides, + implementation_index, + dtype, + device, + rtol, + atol, ): input = rand_strided(shape, input_strides, dtype=dtype, device=device) gate = rand_strided(shape, gate_strides, dtype=dtype, device=device) out = empty_strided(shape, out_strides, dtype=dtype, device=device) - return Payload(_swiglu, _torch_swiglu, (input, gate, out), {}, rtol=rtol, atol=atol) + return Payload( + lambda *args, **kwargs: _swiglu( + *args, **kwargs, implementation_index=implementation_index + ), + _torch_swiglu, + (input, gate, out), + {}, + rtol=rtol, + atol=atol, + ) -def _swiglu(input, gate, out): - infini.ops.swiglu(input, gate, out) +def _swiglu(input, gate, out, implementation_index=0): + infini.ops.swiglu( + input, + gate, + out, + implementation_index=implementation_index, + stream=get_stream(input.device), + ) return out diff --git a/tests/utils.py b/tests/utils.py index 8f9532aa0..982d05aec 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -82,11 +82,21 @@ def randint_strided(low, high, shape, strides, *, dtype=None, device=None): return output +_STREAM_ACCESSORS = { + "npu": ("npu", "npu_stream"), + "cuda": ("cuda", "cuda_stream"), + "mlu": ("mlu", "mlu_stream"), + "musa": ("musa", "musa_stream"), +} + + def get_stream(device): """Return the raw stream handle for `device`, or 0 for CPU. - Uses `torch.accelerator.current_stream` when available, falling back to - device-specific APIs for older PyTorch versions. + Uses the device-specific `torch..current_stream()` API rather than + `torch.accelerator.current_stream()` — the latter returns a different + stream object on torch 2.9 + vllm-ascend, producing cross-stream data + hazards on cached-executor ops. """ if isinstance(device, torch.device): device = device.type @@ -97,32 +107,19 @@ def get_stream(device): if device == "cpu": return 0 - if hasattr(torch, "accelerator") and hasattr(torch.accelerator, "current_stream"): - stream = torch.accelerator.current_stream() - - # Each backend exposes the raw handle under a different attribute name. - for attr in ("npu_stream", "cuda_stream", "mlu_stream", "musa_stream"): - if hasattr(stream, attr): - return getattr(stream, attr) + mod_name, attr = _STREAM_ACCESSORS.get(device, (None, None)) + if mod_name is None: return 0 - # Fallback for older PyTorch builds without `torch.accelerator`. - _STREAM_ACCESSORS = { - "npu": ("npu", "npu_stream"), - "cuda": ("cuda", "cuda_stream"), - "mlu": ("mlu", "mlu_stream"), - "musa": ("musa", "musa_stream"), - } + mod = getattr(torch, mod_name, None) - if device in _STREAM_ACCESSORS: - mod_name, attr = _STREAM_ACCESSORS[device] - mod = getattr(torch, mod_name, None) + if mod is None: + return 0 - if mod is not None and hasattr(mod, "current_stream"): - return getattr(mod.current_stream(), attr) + stream = mod.current_stream() - return 0 + return getattr(stream, attr, 0) def clone_strided(input):