diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile
index 3ff79e1c8..a542b99e0 100644
--- a/.ci/images/ascend/Dockerfile
+++ b/.ci/images/ascend/Dockerfile
@@ -18,4 +18,12 @@ RUN pip install --no-cache-dir --progress off \
     pytest-xdist \
     ruff
 
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+
 WORKDIR /workspace
diff --git a/.ci/run.py b/.ci/run.py
index 7330d9694..e293b4a28 100644
--- a/.ci/run.py
+++ b/.ci/run.py
@@ -8,6 +8,7 @@
 import subprocess
 import sys
 import uuid
+import xml.etree.ElementTree as ET
 from datetime import datetime
 from pathlib import Path
 
@@ -24,6 +25,42 @@
 _PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"}
 
 
+def _junit_xml_indicates_pass(results_dir):
+    """Return True if `pytest` junit XML under `results_dir` reports no failures/errors.
+
+    Used to distinguish a real CI failure from the docker 18.09
+    container-teardown `SIGKILL` (exit code 137) that occurs on this host
+    after a child process exits successfully — bash returns 0 from inside
+    the container, but the docker daemon reports 137 due to a race in its
+    `--rm` cleanup path. The junit XML is written by pytest before that
+    teardown and reliably captures the real outcome of the test stage.
+    """
+    for junit in Path(results_dir).rglob("test-results.xml"):
+        try:
+            root = ET.parse(junit).getroot()
+        except ET.ParseError:
+            continue
+
+        suites = root.findall("testsuite") if root.tag == "testsuites" else [root]
+
+        if not suites:
+            continue
+
+        for suite in suites:
+            try:
+                if int(suite.get("failures", 0)) > 0:
+                    return False
+
+                if int(suite.get("errors", 0)) > 0:
+                    return False
+            except ValueError:
+                return False
+
+        return True
+
+    return False
+
+
 def apply_test_override(run_cmd, test_path):
     """Replace positional test path(s) in a pytest stage command.
 
@@ -437,8 +474,23 @@ def main():
             pool.release(allocated_ids)
 
         if returncode != 0:
-            print(f"job {job_name} failed (exit code {returncode})", file=sys.stderr)
-            failed += 1
+            # Docker 18.09 on this host occasionally SIGKILLs containers
+            # during `--rm` cleanup after the inner process already exited
+            # cleanly, producing exit code 137. Fall back to the pytest
+            # junit XML to recover the real outcome in that case.
+            if returncode == 137 and _junit_xml_indicates_pass(results_dir):
+                print(
+                    f"[warn] job {job_name}: container exited with 137 "
+                    f"(likely docker teardown SIGKILL after clean pytest); "
+                    f"junit XML reports no failures — treating as success",
+                    file=sys.stderr,
+                )
+            else:
+                print(
+                    f"job {job_name} failed (exit code {returncode})",
+                    file=sys.stderr,
+                )
+                failed += 1
 
     sys.exit(1 if failed else 0)
 
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..63af38e3b
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,81 @@
+---
+# Google C++ Style Guide enforcement.
+#
+# `clang-format` (via `.clang-format`) handles formatting; this file handles
+# everything `clang-format` cannot see — primarily identifier naming.
+#
+# Usage:
+#   clang-tidy -p build/ --quiet <file>   (needs `compile_commands.json`)
+#
+# The project's scikit-build config sets `CMAKE_EXPORT_COMPILE_COMMANDS=ON`,
+# so `pip install -e .[dev]` produces `build*/compile_commands.json`.
+#
+# Subdirectories may ship their own `.clang-tidy` with
+# `InheritParentConfig: true` to override specific rules (e.g.
+# `src/ascend/custom/.clang-tidy` relaxes `FunctionCase` for `extern "C"`
+# `AscendC` kernel entries whose symbol names are dictated by
+# `ascendc_add_operator(OP_NAME …)`).
+
+Checks: >
+  readability-identifier-naming,
+  google-explicit-constructor,
+  google-readability-braces-around-statements,
+  google-readability-casting,
+  modernize-use-nullptr,
+  modernize-use-override
+
+# Only naming violations fail the lint — the rest are advisory while the
+# project ramps up on `clang-tidy`.
+WarningsAsErrors: 'readability-identifier-naming.*'
+
+HeaderFilterRegex: '^(src|tests)/.*\.(h|hpp|cuh)$'
+
+CheckOptions:
+  # Types.
+  - {key: readability-identifier-naming.ClassCase,                 value: CamelCase}
+  - {key: readability-identifier-naming.StructCase,                value: CamelCase}
+  - {key: readability-identifier-naming.UnionCase,                 value: CamelCase}
+  - {key: readability-identifier-naming.EnumCase,                  value: CamelCase}
+  - {key: readability-identifier-naming.TypeAliasCase,             value: CamelCase}
+  - {key: readability-identifier-naming.TypedefCase,               value: CamelCase}
+  - {key: readability-identifier-naming.TypeTemplateParameterCase, value: CamelCase}
+
+  # Enumerators and constants use `k` + PascalCase.
+  - {key: readability-identifier-naming.EnumConstantCase,          value: CamelCase}
+  - {key: readability-identifier-naming.EnumConstantPrefix,        value: 'k'}
+  - {key: readability-identifier-naming.ConstexprVariableCase,     value: CamelCase}
+  - {key: readability-identifier-naming.ConstexprVariablePrefix,   value: 'k'}
+  - {key: readability-identifier-naming.GlobalConstantCase,        value: CamelCase}
+  - {key: readability-identifier-naming.GlobalConstantPrefix,      value: 'k'}
+  - {key: readability-identifier-naming.StaticConstantCase,        value: CamelCase}
+  - {key: readability-identifier-naming.StaticConstantPrefix,      value: 'k'}
+
+  # Functions and methods use PascalCase.
+  - {key: readability-identifier-naming.FunctionCase,              value: CamelCase}
+  - {key: readability-identifier-naming.MethodCase,                value: CamelCase}
+
+  # Variables and parameters use snake_case.
+  - {key: readability-identifier-naming.VariableCase,              value: lower_case}
+  - {key: readability-identifier-naming.ParameterCase,             value: lower_case}
+  - {key: readability-identifier-naming.LocalVariableCase,         value: lower_case}
+
+  # Class data members use snake_case with a trailing `_`.
+  - {key: readability-identifier-naming.PrivateMemberCase,         value: lower_case}
+  - {key: readability-identifier-naming.PrivateMemberSuffix,       value: '_'}
+  - {key: readability-identifier-naming.ProtectedMemberCase,       value: lower_case}
+  - {key: readability-identifier-naming.ProtectedMemberSuffix,     value: '_'}
+  - {key: readability-identifier-naming.PublicMemberCase,          value: lower_case}
+
+  # Macros use UPPER_CASE.
+  - {key: readability-identifier-naming.MacroDefinitionCase,       value: UPPER_CASE}
+  # Include guards end in a trailing `_` (e.g. `INFINI_OPS_FOO_H_`), which
+  # the default `UPPER_CASE` style rejects — skip them by regex.
+  - {key: readability-identifier-naming.MacroDefinitionIgnoredRegexp, value: '^INFINI_OPS_.*_H_$'}
+
+  # Namespaces use lower_case.
+  - {key: readability-identifier-naming.NamespaceCase,             value: lower_case}
+
+  # Exemptions.
+  # Type-trait variables (`IsFP16`, `IsBFloat16`, `HasKey`, …) mirror
+  # `std::is_*_v<>` naming — `PascalCase` without `k` prefix.
+  - {key: readability-identifier-naming.ConstexprVariableIgnoredRegexp, value: '^(Is|Has|Can)[A-Z].*'}
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..656ddd9e4
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,202 @@
+<!--
+Thanks for contributing to InfiniOps! Please read `CONTRIBUTING.md` before
+opening a pull request and fill out every section below. Delete any section
+that is genuinely not applicable (and note why), but do not delete the
+"Checklist" section — it must be filled in for every PR.
+
+The PR title MUST follow Conventional Commits, e.g.
+  feat: add fused RMSNorm kernel
+  fix: correct stride handling for batched matmul
+See: https://www.conventionalcommits.org/
+-->
+
+## Summary
+
+<!--
+A concise description of **what** this PR changes. Prefer bullet points over
+prose. Reference files with backtick-fenced paths (e.g. `src/cuda/gemm/blas.h`).
+-->
+
+-
+-
+
+## Motivation
+
+<!--
+Explain **why** this change is needed. Link to any related issue, bug, or
+discussion. If this is a performance change, include before/after numbers
+(hardware, shape, dtype, and the measurement methodology).
+-->
+
+Closes #
+
+## Type of Change
+
+<!-- Tick one or more. The type MUST match the Conventional Commits prefix
+in the PR title and the branch name (see `CONTRIBUTING.md` §Branches). -->
+
+- [ ] `feat` — new feature / new operator / new platform
+- [ ] `fix` — bug fix
+- [ ] `perf` — performance improvement (no behavioral change)
+- [ ] `refactor` — code restructuring without behavior change
+- [ ] `test` — adding or fixing tests only
+- [ ] `docs` — documentation only
+- [ ] `build` / `ci` — build system or CI configuration
+- [ ] `chore` — tooling, formatting, or other non-code changes
+- [ ] Breaking change (requires a `!` in the Conventional Commits prefix or a `BREAKING CHANGE:` footer)
+
+## Platforms Affected
+
+<!-- Tick every backend whose code is touched **or** whose behavior may change. -->
+
+- [ ] CPU (`WITH_CPU`)
+- [ ] NVIDIA (`WITH_NVIDIA`)
+- [ ] Iluvatar (`WITH_ILUVATAR`)
+- [ ] MetaX (`WITH_METAX`)
+- [ ] Cambricon (`WITH_CAMBRICON`)
+- [ ] Moore (`WITH_MOORE`)
+- [ ] Ascend (`WITH_ASCEND`)
+- [ ] PyTorch C++ bindings (`WITH_TORCH`)
+- [ ] Build system / CMake / CI
+- [ ] Python bindings / user-facing API
+
+## Test Results on Supported Platforms
+
+<!--
+Per `CONTRIBUTING.md` §Pull Requests, you MUST build and test on every
+supported platform touched by this PR (or explain why a platform is not
+reachable). Paste `pytest` output summaries below (a trimmed tail is fine —
+include pass/fail counts, skipped tests, and any warnings).
+
+If a platform was **not** tested, state the reason (e.g. "no hardware
+available"). Reviewers may block the PR until coverage is provided or an
+explicit owner signs off on a partial run.
+-->
+
+| Platform   | Built | `pytest` Result | Notes / Hardware |
+| ---------- | :---: | --------------- | ---------------- |
+| NVIDIA     |       |                 |                  |
+| Iluvatar   |       |                 |                  |
+| MetaX      |       |                 |                  |
+| Cambricon  |       |                 |                  |
+| Moore      |       |                 |                  |
+| Ascend     |       |                 |                  |
+
+<details>
+<summary>Full `pytest` output (optional)</summary>
+
+```text
+paste here
+```
+
+</details>
+
+## Benchmark / Performance Impact
+
+<!--
+Required for `perf` PRs; optional otherwise. Describe the benchmark harness,
+shapes, dtypes, hardware, and include baseline vs. new numbers. If the PR is
+not performance-sensitive, write "N/A".
+-->
+
+## Notes for Reviewers
+
+<!--
+Anything reviewers should focus on: subtle invariants, known trade-offs,
+follow-up work intentionally left out of scope, etc.
+-->
+
+---
+
+## Checklist
+
+> Every contributor **must** verify every item below before requesting
+> review. Tick each box only after the check has actually been performed —
+> do not tick speculatively. If an item truly does not apply, replace the
+> checkbox with `N/A` and briefly explain why in an inline comment.
+
+### Title, Branch, and Commits
+
+- [ ] PR **title** follows [Conventional Commits](https://www.conventionalcommits.org/) (e.g. `feat(nvidia): …`, `fix(cuda/gemm): …`).
+- [ ] Branch name follows `<type>/xxx-yyyy-zzzz` where `<type>` matches the PR title's Conventional Commits type and words are joined with hyphens (see `CONTRIBUTING.md` §Branches).
+- [ ] Each **commit** message follows Conventional Commits.
+- [ ] Small PR is a **single squashable commit**; or, for a large PR, every commit is meaningful, well-formed, and independently reviewable (see `CONTRIBUTING.md` §Pull Requests).
+- [ ] No stray merge commits from `master` — the branch is rebased cleanly on top of the current `master`.
+- [ ] No `fixup!` / `squash!` / `wip` commits remain.
+
+### Scope and Design
+
+- [ ] Changes are **minimal** — nothing unrelated to the stated motivation was added (`CONTRIBUTING.md` §Code/General).
+- [ ] No dead code, commented-out blocks, debug prints, `printf`/`std::cout`/`print(...)` left behind, or `TODO` without an owner and issue link.
+- [ ] No unrelated formatting churn that would obscure the diff.
+- [ ] Public API changes (if any) are intentional, documented, and reflected in affected callers/tests.
+
+### General Code Hygiene (applies to all languages)
+
+- [ ] The code is self-explanatory; comments were added **only** where the *why* is non-obvious (`CONTRIBUTING.md` §Code/General).
+- [ ] Every modified or added file **ends with a single trailing newline** (`CONTRIBUTING.md` §Code/General).
+- [ ] No trailing whitespace, tab/space mixing, or stray BOMs.
+- [ ] Identifiers in comments and error messages are wrapped in backticks (e.g. ``the `seqlens_k` tensor``) (`CONTRIBUTING.md` §Code/General).
+- [ ] All comments and error messages are in **English** (`CONTRIBUTING.md` §Code/General).
+- [ ] Comments and error messages are complete sentences — capitalized first letter, terminal punctuation — **unless** the language/framework convention says otherwise (`CONTRIBUTING.md` §Code/General; §Python).
+
+### C++ Specific (if C++ files changed)
+
+- [ ] Code follows the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) strictly.
+- [ ] `clang-format` (version **21**, per `.github/workflows/clang-format.yml`) has been run against all modified `.h`, `.cc`, `.cuh`, and `.mlu` files; the diff is clean.
+- [ ] `clang-tidy` concerns (per `.clang-tidy`) have been reviewed — no new warnings beyond the existing baseline.
+- [ ] Operator parameter order is **inputs first, outputs last**; attributes are between inputs and outputs; naming follows PyTorch → ONNX → CUDA API precedence (`CONTRIBUTING.md` §C++).
+- [ ] **No exceptions** are thrown. Error paths use `assert` with messages that include at least `__FILE__`, `__LINE__`, and `__func__` (`CONTRIBUTING.md` §C++).
+- [ ] Error and warning message wording follows the [LLVM Coding Standards](https://llvm.org/docs/CodingStandards.html#error-and-warning-messages) (`CONTRIBUTING.md` §C++).
+- [ ] Kernel files are named correctly: custom = `kernel` / `kernel_v2` / …; well-known algorithms use the algorithm name; library-based implementations use the library name (`CONTRIBUTING.md` §C++).
+- [ ] Kernel and kernel launcher are in **separate files**: launcher `.h`, kernel follows platform conventions (e.g. `.cuh` + `.cu`) even when non-templated (`CONTRIBUTING.md` §C++).
+- [ ] Constructor **initializer list order matches member declaration order** (`CONTRIBUTING.md` §C++).
+- [ ] Exactly **one blank line** between classes, between classes and functions, and between functions (`CONTRIBUTING.md` §C++).
+- [ ] Exactly **one blank line** between members (functions *and* variables) within a class (`CONTRIBUTING.md` §C++).
+- [ ] Exactly **one blank line** before and after the contents of a namespace (`CONTRIBUTING.md` §C++).
+- [ ] New operators added via `src/base/<op>.h` (inheriting `Operator<Op>`) with platform implementations under `src/<category>/<platform>/` inheriting the base (`CONTRIBUTING.md` §Adding an Operator).
+- [ ] No raw `new`/`delete`; RAII / smart pointers / existing allocators are used.
+
+### Python Specific (if Python files changed)
+
+- [ ] Code is [PEP 8](https://peps.python.org/pep-0008/) compliant; `ruff check` passes cleanly on CI (see `.github/workflows/ruff.yml`).
+- [ ] `ruff format --check` passes cleanly — if not, run `ruff format` and commit the result.
+- [ ] Comments are complete English sentences, starting with a capital letter and ending with punctuation; Markdown backticks are used for code references (`CONTRIBUTING.md` §Python).
+- [ ] Framework-specific conventions (e.g. lowercase `pytest.skip` messages without terminal period) are honored where applicable (`CONTRIBUTING.md` §Python).
+- [ ] **No blank line** between the function signature and the body when there is no docstring or comment (`CONTRIBUTING.md` §Python).
+- [ ] A blank line is present **before and after** `if`, `for`, and similar control-flow statements (`CONTRIBUTING.md` §Python).
+- [ ] A blank line appears **before** each `return`, except when it directly follows a control-flow statement (`CONTRIBUTING.md` §Python).
+- [ ] Docstrings (if any) follow [PEP 257](https://peps.python.org/pep-0257/) (`CONTRIBUTING.md` §Python).
+- [ ] Type hints are added / kept consistent with the surrounding code.
+
+### Testing
+
+- [ ] `pytest` was run locally on **every supported platform** that this PR can affect, and the results are recorded in the "Test Results" table above (`CONTRIBUTING.md` §Pull Requests).
+- [ ] For any platform that could not be tested, an explicit reason is given in the table and a reviewer with access has been tagged.
+- [ ] New functionality has matching tests under `tests/` following `tests/test_add.py` / `tests/test_gemm.py` patterns (`CONTRIBUTING.md` §Adding an Operator).
+- [ ] Tests use `pytest.mark.parametrize` correctly: dependent parameters share one decorator (e.g. `@pytest.mark.parametrize("dtype, rtol, atol", …)`), independent parameters use separate decorators ordered by parameter declaration.
+- [ ] Where appropriate, `pytest.mark.auto_act_and_assert` is used and the test returns a `Payload` whose `func` and `ref` share the same calling convention.
+- [ ] Default `dtype` / `device` parameterization is relied on, or overridden with an explicit `pytest.mark.parametrize` when necessary.
+- [ ] Any new test that is flaky under parallelism is marked so, or documented to require `pytest -n 1`.
+- [ ] For bug fixes: a regression test has been added that fails on `master` and passes with this PR.
+
+### Build, CI, and Tooling
+
+- [ ] The project builds cleanly from a fresh directory with `pip install .[dev]` on at least one affected platform.
+- [ ] `compile_commands.json` still regenerates (CMake option `CMAKE_EXPORT_COMPILE_COMMANDS=ON` in `pyproject.toml` — required by the `code-lint` skill and `clang-tidy -p`).
+- [ ] New backends / devices have been added to auto-detection in `CMakeLists.txt` under `if(AUTO_DETECT_DEVICES)` **and** to `if(AUTO_DETECT_BACKENDS)` if applicable.
+- [ ] Only one CUDA-like GPU backend is selectable at a time — the existing mutual-exclusion check in `CMakeLists.txt` is not broken.
+- [ ] Both CI workflows (`clang-format.yml`, `ruff.yml`) are green locally (or expected to be green on CI).
+- [ ] No new runtime dependency was added without updating `pyproject.toml`'s `[project.optional-dependencies]` (or justified in the PR description).
+
+### Documentation
+
+- [ ] `README.md`, `CONTRIBUTING.md`, or inline docs updated when behavior, build flags, or developer workflow changed.
+- [ ] New operators, new dispatch helpers, or new public utilities are documented (docstring, header comment, or an addition to `CONTRIBUTING.md` §Some Code Explanations).
+- [ ] Any user-visible breaking change is called out explicitly under "Motivation" **and** in the commit/PR title with a `!` or `BREAKING CHANGE:` footer.
+
+### Security and Safety
+
+- [ ] No secrets, access tokens, internal URLs, customer data, or personal hardware identifiers have been committed.
+- [ ] Third-party code is license-compatible and attributed.
+- [ ] No unsafe pointer arithmetic, uninitialized reads, or missing bounds checks were introduced.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e88cc20c0..91c2b0154 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ project(InfiniOps LANGUAGES CXX)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-# Internal variable to control pybind11's automatic optimization flags (like `-flto`).
+# Internal variable to control `pybind11`'s automatic optimization flags (like `-flto`).
 set(PYBIND11_ENABLE_EXTRAS ON)
 
 # Options for backends.
@@ -18,6 +18,13 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)
 
 option(WITH_TORCH "Enable PyTorch C++ backend" OFF)
 
+# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for
+# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed
+# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the
+# toolchain is compatible or when building via the standalone
+# `src/ascend/custom/build.sh` script.
+option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF)
+
 option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
 option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)
@@ -130,6 +137,28 @@ if(WITH_TORCH)
     find_library(C10_LIB c10 HINTS ${_torch_lib_dirs} REQUIRED)
     set(TORCH_LIBRARIES ${TORCH_LIB} ${TORCH_CPU_LIB} ${C10_LIB})
 
+    # `auditwheel`-repaired `torch` wheels bundle transitive dependencies
+    # (e.g. `libgfortran-<hash>.so`, `libopenblasp-<hash>.so`) in a sibling
+    # `torch.libs/` directory that `library_paths()` does not return.  When
+    # building against such a wheel, the linker needs this path to resolve
+    # the bundled `NEEDED` entries (otherwise: `undefined reference to
+    # _gfortran_etime@GFORTRAN_8` etc.).
+    execute_process(
+        COMMAND ${Python_EXECUTABLE} -c "import os, torch; d = os.path.dirname(torch.__file__); p = os.path.join(os.path.dirname(d), 'torch.libs'); print(p if os.path.isdir(p) else '')"
+        OUTPUT_VARIABLE TORCH_BUNDLED_LIBS_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    if(TORCH_BUNDLED_LIBS_DIR)
+        list(APPEND CMAKE_BUILD_RPATH "${TORCH_BUNDLED_LIBS_DIR}")
+        list(APPEND CMAKE_INSTALL_RPATH "${TORCH_BUNDLED_LIBS_DIR}")
+        # `rpath-link` is linker-only: lets `ld` resolve the bundled
+        # transitive `NEEDED` entries at link time without adding them to our
+        # own binary's direct `NEEDED` list.
+        add_link_options("-Wl,-rpath-link,${TORCH_BUNDLED_LIBS_DIR}")
+        message(STATUS "PyTorch bundled libs: ${TORCH_BUNDLED_LIBS_DIR}")
+    endif()
+
     # Query the `CXX11` ABI setting that `torch` was compiled with.
     # A mismatch causes linker errors (e.g. undefined reference to
     # `c10::Device::Device(std::string const&)`).
@@ -182,7 +211,7 @@ if(WITH_ILUVATAR)
     # `-x ivcore` must not be in `CMAKE_CUDA_FLAGS` — CMake passes those flags
     # to both compile and link steps. During linking, `-x ivcore` causes
     # `clang++` to re-parse `.o` files as source code.
-    set(CMAKE_CUDA_FLAGS "--cuda-gpu-arch=${ILUVATAR_ARCH} -fPIC -Wno-error=unused-variable -Wno-error=unused-private-field -Wno-unused-variable" CACHE STRING "Iluvatar CUDA flags")
+    set(CMAKE_CUDA_FLAGS "--cuda-gpu-arch=${ILUVATAR_ARCH} -fPIC -Wno-error=unused-variable -Wno-error=unused-private-field -Wno-unused-variable -std=c++17" CACHE STRING "Iluvatar CUDA flags")
     set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Iluvatar")
     set(CMAKE_CUDA_ARCHITECTURES OFF CACHE STRING "Iluvatar CUDA architectures (passed via CMAKE_CUDA_FLAGS)")
     # Iluvatar does not ship `libcudadevrt`, which CMake's compiler test
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d131f0add..fcc19c2cc 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -91,7 +91,7 @@ pytest
 ## Adding an Operator
 
 1. **Base class** in `src/base/`: the class must inherit from `Operator<Op>` (e.g. `class Gemm : public Operator<Gemm>`). See `src/base/gemm.h`.
-2. **Platform implementation** in `src/<category>/<platform>/`: the class must inherit from the base (e.g. `class Blas : public Gemm`). See `src/cuda/gemm/blas.h` and `src/cuda/nvidia/gemm/cublas.h`.
+2. **Platform implementation** in `src/native/<category>/<platform>/ops/<op>/` (or `src/torch/ops/<op>/` for the PyTorch backend): the class must inherit from the base (e.g. `class Blas : public Gemm`). See `src/native/cuda/ops/gemm/blas.h` and `src/native/cuda/nvidia/ops/gemm/cublas.h`.
 3. **Tests** in `tests/`:
    - Use `pytest.mark.parametrize` for parameterization. Dependent parameters go in one decorator (e.g. `@pytest.mark.parametrize("dtype, rtol, atol", ...)`); independent parameters use separate decorators, ordered by parameter declaration.
    - `dtype` and `device` parameterization is included by default. Override with explicit `pytest.mark.parametrize` if needed.
diff --git a/examples/runtime_api.h b/examples/runtime_api.h
index c1ef31612..101fcad59 100644
--- a/examples/runtime_api.h
+++ b/examples/runtime_api.h
@@ -4,27 +4,27 @@
 #include "device.h"
 
 #ifdef WITH_NVIDIA
-#include "cuda/nvidia/gemm/cublas.h"
-#include "cuda/nvidia/gemm/cublaslt.h"
-#include "cuda/nvidia/runtime_.h"
+#include "native/cuda/nvidia/ops/gemm/cublas.h"
+#include "native/cuda/nvidia/ops/gemm/cublaslt.h"
+#include "native/cuda/nvidia/runtime_.h"
 #elif WITH_ILUVATAR
-#include "cuda/iluvatar/gemm/cublas.h"
-#include "cuda/iluvatar/runtime_.h"
+#include "native/cuda/iluvatar/ops/gemm/cublas.h"
+#include "native/cuda/iluvatar/runtime_.h"
 #elif WITH_METAX
-#include "cuda/metax/gemm/mcblas.h"
-#include "cuda/metax/runtime_.h"
+#include "native/cuda/metax/ops/gemm/mcblas.h"
+#include "native/cuda/metax/runtime_.h"
 #elif WITH_CAMBRICON
-#include "cambricon/gemm/cnblas.h"
-#include "cambricon/runtime_.h"
+#include "native/cambricon/ops/gemm/cnblas.h"
+#include "native/cambricon/runtime_.h"
 #elif WITH_MOORE
-#include "cuda/moore/gemm/mublas.h"
-#include "cuda/moore/runtime_.h"
+#include "native/cuda/moore/ops/gemm/mublas.h"
+#include "native/cuda/moore/runtime_.h"
 #elif WITH_ASCEND
-#include "ascend/gemm/kernel.h"
-#include "ascend/runtime_.h"
+#include "native/ascend/ops/gemm/kernel.h"
+#include "native/ascend/runtime_.h"
 #elif WITH_CPU
-#include "cpu/gemm/gemm.h"
-#include "cpu/runtime_.h"
+#include "native/cpu/ops/gemm/gemm.h"
+#include "native/cpu/runtime_.h"
 #else
 #error "One `WITH_*` backend must be enabled for the examples."
 #endif
diff --git a/pyproject.toml b/pyproject.toml
index 58740166e..959699f90 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,5 +17,10 @@ AUTO_DETECT_DEVICES = "ON"
 AUTO_DETECT_BACKENDS = "ON"
 GENERATE_PYTHON_BINDINGS = "ON"
 
+# Enables `compile_commands.json` under `SKBUILD_BUILD_DIR` (or the default
+# `scikit-build` build dir) for `clang-tidy -p <build-dir> <file>` used by
+# the `code-lint` skill.
+CMAKE_EXPORT_COMPILE_COMMANDS = "ON"
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
index c050b31c0..effc0787d 100644
--- a/scripts/generate_wrappers.py
+++ b/scripts/generate_wrappers.py
@@ -103,8 +103,18 @@ def _find_optional_tensor_params(op_name):
     return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))
 
 
+def _find_vector_tensor_params(op_name):
+    """Return a set of parameter names declared as `std::vector<Tensor>` in
+    the base header.
+    """
+    source = (_BASE_DIR / f"{op_name}.h").read_text()
+
+    return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))
+
+
 def _generate_pybind11(operator):
     optional_tensor_params = _find_optional_tensor_params(operator.name)
+    vector_tensor_params = _find_vector_tensor_params(operator.name)
 
     def _is_optional_tensor(arg):
         if arg.spelling in optional_tensor_params:
@@ -112,6 +122,15 @@ def _is_optional_tensor(arg):
 
         return "std::optional" in arg.type.spelling and "Tensor" in arg.type.spelling
 
+    def _is_optional(arg):
+        return "std::optional" in arg.type.spelling
+
+    def _is_vector_tensor(arg):
+        if arg.spelling in vector_tensor_params:
+            return True
+
+        return "std::vector" in arg.type.spelling and "Tensor" in arg.type.spelling
+
     def _generate_params(node):
         parts = []
 
@@ -121,6 +140,8 @@ def _generate_params(node):
 
             if _is_optional_tensor(arg):
                 parts.append(f"std::optional<py::object> {arg.spelling}")
+            elif _is_vector_tensor(arg):
+                parts.append(f"std::vector<py::object> {arg.spelling}")
             else:
                 param = arg.type.spelling.replace("const Tensor", "py::object").replace(
                     "Tensor", "py::object"
@@ -138,6 +159,8 @@ def _generate_arguments(node):
 
             if _is_optional_tensor(arg):
                 args.append(f"OptionalTensorFromPybind11Handle({arg.spelling})")
+            elif _is_vector_tensor(arg):
+                args.append(f"VectorTensorFromPybind11Handle({arg.spelling})")
             elif "Tensor" in arg.type.spelling:
                 args.append(f"TensorFromPybind11Handle({arg.spelling})")
             else:
@@ -155,11 +178,18 @@ def _generate_init(constructor):
       }}))"""
 
     def _generate_py_args(node):
-        return ", ".join(
-            f'py::arg("{arg.spelling}")'
-            for arg in node.get_arguments()
-            if arg.spelling != "stream"
-        )
+        parts = []
+
+        for arg in node.get_arguments():
+            if arg.spelling == "stream":
+                continue
+
+            if _is_optional(arg):
+                parts.append(f'py::arg("{arg.spelling}") = py::none()')
+            else:
+                parts.append(f'py::arg("{arg.spelling}")')
+
+        return ", ".join(parts)
 
     def _generate_call(op_name, call, method=True):
         call_params = _generate_params(call)
@@ -224,7 +254,8 @@ def _generate_call(op_name, call, method=True):
 {calls}
       .def_static("active_implementation_indices", [](const std::string& device) {{
         return Self::active_implementation_indices(DeviceTypeFromString(device));
-      }});
+      }})
+      .def_static("clear_cache", &Self::clear_cache);
 
 {callers}
 }}
@@ -430,7 +461,7 @@ def _get_all_ops(devices, with_torch=False):
         ops[op_name] = []
 
         for file_path in _SRC_DIR.rglob("*.h"):
-            if file_path.parent.parent.name not in scan_dirs:
+            if file_path.parent.parent.parent.name not in scan_dirs:
                 continue
 
             if f"class Operator<{_snake_to_pascal(op_name)}" in file_path.read_text():
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cbdae6745..ce888b4b5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,8 +7,8 @@ set(DEVICE_LIST "")
 
 if(WITH_CPU)
     set(CPU_PATTERNS
-        "cpu/*.cc"
-        "cpu/*.cpp"
+        "native/cpu/*.cc"
+        "native/cpu/*.cpp"
     )
 
     file(GLOB_RECURSE CPU_SOURCES CONFIGURE_DEPENDS ${CPU_PATTERNS})
@@ -24,12 +24,12 @@ endif()
 
 if(WITH_NVIDIA)
     set(NVIDIA_PATTERNS
-        "cuda/*.cc"
-        "cuda/*.cpp"
-        "cuda/*.cu"
-        "cuda/nvidia/*.cc"
-        "cuda/nvidia/*.cpp"
-        "cuda/nvidia/*.cu"
+        "native/cuda/*.cc"
+        "native/cuda/*.cpp"
+        "native/cuda/*.cu"
+        "native/cuda/nvidia/*.cc"
+        "native/cuda/nvidia/*.cpp"
+        "native/cuda/nvidia/*.cu"
     )
 
     file(GLOB_RECURSE NVIDIA_SOURCES CONFIGURE_DEPENDS ${NVIDIA_PATTERNS})
@@ -51,12 +51,12 @@ endif()
 
 if(WITH_ILUVATAR)
     set(ILUVATAR_PATTERNS
-        "cuda/*.cc"
-        "cuda/*.cpp"
-        "cuda/*.cu"
-        "cuda/iluvatar/*.cc"
-        "cuda/iluvatar/*.cpp"
-        "cuda/iluvatar/*.cu"
+        "native/cuda/*.cc"
+        "native/cuda/*.cpp"
+        "native/cuda/*.cu"
+        "native/cuda/iluvatar/*.cc"
+        "native/cuda/iluvatar/*.cpp"
+        "native/cuda/iluvatar/*.cu"
     )
 
     file(GLOB_RECURSE ILUVATAR_SOURCES CONFIGURE_DEPENDS ${ILUVATAR_PATTERNS})
@@ -79,10 +79,10 @@ endif()
 
 if(WITH_METAX)
     set(METAX_PATTERNS
-        "cuda/*.cc"
-        "cuda/*.cpp"
-        "cuda/metax/*.cc"
-        "cuda/metax/*.maca"
+        "native/cuda/*.cc"
+        "native/cuda/*.cpp"
+        "native/cuda/metax/*.cc"
+        "native/cuda/metax/*.maca"
     )
 
     file(GLOB_RECURSE METAX_SOURCES CONFIGURE_DEPENDS ${METAX_PATTERNS})
@@ -105,11 +105,11 @@ endif()
 
 if(WITH_MOORE)
     set(MOORE_PATTERNS
-        "cuda/*.cc"
-        "cuda/*.cpp"
-        "cuda/moore/*.cc"
-        "cuda/moore/*.cpp"
-        "cuda/moore/*.mu"
+        "native/cuda/*.cc"
+        "native/cuda/*.cpp"
+        "native/cuda/moore/*.cc"
+        "native/cuda/moore/*.cpp"
+        "native/cuda/moore/*.mu"
     )
 
     file(GLOB_RECURSE MOORE_SOURCES CONFIGURE_DEPENDS ${MOORE_PATTERNS})
@@ -127,7 +127,7 @@ if(WITH_MOORE)
 endif()
 
 if(WITH_CAMBRICON)
-    file(GLOB_RECURSE CAMBRICON_MLU_SOURCES CONFIGURE_DEPENDS "cambricon/*/*.mlu")
+    file(GLOB_RECURSE CAMBRICON_MLU_SOURCES CONFIGURE_DEPENDS "native/cambricon/ops/*/*.mlu")
     find_program(CNCC_COMPILER cncc HINTS "${NEUWARE_HOME}/bin" "$ENV{NEUWARE_HOME}/bin" /usr/local/neuware/bin)
     if(CNCC_COMPILER)
         message(STATUS "Found cncc: ${CNCC_COMPILER}")
@@ -175,11 +175,13 @@ endif()
 if(WITH_ASCEND)
     # ASCEND_HOME is set by the top-level CMakeLists.txt.
     file(GLOB_RECURSE ASCEND_SOURCES CONFIGURE_DEPENDS
-        "ascend/*.cc"
-        "ascend/*.cpp"
+        "native/ascend/*.cc"
+        "native/ascend/*.cpp"
     )
-    # Exclude `kernel_impl.cpp` — AscendC device code, not compiled by the host C++ compiler.
+    # Exclude `kernel_impl.cpp` — `AscendC` device code, not compiled by the host C++ compiler.
     list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*kernel_impl\\.cpp$")
+    # Exclude custom/ — standalone PyTorch extension, built separately.
+    list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*/custom/.*")
 
     target_compile_definitions(infiniops PUBLIC WITH_ASCEND=1)
     target_sources(infiniops PRIVATE ${ASCEND_SOURCES})
@@ -215,7 +217,38 @@ if(WITH_ASCEND)
         "${ASCEND_HOME}/lib64/libopapi.so"
         "${ASCEND_HAL_LIB}")
 
+    # ATB (Ascend Transformer Boost) — provides fused operators like
+    # `PagedAttention` and `ReshapeAndCache` that are graph-capture safe.
+    set(ATB_HOME_DIR "$ENV{ATB_HOME_PATH}")
+    if(NOT ATB_HOME_DIR)
+        # Default search path under CANN nnal directory.
+        file(GLOB ATB_SEARCH_DIRS "/usr/local/Ascend/nnal/atb/*/atb/cxx_abi_1")
+        if(ATB_SEARCH_DIRS)
+            list(SORT ATB_SEARCH_DIRS ORDER DESCENDING)
+            list(GET ATB_SEARCH_DIRS 0 ATB_HOME_DIR)
+        endif()
+    endif()
+
+    if(ATB_HOME_DIR AND EXISTS "${ATB_HOME_DIR}/include/atb/operation.h")
+        message(STATUS "ATB found: ${ATB_HOME_DIR}")
+        target_compile_definitions(infiniops PUBLIC INFINI_HAS_ATB=1)
+        target_include_directories(infiniops PUBLIC "${ATB_HOME_DIR}/include")
+        target_link_libraries(infiniops PUBLIC "${ATB_HOME_DIR}/lib/libatb.so")
+    else()
+        message(STATUS "ATB not found — ATB-based operators disabled")
+    endif()
+
     list(APPEND DEVICE_LIST "ascend")
+
+    # Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`).
+    if(BUILD_CUSTOM_KERNEL)
+        add_subdirectory(native/ascend/custom)
+
+        # Link the compiled `AscendC` kernel objects into `infiniops` so that
+        # custom kernel implementations (e.g. `RmsNorm` index 1) can call
+        # them via the generated launch functions.
+        target_compile_definitions(infiniops PUBLIC INFINI_HAS_CUSTOM_KERNELS=1)
+    endif()
 endif()
 
 if(WITH_TORCH)
@@ -340,6 +373,17 @@ if(GENERATE_PYTHON_BINDINGS)
     target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR})
     target_link_libraries(ops PRIVATE infiniops)
 
+    # Custom `AscendC` kernel objects must be linked directly into ops
+    # because the `AscendC` toolchain compiles host stubs with hidden
+    # visibility — `libinfiniops.so` cannot re-export those symbols.
+    # The `Operator<..., 1>` template instantiations that call
+    # `aclrtlaunch_*` live in `ops.cc`, so link here with
+    # `--whole-archive` to ensure all launch functions are available.
+    if(BUILD_CUSTOM_KERNEL)
+        target_link_libraries(ops PRIVATE
+            -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
+    endif()
+
     set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
     set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN")
 
diff --git a/src/ascend/common.h b/src/ascend/common.h
deleted file mode 100644
index fba4766b4..000000000
--- a/src/ascend/common.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_COMMON_H_
-#define INFINI_OPS_ASCEND_COMMON_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/acl_meta.h"
-#include "ascend/data_type_.h"
-#include "tensor.h"
-
-namespace infini::ops::ascend {
-
-// Build an `aclTensor` descriptor from an InfiniOps `Tensor`.
-//
-// When `transpose_last2` is true the last two dimensions are swapped in the
-// descriptor (shape and strides) without copying data.  This is used by `Gemm`
-// and `MatMul` to express a transpose via the view.
-inline aclTensor* buildAclTensor(const Tensor& t,
-                                 bool transpose_last2 = false) {
-  std::vector<int64_t> shape(t.shape().begin(), t.shape().end());
-  std::vector<int64_t> strides(t.strides().begin(), t.strides().end());
-
-  if (transpose_last2 && shape.size() >= 2) {
-    auto n = shape.size();
-    std::swap(shape[n - 2], shape[n - 1]);
-    std::swap(strides[n - 2], strides[n - 1]);
-  }
-
-  // Compute the minimum physical storage needed for this strided view.
-  // For contiguous tensors this equals `numel()`; for non-contiguous (gapped)
-  // tensors it may be larger; for broadcast (stride-0) tensors it may be
-  // smaller.  Passing the view shape as the storage shape causes
-  // "ViewShape overlap" errors in ACLNN for non-contiguous inputs.
-  int64_t storage_elems = 1;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] == 0) {
-      storage_elems = 0;
-      break;
-    }
-    if (strides[i] > 0 && shape[i] > 1) {
-      storage_elems += static_cast<int64_t>(shape[i] - 1) * strides[i];
-    }
-  }
-  std::vector<int64_t> storage_shape = {storage_elems};
-
-  return aclCreateTensor(
-      shape.data(), static_cast<int64_t>(shape.size()), ToAclDtype(t.dtype()),
-      strides.data(),
-      /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape.data(),
-      static_cast<int64_t>(storage_shape.size()), const_cast<void*>(t.data()));
-}
-
-}  // namespace infini::ops::ascend
-
-#endif
diff --git a/src/ascend/gemm/kernel.h b/src/ascend/gemm/kernel.h
deleted file mode 100644
index 3360e7930..000000000
--- a/src/ascend/gemm/kernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_GEMM_KERNEL_H_
-#define INFINI_OPS_ASCEND_GEMM_KERNEL_H_
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnnop/aclnn_addmm.h"
-#include "aclnnop/aclnn_baddbmm.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/gemm.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-template <>
-class Operator<Gemm, Device::Type::kAscend> : public Gemm {
- public:
-  Operator(const Tensor a, const Tensor b, std::optional<float> alpha,
-           std::optional<float> beta, std::optional<int> trans_a,
-           std::optional<int> trans_b, Tensor c)
-      : Gemm(a, b, alpha, beta, trans_a, trans_b, c),
-        batched_{batch_count_ > 1},
-        alpha_val_{alpha.value_or(1.0f)},
-        beta_val_{beta.value_or(1.0f)} {
-    alpha_scalar_ = aclCreateScalar(&alpha_val_, ACL_FLOAT);
-    beta_scalar_ = aclCreateScalar(&beta_val_, ACL_FLOAT);
-  }
-
-  ~Operator() {
-    aclDestroyScalar(alpha_scalar_);
-    aclDestroyScalar(beta_scalar_);
-  }
-
-  void operator()(const Tensor a, const Tensor b, std::optional<float> alpha,
-                  std::optional<float> beta, std::optional<int> trans_a,
-                  std::optional<int> trans_b, Tensor c) const override {
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    auto t_self = ascend::buildAclTensor(c);
-    auto t_a = ascend::buildAclTensor(a, trans_a_);
-    auto t_b = ascend::buildAclTensor(b, trans_b_);
-    auto t_out = ascend::buildAclTensor(c);
-
-    uint64_t ws_needed = 0;
-    aclOpExecutor* executor = nullptr;
-
-    if (batched_) {
-      aclnnBaddbmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_,
-                                   alpha_scalar_, t_out, 0, &ws_needed,
-                                   &executor);
-    } else {
-      aclnnAddmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_, alpha_scalar_,
-                                 t_out, 0, &ws_needed, &executor);
-    }
-
-    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_needed);
-
-    if (batched_) {
-      aclnnBaddbmm(arena.buf, ws_needed, executor, stream);
-    } else {
-      aclnnAddmm(arena.buf, ws_needed, executor, stream);
-    }
-
-    aclDestroyTensor(t_self);
-    aclDestroyTensor(t_a);
-    aclDestroyTensor(t_b);
-    aclDestroyTensor(t_out);
-  }
-
- private:
-  bool batched_;
-
-  float alpha_val_;
-
-  float beta_val_;
-
-  aclScalar* alpha_scalar_ = nullptr;
-
-  aclScalar* beta_scalar_ = nullptr;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/ascend/workspace_pool_.h b/src/ascend/workspace_pool_.h
deleted file mode 100644
index 8c4c91961..000000000
--- a/src/ascend/workspace_pool_.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_WORKSPACE_POOL__H_
-#define INFINI_OPS_ASCEND_WORKSPACE_POOL__H_
-
-#include <cassert>
-#include <cstdint>
-#include <mutex>
-#include <unordered_map>
-
-#include "acl/acl.h"
-
-namespace infini::ops::ascend {
-
-struct WorkspaceArena {
-  void* buf = nullptr;
-
-  uint64_t capacity = 0;
-};
-
-class WorkspacePool {
- public:
-  WorkspaceArena& Ensure(aclrtStream stream, uint64_t needed) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto& arena = arenas_[stream];
-    if (needed <= arena.capacity) return arena;
-    if (arena.capacity > 0) {
-      aclrtSynchronizeStream(stream);
-      aclrtFree(arena.buf);
-    }
-    if (needed > 0) {
-      auto ret = aclrtMalloc(&arena.buf, needed, ACL_MEM_MALLOC_NORMAL_ONLY);
-      assert(ret == ACL_SUCCESS && "`WorkspacePool`: `aclrtMalloc` failed");
-    }
-    arena.capacity = needed;
-    return arena;
-  }
-
-  ~WorkspacePool() {
-    for (auto& [stream, arena] : arenas_) {
-      if (arena.capacity > 0) aclrtFree(arena.buf);
-    }
-  }
-
- private:
-  std::unordered_map<aclrtStream, WorkspaceArena> arenas_;
-
-  std::mutex mutex_;
-};
-
-inline WorkspacePool& GetWorkspacePool() {
-  static WorkspacePool pool;
-  return pool;
-}
-
-}  // namespace infini::ops::ascend
-
-#endif
diff --git a/src/base/cast.h b/src/base/cast.h
new file mode 100644
index 000000000..29f1f40cf
--- /dev/null
+++ b/src/base/cast.h
@@ -0,0 +1,52 @@
+#ifndef INFINI_OPS_BASE_CAST_H_
+#define INFINI_OPS_BASE_CAST_H_
+
+#include "operator.h"
+
+namespace infini::ops {
+
+class Cast : public Operator<Cast> {
+ public:
+  Cast(const Tensor input, Tensor out)
+      : ndim_{out.ndim()},
+        output_size_{out.numel()},
+        input_dtype_{input.dtype()},
+        out_dtype_{out.dtype()},
+        input_shape_{input.shape()},
+        out_shape_{out.shape()},
+        input_strides_{input.strides()},
+        out_strides_{out.strides()},
+        is_input_contiguous_{input.IsContiguous()},
+        is_out_contiguous_{out.IsContiguous()} {
+    assert(input.numel() == out.numel() &&
+           "the input and output of `Cast` must have the same number of "
+           "elements");
+  }
+
+  virtual void operator()(const Tensor input, Tensor out) const = 0;
+
+ protected:
+  Tensor::Size ndim_{0};
+
+  Tensor::Size output_size_{0};
+
+  const DataType input_dtype_;
+
+  const DataType out_dtype_;
+
+  Tensor::Shape input_shape_;
+
+  Tensor::Shape out_shape_;
+
+  Tensor::Strides input_strides_;
+
+  Tensor::Strides out_strides_;
+
+  bool is_input_contiguous_{false};
+
+  bool is_out_contiguous_{false};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/base/cat.h b/src/base/cat.h
new file mode 100644
index 000000000..dcb0ba587
--- /dev/null
+++ b/src/base/cat.h
@@ -0,0 +1,35 @@
+#ifndef INFINI_OPS_BASE_CAT_H_
+#define INFINI_OPS_BASE_CAT_H_
+
+#include <vector>
+
+#include "operator.h"
+
+namespace infini::ops {
+
+class Cat : public Operator<Cat> {
+ public:
+  Cat(const Tensor first_input, std::vector<Tensor> rest_inputs, int64_t dim,
+      Tensor out)
+      : input_count_{1 + rest_inputs.size()} {
+    assert(input_count_ >= 2 && "`Cat` requires at least 2 input tensors");
+
+    auto ndim = static_cast<int64_t>(out.ndim());
+    // Normalize negative dim (e.g. -1 means last dimension).
+    dim_ = dim < 0 ? dim + ndim : dim;
+    assert(dim_ >= 0 && dim_ < ndim && "`Cat` dim out of range");
+  }
+
+  virtual void operator()(const Tensor first_input,
+                          std::vector<Tensor> rest_inputs, int64_t dim,
+                          Tensor out) const = 0;
+
+ protected:
+  int64_t dim_;
+
+  size_t input_count_;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/base/linear.h b/src/base/linear.h
new file mode 100644
index 000000000..a5276e612
--- /dev/null
+++ b/src/base/linear.h
@@ -0,0 +1,65 @@
+#ifndef INFINI_OPS_BASE_LINEAR_H_
+#define INFINI_OPS_BASE_LINEAR_H_
+
+#include <optional>
+
+#include "operator.h"
+
+namespace infini::ops {
+
+// Fused linear projection: out = a @ b (+ bias).
+//
+// When bias is present, computes out = a @ b + bias in a single dispatch.
+// When bias is absent, computes out = a @ b (equivalent to Matmul).
+// `trans_a` / `trans_b`: If true, transpose the last two dims before
+// multiplying.
+class Linear : public Operator<Linear> {
+ public:
+  Linear(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+         bool trans_a, bool trans_b, Tensor out)
+      : a_shape_{a.shape()},
+        b_shape_{b.shape()},
+        out_shape_{out.shape()},
+        a_strides_{a.strides()},
+        b_strides_{b.strides()},
+        out_strides_{out.strides()},
+        trans_a_{trans_a},
+        trans_b_{trans_b},
+        has_bias_{bias.has_value()} {
+    assert(a.dtype() == b.dtype() &&
+           "operator `Linear` requires a and b to have the same dtype");
+    assert(a.dtype() == out.dtype() &&
+           "operator `Linear` requires a and out to have the same dtype");
+    if (has_bias_) {
+      assert(bias->dtype() == out.dtype() &&
+             "operator `Linear` requires bias and out to have the same dtype");
+    }
+  }
+
+  virtual void operator()(const Tensor a, const Tensor b,
+                          std::optional<Tensor> bias, bool trans_a,
+                          bool trans_b, Tensor out) const = 0;
+
+ protected:
+  Tensor::Shape a_shape_;
+
+  Tensor::Shape b_shape_;
+
+  Tensor::Shape out_shape_;
+
+  Tensor::Strides a_strides_;
+
+  Tensor::Strides b_strides_;
+
+  Tensor::Strides out_strides_;
+
+  bool trans_a_{false};
+
+  bool trans_b_{false};
+
+  bool has_bias_{false};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/base/mat_mul.h b/src/base/mat_mul.h
deleted file mode 100644
index 6180c8bf2..000000000
--- a/src/base/mat_mul.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef INFINI_OPS_BASE_MAT_MUL_H_
-#define INFINI_OPS_BASE_MAT_MUL_H_
-
-#include "operator.h"
-#include "tensor.h"
-
-namespace infini::ops {
-
-class MatMul : public Operator<MatMul> {
- public:
-  MatMul(const Tensor input, const Tensor other, Tensor out)
-      : input_shape_{input.shape()},
-        other_shape_{other.shape()},
-        out_shape_{out.shape()} {
-    assert(input.dtype() == other.dtype());
-  }
-
-  virtual void operator()(const Tensor input, const Tensor other,
-                          Tensor out) const = 0;
-
- protected:
-  Tensor::Shape input_shape_;
-
-  Tensor::Shape other_shape_;
-
-  Tensor::Shape out_shape_;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/base/matmul.h b/src/base/matmul.h
new file mode 100644
index 000000000..071feaeaa
--- /dev/null
+++ b/src/base/matmul.h
@@ -0,0 +1,41 @@
+#ifndef INFINI_OPS_BASE_MATMUL_H_
+#define INFINI_OPS_BASE_MATMUL_H_
+
+#include "operator.h"
+#include "tensor.h"
+
+namespace infini::ops {
+
+class Matmul : public Operator<Matmul> {
+ public:
+  // `trans_a` / `trans_b`: If true, transpose the last two dims of `a` / `b`
+  // before multiplying.  These are constructor parameters so the `CacheKey`
+  // encodes the transposition and distinct descriptors are cached for each
+  // combination.
+  Matmul(const Tensor a, const Tensor b, Tensor c, bool trans_a, bool trans_b)
+      : a_shape_{a.shape()},
+        b_shape_{b.shape()},
+        c_shape_{c.shape()},
+        trans_a_{trans_a},
+        trans_b_{trans_b} {
+    assert(a.dtype() == b.dtype());
+  }
+
+  virtual void operator()(const Tensor a, const Tensor b, Tensor c,
+                          bool trans_a, bool trans_b) const = 0;
+
+ protected:
+  Tensor::Shape a_shape_;
+
+  Tensor::Shape b_shape_;
+
+  Tensor::Shape c_shape_;
+
+  bool trans_a_{false};
+
+  bool trans_b_{false};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/base/mul.h b/src/base/mul.h
new file mode 100644
index 000000000..9e7be2239
--- /dev/null
+++ b/src/base/mul.h
@@ -0,0 +1,67 @@
+#ifndef INFINI_OPS_BASE_MUL_H_
+#define INFINI_OPS_BASE_MUL_H_
+
+#include "operator.h"
+
+namespace infini::ops {
+
+class Mul : public Operator<Mul> {
+ public:
+  Mul(const Tensor input, const Tensor other, Tensor out)
+      : ndim_{out.ndim()},
+        output_size_{out.numel()},
+        input_type_{input.dtype()},
+        other_type_{other.dtype()},
+        out_type_{out.dtype()},
+        input_shape_{input.shape()},
+        other_shape_{other.shape()},
+        out_shape_{out.shape()},
+        input_strides_{input.strides()},
+        other_strides_{other.strides()},
+        out_strides_{out.strides()},
+        is_input_contiguous_{input.IsContiguous()},
+        is_other_contiguous_{other.IsContiguous()},
+        is_out_contiguous_{out.IsContiguous()} {
+    assert(!out.HasBroadcastDim() &&
+           "the output of `Mul` should NOT have broadcasted dim!");
+    assert(input_type_ == other_type_ && other_type_ == out_type_ &&
+           "operator `Mul` requires all input and output tensors to have the "
+           "same dtype");
+  }
+
+  virtual void operator()(const Tensor input, const Tensor other,
+                          Tensor out) const = 0;
+
+ protected:
+  Tensor::Size ndim_{0};
+
+  Tensor::Size output_size_{0};
+
+  const DataType input_type_;
+
+  const DataType other_type_;
+
+  const DataType out_type_;
+
+  Tensor::Shape input_shape_;
+
+  Tensor::Shape other_shape_;
+
+  Tensor::Shape out_shape_;
+
+  Tensor::Strides input_strides_;
+
+  Tensor::Strides other_strides_;
+
+  Tensor::Strides out_strides_;
+
+  bool is_input_contiguous_{false};
+
+  bool is_other_contiguous_{false};
+
+  bool is_out_contiguous_{false};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/device.h b/src/device.h
index 38e4bce11..688cd0dc2 100644
--- a/src/device.h
+++ b/src/device.h
@@ -1,6 +1,9 @@
 #ifndef INFINI_OPS_DEVICE_H_
 #define INFINI_OPS_DEVICE_H_
 
+#include <string>
+#include <string_view>
+
 #include "common/constexpr_map.h"
 #include "common/traits.h"
 #include "hash.h"
diff --git a/src/hash.h b/src/hash.h
index efb34f751..4721f33f3 100644
--- a/src/hash.h
+++ b/src/hash.h
@@ -2,6 +2,7 @@
 #define INFINI_OPS_HASH_H_
 
 #include <functional>
+#include <vector>
 
 template <typename T>
 inline void HashCombine(std::size_t& seed, const T& v) {
@@ -9,4 +10,12 @@ inline void HashCombine(std::size_t& seed, const T& v) {
   seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
 }
 
+template <typename T>
+inline void HashCombine(std::size_t& seed, const std::vector<T>& v) {
+  HashCombine(seed, v.size());
+  for (const auto& elem : v) {
+    HashCombine(seed, elem);
+  }
+}
+
 #endif
diff --git a/src/native/ascend/atb_common_.h b/src/native/ascend/atb_common_.h
new file mode 100644
index 000000000..40b76698e
--- /dev/null
+++ b/src/native/ascend/atb_common_.h
@@ -0,0 +1,95 @@
+#ifndef INFINI_OPS_ASCEND_ATB_COMMON__H_
+#define INFINI_OPS_ASCEND_ATB_COMMON__H_
+
+#ifdef INFINI_HAS_ATB
+
+#include <cassert>
+#include <cstdint>
+#include <mutex>
+#include <vector>
+
+#include "acl/acl.h"
+#include "atb/context.h"
+#include "atb/operation.h"
+#include "atb/types.h"
+#include "native/ascend/data_type_.h"
+#include "tensor.h"
+
+namespace infini::ops::ascend {
+
+// Thread-local ATB context.
+//
+// ATB requires a `Context` for Setup/Execute.  Creating one per call is
+// expensive (internal tiling buffer allocation), so we cache one per thread.
+// `SetExecuteStream` is called before every `Execute` to match the caller's
+// stream.
+inline atb::Context*& ThreadLocalAtbContext() {
+  thread_local atb::Context* ctx = nullptr;
+
+  return ctx;
+}
+
+inline atb::Context* GetAtbContext(aclrtStream stream) {
+  auto*& ctx = ThreadLocalAtbContext();
+
+  if (!ctx) {
+    atb::Status s = atb::CreateContext(&ctx);
+    assert(s == atb::NO_ERROR && "atb::CreateContext failed");
+  }
+
+  atb::Status s = ctx->SetExecuteStream(stream);
+  assert(s == atb::NO_ERROR && "atb::Context::SetExecuteStream failed");
+
+  return ctx;
+}
+
+// Build an `atb::Tensor` from an InfiniOps Tensor.
+//
+// Sets dtype, ND format, shape dimensions, and the device data pointer.
+// The caller must keep the InfiniOps Tensor alive for the duration of the
+// ATB operation.
+inline atb::Tensor ToAtbTensor(const Tensor& t) {
+  atb::Tensor out;
+  out.desc.dtype = ToAclDtype(t.dtype());
+  out.desc.format = ACL_FORMAT_ND;
+  out.desc.shape.dimNum = t.ndim();
+  assert(t.ndim() <= atb::MAX_DIM);
+
+  for (uint64_t i = 0; i < t.ndim(); ++i) {
+    out.desc.shape.dims[i] = static_cast<int64_t>(t.size(i));
+  }
+
+  out.deviceData = const_cast<void*>(t.data());
+  out.dataSize = static_cast<uint64_t>(t.numel()) * t.element_size();
+
+  return out;
+}
+
+// Build an `atb::Tensor` from explicit shape, dtype, and data pointer.
+//
+// Useful for sub-views of a larger buffer (e.g. K-cache and V-cache halves
+// of a fused KV cache tensor).
+inline atb::Tensor ToAtbTensor(const std::vector<int64_t>& shape,
+                               aclDataType dtype, void* data,
+                               uint64_t data_size) {
+  atb::Tensor out;
+  out.desc.dtype = dtype;
+  out.desc.format = ACL_FORMAT_ND;
+  out.desc.shape.dimNum = shape.size();
+  assert(shape.size() <= atb::MAX_DIM);
+
+  for (size_t i = 0; i < shape.size(); ++i) {
+    out.desc.shape.dims[i] = shape[i];
+  }
+
+  out.deviceData = data;
+  out.dataSize = data_size;
+
+  return out;
+}
+
+}  // namespace infini::ops::ascend
+
+#endif  // INFINI_HAS_ATB
+
+#endif  // INFINI_OPS_ASCEND_ATB_COMMON__H_
diff --git a/src/native/ascend/common.h b/src/native/ascend/common.h
new file mode 100644
index 000000000..46a914771
--- /dev/null
+++ b/src/native/ascend/common.h
@@ -0,0 +1,208 @@
+#ifndef INFINI_OPS_ASCEND_COMMON_H_
+#define INFINI_OPS_ASCEND_COMMON_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "acl/acl.h"
+#include "aclnn/acl_meta.h"
+#include "native/ascend/data_type_.h"
+#include "tensor.h"
+
+namespace infini::ops::ascend {
+
+// Check whether the ACL runtime is still usable.
+//
+// During process shutdown the CANN runtime may be torn down before C++
+// static destructors run.  Calling `aclrtGetDevice` is the cheapest
+// probe — it fails once the runtime is gone.  Destructors that call
+// ACL/ATB APIs must guard with this to avoid use-after-finalize crashes.
+inline bool IsAclRuntimeAlive() {
+  int32_t dev_id = -1;
+
+  return aclrtGetDevice(&dev_id) == ACL_SUCCESS;
+}
+
+// Build an `aclTensor` descriptor from an InfiniOps `Tensor`.
+//
+// When `transpose_last2` is true the last two dimensions are swapped in the
+// descriptor (shape and strides) without copying data.  This is used by `Gemm`
+// and `MatMul` to express a transpose via the view.
+inline aclTensor* BuildAclTensor(const Tensor& t,
+                                 bool transpose_last2 = false) {
+  std::vector<int64_t> shape(t.shape().begin(), t.shape().end());
+  std::vector<int64_t> strides(t.strides().begin(), t.strides().end());
+
+  if (transpose_last2 && shape.size() >= 2) {
+    auto n = shape.size();
+    std::swap(shape[n - 2], shape[n - 1]);
+    std::swap(strides[n - 2], strides[n - 1]);
+  }
+
+  // Compute the minimum physical storage needed for this strided view.
+  // For contiguous tensors this equals `numel()`; for non-contiguous (gapped)
+  // tensors it may be larger; for broadcast (stride-0) tensors it may be
+  // smaller.  Passing the view shape as the storage shape causes
+  // "ViewShape overlap" errors in ACLNN for non-contiguous inputs.
+  int64_t storage_elems = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] == 0) {
+      storage_elems = 0;
+      break;
+    }
+    if (strides[i] > 0 && shape[i] > 1) {
+      storage_elems += static_cast<int64_t>(shape[i] - 1) * strides[i];
+    }
+  }
+  std::vector<int64_t> storage_shape = {storage_elems};
+
+  return aclCreateTensor(
+      shape.data(), static_cast<int64_t>(shape.size()), ToAclDtype(t.dtype()),
+      strides.data(),
+      /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape.data(),
+      static_cast<int64_t>(storage_shape.size()), const_cast<void*>(t.data()));
+}
+
+// Pre-computed tensor metadata for descriptor reuse.
+//
+// Stores `shape`, `strides`, `storage_shape`, and `dtype` once (avoiding
+// per-call heap allocations).  The `aclTensor` descriptor is created on the
+// first `get()` call and its data pointer is updated in-place via
+// `aclSetRawTensorAddr` on subsequent calls.
+class AclTensorCache {
+ public:
+  AclTensorCache() = default;
+
+  // Construct from explicit metadata (for device buffers not wrapped in
+  // Tensor). Computes contiguous strides from shape.
+  AclTensorCache(std::vector<int64_t> shape, aclDataType dtype, void* data)
+      : shape_(std::move(shape)), dtype_(dtype) {
+    strides_.resize(shape_.size());
+    int64_t stride = 1;
+    for (int i = static_cast<int>(shape_.size()) - 1; i >= 0; --i) {
+      strides_[i] = stride;
+      stride *= shape_[i];
+    }
+    storage_shape_ = {stride};
+
+    if (data) {
+      tensor_ = aclCreateTensor(
+          shape_.data(), static_cast<int64_t>(shape_.size()), dtype_,
+          strides_.data(),
+          /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape_.data(),
+          static_cast<int64_t>(storage_shape_.size()), data);
+    }
+  }
+
+  explicit AclTensorCache(const Tensor& t, bool transpose_last2 = false)
+      : dtype_{ToAclDtype(t.dtype())} {
+    shape_.assign(t.shape().begin(), t.shape().end());
+    strides_.assign(t.strides().begin(), t.strides().end());
+
+    if (transpose_last2 && shape_.size() >= 2) {
+      auto n = shape_.size();
+      std::swap(shape_[n - 2], shape_[n - 1]);
+      std::swap(strides_[n - 2], strides_[n - 1]);
+    }
+
+    int64_t storage_elems = 1;
+    for (size_t i = 0; i < shape_.size(); ++i) {
+      if (shape_[i] == 0) {
+        storage_elems = 0;
+        break;
+      }
+      if (strides_[i] > 0 && shape_[i] > 1) {
+        storage_elems += static_cast<int64_t>(shape_[i] - 1) * strides_[i];
+      }
+    }
+    storage_shape_ = {storage_elems};
+  }
+
+  ~AclTensorCache() {
+    if (tensor_ && IsAclRuntimeAlive()) {
+      aclDestroyTensor(tensor_);
+    }
+  }
+
+  AclTensorCache(const AclTensorCache&) = delete;
+
+  AclTensorCache& operator=(const AclTensorCache&) = delete;
+
+  AclTensorCache(AclTensorCache&& o) noexcept
+      : shape_(std::move(o.shape_)),
+        strides_(std::move(o.strides_)),
+        storage_shape_(std::move(o.storage_shape_)),
+        dtype_(o.dtype_),
+        tensor_(o.tensor_) {
+    o.tensor_ = nullptr;
+  }
+
+  AclTensorCache& operator=(AclTensorCache&& o) noexcept {
+    if (this != &o) {
+      if (tensor_) {
+        aclDestroyTensor(tensor_);
+      }
+      shape_ = std::move(o.shape_);
+      strides_ = std::move(o.strides_);
+      storage_shape_ = std::move(o.storage_shape_);
+      dtype_ = o.dtype_;
+      tensor_ = o.tensor_;
+      o.tensor_ = nullptr;
+    }
+
+    return *this;
+  }
+
+  // Null the cached descriptor pointer without calling `aclDestroyTensor`.
+  // Call from the owning operator's destructor: the descriptor is still
+  // referenced by a Repeatable `aclOpExecutor` which would be destroyed
+  // alongside the tensor, and per CANN 8.5 docs that destruction is our
+  // responsibility.  In practice `aclDestroyAclOpExecutor` during process
+  // shutdown crashes even with `IsAclRuntimeAlive()` true — see `64c367c` —
+  // so operators leak the executor at shutdown; skipping `aclDestroyTensor`
+  // here keeps `~AclTensorCache` from double-freeing a descriptor the
+  // executor still holds.
+  void release() { tensor_ = nullptr; }
+
+  // Explicitly destroy the cached tensor and clear the pointer.
+  // Use only when the descriptor is owned outside any executor (e.g. an
+  // intermediate tensor not passed to `aclnn*GetWorkspaceSize`).
+  void destroy() {
+    if (tensor_) {
+      aclDestroyTensor(tensor_);
+      tensor_ = nullptr;
+    }
+  }
+
+  // Update the data pointer and return the cached descriptor.
+  aclTensor* get(void* data) const {
+    if (tensor_) {
+      aclSetRawTensorAddr(tensor_, data);
+
+      return tensor_;
+    }
+
+    tensor_ = aclCreateTensor(
+        shape_.data(), static_cast<int64_t>(shape_.size()), dtype_,
+        strides_.data(),
+        /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape_.data(),
+        static_cast<int64_t>(storage_shape_.size()), data);
+
+    return tensor_;
+  }
+
+ private:
+  std::vector<int64_t> shape_;
+
+  std::vector<int64_t> strides_;
+
+  std::vector<int64_t> storage_shape_;
+
+  aclDataType dtype_{ACL_DT_UNDEFINED};
+
+  mutable aclTensor* tensor_ = nullptr;
+};
+
+}  // namespace infini::ops::ascend
+
+#endif
diff --git a/src/native/ascend/custom/.clang-tidy b/src/native/ascend/custom/.clang-tidy
new file mode 100644
index 000000000..d1f1296a6
--- /dev/null
+++ b/src/native/ascend/custom/.clang-tidy
@@ -0,0 +1,12 @@
+---
+# `CANN`'s `ascendc_add_operator(OP_NAME <lower_snake>)` dictates the
+# symbol name of the `extern "C" __global__ __aicore__ void <op>(…)`
+# kernel entry.  The `aclrtlaunch_<op>` wrapper and `op_host/` call site
+# must match, so these entries cannot follow Google's `PascalCase`
+# convention.  Relax `FunctionCase` to `lower_case` within this subtree;
+# class and method names still inherit `CamelCase` from the root config.
+
+InheritParentConfig: true
+
+CheckOptions:
+  - {key: readability-identifier-naming.FunctionCase, value: lower_case}
diff --git a/src/native/ascend/custom/CMakeLists.txt b/src/native/ascend/custom/CMakeLists.txt
new file mode 100644
index 000000000..ca6e6883f
--- /dev/null
+++ b/src/native/ascend/custom/CMakeLists.txt
@@ -0,0 +1,97 @@
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# Copyright (c) 2025 InfiniTensor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Adapted from https://github.com/vllm-project/vllm-ascend/blob/main/csrc/CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
+project(ascend-kernel LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE RELEASE)
+endif()
+
+add_compile_options(-Wunused-value -Wcast-align -Wcast-qual -Wwrite-strings
+                    -Wsign-compare -Wextra)
+
+if(${CMAKE_BUILD_TYPE} MATCHES "RELEASE")
+    add_compile_options(-O3 -fvisibility=hidden -fvisibility-inlines-hidden
+                        -fstack-protector-strong -fPIE -fPIC)
+    message(STATUS "Build type set to `RELEASE`.")
+else()
+    add_compile_options(-g -rdynamic)
+endif()
+
+set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR})
+set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build)
+set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output)
+
+include(cmake/config_envs.cmake)
+include(cmake/config_ascend.cmake)
+
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
+    message(STATUS "Found `ccache`: ${CCACHE_PROGRAM}.")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+endif()
+
+# Shared library output location.
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_PATH})
+
+# Host-side files.
+file(GLOB OP_SRCS
+    ${PROJECT_OP_SRC_BASE}/torch_binding.cpp
+    ${PROJECT_OP_SRC_BASE}/rms_norm/op_host/rms_norm.cpp
+)
+
+# Shared library name — consumed by `kernel_custom.h` variants and by the
+# Python side via `import ascend_kernel`.
+set(OP_PLUGIN_NAME ascend_kernel)
+
+# Kernel-side files (device code compiled by the `AscendC` toolchain).
+ascendc_library(no_workspace_kernel STATIC
+    ${PROJECT_OP_SRC_BASE}/rms_norm/op_kernel/rms_norm.cpp
+)
+
+# Create the shared library `libascend_kernel.so`.
+add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS})
+
+target_link_libraries(${OP_PLUGIN_NAME} PRIVATE
+    no_workspace_kernel
+    torch_npu
+    ascendcl
+    tiling_api
+    nnopbase
+    opapi
+    register
+    platform
+    ascendalog
+    dl
+)
+
+target_link_directories(${OP_PLUGIN_NAME} PRIVATE
+    ${TORCH_DIR}/lib
+    ${TORCH_NPU_DIR}/lib
+)
+
+target_include_directories(${OP_PLUGIN_NAME} PRIVATE
+    ${PROJECT_OP_SRC_BASE}/utils
+    ${PROJECT_SOURCE_DIR}/include
+    ${TORCH_DIR}/include
+    ${TORCH_DIR}/include/torch/csrc/api/include
+    ${TORCH_NPU_DIR}/include/third_party/acl/inc
+    ${TORCH_NPU_DIR}/include/third_party/hccl/inc
+    ${TORCH_NPU_DIR}/include
+    ${PYTHON_INCLUDE_DIR}
+    ${ASCEND_INCLUDE_DIR}/external
+    ${ASCEND_INCLUDE_DIR}/experiment/platform
+    ${ASCEND_INCLUDE_DIR}/experiment/runtime
+)
diff --git a/src/native/ascend/custom/add_rms_norm/CMakeLists.txt b/src/native/ascend/custom/add_rms_norm/CMakeLists.txt
new file mode 100644
index 000000000..1748afc06
--- /dev/null
+++ b/src/native/ascend/custom/add_rms_norm/CMakeLists.txt
@@ -0,0 +1 @@
+ascendc_add_operator(OP_NAME add_rms_norm)
diff --git a/src/native/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp b/src/native/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp
new file mode 100644
index 000000000..b8e0d504b
--- /dev/null
+++ b/src/native/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp
@@ -0,0 +1,134 @@
+#include "aclrtlaunch_add_rms_norm.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "torch_kernel_helper.h"
+
+namespace ascend::detail {
+
+std::vector<at::Tensor> AddRmsNorm(const at::Tensor& x1, const at::Tensor& x2,
+                                   const at::Tensor& weight, double eps) {
+  // Input validation.
+  TORCH_CHECK(x1.dim() > 0,
+              "`AddRmsNorm`: `x1` must have at least 1 dimension.");
+  TORCH_CHECK(x1.sizes() == x2.sizes(),
+              "`AddRmsNorm`: `x1` and `x2` must have the same shape.");
+  TORCH_CHECK(x1.scalar_type() == x2.scalar_type(),
+              "`AddRmsNorm`: `x1` and `x2` must have the same dtype.");
+  TORCH_CHECK(x1.scalar_type() == at::kHalf || x1.scalar_type() == at::kFloat,
+              "`AddRmsNorm`: only `float16` and `float32` are supported; got ",
+              x1.scalar_type(), ".");
+  TORCH_CHECK(weight.dim() == 1,
+              "`AddRmsNorm`: `weight` must be 1-dimensional.");
+  TORCH_CHECK(weight.size(0) == x1.size(-1), "`AddRmsNorm`: `weight` size (",
+              weight.size(0), ") must match input last dim (", x1.size(-1),
+              ").");
+
+  int64_t dim_length = x1.size(-1);
+  int64_t total_rows = x1.numel() / dim_length;
+
+  if (total_rows == 0 || dim_length == 0) {
+    return {at::empty_like(x1), at::empty_like(x1)};
+  }
+
+  at::Tensor inp1 = x1.contiguous();
+  at::Tensor inp2 = x2.contiguous();
+  int64_t dtype_size = inp1.element_size();
+
+  // Hardware parameters.
+  auto ascendc_platform =
+      platform_ascendc::PlatformAscendCManager::GetInstance();
+  int64_t core_num = static_cast<int64_t>(ascendc_platform->GetCoreNumAiv());
+  uint64_t ub_size;
+  ascendc_platform->GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size);
+  int64_t ub_size_limit = static_cast<int64_t>(ub_size);
+
+  // Alignment (32-byte boundary).
+  int64_t align_elements = 32 / dtype_size;
+  int64_t dim_length_align =
+      ((dim_length + align_elements - 1) / align_elements) * align_elements;
+
+  // UB capacity check.
+  //
+  // - `fp16`: `inQ_x1` (*2*2) + `inQ_x2` (*2*2) + `outQ_y` (*2*2) +
+  //   `outQ_xout` (*2*2) + `fp32Buf1` (*4) + `fp32Buf2` (*4) +
+  //   `weight` (*4) = 16 + 12 = 28
+  // - `fp32`: `inQ_x1` (*2*4) + `inQ_x2` (*2*4) + `outQ_y` (*2*4) +
+  //   `outQ_xout` (*2*4) + `weight` (*4) = 32 + 4 = 36
+  int64_t buffer_coefficient = (dtype_size == 2) ? 28 : 36;
+  int64_t max_dim_length = (ub_size_limit - 1024) / buffer_coefficient;
+  int64_t fp_align_elements = 32 / 4;
+  max_dim_length = (max_dim_length / fp_align_elements) * fp_align_elements;
+  TORCH_CHECK(dim_length_align <= max_dim_length,
+              "`AddRmsNorm`: `dim_length` ", dim_length, " (aligned ",
+              dim_length_align, ") exceeds UB capacity (max ", max_dim_length,
+              ").");
+
+  // Padding.
+  at::Tensor kernel_input1;
+  at::Tensor kernel_input2;
+
+  if (dim_length != dim_length_align) {
+    kernel_input1 = inp1.reshape({total_rows, dim_length});
+    kernel_input1 = at::constant_pad_nd(
+        kernel_input1, {0, dim_length_align - dim_length}, 0.0);
+    kernel_input1 = kernel_input1.contiguous();
+
+    kernel_input2 = inp2.reshape({total_rows, dim_length});
+    kernel_input2 = at::constant_pad_nd(
+        kernel_input2, {0, dim_length_align - dim_length}, 0.0);
+    kernel_input2 = kernel_input2.contiguous();
+  } else {
+    kernel_input1 = inp1.reshape({total_rows, dim_length_align}).contiguous();
+    kernel_input2 = inp2.reshape({total_rows, dim_length_align}).contiguous();
+  }
+
+  at::Tensor kernel_output_y = at::empty_like(kernel_input1);
+  at::Tensor kernel_output_x_out = at::empty_like(kernel_input1);
+
+  // Weight: always pass as fp32, padded to `dim_length_align`.
+  at::Tensor weight_float = weight.contiguous().to(at::kFloat);
+
+  if (dim_length != dim_length_align) {
+    weight_float = at::constant_pad_nd(
+        weight_float, {0, dim_length_align - dim_length}, 0.0);
+  }
+
+  weight_float = weight_float.contiguous();
+
+  // Block-level tiling (distribute rows across cores).
+  int64_t used_core_num = std::min(total_rows, core_num);
+  int64_t former_length = (total_rows + used_core_num - 1) / used_core_num;
+  int64_t tail_length = former_length - 1;
+  int64_t former_num = total_rows - tail_length * used_core_num;
+  uint32_t block_dim = static_cast<uint32_t>(used_core_num);
+
+  // All `EXEC_KERNEL_CMD` args must be lvalues.
+  float eps_float = static_cast<float>(eps);
+  int64_t dtype_size_val = dtype_size;
+
+  // The first arg `add_rms_norm` is the AscendC kernel entry-point name — it
+  // must match `ascendc_add_operator(OP_NAME add_rms_norm)` in `CMakeLists.txt`,
+  // the `__global__ __aicore__ void add_rms_norm(...)` definition in
+  // `op_kernel/`, and the generated `aclrtlaunch_add_rms_norm.h` header.
+  // Google C++ Style's PascalCase rule does NOT apply: this identifier is
+  // dictated by the AscendC toolchain's symbol convention.
+  EXEC_KERNEL_CMD(add_rms_norm, block_dim, kernel_input1, kernel_input2,
+                  weight_float, kernel_output_y, kernel_output_x_out,
+                  total_rows, dim_length, dim_length_align, former_num,
+                  former_length, tail_length, eps_float, dtype_size_val);
+
+  // Remove padding and reshape back to original shape.
+  at::Tensor output_y = kernel_output_y;
+  at::Tensor output_x_out = kernel_output_x_out;
+
+  if (dim_length != dim_length_align) {
+    output_y = output_y.narrow(-1, 0, dim_length).contiguous();
+    output_x_out = output_x_out.narrow(-1, 0, dim_length).contiguous();
+  }
+
+  output_y = output_y.reshape(x1.sizes());
+  output_x_out = output_x_out.reshape(x1.sizes());
+
+  return {output_y, output_x_out};
+}
+
+}  // namespace ascend::detail
diff --git a/src/native/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp b/src/native/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp
new file mode 100644
index 000000000..e2a08e555
--- /dev/null
+++ b/src/native/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp
@@ -0,0 +1,249 @@
+#include "kernel_operator.h"
+
+constexpr int32_t BUFFER_NUM = 2;
+
+template <typename T>
+class KernelAddRmsNorm {
+ public:
+  __aicore__ inline KernelAddRmsNorm() {}
+
+  __aicore__ inline void Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y,
+                              GM_ADDR x_out, int64_t totalRows,
+                              int64_t dimLength, int64_t dimLengthAlign,
+                              int64_t formerNum, int64_t formerLength,
+                              int64_t tailLength, float eps) {
+    this->dimLength = dimLength;
+    this->dimLengthAlign = dimLengthAlign;
+    this->eps = eps;
+
+    // Block-level tiling: determine row range for this core.
+    int64_t blockIdx = AscendC::GetBlockIdx();
+    int64_t rowOffset;
+
+    if (blockIdx < formerNum) {
+      this->blockRows = formerLength;
+      rowOffset = formerLength * blockIdx;
+    } else {
+      this->blockRows = tailLength;
+      int64_t tailIdx = blockIdx - formerNum;
+      rowOffset = formerLength * formerNum + tailLength * tailIdx;
+    }
+
+    // Global memory pointers.
+    x1Gm.SetGlobalBuffer((__gm__ T*)x1 + rowOffset * dimLengthAlign,
+                         this->blockRows * dimLengthAlign);
+    x2Gm.SetGlobalBuffer((__gm__ T*)x2 + rowOffset * dimLengthAlign,
+                         this->blockRows * dimLengthAlign);
+    yGm.SetGlobalBuffer((__gm__ T*)y + rowOffset * dimLengthAlign,
+                        this->blockRows * dimLengthAlign);
+    xOutGm.SetGlobalBuffer((__gm__ T*)x_out + rowOffset * dimLengthAlign,
+                           this->blockRows * dimLengthAlign);
+    weightGm.SetGlobalBuffer((__gm__ float*)weight, dimLengthAlign);
+
+    int32_t dimLenAlign = static_cast<int32_t>(this->dimLengthAlign);
+
+    // I/O queues (double-buffered).
+    pipe.InitBuffer(inQueueX1, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(inQueueX2, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(outQueueY, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(outQueueXOut, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+
+    // Weight buffer (fp32, loaded once, reused for all rows).
+    pipe.InitBuffer(weightBuf,
+                    dimLenAlign * static_cast<int32_t>(sizeof(float)));
+
+    // FP16 path needs extra fp32 compute buffers.
+    // buf1: holds x_out in fp32 (reused from x1_fp32 after Add).
+    // buf2: holds x2_fp32 initially, then x_out^2, then final result.
+    if constexpr (sizeof(T) == 2) {
+      pipe.InitBuffer(fp32Buf1,
+                      dimLenAlign * static_cast<int32_t>(sizeof(float)));
+      pipe.InitBuffer(fp32Buf2,
+                      dimLenAlign * static_cast<int32_t>(sizeof(float)));
+    }
+
+    // ReduceSum temporary buffer (size per API formula).
+    constexpr int32_t ELEMS_PER_REPEAT = 256 / sizeof(float);
+    constexpr int32_t ELEMS_PER_BLOCK = 32 / sizeof(float);
+    int32_t firstMaxRepeat =
+        (dimLenAlign + ELEMS_PER_REPEAT - 1) / ELEMS_PER_REPEAT;
+    int32_t reduceTmpSize =
+        ((firstMaxRepeat + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK) *
+        ELEMS_PER_BLOCK;
+    pipe.InitBuffer(reduceTmpBuf,
+                    reduceTmpSize * static_cast<int32_t>(sizeof(float)));
+
+    // Scalar buffer for reduction result (8 floats = 32 bytes).
+    pipe.InitBuffer(sumBuf, 32);
+
+    // Load weight (fp32) from GM into `weightBuf`.
+    AscendC::LocalTensor<float> wLocal = weightBuf.Get<float>();
+    AscendC::DataCopyExtParams wParams{
+        1, static_cast<uint32_t>(dimLenAlign * sizeof(float)), 0, 0, 0};
+    AscendC::DataCopyPadExtParams<float> wPad{false, 0, 0, 0.0f};
+    AscendC::DataCopyPad(wLocal, weightGm, wParams, wPad);
+
+    // Ensure weight DMA completes before compute.
+    AscendC::PipeBarrier<PIPE_ALL>();
+  }
+
+  __aicore__ inline void Process() {
+    for (int64_t row = 0; row < this->blockRows; ++row) {
+      CopyIn(row);
+      Compute(row);
+      CopyOut(row);
+    }
+  }
+
+ private:
+  __aicore__ inline void CopyIn(int64_t row) {
+    AscendC::LocalTensor<T> x1Local = inQueueX1.AllocTensor<T>();
+    AscendC::LocalTensor<T> x2Local = inQueueX2.AllocTensor<T>();
+    AscendC::DataCopyExtParams params{
+        1, static_cast<uint32_t>(this->dimLengthAlign * sizeof(T)), 0, 0, 0};
+    AscendC::DataCopyPadExtParams<T> pad{false, 0, 0, static_cast<T>(0)};
+    AscendC::DataCopyPad(x1Local, x1Gm[row * this->dimLengthAlign], params,
+                         pad);
+    AscendC::DataCopyPad(x2Local, x2Gm[row * this->dimLengthAlign], params,
+                         pad);
+    inQueueX1.EnQue(x1Local);
+    inQueueX2.EnQue(x2Local);
+  }
+
+  __aicore__ inline void Compute(int64_t row) {
+    AscendC::LocalTensor<T> x1Local = inQueueX1.DeQue<T>();
+    AscendC::LocalTensor<T> x2Local = inQueueX2.DeQue<T>();
+    AscendC::LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
+    AscendC::LocalTensor<T> xOutLocal = outQueueXOut.AllocTensor<T>();
+
+    AscendC::LocalTensor<float> wLocal = weightBuf.Get<float>();
+    AscendC::LocalTensor<float> rTmp = reduceTmpBuf.Get<float>();
+    AscendC::LocalTensor<float> sLocal = sumBuf.Get<float>();
+
+    int32_t dimLen = static_cast<int32_t>(this->dimLength);
+    int32_t dimLenAlign = static_cast<int32_t>(this->dimLengthAlign);
+
+    if constexpr (sizeof(T) == 4) {
+      // ---- FP32 path: compute directly. ----
+
+      // Step 1: x_out = x1 + x2.
+      AscendC::Add(xOutLocal, x1Local, x2Local, dimLenAlign);
+
+      // Step 2: x_out^2 into yLocal (reuse output buffer temporarily).
+      AscendC::Mul(yLocal, xOutLocal, xOutLocal, dimLenAlign);
+
+      // Step 3: ReduceSum(x_out^2) -> sLocal[0].
+      // ReduceSum may modify yLocal, but we overwrite it below.
+      AscendC::ReduceSum(sLocal, yLocal, rTmp, dimLenAlign);
+
+      // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps).
+      float sumVal = sLocal.GetValue(0);
+      float meanVal = sumVal / static_cast<float>(dimLen) + this->eps;
+      sLocal.SetValue(0, meanVal);
+      AscendC::Sqrt(sLocal, sLocal, 8);
+      float scale = 1.0f / sLocal.GetValue(0);
+
+      // Step 6: y = x_out * scale.
+      AscendC::Muls(yLocal, xOutLocal, scale, dimLenAlign);
+
+      // Step 7: y = y * weight.
+      AscendC::Mul(yLocal, yLocal, wLocal, dimLenAlign);
+
+    } else {
+      // ---- FP16 path: cast → fp32 compute → cast back. ----
+      AscendC::LocalTensor<float> b1 = fp32Buf1.Get<float>();
+      AscendC::LocalTensor<float> b2 = fp32Buf2.Get<float>();
+
+      // Cast inputs fp16 → fp32.
+      AscendC::Cast(b1, x1Local, AscendC::RoundMode::CAST_NONE, dimLenAlign);
+      AscendC::Cast(b2, x2Local, AscendC::RoundMode::CAST_NONE, dimLenAlign);
+
+      // Step 1: x_out = x1 + x2 (fp32), stored in b1.
+      AscendC::Add(b1, b1, b2, dimLenAlign);
+
+      // Cast x_out fp32 → fp16 for the x_out output.
+      AscendC::Cast(xOutLocal, b1, AscendC::RoundMode::CAST_ROUND, dimLenAlign);
+
+      // Step 2: x_out^2 in fp32, stored in b2.
+      AscendC::Mul(b2, b1, b1, dimLenAlign);
+
+      // Step 3: ReduceSum(x_out^2) -> sLocal[0].
+      AscendC::ReduceSum(sLocal, b2, rTmp, dimLenAlign);
+
+      // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps).
+      float sumVal = sLocal.GetValue(0);
+      float meanVal = sumVal / static_cast<float>(dimLen) + this->eps;
+      sLocal.SetValue(0, meanVal);
+      AscendC::Sqrt(sLocal, sLocal, 8);
+      float scale = 1.0f / sLocal.GetValue(0);
+
+      // Step 6: y = x_out * scale (fp32), reuse b2.
+      AscendC::Muls(b2, b1, scale, dimLenAlign);
+
+      // Step 7: y = y * weight (fp32).
+      AscendC::Mul(b2, b2, wLocal, dimLenAlign);
+
+      // Cast result fp32 → fp16.
+      AscendC::Cast(yLocal, b2, AscendC::RoundMode::CAST_ROUND, dimLenAlign);
+    }
+
+    inQueueX1.FreeTensor(x1Local);
+    inQueueX2.FreeTensor(x2Local);
+    outQueueY.EnQue(yLocal);
+    outQueueXOut.EnQue(xOutLocal);
+  }
+
+  __aicore__ inline void CopyOut(int64_t row) {
+    AscendC::LocalTensor<T> yLocal = outQueueY.DeQue<T>();
+    AscendC::LocalTensor<T> xOutLocal = outQueueXOut.DeQue<T>();
+    AscendC::DataCopyExtParams params{
+        1, static_cast<uint32_t>(this->dimLengthAlign * sizeof(T)), 0, 0, 0};
+    AscendC::DataCopyPad(yGm[row * this->dimLengthAlign], yLocal, params);
+    AscendC::DataCopyPad(xOutGm[row * this->dimLengthAlign], xOutLocal, params);
+    outQueueY.FreeTensor(yLocal);
+    outQueueXOut.FreeTensor(xOutLocal);
+  }
+
+ private:
+  AscendC::TPipe pipe;
+  AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX1;
+  AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX2;
+  AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueY;
+  AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueXOut;
+
+  AscendC::TBuf<AscendC::TPosition::VECCALC> weightBuf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> fp32Buf1;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> fp32Buf2;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> reduceTmpBuf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> sumBuf;
+
+  AscendC::GlobalTensor<T> x1Gm, x2Gm, yGm, xOutGm;
+  AscendC::GlobalTensor<float> weightGm;
+
+  int64_t blockRows;
+  int64_t dimLength;
+  int64_t dimLengthAlign;
+  float eps;
+};
+
+extern "C" __global__ __aicore__ void add_rms_norm(
+    GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y, GM_ADDR x_out,
+    int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign,
+    int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps,
+    int64_t dtypeSize) {
+  if (dtypeSize == 2) {
+    KernelAddRmsNorm<half> op;
+    op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign,
+            formerNum, formerLength, tailLength, eps);
+    op.Process();
+  } else {
+    KernelAddRmsNorm<float> op;
+    op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign,
+            formerNum, formerLength, tailLength, eps);
+    op.Process();
+  }
+}
diff --git a/src/native/ascend/custom/build.sh b/src/native/ascend/custom/build.sh
new file mode 100755
index 000000000..258a88e4b
--- /dev/null
+++ b/src/native/ascend/custom/build.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Build custom `AscendC` kernels into `libascend_kernel.so`.
+set -e
+
+SOC_VERSION="${1:-Ascend910_9382}"
+
+# Detect CANN toolkit path.
+_CANN_TOOLKIT_INSTALL_PATH=$(grep "Toolkit_InstallPath" /etc/Ascend/ascend_cann_install.info | awk -F'=' '{print $2}')
+source "${_CANN_TOOLKIT_INSTALL_PATH}/set_env.sh"
+echo "CANN: ${ASCEND_TOOLKIT_HOME}"
+
+ASCEND_INCLUDE_DIR=${ASCEND_TOOLKIT_HOME}/$(arch)-linux/include
+CURRENT_DIR=$(pwd)
+OUTPUT_DIR=${CURRENT_DIR}/output
+mkdir -p "${OUTPUT_DIR}"
+
+BUILD_DIR=build
+rm -rf "${BUILD_DIR}"
+mkdir -p "${BUILD_DIR}"
+
+cmake \
+    -DASCEND_HOME_PATH="${ASCEND_HOME_PATH}" \
+    -DASCEND_INCLUDE_DIR="${ASCEND_INCLUDE_DIR}" \
+    -DSOC_VERSION="${SOC_VERSION}" \
+    -B "${BUILD_DIR}" \
+    -S .
+
+cmake --build "${BUILD_DIR}" -j 16
+
+echo "Build complete. Output: ${OUTPUT_DIR}"
diff --git a/src/native/ascend/custom/cmake/config_ascend.cmake b/src/native/ascend/custom/cmake/config_ascend.cmake
new file mode 100644
index 000000000..1772e9e70
--- /dev/null
+++ b/src/native/ascend/custom/cmake/config_ascend.cmake
@@ -0,0 +1,40 @@
+
+if(DEFINED ASCEND_HOME_PATH)
+elseif(DEFINED ENV{ASCEND_HOME_PATH})
+    set(ASCEND_HOME_PATH "$ENV{ASCEND_HOME_PATH}" CACHE PATH "ASCEND CANN package installation directory" FORCE)
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH})
+
+# Auto-detect `SOC_VERSION` from `npu-smi info` if not set externally.
+# Required by `CANN`'s `ascendc.cmake` for `AscendC` kernel compilation.
+if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "")
+    execute_process(
+        COMMAND bash -c "npu-smi info 2>/dev/null | awk '/910B|910A|310/ {for (i=1;i<=NF;i++) if ($i ~ /^(910|310)/) {print \"Ascend\" $i; exit}}'"
+        OUTPUT_VARIABLE _DETECTED_SOC
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(_DETECTED_SOC)
+        set(SOC_VERSION "${_DETECTED_SOC}" CACHE STRING "Ascend SOC version" FORCE)
+    else()
+        set(SOC_VERSION "Ascend910B4" CACHE STRING "Ascend SOC version" FORCE)
+    endif()
+
+    message(STATUS "SOC_VERSION auto-set to ${SOC_VERSION}")
+endif()
+
+if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "`ascendc_kernel_cmake` does not exist; please check whether the `CANN` package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+
+message(STATUS "ASCEND_CANN_PACKAGE_PATH = ${ASCEND_CANN_PACKAGE_PATH}")
+message(STATUS "ASCEND_HOME_PATH = ${ASCEND_HOME_PATH}")
diff --git a/src/native/ascend/custom/cmake/config_envs.cmake b/src/native/ascend/custom/cmake/config_envs.cmake
new file mode 100644
index 000000000..e715bfc7e
--- /dev/null
+++ b/src/native/ascend/custom/cmake/config_envs.cmake
@@ -0,0 +1,83 @@
+# Find the Python binary.
+find_program(PYTHON_EXECUTABLE NAMES python3)
+
+if (NOT EXISTS ${PYTHON_EXECUTABLE})
+    message(FATAL_ERROR "`python3` is not found; install Python first.")
+endif ()
+
+# Get `torch`, `torch_npu`, and `pybind11` paths via a Python helper.
+execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} "-c"
+        "import torch; import torch_npu; import os; import pybind11; import sysconfig;
+torch_dir = os.path.realpath(os.path.dirname(torch.__file__));
+torch_npu_dir = os.path.realpath(os.path.dirname(torch_npu.__file__));
+pybind11_dir = os.path.realpath(os.path.dirname(pybind11.__file__));
+abi_enabled=torch.compiled_with_cxx11_abi();
+python_include_dir = sysconfig.get_path('include');
+print(torch_dir, torch_npu_dir, pybind11_dir, abi_enabled, python_include_dir, end='');
+quit(0)
+        "
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_VARIABLE OUTPUT_ENV_DEFINES)
+
+# Abort if the Python helper failed.
+if (NOT ${EXEC_RESULT} EQUAL 0)
+    message(FATAL_ERROR "Failed to run Python script to probe env vars like `TORCH_DIR`.")
+else ()
+    message(STATUS "Python probe succeeded; output string is [${OUTPUT_ENV_DEFINES}].")
+endif ()
+
+# Extract `TORCH_DIR`.
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $1}'"
+        OUTPUT_VARIABLE TORCH_DIR
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Extract `TORCH_NPU_DIR`.
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $2}'"
+        OUTPUT_VARIABLE TORCH_NPU_DIR
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Extract `PYBIND11_DIR`.
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $3}'"
+        OUTPUT_VARIABLE PYBIND11_DIR
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Extract the PyTorch C++11 ABI flag.
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $4}'"
+        OUTPUT_VARIABLE TORCH_API_ENABLED
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Extract `PYTHON_INCLUDE_DIR`.
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $5}'"
+        OUTPUT_VARIABLE PYTHON_INCLUDE_DIR
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "SOC_VERSION=${SOC_VERSION}")
+message(STATUS "TORCH_DIR=${TORCH_DIR}")
+message(STATUS "TORCH_NPU_DIR=${TORCH_NPU_DIR}")
+message(STATUS "PYBIND11_DIR=${PYBIND11_DIR}")
+message(STATUS "PYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}")
+
+# Set `_GLIBCXX_USE_CXX11_ABI` to match the PyTorch build.
+if (${TORCH_API_ENABLED} STREQUAL "True")
+    add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=1)
+    message(STATUS "_GLIBCXX_USE_CXX11_ABI=1")
+else ()
+    add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=0)
+    message(STATUS "_GLIBCXX_USE_CXX11_ABI=0")
+endif ()
diff --git a/src/native/ascend/custom/ops.h b/src/native/ascend/custom/ops.h
new file mode 100644
index 000000000..e9e6ad9d4
--- /dev/null
+++ b/src/native/ascend/custom/ops.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * Copyright (c) 2025 InfiniTensor.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *
+ * Adapted from https://github.com/vllm-project/vllm-ascend/blob/main/csrc/ops.h
+ */
+
+#ifndef OPS_H
+#define OPS_H
+
+namespace ascend::detail {
+
+at::Tensor RmsNorm(const at::Tensor& input, const at::Tensor& weight,
+                   double eps);
+
+}  // namespace ascend::detail
+
+#endif  // OPS_H
diff --git a/src/native/ascend/custom/rms_norm/CMakeLists.txt b/src/native/ascend/custom/rms_norm/CMakeLists.txt
new file mode 100644
index 000000000..94ceabaa6
--- /dev/null
+++ b/src/native/ascend/custom/rms_norm/CMakeLists.txt
@@ -0,0 +1 @@
+ascendc_add_operator(OP_NAME rms_norm)
diff --git a/src/native/ascend/custom/rms_norm/op_host/rms_norm.cpp b/src/native/ascend/custom/rms_norm/op_host/rms_norm.cpp
new file mode 100644
index 000000000..eb521c7b5
--- /dev/null
+++ b/src/native/ascend/custom/rms_norm/op_host/rms_norm.cpp
@@ -0,0 +1,120 @@
+#include "aclrtlaunch_rms_norm.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "torch_kernel_helper.h"
+
+namespace ascend::detail {
+
+at::Tensor RmsNorm(const at::Tensor& input, const at::Tensor& weight,
+                   double eps) {
+  // Input validation.
+  TORCH_CHECK(input.dim() > 0,
+              "`RmsNorm`: `input` must have at least 1 dimension.");
+  TORCH_CHECK(
+      input.scalar_type() == at::kHalf || input.scalar_type() == at::kFloat,
+      "`RmsNorm`: only `float16` and `float32` are supported; got ",
+      input.scalar_type(), ".");
+  TORCH_CHECK(weight.dim() == 1, "`RmsNorm`: `weight` must be 1-dimensional.");
+  TORCH_CHECK(weight.size(0) == input.size(-1), "`RmsNorm`: `weight` size (",
+              weight.size(0), ") must match input last dim (", input.size(-1),
+              ").");
+
+  int64_t dim_length = input.size(-1);
+  int64_t total_rows = input.numel() / dim_length;
+
+  if (total_rows == 0 || dim_length == 0) {
+    return at::empty_like(input);
+  }
+
+  at::Tensor x = input.contiguous();
+  int64_t dtype_size = x.element_size();
+
+  // Hardware parameters.
+  auto ascendc_platform =
+      platform_ascendc::PlatformAscendCManager::GetInstance();
+  int64_t core_num = static_cast<int64_t>(ascendc_platform->GetCoreNumAiv());
+  uint64_t ub_size;
+  ascendc_platform->GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size);
+  int64_t ub_size_limit = static_cast<int64_t>(ub_size);
+
+  // Alignment (32-byte boundary).
+  int64_t align_elements = 32 / dtype_size;
+  int64_t dim_length_align =
+      ((dim_length + align_elements - 1) / align_elements) * align_elements;
+
+  // UB capacity check.
+  //
+  // - `fp32`: `inQ` (*2) + `outQ` (*2) + `weight` = 5 * `dim_length_align`
+  //   * 4 = coeff 20.
+  // - `fp16`: `inQ` (*2) + `outQ` (*2) + `x_fp32` + `tmp_fp32` + `weight`
+  //   = 2 * `dim_length_align` * 2 * 2 + 3 * `dim_length_align` * 4 =
+  //   8 + 12 = coeff 20.
+  int64_t buffer_coefficient = 20;
+  // Reserve 1024 bytes for reduce buffers.
+  int64_t max_dim_length = (ub_size_limit - 1024) / buffer_coefficient;
+  // `fp32` alignment.
+  int64_t fp_align_elements = 32 / 4;
+  max_dim_length = (max_dim_length / fp_align_elements) * fp_align_elements;
+  TORCH_CHECK(dim_length_align <= max_dim_length,
+              "`RmsNorm`: `dim_length` ", dim_length, " (aligned ",
+              dim_length_align, ") exceeds UB capacity (max ", max_dim_length,
+              ").");
+
+  // Padding.
+  at::Tensor kernel_input;
+
+  if (dim_length != dim_length_align) {
+    kernel_input = x.reshape({total_rows, dim_length});
+    kernel_input = at::constant_pad_nd(
+        kernel_input, {0, dim_length_align - dim_length}, 0.0);
+    kernel_input = kernel_input.contiguous();
+  } else {
+    kernel_input = x.reshape({total_rows, dim_length_align}).contiguous();
+  }
+
+  at::Tensor kernel_output = at::empty_like(kernel_input);
+
+  // Weight: always pass as fp32, padded to `dim_length_align`.
+  at::Tensor weight_float = weight.contiguous().to(at::kFloat);
+
+  if (dim_length != dim_length_align) {
+    weight_float = at::constant_pad_nd(
+        weight_float, {0, dim_length_align - dim_length}, 0.0);
+  }
+
+  weight_float = weight_float.contiguous();
+
+  // Block-level tiling (distribute rows across cores).
+  int64_t used_core_num = std::min(total_rows, core_num);
+  int64_t former_length = (total_rows + used_core_num - 1) / used_core_num;
+  int64_t tail_length = former_length - 1;
+  int64_t former_num = total_rows - tail_length * used_core_num;
+  uint32_t block_dim = static_cast<uint32_t>(used_core_num);
+
+  // All `EXEC_KERNEL_CMD` args must be lvalues.
+  float eps_float = static_cast<float>(eps);
+  int64_t dtype_size_val = dtype_size;
+
+  // The first arg `rms_norm` is the AscendC kernel entry-point name — it
+  // must match `ascendc_add_operator(OP_NAME rms_norm)` in `CMakeLists.txt`,
+  // the `__global__ __aicore__ void rms_norm(...)` definition in `op_kernel/`,
+  // and the generated `aclrtlaunch_rms_norm.h` header.  Google C++ Style's
+  // PascalCase rule does NOT apply: this identifier is dictated by the
+  // AscendC toolchain's symbol convention.
+  EXEC_KERNEL_CMD(rms_norm, block_dim, kernel_input, weight_float,
+                  kernel_output, total_rows, dim_length, dim_length_align,
+                  former_num, former_length, tail_length, eps_float,
+                  dtype_size_val);
+
+  // Remove padding and reshape back to original shape.
+  at::Tensor output = kernel_output;
+
+  if (dim_length != dim_length_align) {
+    output = output.narrow(-1, 0, dim_length).contiguous();
+  }
+
+  output = output.reshape(input.sizes());
+
+  return output;
+}
+
+}  // namespace ascend::detail
diff --git a/src/native/ascend/custom/rms_norm/op_kernel/.clang-tidy b/src/native/ascend/custom/rms_norm/op_kernel/.clang-tidy
new file mode 100644
index 000000000..ccf13972c
--- /dev/null
+++ b/src/native/ascend/custom/rms_norm/op_kernel/.clang-tidy
@@ -0,0 +1,9 @@
+---
+# `op_kernel/*.cpp` is `AscendC` device code compiled by `ccec`, not by
+# the host toolchain, so it has no entry in `compile_commands.json` and
+# `clang-tidy` cannot parse it correctly (the `__aicore__` macro expands
+# unexpectedly when `kernel_operator.h` is absent).  Disable all checks
+# here — the `op_host/` side and the `kernel_custom.h` launcher still
+# enforce the full ruleset.
+
+Checks: '-*'
diff --git a/src/native/ascend/custom/rms_norm/op_kernel/rms_norm.cpp b/src/native/ascend/custom/rms_norm/op_kernel/rms_norm.cpp
new file mode 100644
index 000000000..5c8f4fc67
--- /dev/null
+++ b/src/native/ascend/custom/rms_norm/op_kernel/rms_norm.cpp
@@ -0,0 +1,215 @@
+#include "kernel_operator.h"
+
+constexpr int32_t BUFFER_NUM = 2;
+
+template <typename T>
+class KernelRmsNorm {
+ public:
+  __aicore__ inline KernelRmsNorm() {}
+
+  __aicore__ inline void Init(GM_ADDR x, GM_ADDR weight, GM_ADDR y,
+                              int64_t totalRows, int64_t dimLength,
+                              int64_t dimLengthAlign, int64_t formerNum,
+                              int64_t formerLength, int64_t tailLength,
+                              float eps) {
+    this->dimLength = dimLength;
+    this->dimLengthAlign = dimLengthAlign;
+    this->eps = eps;
+
+    // Block-level tiling: determine row range for this core.
+    int64_t blockIdx = AscendC::GetBlockIdx();
+    int64_t rowOffset;
+
+    if (blockIdx < formerNum) {
+      this->blockRows = formerLength;
+      rowOffset = formerLength * blockIdx;
+    } else {
+      this->blockRows = tailLength;
+      int64_t tailIdx = blockIdx - formerNum;
+      rowOffset = formerLength * formerNum + tailLength * tailIdx;
+    }
+
+    // Global memory pointers.
+    xGm.SetGlobalBuffer((__gm__ T*)x + rowOffset * dimLengthAlign,
+                        this->blockRows * dimLengthAlign);
+    yGm.SetGlobalBuffer((__gm__ T*)y + rowOffset * dimLengthAlign,
+                        this->blockRows * dimLengthAlign);
+    weightGm.SetGlobalBuffer((__gm__ float*)weight, dimLengthAlign);
+
+    int32_t dimLenAlign = static_cast<int32_t>(this->dimLengthAlign);
+
+    // I/O queues (double-buffered).
+    pipe.InitBuffer(inQueueX, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(outQueueY, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+
+    // Weight buffer (fp32, loaded once, reused for all rows).
+    pipe.InitBuffer(weightBuf,
+                    dimLenAlign * static_cast<int32_t>(sizeof(float)));
+
+    // FP16 path needs extra fp32 compute buffers.
+    if constexpr (sizeof(T) == 2) {
+      pipe.InitBuffer(xFp32Buf,
+                      dimLenAlign * static_cast<int32_t>(sizeof(float)));
+      pipe.InitBuffer(tmpFp32Buf,
+                      dimLenAlign * static_cast<int32_t>(sizeof(float)));
+    }
+
+    // ReduceSum temporary buffer (size per API formula).
+    constexpr int32_t ELEMS_PER_REPEAT = 256 / sizeof(float);
+    constexpr int32_t ELEMS_PER_BLOCK = 32 / sizeof(float);
+    int32_t firstMaxRepeat =
+        (dimLenAlign + ELEMS_PER_REPEAT - 1) / ELEMS_PER_REPEAT;
+    int32_t reduceTmpSize =
+        ((firstMaxRepeat + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK) *
+        ELEMS_PER_BLOCK;
+    pipe.InitBuffer(reduceTmpBuf,
+                    reduceTmpSize * static_cast<int32_t>(sizeof(float)));
+
+    // Scalar buffer for reduction result (8 floats = 32 bytes).
+    pipe.InitBuffer(sumBuf, 32);
+
+    // Load weight (fp32) from GM into `weightBuf`.
+    AscendC::LocalTensor<float> wLocal = weightBuf.Get<float>();
+    AscendC::DataCopyExtParams wParams{
+        1, static_cast<uint32_t>(dimLenAlign * sizeof(float)), 0, 0, 0};
+    AscendC::DataCopyPadExtParams<float> wPad{false, 0, 0, 0.0f};
+    AscendC::DataCopyPad(wLocal, weightGm, wParams, wPad);
+
+    // Ensure weight DMA completes before compute.
+    AscendC::PipeBarrier<PIPE_ALL>();
+  }
+
+  __aicore__ inline void Process() {
+    for (int64_t row = 0; row < this->blockRows; ++row) {
+      CopyIn(row);
+      Compute(row);
+      CopyOut(row);
+    }
+  }
+
+ private:
+  __aicore__ inline void CopyIn(int64_t row) {
+    AscendC::LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+    AscendC::DataCopyExtParams params{
+        1, static_cast<uint32_t>(this->dimLengthAlign * sizeof(T)), 0, 0, 0};
+    AscendC::DataCopyPadExtParams<T> pad{false, 0, 0, static_cast<T>(0)};
+    AscendC::DataCopyPad(xLocal, xGm[row * this->dimLengthAlign], params, pad);
+    inQueueX.EnQue(xLocal);
+  }
+
+  __aicore__ inline void Compute(int64_t row) {
+    AscendC::LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+    AscendC::LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
+
+    AscendC::LocalTensor<float> wLocal = weightBuf.Get<float>();
+    AscendC::LocalTensor<float> rTmp = reduceTmpBuf.Get<float>();
+    AscendC::LocalTensor<float> sLocal = sumBuf.Get<float>();
+
+    int32_t dimLen = static_cast<int32_t>(this->dimLength);
+    int32_t dimLenAlign = static_cast<int32_t>(this->dimLengthAlign);
+
+    if constexpr (sizeof(T) == 4) {
+      // ---- FP32 path: compute directly. ----
+
+      // Step 1: x^2 into yLocal (reuse output buffer temporarily).
+      AscendC::Mul(yLocal, xLocal, xLocal, dimLenAlign);
+
+      // Step 2: ReduceSum(x^2) -> sLocal[0].
+      // ReduceSum may modify src (yLocal), but we overwrite it later.
+      AscendC::ReduceSum(sLocal, yLocal, rTmp, dimLenAlign);
+
+      // Step 3-5: scale = 1 / sqrt(mean(x^2) + eps).
+      float sumVal = sLocal.GetValue(0);
+      float meanVal = sumVal / static_cast<float>(dimLen) + this->eps;
+      sLocal.SetValue(0, meanVal);
+      AscendC::Sqrt(sLocal, sLocal, 8);
+      float scale = 1.0f / sLocal.GetValue(0);
+
+      // Step 6: y = x * scale.
+      AscendC::Muls(yLocal, xLocal, scale, dimLenAlign);
+
+      // Step 7: y = y * weight.
+      AscendC::Mul(yLocal, yLocal, wLocal, dimLenAlign);
+
+    } else {
+      // ---- FP16 path: cast → fp32 compute → cast back. ----
+      AscendC::LocalTensor<float> xF32 = xFp32Buf.Get<float>();
+      AscendC::LocalTensor<float> tmpF32 = tmpFp32Buf.Get<float>();
+
+      // Cast input fp16 → fp32.
+      AscendC::Cast(xF32, xLocal, AscendC::RoundMode::CAST_NONE, dimLenAlign);
+
+      // Step 1: x^2 in fp32.
+      AscendC::Mul(tmpF32, xF32, xF32, dimLenAlign);
+
+      // Step 2: ReduceSum(x^2) -> sLocal[0].
+      AscendC::ReduceSum(sLocal, tmpF32, rTmp, dimLenAlign);
+
+      // Step 3-5: scale = 1 / sqrt(mean(x^2) + eps).
+      float sumVal = sLocal.GetValue(0);
+      float meanVal = sumVal / static_cast<float>(dimLen) + this->eps;
+      sLocal.SetValue(0, meanVal);
+      AscendC::Sqrt(sLocal, sLocal, 8);
+      float scale = 1.0f / sLocal.GetValue(0);
+
+      // Step 6: y = x * scale (fp32).
+      AscendC::Muls(tmpF32, xF32, scale, dimLenAlign);
+
+      // Step 7: y = y * weight (fp32).
+      AscendC::Mul(tmpF32, tmpF32, wLocal, dimLenAlign);
+
+      // Cast result fp32 → fp16.
+      AscendC::Cast(yLocal, tmpF32, AscendC::RoundMode::CAST_ROUND,
+                    dimLenAlign);
+    }
+
+    inQueueX.FreeTensor(xLocal);
+    outQueueY.EnQue(yLocal);
+  }
+
+  __aicore__ inline void CopyOut(int64_t row) {
+    AscendC::LocalTensor<T> yLocal = outQueueY.DeQue<T>();
+    AscendC::DataCopyExtParams params{
+        1, static_cast<uint32_t>(this->dimLengthAlign * sizeof(T)), 0, 0, 0};
+    AscendC::DataCopyPad(yGm[row * this->dimLengthAlign], yLocal, params);
+    outQueueY.FreeTensor(yLocal);
+  }
+
+ private:
+  AscendC::TPipe pipe;
+  AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+  AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueY;
+
+  AscendC::TBuf<AscendC::TPosition::VECCALC> weightBuf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> xFp32Buf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> tmpFp32Buf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> reduceTmpBuf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> sumBuf;
+
+  AscendC::GlobalTensor<T> xGm, yGm;
+  AscendC::GlobalTensor<float> weightGm;
+
+  int64_t blockRows;
+  int64_t dimLength;
+  int64_t dimLengthAlign;
+  float eps;
+};
+
+extern "C" __global__ __aicore__ void rms_norm(
+    GM_ADDR x, GM_ADDR weight, GM_ADDR y, int64_t totalRows, int64_t dimLength,
+    int64_t dimLengthAlign, int64_t formerNum, int64_t formerLength,
+    int64_t tailLength, float eps, int64_t dtypeSize) {
+  if (dtypeSize == 2) {
+    KernelRmsNorm<half> op;
+    op.Init(x, weight, y, totalRows, dimLength, dimLengthAlign, formerNum,
+            formerLength, tailLength, eps);
+    op.Process();
+  } else {
+    KernelRmsNorm<float> op;
+    op.Init(x, weight, y, totalRows, dimLength, dimLengthAlign, formerNum,
+            formerLength, tailLength, eps);
+    op.Process();
+  }
+}
diff --git a/src/native/ascend/custom/torch_binding.cpp b/src/native/ascend/custom/torch_binding.cpp
new file mode 100644
index 000000000..1d343c064
--- /dev/null
+++ b/src/native/ascend/custom/torch_binding.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * Copyright (c) 2025 InfiniTensor.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *
+ * Adapted from https://github.com/vllm-project/vllm-ascend/blob/main/csrc/torch_binding.cpp
+ */
+
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "ops.h"
+
+namespace {
+TORCH_LIBRARY_FRAGMENT(npu, m) {
+  m.def("rms_norm(Tensor input, Tensor weight, float eps=1e-6) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(npu, PrivateUse1, m) {
+  m.impl("rms_norm", TORCH_FN(ascend::detail::RmsNorm));
+}
+}  // namespace
diff --git a/src/native/ascend/custom/utils/torch_kernel_helper.h b/src/native/ascend/custom/utils/torch_kernel_helper.h
new file mode 100644
index 000000000..c4679d1dc
--- /dev/null
+++ b/src/native/ascend/custom/utils/torch_kernel_helper.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * Copyright (c) 2025 InfiniTensor.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *
+ * Adapted from https://github.com/vllm-project/vllm-ascend/tree/main/csrc
+ */
+
+#ifndef TORCH_KERNEL_HELPER_H
+#define TORCH_KERNEL_HELPER_H
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/framework/OpCommand.h"
+
+namespace ascend::detail {
+
+#define DEVICE_TYPE c10::DeviceType::PrivateUse1
+
+class TorchNpuHelper {
+ public:
+  inline static at::Tensor CopyTensorHostToDevice(
+      const at::Tensor& cpu_tensor) {
+    at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory();
+    int deviceIndex = 0;
+    c10_npu::GetDevice(&deviceIndex);
+    return cpuPinMemTensor.to(c10::Device(DEVICE_TYPE, deviceIndex),
+                              cpuPinMemTensor.scalar_type(), true, true);
+  }
+
+  inline static at::Tensor CopyScalarToDevice(const c10::Scalar& cpu_scalar,
+                                              at::ScalarType scalar_data_type) {
+    return CopyTensorHostToDevice(
+        scalar_to_tensor(cpu_scalar).to(scalar_data_type));
+  }
+
+  inline static void* ConvertType(const at::Tensor& at_tensor) {
+    return const_cast<void*>(at_tensor.data_ptr());
+  }
+
+  template <typename T>
+  inline static T ConvertType(T value) {
+    return value;
+  }
+
+  template <typename... Ts>
+  inline static constexpr auto ConvertTypes(Ts&... args) {
+    return std::make_tuple(ConvertType(args)...);
+  }
+};
+}  // namespace ascend::detail
+
+/**
+ * @brief Launch real kernel function on NPU
+ *
+ * @param kernel_name      [in] name of kernel
+ * @param blockdim         [in] dim size of block
+ */
+#define EXEC_KERNEL_CMD(kernel_name, blockdim, ...)                     \
+  do {                                                                  \
+    auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);     \
+    auto converted_params =                                             \
+        ascend::detail::TorchNpuHelper::ConvertTypes(__VA_ARGS__);      \
+    auto acl_call = [acl_stream, blockdim, converted_params]() -> int { \
+      std::apply(                                                       \
+          [&](auto&&... params) {                                       \
+            ACLRT_LAUNCH_KERNEL(kernel_name)                            \
+            (blockdim, acl_stream, params...);                          \
+          },                                                            \
+          converted_params);                                            \
+      return 0;                                                         \
+    };                                                                  \
+    at_npu::native::OpCommand::RunOpApi(#kernel_name, acl_call);        \
+  } while (false)
+
+#endif  // TORCH_KERNEL_HELPER_H
diff --git a/src/ascend/data_type_.h b/src/native/ascend/data_type_.h
similarity index 88%
rename from src/ascend/data_type_.h
rename to src/native/ascend/data_type_.h
index 9026f515f..81253a9b3 100644
--- a/src/ascend/data_type_.h
+++ b/src/native/ascend/data_type_.h
@@ -4,13 +4,19 @@
 #include <cassert>
 
 #include "acl/acl.h"
-#include "ascend/device_.h"
 #include "data_type.h"
+#include "native/ascend/device_.h"
 
 namespace infini::ops::ascend {
 
 inline aclDataType ToAclDtype(DataType dt) {
   switch (dt) {
+    case DataType::kFloat16:
+      return ACL_FLOAT16;
+    case DataType::kBFloat16:
+      return ACL_BF16;
+    case DataType::kFloat32:
+      return ACL_FLOAT;
     case DataType::kInt8:
       return ACL_INT8;
     case DataType::kInt16:
@@ -27,19 +33,13 @@ inline aclDataType ToAclDtype(DataType dt) {
       return ACL_UINT32;
     case DataType::kUInt64:
       return ACL_UINT64;
-    case DataType::kFloat16:
-      return ACL_FLOAT16;
-    case DataType::kBFloat16:
-      return ACL_BF16;
-    case DataType::kFloat32:
-      return ACL_FLOAT;
     default:
-      assert(false && "Unsupported dtype for Ascend backend.");
+      assert(false && "unsupported dtype for Ascend backend");
       return ACL_DT_UNDEFINED;
   }
 }
 
-// Returns true for integer (signed or unsigned) `DataType` values.
+// Returns true for integer (signed or unsigned) DataType values.
 inline bool IsIntegerDtype(DataType dt) {
   switch (dt) {
     case DataType::kInt8:
diff --git a/src/ascend/device_.h b/src/native/ascend/device_.h
similarity index 100%
rename from src/ascend/device_.h
rename to src/native/ascend/device_.h
diff --git a/src/native/ascend/ops/add/kernel.h b/src/native/ascend/ops/add/kernel.h
new file mode 100644
index 000000000..8bf4a2564
--- /dev/null
+++ b/src/native/ascend/ops/add/kernel.h
@@ -0,0 +1,93 @@
+#ifndef INFINI_OPS_ASCEND_ADD_KERNEL_H_
+#define INFINI_OPS_ASCEND_ADD_KERNEL_H_
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnn_add.h"
+#include "base/add.h"
+#include "data_type.h"
+#include "native/ascend/common.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Add, Device::Type::kAscend> : public Add {
+ public:
+  Operator(const Tensor input, const Tensor other, Tensor out)
+      : Add(input, other, out),
+        in_cache_(input),
+        oth_cache_(other),
+        out_cache_(out) {
+    // `aclCreateScalar` stores the pointer rather than copying the value, so
+    // `alpha_storage_*` must remain alive for the lifetime of `alpha_`.
+    // The alpha scalar type must match the tensor dtype: use int64 for integer
+    // dtypes and float for floating-point dtypes.
+    if (ascend::IsIntegerDtype(input.dtype())) {
+      alpha_ = aclCreateScalar(&alpha_int_storage_, ACL_INT64);
+    } else {
+      alpha_ = aclCreateScalar(&alpha_float_storage_, ACL_FLOAT);
+    }
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.  The
+    // descriptors are still referenced by the Repeatable `executor_`, so
+    // skipping `aclDestroyTensor` (and leaking the executor at shutdown)
+    // avoids a double-free; see `64c367c`.
+    in_cache_.release();
+    oth_cache_.release();
+    out_cache_.release();
+
+    if (alpha_) aclDestroyScalar(alpha_);
+  }
+
+  void operator()(const Tensor input, const Tensor other,
+                  Tensor out) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+    auto t_in = in_cache_.get(const_cast<void*>(input.data()));
+    auto t_oth = oth_cache_.get(const_cast<void*>(other.data()));
+    auto t_out = out_cache_.get(out.data());
+
+    if (!executor_) {
+      aclnnAddGetWorkspaceSize(t_in, t_oth, alpha_, t_out, &ws_size_,
+                               &executor_);
+      aclSetAclOpExecutorRepeatable(executor_);
+    } else {
+      aclSetInputTensorAddr(executor_, 0, t_in,
+                            const_cast<void*>(input.data()));
+      aclSetInputTensorAddr(executor_, 1, t_oth,
+                            const_cast<void*>(other.data()));
+      aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
+    }
+
+    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+    aclnnAdd(arena.buf, ws_size_, executor_, stream);
+  }
+
+ private:
+  mutable ascend::AclTensorCache in_cache_;
+
+  mutable ascend::AclTensorCache oth_cache_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  mutable aclOpExecutor* executor_ = nullptr;
+
+  mutable uint64_t ws_size_ = 0;
+
+  // Stable address for `aclCreateScalar` (float).
+  float alpha_float_storage_ = 1.0f;
+
+  // Stable address for `aclCreateScalar` (int).
+  int64_t alpha_int_storage_ = 1;
+
+  aclScalar* alpha_ = nullptr;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/ascend/ops/cast/kernel.h b/src/native/ascend/ops/cast/kernel.h
new file mode 100644
index 000000000..9fb6e7b5f
--- /dev/null
+++ b/src/native/ascend/ops/cast/kernel.h
@@ -0,0 +1,64 @@
+#ifndef INFINI_OPS_ASCEND_CAST_KERNEL_H_
+#define INFINI_OPS_ASCEND_CAST_KERNEL_H_
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_cast.h"
+#include "base/cast.h"
+#include "native/ascend/common.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Cast, Device::Type::kAscend> : public Cast {
+ public:
+  Operator(const Tensor input, Tensor out)
+      : Cast(input, out),
+        in_cache_(input),
+        out_cache_(out),
+        acl_out_dtype_(ascend::ToAclDtype(out.dtype())) {}
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.
+    in_cache_.release();
+    out_cache_.release();
+  }
+
+  void operator()(const Tensor input, Tensor out) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+    auto t_in = in_cache_.get(const_cast<void*>(input.data()));
+    auto t_out = out_cache_.get(out.data());
+
+    if (!executor_) {
+      aclnnCastGetWorkspaceSize(t_in, acl_out_dtype_, t_out, &ws_size_,
+                                &executor_);
+      aclSetAclOpExecutorRepeatable(executor_);
+    } else {
+      aclSetInputTensorAddr(executor_, 0, t_in,
+                            const_cast<void*>(input.data()));
+      aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
+    }
+
+    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+    aclnnCast(arena.buf, ws_size_, executor_, stream);
+  }
+
+ private:
+  mutable ascend::AclTensorCache in_cache_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  aclDataType acl_out_dtype_;
+
+  mutable aclOpExecutor* executor_ = nullptr;
+
+  mutable uint64_t ws_size_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/ascend/ops/cat/kernel.h b/src/native/ascend/ops/cat/kernel.h
new file mode 100644
index 000000000..32948181a
--- /dev/null
+++ b/src/native/ascend/ops/cat/kernel.h
@@ -0,0 +1,105 @@
+#ifndef INFINI_OPS_ASCEND_CAT_KERNEL_H_
+#define INFINI_OPS_ASCEND_CAT_KERNEL_H_
+
+#include <vector>
+
+#include "acl/acl.h"
+#include "aclnn/acl_meta.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_cat.h"
+#include "base/cat.h"
+#include "native/ascend/common.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Cat, Device::Type::kAscend> : public Cat {
+ public:
+  Operator(const Tensor first_input, std::vector<Tensor> rest_inputs,
+           int64_t dim, Tensor out)
+      : Cat(first_input, rest_inputs, dim, out), out_cache_(out) {
+    // Build `AclTensorCache` for each input tensor.
+    in_caches_.reserve(input_count_);
+    in_caches_.emplace_back(first_input);
+    for (const auto& t : rest_inputs) {
+      in_caches_.emplace_back(t);
+    }
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.  The input
+    // descriptors are referenced by the Repeatable `executor_` via
+    // `tensor_list_`, so every `in_caches_[i]` must be released alongside
+    // `out_cache_`; otherwise `~AclTensorCache()` double-frees them when the
+    // vector destructs.
+    for (auto& c : in_caches_) {
+      c.release();
+    }
+    out_cache_.release();
+
+    if (tensor_list_) aclDestroyTensorList(tensor_list_);
+  }
+
+  void operator()(const Tensor first_input, std::vector<Tensor> rest_inputs,
+                  int64_t /*dim*/, Tensor out) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+
+    // Collect all input tensors in order.
+    std::vector<const Tensor*> inputs;
+    inputs.reserve(input_count_);
+    inputs.push_back(&first_input);
+    for (const auto& t : rest_inputs) {
+      inputs.push_back(&t);
+    }
+
+    auto t_out = out_cache_.get(out.data());
+
+    if (!executor_) {
+      // First call: create descriptors, tensor list, and executor.
+      std::vector<aclTensor*> acl_tensors(input_count_);
+      for (size_t i = 0; i < input_count_; ++i) {
+        acl_tensors[i] =
+            in_caches_[i].get(const_cast<void*>(inputs[i]->data()));
+      }
+
+      tensor_list_ =
+          aclCreateTensorList(const_cast<const aclTensor**>(acl_tensors.data()),
+                              static_cast<uint64_t>(input_count_));
+
+      aclnnCatGetWorkspaceSize(tensor_list_, dim_, t_out, &ws_size_,
+                               &executor_);
+      aclSetAclOpExecutorRepeatable(executor_);
+    } else {
+      // Subsequent calls: update data pointers on cached descriptors via
+      // `aclSetRawTensorAddr`.  The executor holds references to the same
+      // `aclTensor*` objects inside `tensor_list_`, so updating their data
+      // pointers is sufficient — no `aclSetInputTensorAddr` needed.
+      for (size_t i = 0; i < input_count_; ++i) {
+        in_caches_[i].get(const_cast<void*>(inputs[i]->data()));
+      }
+      aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
+    }
+
+    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+    aclnnCat(arena.buf, ws_size_, executor_, stream);
+  }
+
+ private:
+  mutable std::vector<ascend::AclTensorCache> in_caches_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  mutable aclTensorList* tensor_list_ = nullptr;
+
+  mutable aclOpExecutor* executor_ = nullptr;
+
+  mutable uint64_t ws_size_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/ascend/ops/gemm/kernel.h b/src/native/ascend/ops/gemm/kernel.h
new file mode 100644
index 000000000..34644cab7
--- /dev/null
+++ b/src/native/ascend/ops/gemm/kernel.h
@@ -0,0 +1,109 @@
+#ifndef INFINI_OPS_ASCEND_GEMM_KERNEL_H_
+#define INFINI_OPS_ASCEND_GEMM_KERNEL_H_
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_addmm.h"
+#include "aclnnop/aclnn_baddbmm.h"
+#include "base/gemm.h"
+#include "native/ascend/common.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Gemm, Device::Type::kAscend> : public Gemm {
+ public:
+  Operator(const Tensor a, const Tensor b, std::optional<float> alpha,
+           std::optional<float> beta, std::optional<int> trans_a,
+           std::optional<int> trans_b, Tensor c)
+      : Gemm(a, b, alpha, beta, trans_a, trans_b, c),
+        batched_{batch_count_ > 1},
+        alpha_val_{alpha.value_or(1.0f)},
+        beta_val_{beta.value_or(1.0f)},
+        self_cache_(c),
+        a_cache_(a, trans_a_),
+        b_cache_(b, trans_b_),
+        out_cache_(c) {
+    alpha_scalar_ = aclCreateScalar(&alpha_val_, ACL_FLOAT);
+    beta_scalar_ = aclCreateScalar(&beta_val_, ACL_FLOAT);
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.
+    self_cache_.release();
+    a_cache_.release();
+    b_cache_.release();
+    out_cache_.release();
+
+    if (alpha_scalar_) aclDestroyScalar(alpha_scalar_);
+    if (beta_scalar_) aclDestroyScalar(beta_scalar_);
+  }
+
+  void operator()(const Tensor a, const Tensor b, std::optional<float> alpha,
+                  std::optional<float> beta, std::optional<int> trans_a,
+                  std::optional<int> trans_b, Tensor c) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+
+    auto t_self = self_cache_.get(c.data());
+    auto t_a = a_cache_.get(const_cast<void*>(a.data()));
+    auto t_b = b_cache_.get(const_cast<void*>(b.data()));
+    auto t_out = out_cache_.get(c.data());
+
+    if (!executor_) {
+      if (batched_) {
+        aclnnBaddbmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_,
+                                     alpha_scalar_, t_out, 0, &ws_size_,
+                                     &executor_);
+      } else {
+        aclnnAddmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_,
+                                   alpha_scalar_, t_out, 0, &ws_size_,
+                                   &executor_);
+      }
+      aclSetAclOpExecutorRepeatable(executor_);
+    } else {
+      aclSetInputTensorAddr(executor_, 0, t_self, c.data());
+      aclSetInputTensorAddr(executor_, 1, t_a, const_cast<void*>(a.data()));
+      aclSetInputTensorAddr(executor_, 2, t_b, const_cast<void*>(b.data()));
+      aclSetOutputTensorAddr(executor_, 0, t_out, c.data());
+    }
+
+    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+
+    if (batched_) {
+      aclnnBaddbmm(arena.buf, ws_size_, executor_, stream);
+    } else {
+      aclnnAddmm(arena.buf, ws_size_, executor_, stream);
+    }
+  }
+
+ private:
+  bool batched_;
+
+  float alpha_val_;
+
+  float beta_val_;
+
+  aclScalar* alpha_scalar_ = nullptr;
+
+  aclScalar* beta_scalar_ = nullptr;
+
+  mutable ascend::AclTensorCache self_cache_;
+
+  mutable ascend::AclTensorCache a_cache_;
+
+  mutable ascend::AclTensorCache b_cache_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  mutable aclOpExecutor* executor_ = nullptr;
+
+  mutable uint64_t ws_size_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/ascend/ops/linear/kernel.h b/src/native/ascend/ops/linear/kernel.h
new file mode 100644
index 000000000..d6ce2a6da
--- /dev/null
+++ b/src/native/ascend/ops/linear/kernel.h
@@ -0,0 +1,125 @@
+#ifndef INFINI_OPS_ASCEND_LINEAR_KERNEL_H_
+#define INFINI_OPS_ASCEND_LINEAR_KERNEL_H_
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_addmm.h"
+#include "aclnnop/aclnn_baddbmm.h"
+#include "aclnnop/aclnn_matmul.h"
+#include "base/linear.h"
+#include "native/ascend/common.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Linear, Device::Type::kAscend> : public Linear {
+ public:
+  Operator(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+           bool trans_a, bool trans_b, Tensor out)
+      : Linear(a, b, bias, trans_a, trans_b, out),
+        batched_{out.ndim() > 2},
+        a_cache_(a, trans_a),
+        b_cache_(b, trans_b),
+        out_cache_(out) {
+    if (has_bias_) {
+      bias_cache_ = ascend::AclTensorCache(*bias);
+      alpha_scalar_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT);
+      beta_scalar_ = aclCreateScalar(&beta_storage_, ACL_FLOAT);
+    }
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.
+    bias_cache_.release();
+    a_cache_.release();
+    b_cache_.release();
+    out_cache_.release();
+
+    if (alpha_scalar_) aclDestroyScalar(alpha_scalar_);
+    if (beta_scalar_) aclDestroyScalar(beta_scalar_);
+  }
+
+  void operator()(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+                  bool trans_a, bool trans_b, Tensor out) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+    auto t_a = a_cache_.get(const_cast<void*>(a.data()));
+    auto t_b = b_cache_.get(const_cast<void*>(b.data()));
+    auto t_out = out_cache_.get(out.data());
+
+    if (has_bias_) {
+      auto t_bias = bias_cache_.get(const_cast<void*>(bias->data()));
+
+      if (!executor_) {
+        if (batched_) {
+          aclnnBaddbmmGetWorkspaceSize(t_bias, t_a, t_b, beta_scalar_,
+                                       alpha_scalar_, t_out, 0, &ws_size_,
+                                       &executor_);
+        } else {
+          aclnnAddmmGetWorkspaceSize(t_bias, t_a, t_b, beta_scalar_,
+                                     alpha_scalar_, t_out, 0, &ws_size_,
+                                     &executor_);
+        }
+        aclSetAclOpExecutorRepeatable(executor_);
+      } else {
+        aclSetInputTensorAddr(executor_, 0, t_bias,
+                              const_cast<void*>(bias->data()));
+        aclSetInputTensorAddr(executor_, 1, t_a, const_cast<void*>(a.data()));
+        aclSetInputTensorAddr(executor_, 2, t_b, const_cast<void*>(b.data()));
+        aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
+      }
+
+      auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+
+      if (batched_) {
+        aclnnBaddbmm(arena.buf, ws_size_, executor_, stream);
+      } else {
+        aclnnAddmm(arena.buf, ws_size_, executor_, stream);
+      }
+    } else {
+      if (!executor_) {
+        int8_t cube_math_type = 1;
+        aclnnMatmulGetWorkspaceSize(t_a, t_b, t_out, cube_math_type, &ws_size_,
+                                    &executor_);
+        aclSetAclOpExecutorRepeatable(executor_);
+      } else {
+        aclSetInputTensorAddr(executor_, 0, t_a, const_cast<void*>(a.data()));
+        aclSetInputTensorAddr(executor_, 1, t_b, const_cast<void*>(b.data()));
+        aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
+      }
+
+      auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+      aclnnMatmul(arena.buf, ws_size_, executor_, stream);
+    }
+  }
+
+ private:
+  bool batched_;
+
+  mutable ascend::AclTensorCache bias_cache_;
+
+  mutable ascend::AclTensorCache a_cache_;
+
+  mutable ascend::AclTensorCache b_cache_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  float alpha_storage_ = 1.0f;
+
+  float beta_storage_ = 1.0f;
+
+  aclScalar* alpha_scalar_ = nullptr;
+
+  aclScalar* beta_scalar_ = nullptr;
+
+  mutable aclOpExecutor* executor_ = nullptr;
+
+  mutable uint64_t ws_size_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/ascend/ops/matmul/kernel.h b/src/native/ascend/ops/matmul/kernel.h
new file mode 100644
index 000000000..c574e4e55
--- /dev/null
+++ b/src/native/ascend/ops/matmul/kernel.h
@@ -0,0 +1,68 @@
+#ifndef INFINI_OPS_ASCEND_MATMUL_KERNEL_H_
+#define INFINI_OPS_ASCEND_MATMUL_KERNEL_H_
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_matmul.h"
+#include "base/matmul.h"
+#include "native/ascend/common.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Matmul, Device::Type::kAscend> : public Matmul {
+ public:
+  Operator(const Tensor a, const Tensor b, Tensor c, bool trans_a, bool trans_b)
+      : Matmul(a, b, c, trans_a, trans_b),
+        a_cache_(a, trans_a),
+        b_cache_(b, trans_b),
+        out_cache_(c) {}
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.
+    a_cache_.release();
+    b_cache_.release();
+    out_cache_.release();
+  }
+
+  void operator()(const Tensor a, const Tensor b, Tensor c, bool trans_a,
+                  bool trans_b) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+    auto t_a = a_cache_.get(const_cast<void*>(a.data()));
+    auto t_b = b_cache_.get(const_cast<void*>(b.data()));
+    auto t_out = out_cache_.get(c.data());
+
+    if (!executor_) {
+      int8_t cube_math_type = 1;
+      aclnnMatmulGetWorkspaceSize(t_a, t_b, t_out, cube_math_type, &ws_size_,
+                                  &executor_);
+      aclSetAclOpExecutorRepeatable(executor_);
+    } else {
+      aclSetInputTensorAddr(executor_, 0, t_a, const_cast<void*>(a.data()));
+      aclSetInputTensorAddr(executor_, 1, t_b, const_cast<void*>(b.data()));
+      aclSetOutputTensorAddr(executor_, 0, t_out, c.data());
+    }
+
+    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+    aclnnMatmul(arena.buf, ws_size_, executor_, stream);
+  }
+
+ private:
+  mutable ascend::AclTensorCache a_cache_;
+
+  mutable ascend::AclTensorCache b_cache_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  mutable aclOpExecutor* executor_ = nullptr;
+
+  mutable uint64_t ws_size_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/ascend/ops/mul/kernel.h b/src/native/ascend/ops/mul/kernel.h
new file mode 100644
index 000000000..311b85de8
--- /dev/null
+++ b/src/native/ascend/ops/mul/kernel.h
@@ -0,0 +1,68 @@
+#ifndef INFINI_OPS_ASCEND_MUL_KERNEL_H_
+#define INFINI_OPS_ASCEND_MUL_KERNEL_H_
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnn_mul.h"
+#include "base/mul.h"
+#include "native/ascend/common.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Mul, Device::Type::kAscend> : public Mul {
+ public:
+  Operator(const Tensor input, const Tensor other, Tensor out)
+      : Mul(input, other, out),
+        in_cache_(input),
+        oth_cache_(other),
+        out_cache_(out) {}
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.
+    in_cache_.release();
+    oth_cache_.release();
+    out_cache_.release();
+  }
+
+  void operator()(const Tensor input, const Tensor other,
+                  Tensor out) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+    auto t_in = in_cache_.get(const_cast<void*>(input.data()));
+    auto t_oth = oth_cache_.get(const_cast<void*>(other.data()));
+    auto t_out = out_cache_.get(out.data());
+
+    if (!executor_) {
+      aclnnMulGetWorkspaceSize(t_in, t_oth, t_out, &ws_size_, &executor_);
+      aclSetAclOpExecutorRepeatable(executor_);
+    } else {
+      aclSetInputTensorAddr(executor_, 0, t_in,
+                            const_cast<void*>(input.data()));
+      aclSetInputTensorAddr(executor_, 1, t_oth,
+                            const_cast<void*>(other.data()));
+      aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
+    }
+
+    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+    aclnnMul(arena.buf, ws_size_, executor_, stream);
+  }
+
+ private:
+  mutable ascend::AclTensorCache in_cache_;
+
+  mutable ascend::AclTensorCache oth_cache_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  mutable aclOpExecutor* executor_ = nullptr;
+
+  mutable uint64_t ws_size_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/ascend/runtime_.h b/src/native/ascend/runtime_.h
similarity index 97%
rename from src/ascend/runtime_.h
rename to src/native/ascend/runtime_.h
index dca74258c..2b9e14136 100644
--- a/src/ascend/runtime_.h
+++ b/src/native/ascend/runtime_.h
@@ -5,7 +5,7 @@
 #include "acl/acl.h"
 // clang-format on
 
-#include "ascend/device_.h"
+#include "native/ascend/device_.h"
 #include "runtime.h"
 
 namespace infini::ops {
diff --git a/src/native/ascend/workspace_pool_.h b/src/native/ascend/workspace_pool_.h
new file mode 100644
index 000000000..bd3774fab
--- /dev/null
+++ b/src/native/ascend/workspace_pool_.h
@@ -0,0 +1,156 @@
+#ifndef INFINI_OPS_ASCEND_WORKSPACE_POOL__H_
+#define INFINI_OPS_ASCEND_WORKSPACE_POOL__H_
+
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include "acl/acl.h"
+
+namespace infini::ops::ascend {
+
+struct WorkspaceArena {
+  void* buf = nullptr;
+
+  uint64_t capacity = 0;
+};
+
+class WorkspacePool {
+ public:
+  // Ensure the arena for `(stream, slot)` has at least `needed` bytes.
+  //
+  // The `slot` parameter defaults to `"workspace"` for backward
+  // compatibility. Operators needing a separate temp arena pass
+  // `"temp"`.
+  WorkspaceArena& Ensure(aclrtStream stream, uint64_t needed,
+                         const char* slot = "workspace") {
+    // Thread-local fast path: a small flat array of recently used
+    // `(stream, slot, arena*)` triples. In practice operators use at
+    // most 2-3 slots, so linear scan is sufficient — no heap
+    // allocation on the hot path.
+    struct CacheEntry {
+      aclrtStream stream = nullptr;
+      const char* slot = nullptr;
+      WorkspaceArena* arena = nullptr;
+    };
+    static constexpr int kCacheSize = 4;
+    thread_local CacheEntry cache[kCacheSize] = {};
+
+    for (int i = 0; i < kCacheSize; ++i) {
+      auto& e = cache[i];
+
+      if (e.stream == stream && e.slot != nullptr &&
+          std::strcmp(e.slot, slot) == 0 && e.arena != nullptr &&
+          needed <= e.arena->capacity) {
+        return *e.arena;
+      }
+    }
+
+    // Slow path: look up arena in the map under lock.
+    assert(!capturing_ &&
+           "`WorkspacePool`: `aclrtMalloc` on slow path during graph "
+           "capture. Ensure all operators run at least once during "
+           "eager warmup.");
+
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    SlotKey key{stream, slot};
+    auto& owned = arenas_[key];
+
+    if (!owned) {
+      owned = std::make_unique<WorkspaceArena>();
+    }
+
+    auto* arena = owned.get();
+
+    if (needed > arena->capacity) {
+      if (arena->capacity > 0) {
+        aclrtSynchronizeStream(stream);
+        aclrtFree(arena->buf);
+      }
+
+      if (needed > 0) {
+        auto ret = aclrtMalloc(&arena->buf, needed, ACL_MEM_MALLOC_NORMAL_ONLY);
+        assert(ret == ACL_SUCCESS && "`WorkspacePool`: `aclrtMalloc` failed");
+      }
+
+      arena->capacity = needed;
+    }
+
+    // Insert into the thread-local cache (evict oldest).
+    for (int i = kCacheSize - 1; i > 0; --i) {
+      cache[i] = cache[i - 1];
+    }
+    cache[0] = {stream, slot, arena};
+
+    return *arena;
+  }
+
+  // Set to true before NPUGraph capture, false after.  When true,
+  // the slow path (which calls `aclrtMalloc`) triggers an assert
+  // failure — a safety net against accidental device allocations
+  // being recorded into the graph.
+  void set_capture_mode(bool capturing) { capturing_ = capturing; }
+
+  ~WorkspacePool() {
+    for (auto& [key, arena] : arenas_) {
+      if (arena && arena->capacity > 0) {
+        // The CANN runtime may already be torn down when this static
+        // destructor runs.  `aclrtGetDevice` fails in that case —
+        // skip the free to avoid glibc "double free" abort.
+        int32_t dev_id = -1;
+
+        if (aclrtGetDevice(&dev_id) == ACL_SUCCESS) {
+          aclrtFree(arena->buf);
+        } else {
+          fprintf(stderr,
+                  "[InfiniOps] `WorkspacePool`: CANN runtime already "
+                  "finalized, skipping `aclrtFree` (%" PRIu64
+                  " bytes leaked).\n",
+                  arena->capacity);
+        }
+      }
+    }
+  }
+
+ private:
+  struct SlotKey {
+    aclrtStream stream;
+    std::string slot;
+
+    bool operator==(const SlotKey& o) const {
+      return stream == o.stream && slot == o.slot;
+    }
+  };
+
+  struct SlotKeyHash {
+    size_t operator()(const SlotKey& k) const {
+      auto h1 = std::hash<void*>{}(static_cast<void*>(k.stream));
+      auto h2 = std::hash<std::string>{}(k.slot);
+
+      return h1 ^ (h2 << 1);
+    }
+  };
+
+  std::unordered_map<SlotKey, std::unique_ptr<WorkspaceArena>, SlotKeyHash>
+      arenas_;
+
+  std::mutex mutex_;
+
+  bool capturing_ = false;
+};
+
+inline WorkspacePool& GetWorkspacePool() {
+  static WorkspacePool pool;
+
+  return pool;
+}
+
+}  // namespace infini::ops::ascend
+
+#endif
diff --git a/src/cambricon/common.h b/src/native/cambricon/common.h
similarity index 100%
rename from src/cambricon/common.h
rename to src/native/cambricon/common.h
diff --git a/src/cambricon/data_type_.h b/src/native/cambricon/data_type_.h
similarity index 91%
rename from src/cambricon/data_type_.h
rename to src/native/cambricon/data_type_.h
index 1efa09a21..f4ca82da8 100644
--- a/src/cambricon/data_type_.h
+++ b/src/native/cambricon/data_type_.h
@@ -3,8 +3,8 @@
 
 #include "bang_bf16.h"
 #include "bang_fp16.h"
-#include "cambricon/device_.h"
 #include "data_type.h"
+#include "native/cambricon/device_.h"
 
 namespace infini::ops {
 
diff --git a/src/cambricon/device_.h b/src/native/cambricon/device_.h
similarity index 100%
rename from src/cambricon/device_.h
rename to src/native/cambricon/device_.h
diff --git a/src/cambricon/gemm/cnblas.h b/src/native/cambricon/ops/gemm/cnblas.h
similarity index 99%
rename from src/cambricon/gemm/cnblas.h
rename to src/native/cambricon/ops/gemm/cnblas.h
index ac95bd568..42248d4fa 100644
--- a/src/cambricon/gemm/cnblas.h
+++ b/src/native/cambricon/ops/gemm/cnblas.h
@@ -11,7 +11,7 @@
 // clang-format on
 
 #include "base/gemm.h"
-#include "cambricon/common.h"
+#include "native/cambricon/common.h"
 
 namespace infini::ops {
 
diff --git a/src/cambricon/rms_norm/kernel.mlu b/src/native/cambricon/ops/rms_norm/kernel.mlu
similarity index 100%
rename from src/cambricon/rms_norm/kernel.mlu
rename to src/native/cambricon/ops/rms_norm/kernel.mlu
diff --git a/src/cambricon/rms_norm/rms_norm.h b/src/native/cambricon/ops/rms_norm/rms_norm.h
similarity index 96%
rename from src/cambricon/rms_norm/rms_norm.h
rename to src/native/cambricon/ops/rms_norm/rms_norm.h
index 61906f730..6a9aed098 100644
--- a/src/cambricon/rms_norm/rms_norm.h
+++ b/src/native/cambricon/ops/rms_norm/rms_norm.h
@@ -6,8 +6,8 @@
 #include <vector>
 
 #include "base/rms_norm.h"
-#include "cambricon/common.h"
-#include "cambricon/data_type_.h"
+#include "native/cambricon/common.h"
+#include "native/cambricon/data_type_.h"
 
 namespace infini::ops {
 
diff --git a/src/cambricon/runtime_.h b/src/native/cambricon/runtime_.h
similarity index 95%
rename from src/cambricon/runtime_.h
rename to src/native/cambricon/runtime_.h
index f260c0c7a..7ff30fe9c 100644
--- a/src/cambricon/runtime_.h
+++ b/src/native/cambricon/runtime_.h
@@ -3,7 +3,7 @@
 
 #include <cnrt.h>
 
-#include "cambricon/device_.h"
+#include "native/cambricon/device_.h"
 #include "runtime.h"
 
 namespace infini::ops {
diff --git a/src/cpu/caster_.h b/src/native/cpu/caster_.h
similarity index 98%
rename from src/cpu/caster_.h
rename to src/native/cpu/caster_.h
index ab3afee45..7da1bf365 100644
--- a/src/cpu/caster_.h
+++ b/src/native/cpu/caster_.h
@@ -4,7 +4,7 @@
 #include <type_traits>
 
 #include "caster.h"
-#include "cpu/data_type_.h"
+#include "native/cpu/data_type_.h"
 
 namespace infini::ops {
 
diff --git a/src/cpu/data_type_.h b/src/native/cpu/data_type_.h
similarity index 91%
rename from src/cpu/data_type_.h
rename to src/native/cpu/data_type_.h
index f1195aac8..36231db51 100644
--- a/src/cpu/data_type_.h
+++ b/src/native/cpu/data_type_.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_CPU_DATA_TYPE__H_
 #define INFINI_OPS_CPU_DATA_TYPE__H_
 
-#include "cpu/device_.h"
 #include "data_type.h"
+#include "native/cpu/device_.h"
 
 namespace infini::ops {
 
diff --git a/src/cpu/device_.h b/src/native/cpu/device_.h
similarity index 100%
rename from src/cpu/device_.h
rename to src/native/cpu/device_.h
diff --git a/src/cpu/add/add.h b/src/native/cpu/ops/add/add.h
similarity index 98%
rename from src/cpu/add/add.h
rename to src/native/cpu/ops/add/add.h
index c56d31f4f..8580c6df0 100644
--- a/src/cpu/add/add.h
+++ b/src/native/cpu/ops/add/add.h
@@ -5,7 +5,7 @@
 
 #include "base/add.h"
 #include "common/generic_utils.h"
-#include "cpu/caster_.h"
+#include "native/cpu/caster_.h"
 
 namespace infini::ops {
 
diff --git a/src/native/cpu/ops/cast/cast.h b/src/native/cpu/ops/cast/cast.h
new file mode 100644
index 000000000..6e570d664
--- /dev/null
+++ b/src/native/cpu/ops/cast/cast.h
@@ -0,0 +1,52 @@
+#ifndef INFINI_OPS_CPU_CAST_CAST_H_
+#define INFINI_OPS_CPU_CAST_CAST_H_
+
+#include "base/cast.h"
+#include "common/generic_utils.h"
+#include "native/cpu/caster_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Cast, Device::Type::kCpu> : public Cast {
+ public:
+  Operator(const Tensor input, Tensor out) : Cast{input, out} {}
+
+  void operator()(const Tensor input, Tensor out) const override {
+    DispatchFunc<Device::Type::kCpu, AllTypes, AllTypes>(
+        {input_dtype_, out_dtype_},
+        [&](auto in_tag, auto out_tag) {
+          using InT = typename decltype(in_tag)::type;
+          using OutT = typename decltype(out_tag)::type;
+          Compute<InT, OutT>(input, out);
+        },
+        "`Operator<Cast, Device::Type::kCpu>::operator()`");
+  }
+
+ private:
+  template <typename InT, typename OutT>
+  void Compute(const Tensor input, Tensor out) const {
+    const auto* in_ptr = static_cast<const InT*>(input.data());
+    auto* out_ptr = static_cast<OutT*>(out.data());
+
+    auto get_idx = [&](Tensor::Size i, bool is_contig, const auto* shape,
+                       const auto* strides) {
+      return is_contig ? i : utils::IndexToOffset(i, ndim_, shape, strides);
+    };
+
+#pragma omp parallel for
+    for (Tensor::Size i = 0; i < output_size_; ++i) {
+      auto in_idx = get_idx(i, is_input_contiguous_, input_shape_.data(),
+                            input_strides_.data());
+      auto out_idx = get_idx(i, is_out_contiguous_, out_shape_.data(),
+                             out_strides_.data());
+
+      out_ptr[out_idx] =
+          Caster<Device::Type::kCpu>::template Cast<OutT>(in_ptr[in_idx]);
+    }
+  }
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cpu/ops/cat/cat.h b/src/native/cpu/ops/cat/cat.h
new file mode 100644
index 000000000..a2f600e5c
--- /dev/null
+++ b/src/native/cpu/ops/cat/cat.h
@@ -0,0 +1,71 @@
+#ifndef INFINI_OPS_CPU_CAT_CAT_H_
+#define INFINI_OPS_CPU_CAT_CAT_H_
+
+#include <cstring>
+#include <vector>
+
+#include "base/cat.h"
+#include "native/cpu/caster_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Cat, Device::Type::kCpu> : public Cat {
+ public:
+  Operator(const Tensor first_input, std::vector<Tensor> rest_inputs,
+           int64_t dim, Tensor out)
+      : Cat{first_input, rest_inputs, dim, out} {}
+
+  void operator()(const Tensor first_input, std::vector<Tensor> rest_inputs,
+                  int64_t /*dim*/, Tensor out) const override {
+    // Collect all input tensors.
+    std::vector<const Tensor*> inputs;
+    inputs.reserve(input_count_);
+    inputs.push_back(&first_input);
+    for (const auto& t : rest_inputs) {
+      inputs.push_back(&t);
+    }
+
+    // Use normalized `dim_` from base class (handles negative dim).
+    auto dim = dim_;
+    auto elem_size = kDataTypeToSize.at(out.dtype());
+    auto ndim = out.ndim();
+    auto out_shape = out.shape();
+
+    // Compute outer and inner sizes relative to the cat dimension.
+    Tensor::Size outer = 1;
+    for (int64_t i = 0; i < dim; ++i) {
+      outer *= out_shape[i];
+    }
+
+    Tensor::Size inner = 1;
+    for (size_t i = static_cast<size_t>(dim) + 1; i < ndim; ++i) {
+      inner *= out_shape[i];
+    }
+
+    auto* out_ptr = static_cast<char*>(out.data());
+    Tensor::Size out_dim_size = out_shape[dim];
+
+    // For each outer index, copy slices from each input along the cat dim.
+    for (Tensor::Size o = 0; o < outer; ++o) {
+      Tensor::Size offset_in_dim = 0;
+
+      for (size_t t = 0; t < input_count_; ++t) {
+        auto in_dim = inputs[t]->shape()[dim];
+        auto in_ptr = static_cast<const char*>(inputs[t]->data());
+
+        auto src_offset = (o * in_dim) * inner * elem_size;
+        auto dst_offset =
+            (o * out_dim_size + offset_in_dim) * inner * elem_size;
+        auto copy_size = in_dim * inner * elem_size;
+
+        std::memcpy(out_ptr + dst_offset, in_ptr + src_offset, copy_size);
+        offset_in_dim += in_dim;
+      }
+    }
+  }
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cpu/causal_softmax/causal_softmax.h b/src/native/cpu/ops/causal_softmax/causal_softmax.h
similarity index 98%
rename from src/cpu/causal_softmax/causal_softmax.h
rename to src/native/cpu/ops/causal_softmax/causal_softmax.h
index 14848ee40..8b00c6535 100644
--- a/src/cpu/causal_softmax/causal_softmax.h
+++ b/src/native/cpu/ops/causal_softmax/causal_softmax.h
@@ -5,8 +5,8 @@
 
 #include "base/causal_softmax.h"
 #include "common/generic_utils.h"
-#include "cpu/caster_.h"
 #include "data_type.h"
+#include "native/cpu/caster_.h"
 #include "tensor.h"
 
 namespace infini::ops {
diff --git a/src/cpu/gemm/gemm.h b/src/native/cpu/ops/gemm/gemm.h
similarity index 99%
rename from src/cpu/gemm/gemm.h
rename to src/native/cpu/ops/gemm/gemm.h
index a4dfb9891..0eaa0a0bd 100644
--- a/src/cpu/gemm/gemm.h
+++ b/src/native/cpu/ops/gemm/gemm.h
@@ -5,7 +5,7 @@
 
 #include "base/gemm.h"
 #include "common/generic_utils.h"
-#include "cpu/caster_.h"
+#include "native/cpu/caster_.h"
 
 namespace infini::ops {
 
diff --git a/src/native/cpu/ops/linear/linear.h b/src/native/cpu/ops/linear/linear.h
new file mode 100644
index 000000000..aa152658a
--- /dev/null
+++ b/src/native/cpu/ops/linear/linear.h
@@ -0,0 +1,107 @@
+#ifndef INFINI_OPS_CPU_LINEAR_LINEAR_H_
+#define INFINI_OPS_CPU_LINEAR_LINEAR_H_
+
+#include <utility>
+
+#include "base/linear.h"
+#include "common/generic_utils.h"
+#include "native/cpu/caster_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Linear, Device::Type::kCpu> : public Linear,
+                                             Caster<Device::Type::kCpu> {
+ public:
+  Operator(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+           bool trans_a, bool trans_b, Tensor out)
+      : Linear{a, b, bias, trans_a, trans_b, out} {}
+
+  void operator()(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+                  bool trans_a, bool trans_b, Tensor out) const override {
+    DispatchFunc<Device::Type::kCpu, AllFloatTypes>(
+        out.dtype(),
+        [&](auto tag) {
+          using T = typename decltype(tag)::type;
+          Compute<T>(a, b, bias, trans_a, trans_b, out);
+        },
+        "`Operator<Linear, Device::Type::kCpu>::operator()`");
+  }
+
+ private:
+  template <typename T>
+  void Compute(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+               bool trans_a, bool trans_b, Tensor out) const {
+    const auto* a_ptr = static_cast<const T*>(a.data());
+    const auto* b_ptr = static_cast<const T*>(b.data());
+    auto* out_ptr = static_cast<T*>(out.data());
+    const T* bias_ptr = bias ? static_cast<const T*>(bias->data()) : nullptr;
+
+    auto ndim_a = a_shape_.size();
+    auto ndim_b = b_shape_.size();
+    auto ndim_out = out_shape_.size();
+
+    Tensor::Size m = out_shape_[ndim_out - 2];
+    Tensor::Size n = out_shape_[ndim_out - 1];
+    Tensor::Size k = trans_a ? a_shape_[ndim_a - 2] : a_shape_[ndim_a - 1];
+
+    // Compute strides for the inner matrix dimensions after transpose.
+    Tensor::Stride stride_a_m =
+        trans_a ? a_strides_[ndim_a - 1] : a_strides_[ndim_a - 2];
+    Tensor::Stride stride_a_k =
+        trans_a ? a_strides_[ndim_a - 2] : a_strides_[ndim_a - 1];
+    Tensor::Stride stride_b_k =
+        trans_b ? b_strides_[ndim_b - 1] : b_strides_[ndim_b - 2];
+    Tensor::Stride stride_b_n =
+        trans_b ? b_strides_[ndim_b - 2] : b_strides_[ndim_b - 1];
+    Tensor::Stride stride_out_m = out_strides_[ndim_out - 2];
+    Tensor::Stride stride_out_n = out_strides_[ndim_out - 1];
+
+    // Batch dimensions.
+    Tensor::Size batch_count = 1;
+    for (size_t i = 0; i + 2 < ndim_out; ++i) {
+      batch_count *= out_shape_[i];
+    }
+
+    Tensor::Stride batch_stride_a = ndim_a > 2 ? a_strides_[ndim_a - 3] : 0;
+    Tensor::Stride batch_stride_b = ndim_b > 2 ? b_strides_[ndim_b - 3] : 0;
+    Tensor::Stride batch_stride_out =
+        ndim_out > 2 ? out_strides_[ndim_out - 3] : 0;
+
+    // Bias stride: for 1D bias `[n]`, stride is 1. For batched bias, use last
+    // stride.
+    Tensor::Stride bias_stride = 0;
+    if (bias_ptr) {
+      auto ndim_bias = bias->shape().size();
+      bias_stride = bias->strides()[ndim_bias - 1];
+    }
+
+    for (Tensor::Size batch = 0; batch < batch_count; ++batch) {
+      const auto* a_batch = a_ptr + batch * batch_stride_a;
+      const auto* b_batch = b_ptr + batch * batch_stride_b;
+      auto* out_batch = out_ptr + batch * batch_stride_out;
+
+      for (Tensor::Size i = 0; i < m; ++i) {
+        for (Tensor::Size j = 0; j < n; ++j) {
+          float sum = 0.0f;
+
+          for (Tensor::Size l = 0; l < k; ++l) {
+            float a_val = Cast<float>(a_batch[i * stride_a_m + l * stride_a_k]);
+            float b_val = Cast<float>(b_batch[l * stride_b_k + j * stride_b_n]);
+            sum += a_val * b_val;
+          }
+
+          if (bias_ptr) {
+            sum += Cast<float>(bias_ptr[j * bias_stride]);
+          }
+
+          out_batch[i * stride_out_m + j * stride_out_n] = Cast<T>(sum);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cpu/ops/mul/mul.h b/src/native/cpu/ops/mul/mul.h
new file mode 100644
index 000000000..c3039b005
--- /dev/null
+++ b/src/native/cpu/ops/mul/mul.h
@@ -0,0 +1,63 @@
+#ifndef INFINI_OPS_CPU_MUL_MUL_H_
+#define INFINI_OPS_CPU_MUL_MUL_H_
+
+#include <utility>
+
+#include "base/mul.h"
+#include "common/generic_utils.h"
+#include "native/cpu/caster_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Mul, Device::Type::kCpu> : public Mul,
+                                          Caster<Device::Type::kCpu> {
+ public:
+  Operator(const Tensor input, const Tensor other, Tensor out)
+      : Mul{input, other, out} {}
+
+  void operator()(const Tensor input, const Tensor other,
+                  Tensor out) const override {
+    DispatchFunc<Device::Type::kCpu, AllTypes>(
+        out_type_,
+        [&](auto tag) {
+          using T = typename decltype(tag)::type;
+          Compute<T>(input, other, out);
+        },
+        "`Operator<Mul, Device::Type::kCpu>::operator()`");
+  }
+
+ private:
+  template <typename T>
+  void Compute(const Tensor input, const Tensor other, Tensor out) const {
+    using ComputeType = std::conditional_t<IsBFloat16<Device::Type::kCpu, T> ||
+                                               IsFP16<Device::Type::kCpu, T>,
+                                           float, T>;
+
+    const auto* input_ptr = static_cast<const T*>(input.data());
+    const auto* other_ptr = static_cast<const T*>(other.data());
+    auto* out_ptr = static_cast<T*>(out.data());
+
+    auto get_idx = [&](Tensor::Size i, bool is_contig, const auto* shape,
+                       const auto* strides) {
+      return is_contig ? i : utils::IndexToOffset(i, ndim_, shape, strides);
+    };
+
+#pragma omp parallel for
+    for (Tensor::Size i = 0; i < output_size_; ++i) {
+      auto input_idx = get_idx(i, is_input_contiguous_, input_shape_.data(),
+                               input_strides_.data());
+      auto other_idx = get_idx(i, is_other_contiguous_, other_shape_.data(),
+                               other_strides_.data());
+      auto out_idx = get_idx(i, is_out_contiguous_, out_shape_.data(),
+                             out_strides_.data());
+
+      out_ptr[out_idx] = Cast<T>(Cast<ComputeType>(input_ptr[input_idx]) *
+                                 Cast<ComputeType>(other_ptr[other_idx]));
+    }
+  }
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cpu/rms_norm/rms_norm.h b/src/native/cpu/ops/rms_norm/rms_norm.h
similarity index 98%
rename from src/cpu/rms_norm/rms_norm.h
rename to src/native/cpu/ops/rms_norm/rms_norm.h
index 9cae419e2..c3f091cb4 100644
--- a/src/cpu/rms_norm/rms_norm.h
+++ b/src/native/cpu/ops/rms_norm/rms_norm.h
@@ -5,8 +5,8 @@
 
 #include "base/rms_norm.h"
 #include "common/generic_utils.h"
-#include "cpu/caster_.h"
 #include "data_type.h"
+#include "native/cpu/caster_.h"
 #include "tensor.h"
 
 namespace infini::ops {
diff --git a/src/cpu/swiglu/swiglu.h b/src/native/cpu/ops/swiglu/swiglu.h
similarity index 98%
rename from src/cpu/swiglu/swiglu.h
rename to src/native/cpu/ops/swiglu/swiglu.h
index 57dccf188..37ae46767 100644
--- a/src/cpu/swiglu/swiglu.h
+++ b/src/native/cpu/ops/swiglu/swiglu.h
@@ -5,7 +5,7 @@
 
 #include "base/swiglu.h"
 #include "common/generic_utils.h"
-#include "cpu/caster_.h"
+#include "native/cpu/caster_.h"
 
 namespace infini::ops {
 
diff --git a/src/cpu/runtime_.h b/src/native/cpu/runtime_.h
similarity index 100%
rename from src/cpu/runtime_.h
rename to src/native/cpu/runtime_.h
diff --git a/src/cuda/blas.h b/src/native/cuda/blas.h
similarity index 100%
rename from src/cuda/blas.h
rename to src/native/cuda/blas.h
diff --git a/src/cuda/blas_utils.h b/src/native/cuda/blas_utils.h
similarity index 100%
rename from src/cuda/blas_utils.h
rename to src/native/cuda/blas_utils.h
diff --git a/src/cuda/caster.cuh b/src/native/cuda/caster.cuh
similarity index 100%
rename from src/cuda/caster.cuh
rename to src/native/cuda/caster.cuh
diff --git a/src/cuda/iluvatar/blas.h b/src/native/cuda/iluvatar/blas.h
similarity index 91%
rename from src/cuda/iluvatar/blas.h
rename to src/native/cuda/iluvatar/blas.h
index 791f2dde5..7e7545a40 100644
--- a/src/cuda/iluvatar/blas.h
+++ b/src/native/cuda/iluvatar/blas.h
@@ -7,10 +7,10 @@
 #include "cublas_v2.h"
 // clang-format on
 
-#include "cuda/blas.h"
-#include "cuda/iluvatar/blas_utils.h"
-#include "cuda/iluvatar/runtime_.h"
 #include "data_type.h"
+#include "native/cuda/blas.h"
+#include "native/cuda/iluvatar/blas_utils.h"
+#include "native/cuda/iluvatar/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/blas_utils.h b/src/native/cuda/iluvatar/blas_utils.h
similarity index 94%
rename from src/cuda/iluvatar/blas_utils.h
rename to src/native/cuda/iluvatar/blas_utils.h
index 6243d8d89..3654e3541 100644
--- a/src/cuda/iluvatar/blas_utils.h
+++ b/src/native/cuda/iluvatar/blas_utils.h
@@ -5,8 +5,8 @@
 #include "cublas_v2.h"
 // clang-format on
 
-#include "cuda/blas_utils.h"
 #include "data_type.h"
+#include "native/cuda/blas_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/caster.cuh b/src/native/cuda/iluvatar/caster.cuh
similarity index 93%
rename from src/cuda/iluvatar/caster.cuh
rename to src/native/cuda/iluvatar/caster.cuh
index c2ff06774..a75369570 100644
--- a/src/cuda/iluvatar/caster.cuh
+++ b/src/native/cuda/iluvatar/caster.cuh
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_ILUVATAR_CASTER__H_
 #define INFINI_OPS_ILUVATAR_CASTER__H_
 
-#include "cuda/caster.cuh"
-#include "cuda/iluvatar/data_type_.h"
+#include "native/cuda/caster.cuh"
+#include "native/cuda/iluvatar/data_type_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/data_type_.h b/src/native/cuda/iluvatar/data_type_.h
similarity index 92%
rename from src/cuda/iluvatar/data_type_.h
rename to src/native/cuda/iluvatar/data_type_.h
index 533cc3364..ac997f992 100644
--- a/src/cuda/iluvatar/data_type_.h
+++ b/src/native/cuda/iluvatar/data_type_.h
@@ -6,8 +6,8 @@
 #include <cuda_fp16.h>
 // clang-format on
 
-#include "cuda/iluvatar/device_.h"
 #include "data_type.h"
+#include "native/cuda/iluvatar/device_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/device_.h b/src/native/cuda/iluvatar/device_.h
similarity index 100%
rename from src/cuda/iluvatar/device_.h
rename to src/native/cuda/iluvatar/device_.h
diff --git a/src/cuda/iluvatar/device_property.h b/src/native/cuda/iluvatar/device_property.h
similarity index 100%
rename from src/cuda/iluvatar/device_property.h
rename to src/native/cuda/iluvatar/device_property.h
diff --git a/src/cuda/iluvatar/add/kernel.h b/src/native/cuda/iluvatar/ops/add/kernel.h
similarity index 73%
rename from src/cuda/iluvatar/add/kernel.h
rename to src/native/cuda/iluvatar/ops/add/kernel.h
index 365ec9e2e..f41aa6be4 100644
--- a/src/cuda/iluvatar/add/kernel.h
+++ b/src/native/cuda/iluvatar/ops/add/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/add/kernel.h"
-#include "cuda/iluvatar/caster.cuh"
-#include "cuda/iluvatar/runtime_.h"
+#include "native/cuda/iluvatar/caster.cuh"
+#include "native/cuda/iluvatar/runtime_.h"
+#include "native/cuda/ops/add/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/causal_softmax/kernel.h b/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h
similarity index 75%
rename from src/cuda/iluvatar/causal_softmax/kernel.h
rename to src/native/cuda/iluvatar/ops/causal_softmax/kernel.h
index a66fa2644..8c35c87ce 100644
--- a/src/cuda/iluvatar/causal_softmax/kernel.h
+++ b/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/causal_softmax/kernel.h"
-#include "cuda/iluvatar/caster.cuh"
-#include "cuda/iluvatar/runtime_.h"
+#include "native/cuda/iluvatar/caster.cuh"
+#include "native/cuda/iluvatar/runtime_.h"
+#include "native/cuda/ops/causal_softmax/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/gemm/cublas.h b/src/native/cuda/iluvatar/ops/gemm/cublas.h
similarity index 81%
rename from src/cuda/iluvatar/gemm/cublas.h
rename to src/native/cuda/iluvatar/ops/gemm/cublas.h
index 4575e5b15..d086cf4f4 100644
--- a/src/cuda/iluvatar/gemm/cublas.h
+++ b/src/native/cuda/iluvatar/ops/gemm/cublas.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_ILUVATAR_GEMM_CUBLAS_H_
 #define INFINI_OPS_ILUVATAR_GEMM_CUBLAS_H_
 
-#include "cuda/gemm/blas.h"
-#include "cuda/iluvatar/blas.h"
+#include "native/cuda/iluvatar/blas.h"
+#include "native/cuda/ops/gemm/blas.h"
 
 namespace infini::ops {
 
diff --git a/src/native/cuda/iluvatar/ops/linear/kernel.h b/src/native/cuda/iluvatar/ops/linear/kernel.h
new file mode 100644
index 000000000..c3976144a
--- /dev/null
+++ b/src/native/cuda/iluvatar/ops/linear/kernel.h
@@ -0,0 +1,19 @@
+#ifndef INFINI_OPS_ILUVATAR_LINEAR_KERNEL_H_
+#define INFINI_OPS_ILUVATAR_LINEAR_KERNEL_H_
+
+#include "native/cuda/iluvatar/ops/add/kernel.h"
+#include "native/cuda/iluvatar/ops/gemm/cublas.h"
+#include "native/cuda/ops/linear/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Linear, Device::Type::kIluvatar>
+    : public CudaLinear<Device::Type::kIluvatar> {
+ public:
+  using CudaLinear<Device::Type::kIluvatar>::CudaLinear;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/iluvatar/rms_norm/kernel.h b/src/native/cuda/iluvatar/ops/rms_norm/kernel.h
similarity index 74%
rename from src/cuda/iluvatar/rms_norm/kernel.h
rename to src/native/cuda/iluvatar/ops/rms_norm/kernel.h
index ad9a0a1f4..3230fe8b0 100644
--- a/src/cuda/iluvatar/rms_norm/kernel.h
+++ b/src/native/cuda/iluvatar/ops/rms_norm/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/iluvatar/caster.cuh"
-#include "cuda/iluvatar/runtime_.h"
-#include "cuda/rms_norm/kernel.h"
+#include "native/cuda/iluvatar/caster.cuh"
+#include "native/cuda/iluvatar/runtime_.h"
+#include "native/cuda/ops/rms_norm/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/swiglu/kernel.h b/src/native/cuda/iluvatar/ops/swiglu/kernel.h
similarity index 74%
rename from src/cuda/iluvatar/swiglu/kernel.h
rename to src/native/cuda/iluvatar/ops/swiglu/kernel.h
index b28521f67..b03ef19c9 100644
--- a/src/cuda/iluvatar/swiglu/kernel.h
+++ b/src/native/cuda/iluvatar/ops/swiglu/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/iluvatar/caster.cuh"
-#include "cuda/iluvatar/runtime_.h"
-#include "cuda/swiglu/kernel.h"
+#include "native/cuda/iluvatar/caster.cuh"
+#include "native/cuda/iluvatar/runtime_.h"
+#include "native/cuda/ops/swiglu/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h
similarity index 87%
rename from src/cuda/iluvatar/runtime_.h
rename to src/native/cuda/iluvatar/runtime_.h
index acd41f868..1e41b68e9 100644
--- a/src/cuda/iluvatar/runtime_.h
+++ b/src/native/cuda/iluvatar/runtime_.h
@@ -7,9 +7,9 @@
 #include <cuda_runtime.h>
 // clang-format on
 
-#include "cuda/iluvatar/device_.h"
-#include "cuda/iluvatar/runtime_utils.h"
-#include "cuda/runtime.h"
+#include "native/cuda/iluvatar/device_.h"
+#include "native/cuda/iluvatar/runtime_utils.h"
+#include "native/cuda/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/iluvatar/runtime_utils.h b/src/native/cuda/iluvatar/runtime_utils.h
similarity index 74%
rename from src/cuda/iluvatar/runtime_utils.h
rename to src/native/cuda/iluvatar/runtime_utils.h
index be52981cc..df88ed3e5 100644
--- a/src/cuda/iluvatar/runtime_utils.h
+++ b/src/native/cuda/iluvatar/runtime_utils.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_ILUVATAR_RUNTIME_UTILS_H_
 #define INFINI_OPS_ILUVATAR_RUNTIME_UTILS_H_
 
-#include "cuda/iluvatar/device_property.h"
-#include "cuda/runtime_utils.h"
+#include "native/cuda/iluvatar/device_property.h"
+#include "native/cuda/runtime_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/kernel_commons.cuh b/src/native/cuda/kernel_commons.cuh
similarity index 100%
rename from src/cuda/kernel_commons.cuh
rename to src/native/cuda/kernel_commons.cuh
diff --git a/src/cuda/metax/blas.h b/src/native/cuda/metax/blas.h
similarity index 91%
rename from src/cuda/metax/blas.h
rename to src/native/cuda/metax/blas.h
index 9c0dd2d05..68e5183e1 100644
--- a/src/cuda/metax/blas.h
+++ b/src/native/cuda/metax/blas.h
@@ -7,10 +7,10 @@
 #include <mcblas/mcblas.h>
 // clang-format on
 
-#include "cuda/blas.h"
-#include "cuda/metax/blas_utils.h"
-#include "cuda/metax/runtime_.h"
 #include "data_type.h"
+#include "native/cuda/blas.h"
+#include "native/cuda/metax/blas_utils.h"
+#include "native/cuda/metax/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/blas_utils.h b/src/native/cuda/metax/blas_utils.h
similarity index 94%
rename from src/cuda/metax/blas_utils.h
rename to src/native/cuda/metax/blas_utils.h
index a84fd3cf8..6095cd70b 100644
--- a/src/cuda/metax/blas_utils.h
+++ b/src/native/cuda/metax/blas_utils.h
@@ -5,8 +5,8 @@
 #include <mcblas/mcblas.h>
 // clang-format on
 
-#include "cuda/blas_utils.h"
 #include "data_type.h"
+#include "native/cuda/blas_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/caster.cuh b/src/native/cuda/metax/caster.cuh
similarity index 96%
rename from src/cuda/metax/caster.cuh
rename to src/native/cuda/metax/caster.cuh
index dc9c808df..6ad565870 100644
--- a/src/cuda/metax/caster.cuh
+++ b/src/native/cuda/metax/caster.cuh
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_METAX_CASTER__H_
 #define INFINI_OPS_METAX_CASTER__H_
 
-#include "cuda/caster.cuh"
-#include "cuda/metax/data_type_.h"
+#include "native/cuda/caster.cuh"
+#include "native/cuda/metax/data_type_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/data_type_.h b/src/native/cuda/metax/data_type_.h
similarity index 93%
rename from src/cuda/metax/data_type_.h
rename to src/native/cuda/metax/data_type_.h
index f14620267..73e498774 100644
--- a/src/cuda/metax/data_type_.h
+++ b/src/native/cuda/metax/data_type_.h
@@ -5,8 +5,8 @@
 #include <common/maca_fp16.h>
 #include <mcr/mc_runtime.h>
 
-#include "cuda/metax/device_.h"
 #include "data_type.h"
+#include "native/cuda/metax/device_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/device_.h b/src/native/cuda/metax/device_.h
similarity index 100%
rename from src/cuda/metax/device_.h
rename to src/native/cuda/metax/device_.h
diff --git a/src/cuda/metax/device_property.h b/src/native/cuda/metax/device_property.h
similarity index 100%
rename from src/cuda/metax/device_property.h
rename to src/native/cuda/metax/device_property.h
diff --git a/src/cuda/metax/add/kernel.h b/src/native/cuda/metax/ops/add/kernel.h
similarity index 73%
rename from src/cuda/metax/add/kernel.h
rename to src/native/cuda/metax/ops/add/kernel.h
index e0c177426..7059bb7c1 100644
--- a/src/cuda/metax/add/kernel.h
+++ b/src/native/cuda/metax/ops/add/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/add/kernel.h"
-#include "cuda/metax/caster.cuh"
-#include "cuda/metax/runtime_.h"
+#include "native/cuda/metax/caster.cuh"
+#include "native/cuda/metax/runtime_.h"
+#include "native/cuda/ops/add/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/causal_softmax/kernel.h b/src/native/cuda/metax/ops/causal_softmax/kernel.h
similarity index 75%
rename from src/cuda/metax/causal_softmax/kernel.h
rename to src/native/cuda/metax/ops/causal_softmax/kernel.h
index 55a54f632..426465984 100644
--- a/src/cuda/metax/causal_softmax/kernel.h
+++ b/src/native/cuda/metax/ops/causal_softmax/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/causal_softmax/kernel.h"
-#include "cuda/metax/caster.cuh"
-#include "cuda/metax/runtime_.h"
+#include "native/cuda/metax/caster.cuh"
+#include "native/cuda/metax/runtime_.h"
+#include "native/cuda/ops/causal_softmax/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/gemm/mcblas.h b/src/native/cuda/metax/ops/gemm/mcblas.h
similarity index 80%
rename from src/cuda/metax/gemm/mcblas.h
rename to src/native/cuda/metax/ops/gemm/mcblas.h
index df1eac51a..c1ae08d08 100644
--- a/src/cuda/metax/gemm/mcblas.h
+++ b/src/native/cuda/metax/ops/gemm/mcblas.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_METAX_GEMM_MCBLAS_H_
 #define INFINI_OPS_METAX_GEMM_MCBLAS_H_
 
-#include "cuda/gemm/blas.h"
-#include "cuda/metax/blas.h"
+#include "native/cuda/metax/blas.h"
+#include "native/cuda/ops/gemm/blas.h"
 
 namespace infini::ops {
 
diff --git a/src/native/cuda/metax/ops/linear/kernel.h b/src/native/cuda/metax/ops/linear/kernel.h
new file mode 100644
index 000000000..0f0a414fa
--- /dev/null
+++ b/src/native/cuda/metax/ops/linear/kernel.h
@@ -0,0 +1,19 @@
+#ifndef INFINI_OPS_METAX_LINEAR_KERNEL_H_
+#define INFINI_OPS_METAX_LINEAR_KERNEL_H_
+
+#include "native/cuda/metax/ops/add/kernel.h"
+#include "native/cuda/metax/ops/gemm/mcblas.h"
+#include "native/cuda/ops/linear/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Linear, Device::Type::kMetax>
+    : public CudaLinear<Device::Type::kMetax> {
+ public:
+  using CudaLinear<Device::Type::kMetax>::CudaLinear;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/metax/rms_norm/kernel.h b/src/native/cuda/metax/ops/rms_norm/kernel.h
similarity index 74%
rename from src/cuda/metax/rms_norm/kernel.h
rename to src/native/cuda/metax/ops/rms_norm/kernel.h
index 38b29f4a9..38d84c830 100644
--- a/src/cuda/metax/rms_norm/kernel.h
+++ b/src/native/cuda/metax/ops/rms_norm/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/metax/caster.cuh"
-#include "cuda/metax/runtime_.h"
-#include "cuda/rms_norm/kernel.h"
+#include "native/cuda/metax/caster.cuh"
+#include "native/cuda/metax/runtime_.h"
+#include "native/cuda/ops/rms_norm/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/swiglu/kernel.h b/src/native/cuda/metax/ops/swiglu/kernel.h
similarity index 74%
rename from src/cuda/metax/swiglu/kernel.h
rename to src/native/cuda/metax/ops/swiglu/kernel.h
index a4f9359d7..0bcfd3198 100644
--- a/src/cuda/metax/swiglu/kernel.h
+++ b/src/native/cuda/metax/ops/swiglu/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/metax/caster.cuh"
-#include "cuda/metax/runtime_.h"
-#include "cuda/swiglu/kernel.h"
+#include "native/cuda/metax/caster.cuh"
+#include "native/cuda/metax/runtime_.h"
+#include "native/cuda/ops/swiglu/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h
similarity index 86%
rename from src/cuda/metax/runtime_.h
rename to src/native/cuda/metax/runtime_.h
index 576e0fb38..6563b6b41 100644
--- a/src/cuda/metax/runtime_.h
+++ b/src/native/cuda/metax/runtime_.h
@@ -3,9 +3,9 @@
 
 #include <mcr/mc_runtime.h>
 
-#include "cuda/metax/device_.h"
-#include "cuda/metax/runtime_utils.h"
-#include "cuda/runtime.h"
+#include "native/cuda/metax/device_.h"
+#include "native/cuda/metax/runtime_utils.h"
+#include "native/cuda/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/metax/runtime_utils.h b/src/native/cuda/metax/runtime_utils.h
similarity index 74%
rename from src/cuda/metax/runtime_utils.h
rename to src/native/cuda/metax/runtime_utils.h
index 3bd9eafef..94100549b 100644
--- a/src/cuda/metax/runtime_utils.h
+++ b/src/native/cuda/metax/runtime_utils.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_METAX_RUNTIME_UTILS_H_
 #define INFINI_OPS_METAX_RUNTIME_UTILS_H_
 
-#include "cuda/metax/device_property.h"
-#include "cuda/runtime_utils.h"
+#include "native/cuda/metax/device_property.h"
+#include "native/cuda/runtime_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/blas.h b/src/native/cuda/moore/blas.h
similarity index 89%
rename from src/cuda/moore/blas.h
rename to src/native/cuda/moore/blas.h
index 91a86e507..b2531a1ca 100644
--- a/src/cuda/moore/blas.h
+++ b/src/native/cuda/moore/blas.h
@@ -5,10 +5,10 @@
 
 #include <utility>
 
-#include "cuda/blas.h"
-#include "cuda/moore/blas_utils.h"
-#include "cuda/moore/runtime_.h"
 #include "data_type.h"
+#include "native/cuda/blas.h"
+#include "native/cuda/moore/blas_utils.h"
+#include "native/cuda/moore/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/blas_utils.h b/src/native/cuda/moore/blas_utils.h
similarity index 94%
rename from src/cuda/moore/blas_utils.h
rename to src/native/cuda/moore/blas_utils.h
index b3a08d956..f897936ed 100644
--- a/src/cuda/moore/blas_utils.h
+++ b/src/native/cuda/moore/blas_utils.h
@@ -3,8 +3,8 @@
 
 #include <mublas.h>
 
-#include "cuda/blas_utils.h"
 #include "data_type.h"
+#include "native/cuda/blas_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/caster.cuh b/src/native/cuda/moore/caster.cuh
similarity index 95%
rename from src/cuda/moore/caster.cuh
rename to src/native/cuda/moore/caster.cuh
index 9b45cd046..48154980d 100644
--- a/src/cuda/moore/caster.cuh
+++ b/src/native/cuda/moore/caster.cuh
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_MOORE_CASTER__H_
 #define INFINI_OPS_MOORE_CASTER__H_
 
-#include "cuda/caster.cuh"
-#include "cuda/moore/data_type_.h"
+#include "native/cuda/caster.cuh"
+#include "native/cuda/moore/data_type_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/data_type_.h b/src/native/cuda/moore/data_type_.h
similarity index 92%
rename from src/cuda/moore/data_type_.h
rename to src/native/cuda/moore/data_type_.h
index bec9e473a..c97cb33dd 100644
--- a/src/cuda/moore/data_type_.h
+++ b/src/native/cuda/moore/data_type_.h
@@ -4,8 +4,8 @@
 #include <musa_bf16.h>
 #include <musa_fp16.h>
 
-#include "cuda/moore/device_.h"
 #include "data_type.h"
+#include "native/cuda/moore/device_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/device_.h b/src/native/cuda/moore/device_.h
similarity index 100%
rename from src/cuda/moore/device_.h
rename to src/native/cuda/moore/device_.h
diff --git a/src/cuda/moore/device_property.h b/src/native/cuda/moore/device_property.h
similarity index 100%
rename from src/cuda/moore/device_property.h
rename to src/native/cuda/moore/device_property.h
diff --git a/src/cuda/moore/add/kernel.h b/src/native/cuda/moore/ops/add/kernel.h
similarity index 64%
rename from src/cuda/moore/add/kernel.h
rename to src/native/cuda/moore/ops/add/kernel.h
index 2a1590dc3..abcea37c5 100644
--- a/src/cuda/moore/add/kernel.h
+++ b/src/native/cuda/moore/ops/add/kernel.h
@@ -4,13 +4,13 @@
 #include <utility>
 
 // clang-format off
-#include "cuda/moore/polyfills.cuh"
+#include "native/cuda/moore/polyfills.cuh"
 // clang-format on
 
-#include "cuda/add/kernel.h"
-#include "cuda/moore/caster.cuh"
-#include "cuda/moore/polyfills.cuh"
-#include "cuda/moore/runtime_.h"
+#include "native/cuda/moore/caster.cuh"
+#include "native/cuda/moore/polyfills.cuh"
+#include "native/cuda/moore/runtime_.h"
+#include "native/cuda/ops/add/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/causal_softmax/kernel.h b/src/native/cuda/moore/ops/causal_softmax/kernel.h
similarity index 79%
rename from src/cuda/moore/causal_softmax/kernel.h
rename to src/native/cuda/moore/ops/causal_softmax/kernel.h
index 0f5de1b95..e13118763 100644
--- a/src/cuda/moore/causal_softmax/kernel.h
+++ b/src/native/cuda/moore/ops/causal_softmax/kernel.h
@@ -6,12 +6,12 @@
 // clang-format on
 
 // clang-format off
-#include "cuda/moore/device_.h"
+#include "native/cuda/moore/device_.h"
 // clang-format on
 
-#include "cuda/causal_softmax/kernel.h"
-#include "cuda/moore/caster.cuh"
-#include "cuda/moore/runtime_.h"
+#include "native/cuda/moore/caster.cuh"
+#include "native/cuda/moore/runtime_.h"
+#include "native/cuda/ops/causal_softmax/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/gemm/mublas.h b/src/native/cuda/moore/ops/gemm/mublas.h
similarity index 92%
rename from src/cuda/moore/gemm/mublas.h
rename to src/native/cuda/moore/ops/gemm/mublas.h
index 2aadb68e8..a34cba523 100644
--- a/src/cuda/moore/gemm/mublas.h
+++ b/src/native/cuda/moore/ops/gemm/mublas.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_MOORE_GEMM_MUBLAS_H_
 #define INFINI_OPS_MOORE_GEMM_MUBLAS_H_
 
-#include "cuda/gemm/blas.h"
-#include "cuda/moore/blas.h"
+#include "native/cuda/moore/blas.h"
+#include "native/cuda/ops/gemm/blas.h"
 
 namespace infini::ops {
 
diff --git a/src/native/cuda/moore/ops/linear/kernel.h b/src/native/cuda/moore/ops/linear/kernel.h
new file mode 100644
index 000000000..76a217bdf
--- /dev/null
+++ b/src/native/cuda/moore/ops/linear/kernel.h
@@ -0,0 +1,19 @@
+#ifndef INFINI_OPS_MOORE_LINEAR_KERNEL_H_
+#define INFINI_OPS_MOORE_LINEAR_KERNEL_H_
+
+#include "native/cuda/moore/ops/add/kernel.h"
+#include "native/cuda/moore/ops/gemm/mublas.h"
+#include "native/cuda/ops/linear/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Linear, Device::Type::kMoore>
+    : public CudaLinear<Device::Type::kMoore> {
+ public:
+  using CudaLinear<Device::Type::kMoore>::CudaLinear;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/moore/rms_norm/kernel.h b/src/native/cuda/moore/ops/rms_norm/kernel.h
similarity index 77%
rename from src/cuda/moore/rms_norm/kernel.h
rename to src/native/cuda/moore/ops/rms_norm/kernel.h
index 5cdb1e58e..0f160017d 100644
--- a/src/cuda/moore/rms_norm/kernel.h
+++ b/src/native/cuda/moore/ops/rms_norm/kernel.h
@@ -7,9 +7,9 @@
 #include <musa_runtime.h>
 // clang-format on
 
-#include "cuda/moore/caster.cuh"
-#include "cuda/moore/runtime_.h"
-#include "cuda/rms_norm/kernel.h"
+#include "native/cuda/moore/caster.cuh"
+#include "native/cuda/moore/runtime_.h"
+#include "native/cuda/ops/rms_norm/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/swiglu/kernel.h b/src/native/cuda/moore/ops/swiglu/kernel.h
similarity index 65%
rename from src/cuda/moore/swiglu/kernel.h
rename to src/native/cuda/moore/ops/swiglu/kernel.h
index daa72074f..ac54bff50 100644
--- a/src/cuda/moore/swiglu/kernel.h
+++ b/src/native/cuda/moore/ops/swiglu/kernel.h
@@ -4,13 +4,13 @@
 #include <utility>
 
 // clang-format off
-#include "cuda/moore/polyfills.cuh"
+#include "native/cuda/moore/polyfills.cuh"
 // clang-format on
 
-#include "cuda/moore/caster.cuh"
-#include "cuda/moore/polyfills.cuh"
-#include "cuda/moore/runtime_.h"
-#include "cuda/swiglu/kernel.h"
+#include "native/cuda/moore/caster.cuh"
+#include "native/cuda/moore/polyfills.cuh"
+#include "native/cuda/moore/runtime_.h"
+#include "native/cuda/ops/swiglu/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/polyfills.cuh b/src/native/cuda/moore/polyfills.cuh
similarity index 100%
rename from src/cuda/moore/polyfills.cuh
rename to src/native/cuda/moore/polyfills.cuh
diff --git a/src/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h
similarity index 89%
rename from src/cuda/moore/runtime_.h
rename to src/native/cuda/moore/runtime_.h
index dd00c3d12..bc519b41e 100644
--- a/src/cuda/moore/runtime_.h
+++ b/src/native/cuda/moore/runtime_.h
@@ -5,9 +5,9 @@
 
 #include <utility>
 
-#include "cuda/moore/device_.h"
-#include "cuda/moore/runtime_utils.h"
-#include "cuda/runtime.h"
+#include "native/cuda/moore/device_.h"
+#include "native/cuda/moore/runtime_utils.h"
+#include "native/cuda/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/moore/runtime_utils.h b/src/native/cuda/moore/runtime_utils.h
similarity index 74%
rename from src/cuda/moore/runtime_utils.h
rename to src/native/cuda/moore/runtime_utils.h
index 92d2dd926..4c5ae3f6c 100644
--- a/src/cuda/moore/runtime_utils.h
+++ b/src/native/cuda/moore/runtime_utils.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_MOORE_RUNTIME_UTILS_H_
 #define INFINI_OPS_MOORE_RUNTIME_UTILS_H_
 
-#include "cuda/moore/device_property.h"
-#include "cuda/runtime_utils.h"
+#include "native/cuda/moore/device_property.h"
+#include "native/cuda/runtime_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/blas.h b/src/native/cuda/nvidia/blas.h
similarity index 90%
rename from src/cuda/nvidia/blas.h
rename to src/native/cuda/nvidia/blas.h
index 7319e86fd..7cdfd2c62 100644
--- a/src/cuda/nvidia/blas.h
+++ b/src/native/cuda/nvidia/blas.h
@@ -7,10 +7,10 @@
 #include "cublas_v2.h"
 // clang-format on
 
-#include "cuda/blas.h"
-#include "cuda/nvidia/blas_utils.h"
-#include "cuda/nvidia/runtime_.h"
 #include "data_type.h"
+#include "native/cuda/blas.h"
+#include "native/cuda/nvidia/blas_utils.h"
+#include "native/cuda/nvidia/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/blas_utils.h b/src/native/cuda/nvidia/blas_utils.h
similarity index 94%
rename from src/cuda/nvidia/blas_utils.h
rename to src/native/cuda/nvidia/blas_utils.h
index c9c81e974..45644ae38 100644
--- a/src/cuda/nvidia/blas_utils.h
+++ b/src/native/cuda/nvidia/blas_utils.h
@@ -5,8 +5,8 @@
 #include "cublas_v2.h"
 // clang-format on
 
-#include "cuda/blas_utils.h"
 #include "data_type.h"
+#include "native/cuda/blas_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/caster.cuh b/src/native/cuda/nvidia/caster.cuh
similarity index 96%
rename from src/cuda/nvidia/caster.cuh
rename to src/native/cuda/nvidia/caster.cuh
index a19740813..b8a366e26 100644
--- a/src/cuda/nvidia/caster.cuh
+++ b/src/native/cuda/nvidia/caster.cuh
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_NVIDIA_CASTER__H_
 #define INFINI_OPS_NVIDIA_CASTER__H_
 
-#include "cuda/caster.cuh"
-#include "cuda/nvidia/data_type_.h"
+#include "native/cuda/caster.cuh"
+#include "native/cuda/nvidia/data_type_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/data_type_.h b/src/native/cuda/nvidia/data_type_.h
similarity index 92%
rename from src/cuda/nvidia/data_type_.h
rename to src/native/cuda/nvidia/data_type_.h
index 80424ba01..1266c0c8a 100644
--- a/src/cuda/nvidia/data_type_.h
+++ b/src/native/cuda/nvidia/data_type_.h
@@ -6,8 +6,8 @@
 #include <cuda_fp16.h>
 // clang-format on
 
-#include "cuda/nvidia/device_.h"
 #include "data_type.h"
+#include "native/cuda/nvidia/device_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/device_.h b/src/native/cuda/nvidia/device_.h
similarity index 100%
rename from src/cuda/nvidia/device_.h
rename to src/native/cuda/nvidia/device_.h
diff --git a/src/cuda/nvidia/device_property.h b/src/native/cuda/nvidia/device_property.h
similarity index 100%
rename from src/cuda/nvidia/device_property.h
rename to src/native/cuda/nvidia/device_property.h
diff --git a/src/cuda/nvidia/add/kernel.h b/src/native/cuda/nvidia/ops/add/kernel.h
similarity index 73%
rename from src/cuda/nvidia/add/kernel.h
rename to src/native/cuda/nvidia/ops/add/kernel.h
index 56fc608cb..373f5a775 100644
--- a/src/cuda/nvidia/add/kernel.h
+++ b/src/native/cuda/nvidia/ops/add/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/add/kernel.h"
-#include "cuda/nvidia/caster.cuh"
-#include "cuda/nvidia/runtime_.h"
+#include "native/cuda/nvidia/caster.cuh"
+#include "native/cuda/nvidia/runtime_.h"
+#include "native/cuda/ops/add/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/causal_softmax/kernel.h b/src/native/cuda/nvidia/ops/causal_softmax/kernel.h
similarity index 75%
rename from src/cuda/nvidia/causal_softmax/kernel.h
rename to src/native/cuda/nvidia/ops/causal_softmax/kernel.h
index 88c71fb49..5a7e18a94 100644
--- a/src/cuda/nvidia/causal_softmax/kernel.h
+++ b/src/native/cuda/nvidia/ops/causal_softmax/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/causal_softmax/kernel.h"
-#include "cuda/nvidia/caster.cuh"
-#include "cuda/nvidia/runtime_.h"
+#include "native/cuda/nvidia/caster.cuh"
+#include "native/cuda/nvidia/runtime_.h"
+#include "native/cuda/ops/causal_softmax/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/gemm/cublas.h b/src/native/cuda/nvidia/ops/gemm/cublas.h
similarity index 81%
rename from src/cuda/nvidia/gemm/cublas.h
rename to src/native/cuda/nvidia/ops/gemm/cublas.h
index e5b2e37b9..eefe0b7af 100644
--- a/src/cuda/nvidia/gemm/cublas.h
+++ b/src/native/cuda/nvidia/ops/gemm/cublas.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_NVIDIA_GEMM_CUBLAS_H_
 #define INFINI_OPS_NVIDIA_GEMM_CUBLAS_H_
 
-#include "cuda/gemm/blas.h"
-#include "cuda/nvidia/blas.h"
+#include "native/cuda/nvidia/blas.h"
+#include "native/cuda/ops/gemm/blas.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/gemm/cublaslt.h b/src/native/cuda/nvidia/ops/gemm/cublaslt.h
similarity index 98%
rename from src/cuda/nvidia/gemm/cublaslt.h
rename to src/native/cuda/nvidia/ops/gemm/cublaslt.h
index c01a7528c..dee5d9a2e 100644
--- a/src/cuda/nvidia/gemm/cublaslt.h
+++ b/src/native/cuda/nvidia/ops/gemm/cublaslt.h
@@ -9,8 +9,8 @@
 // clang-format on
 
 #include "base/gemm.h"
-#include "cuda/nvidia/blas_utils.h"
-#include "cuda/nvidia/runtime_.h"
+#include "native/cuda/nvidia/blas_utils.h"
+#include "native/cuda/nvidia/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/native/cuda/nvidia/ops/linear/kernel.h b/src/native/cuda/nvidia/ops/linear/kernel.h
new file mode 100644
index 000000000..be07883f4
--- /dev/null
+++ b/src/native/cuda/nvidia/ops/linear/kernel.h
@@ -0,0 +1,19 @@
+#ifndef INFINI_OPS_NVIDIA_LINEAR_KERNEL_H_
+#define INFINI_OPS_NVIDIA_LINEAR_KERNEL_H_
+
+#include "native/cuda/nvidia/ops/add/kernel.h"
+#include "native/cuda/nvidia/ops/gemm/cublas.h"
+#include "native/cuda/ops/linear/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Linear, Device::Type::kNvidia>
+    : public CudaLinear<Device::Type::kNvidia> {
+ public:
+  using CudaLinear<Device::Type::kNvidia>::CudaLinear;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/nvidia/rms_norm/kernel.h b/src/native/cuda/nvidia/ops/rms_norm/kernel.h
similarity index 74%
rename from src/cuda/nvidia/rms_norm/kernel.h
rename to src/native/cuda/nvidia/ops/rms_norm/kernel.h
index bf25d9ac8..74cfab17a 100644
--- a/src/cuda/nvidia/rms_norm/kernel.h
+++ b/src/native/cuda/nvidia/ops/rms_norm/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/nvidia/caster.cuh"
-#include "cuda/nvidia/runtime_.h"
-#include "cuda/rms_norm/kernel.h"
+#include "native/cuda/nvidia/caster.cuh"
+#include "native/cuda/nvidia/runtime_.h"
+#include "native/cuda/ops/rms_norm/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/swiglu/kernel.h b/src/native/cuda/nvidia/ops/swiglu/kernel.h
similarity index 74%
rename from src/cuda/nvidia/swiglu/kernel.h
rename to src/native/cuda/nvidia/ops/swiglu/kernel.h
index dba82110c..fd58f29ef 100644
--- a/src/cuda/nvidia/swiglu/kernel.h
+++ b/src/native/cuda/nvidia/ops/swiglu/kernel.h
@@ -3,9 +3,9 @@
 
 #include <utility>
 
-#include "cuda/nvidia/caster.cuh"
-#include "cuda/nvidia/runtime_.h"
-#include "cuda/swiglu/kernel.h"
+#include "native/cuda/nvidia/caster.cuh"
+#include "native/cuda/nvidia/runtime_.h"
+#include "native/cuda/ops/swiglu/kernel.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h
similarity index 88%
rename from src/cuda/nvidia/runtime_.h
rename to src/native/cuda/nvidia/runtime_.h
index b03a9c4b6..326ecdb24 100644
--- a/src/cuda/nvidia/runtime_.h
+++ b/src/native/cuda/nvidia/runtime_.h
@@ -7,9 +7,9 @@
 #include <cuda_runtime.h>
 // clang-format on
 
-#include "cuda/nvidia/device_.h"
-#include "cuda/nvidia/runtime_utils.h"
-#include "cuda/runtime.h"
+#include "native/cuda/nvidia/device_.h"
+#include "native/cuda/nvidia/runtime_utils.h"
+#include "native/cuda/runtime_.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/nvidia/runtime_utils.h b/src/native/cuda/nvidia/runtime_utils.h
similarity index 74%
rename from src/cuda/nvidia/runtime_utils.h
rename to src/native/cuda/nvidia/runtime_utils.h
index 26bfbffb5..c881dabde 100644
--- a/src/cuda/nvidia/runtime_utils.h
+++ b/src/native/cuda/nvidia/runtime_utils.h
@@ -1,8 +1,8 @@
 #ifndef INFINI_OPS_NVIDIA_RUNTIME_UTILS_H_
 #define INFINI_OPS_NVIDIA_RUNTIME_UTILS_H_
 
-#include "cuda/nvidia/device_property.h"
-#include "cuda/runtime_utils.h"
+#include "native/cuda/nvidia/device_property.h"
+#include "native/cuda/runtime_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/add/kernel.cuh b/src/native/cuda/ops/add/kernel.cuh
similarity index 97%
rename from src/cuda/add/kernel.cuh
rename to src/native/cuda/ops/add/kernel.cuh
index f4dcd3de8..ec11fa7ca 100644
--- a/src/cuda/add/kernel.cuh
+++ b/src/native/cuda/ops/add/kernel.cuh
@@ -1,7 +1,7 @@
 #ifndef INFINI_OPS_CUDA_ADD_KERNEL_CUH_
 #define INFINI_OPS_CUDA_ADD_KERNEL_CUH_
 
-#include "cuda/kernel_commons.cuh"
+#include "native/cuda/kernel_commons.cuh"
 
 namespace infini::ops {
 
diff --git a/src/cuda/add/kernel.h b/src/native/cuda/ops/add/kernel.h
similarity index 96%
rename from src/cuda/add/kernel.h
rename to src/native/cuda/ops/add/kernel.h
index 95d82c919..ac3e698d4 100644
--- a/src/cuda/add/kernel.h
+++ b/src/native/cuda/ops/add/kernel.h
@@ -8,9 +8,9 @@
 
 #include "base/add.h"
 #include "common/generic_utils.h"
-#include "cuda/add/kernel.cuh"
-#include "cuda/kernel_commons.cuh"
-#include "cuda/runtime_utils.h"
+#include "native/cuda/kernel_commons.cuh"
+#include "native/cuda/ops/add/kernel.cuh"
+#include "native/cuda/runtime_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/causal_softmax/kernel.cuh b/src/native/cuda/ops/causal_softmax/kernel.cuh
similarity index 97%
rename from src/cuda/causal_softmax/kernel.cuh
rename to src/native/cuda/ops/causal_softmax/kernel.cuh
index 11b23b523..31f685e84 100644
--- a/src/cuda/causal_softmax/kernel.cuh
+++ b/src/native/cuda/ops/causal_softmax/kernel.cuh
@@ -5,8 +5,8 @@
 #include <cstddef>
 #include <cub/block/block_reduce.cuh>
 
-#include "cuda/caster.cuh"
-#include "cuda/kernel_commons.cuh"
+#include "native/cuda/caster.cuh"
+#include "native/cuda/kernel_commons.cuh"
 
 namespace infini::ops {
 
diff --git a/src/cuda/causal_softmax/kernel.h b/src/native/cuda/ops/causal_softmax/kernel.h
similarity index 93%
rename from src/cuda/causal_softmax/kernel.h
rename to src/native/cuda/ops/causal_softmax/kernel.h
index c060d834b..c77d97387 100644
--- a/src/cuda/causal_softmax/kernel.h
+++ b/src/native/cuda/ops/causal_softmax/kernel.h
@@ -6,11 +6,11 @@
 #include <cstdint>
 
 #include "base/causal_softmax.h"
-#include "cuda/causal_softmax/kernel.cuh"
-#include "cuda/kernel_commons.cuh"
-#include "cuda/runtime_utils.h"
 #include "data_type.h"
 #include "dispatcher.h"
+#include "native/cuda/kernel_commons.cuh"
+#include "native/cuda/ops/causal_softmax/kernel.cuh"
+#include "native/cuda/runtime_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/gemm/blas.h b/src/native/cuda/ops/gemm/blas.h
similarity index 99%
rename from src/cuda/gemm/blas.h
rename to src/native/cuda/ops/gemm/blas.h
index 264d52c15..6bac0d6a6 100644
--- a/src/cuda/gemm/blas.h
+++ b/src/native/cuda/ops/gemm/blas.h
@@ -4,7 +4,7 @@
 #include <utility>
 
 #include "base/gemm.h"
-#include "cuda/blas_utils.h"
+#include "native/cuda/blas_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/native/cuda/ops/linear/kernel.h b/src/native/cuda/ops/linear/kernel.h
new file mode 100644
index 000000000..655721d5d
--- /dev/null
+++ b/src/native/cuda/ops/linear/kernel.h
@@ -0,0 +1,72 @@
+#ifndef INFINI_OPS_CUDA_LINEAR_KERNEL_H_
+#define INFINI_OPS_CUDA_LINEAR_KERNEL_H_
+
+#include <cassert>
+#include <optional>
+
+#include "base/add.h"
+#include "base/gemm.h"
+#include "base/linear.h"
+
+namespace infini::ops {
+
+template <Device::Type kDev>
+class CudaLinear : public Linear {
+ public:
+  CudaLinear(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+             bool trans_a, bool trans_b, Tensor out)
+      : Linear(a, b, bias, trans_a, trans_b, out),
+        gemm_(a, b, std::optional<float>{1.0f}, std::optional<float>{0.0f},
+              std::optional<int>{static_cast<int>(trans_a)},
+              std::optional<int>{static_cast<int>(trans_b)}, out) {
+    if (has_bias_) {
+      add_.emplace(out, BroadcastBias(*bias, out), out);
+    }
+  }
+
+  void operator()(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+                  bool trans_a, bool trans_b, Tensor out) const override {
+    assert(has_bias_ == bias.has_value());
+
+    ConfigureSubOperator(gemm_);
+    gemm_(a, b, std::optional<float>{1.0f}, std::optional<float>{0.0f},
+          std::optional<int>{static_cast<int>(trans_a)},
+          std::optional<int>{static_cast<int>(trans_b)}, out);
+
+    if (has_bias_) {
+      auto bias_view = BroadcastBias(*bias, out);
+      ConfigureSubOperator(*add_);
+      (*add_)(out, bias_view, out);
+    }
+  }
+
+ private:
+  static Tensor BroadcastBias(const Tensor& bias, const Tensor& out) {
+    assert(bias.ndim() == 1);
+    assert(bias.size(0) == out.size(-1));
+
+    auto shape = out.shape();
+    Tensor::Strides strides(out.ndim(), 0);
+    strides.back() = bias.stride(0);
+
+    return Tensor{const_cast<void*>(bias.data()), shape, bias.dtype(),
+                  bias.device(), strides};
+  }
+
+  template <typename Op>
+  void ConfigureSubOperator(Op& op) const {
+    op.set_handle(handle_);
+    op.set_config(config_);
+    op.set_stream(stream_);
+    op.set_workspace(workspace_);
+    op.set_workspace_size_in_bytes(workspace_size_in_bytes_);
+  }
+
+  mutable Operator<Gemm, kDev> gemm_;
+
+  mutable std::optional<Operator<Add, kDev>> add_;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/rms_norm/kernel.cuh b/src/native/cuda/ops/rms_norm/kernel.cuh
similarity index 96%
rename from src/cuda/rms_norm/kernel.cuh
rename to src/native/cuda/ops/rms_norm/kernel.cuh
index 980f776ec..91036a37f 100644
--- a/src/cuda/rms_norm/kernel.cuh
+++ b/src/native/cuda/ops/rms_norm/kernel.cuh
@@ -5,8 +5,8 @@
 #include <cstdint>
 #include <cub/block/block_reduce.cuh>
 
-#include "cuda/caster.cuh"
-#include "cuda/kernel_commons.cuh"
+#include "native/cuda/caster.cuh"
+#include "native/cuda/kernel_commons.cuh"
 
 namespace infini::ops {
 
diff --git a/src/cuda/rms_norm/kernel.h b/src/native/cuda/ops/rms_norm/kernel.h
similarity index 93%
rename from src/cuda/rms_norm/kernel.h
rename to src/native/cuda/ops/rms_norm/kernel.h
index 14146edca..00d2a8c2d 100644
--- a/src/cuda/rms_norm/kernel.h
+++ b/src/native/cuda/ops/rms_norm/kernel.h
@@ -5,11 +5,11 @@
 #include <cstdint>
 
 #include "base/rms_norm.h"
-#include "cuda/kernel_commons.cuh"
-#include "cuda/rms_norm/kernel.cuh"
-#include "cuda/runtime_utils.h"
 #include "data_type.h"
 #include "dispatcher.h"
+#include "native/cuda/kernel_commons.cuh"
+#include "native/cuda/ops/rms_norm/kernel.cuh"
+#include "native/cuda/runtime_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/swiglu/kernel.cuh b/src/native/cuda/ops/swiglu/kernel.cuh
similarity index 98%
rename from src/cuda/swiglu/kernel.cuh
rename to src/native/cuda/ops/swiglu/kernel.cuh
index 36b9f9752..a782b6f6b 100644
--- a/src/cuda/swiglu/kernel.cuh
+++ b/src/native/cuda/ops/swiglu/kernel.cuh
@@ -3,7 +3,7 @@
 
 #include <cmath>
 
-#include "cuda/kernel_commons.cuh"
+#include "native/cuda/kernel_commons.cuh"
 
 namespace infini::ops {
 
diff --git a/src/cuda/swiglu/kernel.h b/src/native/cuda/ops/swiglu/kernel.h
similarity index 97%
rename from src/cuda/swiglu/kernel.h
rename to src/native/cuda/ops/swiglu/kernel.h
index 5fcfe73bb..bfd19df48 100644
--- a/src/cuda/swiglu/kernel.h
+++ b/src/native/cuda/ops/swiglu/kernel.h
@@ -8,8 +8,8 @@
 
 #include "base/swiglu.h"
 #include "common/generic_utils.h"
-#include "cuda/runtime_utils.h"
-#include "cuda/swiglu/kernel.cuh"
+#include "native/cuda/ops/swiglu/kernel.cuh"
+#include "native/cuda/runtime_utils.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/runtime.h b/src/native/cuda/runtime_.h
similarity index 97%
rename from src/cuda/runtime.h
rename to src/native/cuda/runtime_.h
index fdc92cc9c..4e4e61404 100644
--- a/src/cuda/runtime.h
+++ b/src/native/cuda/runtime_.h
@@ -3,7 +3,7 @@
 
 #include <type_traits>
 
-#include "../runtime.h"
+#include "runtime.h"
 
 namespace infini::ops {
 
diff --git a/src/cuda/runtime_utils.h b/src/native/cuda/runtime_utils.h
similarity index 100%
rename from src/cuda/runtime_utils.h
rename to src/native/cuda/runtime_utils.h
diff --git a/src/operator.h b/src/operator.h
index d4609e055..83fc4ec2c 100644
--- a/src/operator.h
+++ b/src/operator.h
@@ -37,6 +37,14 @@ struct CacheKey {
     tensors.push_back(t);
   }
 
+  void Absorb(const std::vector<Tensor>& ts) {
+    HashCombine(hash, ts.size());
+    for (const auto& t : ts) {
+      HashCombine(hash, t);
+      tensors.push_back(t);
+    }
+  }
+
   template <typename T>
   void Absorb(const T& v) {
     HashCombine(hash, v);
@@ -121,7 +129,16 @@ class OperatorBase {
 template <typename Key, Device::Type device_type = Device::Type::kCount,
           std::size_t implementation_index = 0>
 class Operator : public OperatorBase {
+  // Generation counter for lazy cache invalidation.  Bumped by
+  // `clear_cache()`; the next `call()` detects the mismatch and
+  // destroys all cached operator instances.
+  static inline std::size_t cache_generation_{0};
+
  public:
+  // Invalidate the operator cache.  Cached operators are destroyed on the
+  // next `call()` invocation.  Intended for test isolation — production
+  // code should never call this.
+  static void clear_cache() { ++cache_generation_; }
   template <typename... Args>
   static auto Make(const Config& config, const Tensor tensor, Args&&... args) {
     std::unique_ptr<Operator> op_ptr;
@@ -166,6 +183,12 @@ class Operator : public OperatorBase {
   static auto Call(const Handle& handle, const Config& config, Args&&... args) {
     static std::unordered_map<detail::CacheKey, std::unique_ptr<Operator>>
         cache;
+    static std::size_t generation{0};
+
+    if (generation != cache_generation_) {
+      cache.clear();
+      generation = cache_generation_;
+    }
 
     auto key = detail::CacheKey::Build(config.implementation_index(), args...);
 
@@ -174,7 +197,7 @@ class Operator : public OperatorBase {
     if (it == cache.end()) {
       // Pass args as lvalue refs so they remain valid for the `operator()` call
       // below. Forwarding rvalue temporaries into `Make()` would leave the args
-      // in a moved-from (empty) state before operator() can use them.
+      // in a moved-from (empty) state before `operator()` can use them.
       it = cache.emplace(std::move(key), Make(config, args...)).first;
     }
 
diff --git a/src/pybind11_utils.h b/src/pybind11_utils.h
index b595836cc..f13d3116c 100644
--- a/src/pybind11_utils.h
+++ b/src/pybind11_utils.h
@@ -66,10 +66,20 @@ inline Tensor TensorFromPybind11Handle(py::handle obj) {
 
 inline std::optional<Tensor> OptionalTensorFromPybind11Handle(
     const std::optional<py::object>& obj) {
-  if (!obj.has_value()) return std::nullopt;
+  if (!obj.has_value() || obj->is_none()) return std::nullopt;
   return TensorFromPybind11Handle(*obj);
 }
 
+inline std::vector<Tensor> VectorTensorFromPybind11Handle(
+    const std::vector<py::object>& objs) {
+  std::vector<Tensor> result;
+  result.reserve(objs.size());
+  for (const auto& obj : objs) {
+    result.push_back(TensorFromPybind11Handle(obj));
+  }
+  return result;
+}
+
 }  // namespace infini::ops
 
 #endif
diff --git a/src/torch/add/add.cc b/src/torch/ops/add/add.cc
similarity index 98%
rename from src/torch/add/add.cc
rename to src/torch/ops/add/add.cc
index cd2f2cc92..2108e652c 100644
--- a/src/torch/add/add.cc
+++ b/src/torch/ops/add/add.cc
@@ -1,4 +1,4 @@
-#include "torch/add/add.h"
+#include "torch/ops/add/add.h"
 
 #include "torch/tensor_.h"
 
diff --git a/src/torch/add/add.h b/src/torch/ops/add/add.h
similarity index 100%
rename from src/torch/add/add.h
rename to src/torch/ops/add/add.h
diff --git a/src/torch/gemm/gemm.cc b/src/torch/ops/gemm/gemm.cc
similarity index 98%
rename from src/torch/gemm/gemm.cc
rename to src/torch/ops/gemm/gemm.cc
index d17353f15..e24da7a84 100644
--- a/src/torch/gemm/gemm.cc
+++ b/src/torch/ops/gemm/gemm.cc
@@ -1,4 +1,4 @@
-#include "torch/gemm/gemm.h"
+#include "torch/ops/gemm/gemm.h"
 
 #include "torch/tensor_.h"
 
diff --git a/src/torch/gemm/gemm.h b/src/torch/ops/gemm/gemm.h
similarity index 100%
rename from src/torch/gemm/gemm.h
rename to src/torch/ops/gemm/gemm.h
diff --git a/tests/conftest.py b/tests/conftest.py
index 8a72355e5..86d01c249 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -44,6 +44,31 @@ def set_seed_per_test(request):
     _set_random_seed(_hash(_test_case_path_from_request(request)))
 
 
+@pytest.fixture(scope="module", autouse=True)
+def _clear_operator_caches():
+    """Clear the C++ operator cache between test modules.
+
+    The `Operator::call()` cache keys on tensor geometry (shape, strides,
+    dtype) but not data pointers. When different test modules create tensors
+    with identical geometry but different data content (e.g., random
+    `cos_sin_cache` tables), a stale cached operator from a prior module
+    silently returns wrong results. Clearing the cache at module boundaries
+    ensures each module starts with a cold cache.
+    """
+    yield
+
+    try:
+        import infini.ops as ops
+
+        for name in dir(ops):
+            cls = getattr(ops, name)
+
+            if hasattr(cls, "clear_cache"):
+                cls.clear_cache()
+    except ImportError:
+        pass
+
+
 _NPU_UNSUPPORTED_DTYPES = {torch.float64}
 
 # `torch_npu` does not implement random number generation for
@@ -65,6 +90,43 @@ def skip_unsupported_dtypes(request):
         pytest.skip(f"{params['dtype']} not supported on Ascend 910B")
 
 
+@pytest.fixture(autouse=True)
+def skip_op_without_platform_impl(request):
+    """Skip `device=<torch_type>` parametrizations when the op has no
+    implementation on any of the corresponding platforms.
+
+    Only runs for tests that parametrize `device` but not
+    `implementation_index` — joint `(device, impl_idx)` parametrize in
+    `pytest_generate_tests` already prunes empty-impl pairs at collection
+    time, making this check redundant (and wasteful) on those tests.
+    """
+    if not hasattr(request.node, "callspec"):
+        return
+
+    params = request.node.callspec.params
+
+    if "implementation_index" in params:
+        return
+
+    device_selectors = _active_device_selectors_for_torch_device(
+        request.config, params.get("device")
+    )
+
+    if not device_selectors:
+        return
+
+    op_cls = _op_class_from_module(request.node.module)
+
+    if op_cls is None or not hasattr(op_cls, "active_implementation_indices"):
+        return
+
+    if not any(op_cls.active_implementation_indices(d) for d in device_selectors):
+        pytest.skip(
+            f"{op_cls.__name__} has no implementation on any "
+            f"`{params.get('device')}`-mapped platform/device"
+        )
+
+
 def _set_random_seed(seed):
     random.seed(seed)
     torch.manual_seed(seed)
@@ -80,6 +142,26 @@ def _set_random_seed(seed):
 }
 
 
+def _active_device_selectors_for_torch_device(config, torch_device):
+    """Return platform or torch device names selected for a torch device type."""
+    if not torch_device:
+        return ()
+
+    cli_devices = config.getoption("--devices") or ()
+    requested_platforms = tuple(
+        name
+        for name in cli_devices
+        if _PLATFORM_TO_TORCH_DEVICE.get(name) == torch_device
+    )
+
+    if requested_platforms:
+        return requested_platforms
+
+    # The pybind layer maps torch device names (e.g. "cuda") to the backend
+    # compiled into the current wheel, avoiding probes of inactive CUDA siblings.
+    return (torch_device,)
+
+
 def _resolve_device(name):
     """Map a platform name (e.g., `ascend`) to a PyTorch device type (e.g., `npu`)."""
     return _PLATFORM_TO_TORCH_DEVICE.get(name, name)
@@ -109,7 +191,69 @@ def pytest_generate_tests(metafunc):
         else:
             devices = ()
 
-        metafunc.parametrize("device", devices or available)
+        devices = devices or available
+
+        # Joint `(device, implementation_index)` parametrize: generate only
+        # pairs where the op has an active implementation on that device.
+        # Avoids cross-device pollution — an impl active on `cpu` but not on
+        # `npu` no longer appears as a runtime skip in the npu column.
+        if (
+            "implementation_index" in metafunc.fixturenames
+            and "implementation_index" not in already_parametrized
+        ):
+            op_cls = _op_class_from_module(metafunc.module)
+
+            if op_cls is not None and hasattr(op_cls, "active_implementation_indices"):
+                pairs = [
+                    (dev, idx)
+                    for dev in devices
+                    for idx in op_cls.active_implementation_indices(dev)
+                ]
+
+                if not pairs:
+                    # Emit one skipped placeholder so test IDs read
+                    # `[skip-dtype0-...]` instead of `[NOTSET-...]`.
+                    # `get_available_devices()` always includes `"cpu"`, so
+                    # `devices[0]` is safe.
+                    pairs = [
+                        pytest.param(
+                            devices[0],
+                            0,
+                            marks=pytest.mark.skip(
+                                reason=(
+                                    f"{op_cls.__name__} has no active "
+                                    "implementation on any available device"
+                                )
+                            ),
+                            id="skip",
+                        )
+                    ]
+
+                metafunc.parametrize("device, implementation_index", pairs)
+
+                return
+
+        metafunc.parametrize("device", devices)
+
+
+def _op_class_from_module(module):
+    """Derive the `infini.ops.<Op>` class from a `tests/test_<snake>.py` module.
+
+    Test modules have already imported `infini.ops` by the time this runs, so
+    no `try/except ImportError` is needed — a real import failure would have
+    aborted collection long before.
+    """
+    module_name = module.__name__.rsplit(".", 1)[-1]
+
+    if not module_name.startswith("test_"):
+        return None
+
+    op_snake = module_name[len("test_") :]
+    op_pascal = "".join(part.capitalize() for part in op_snake.split("_"))
+
+    import infini.ops as _ops
+
+    return getattr(_ops, op_pascal, None)
 
 
 @pytest.hookimpl(tryfirst=True)
diff --git a/tests/test_add.py b/tests/test_add.py
index 825fc932c..e2266c30d 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -2,7 +2,13 @@
 import pytest
 import torch
 
-from tests.utils import Payload, empty_strided, randint_strided, randn_strided
+from tests.utils import (
+    Payload,
+    empty_strided,
+    get_stream,
+    randint_strided,
+    randn_strided,
+)
 
 _INT_DTYPES = (torch.int16, torch.int32, torch.int64)
 
@@ -29,9 +35,6 @@
         ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
     ),
 )
-# TODO: Generate implementation indices dynamically from
-# `Add.active_implementation_indices` instead of hardcoding.
-@pytest.mark.parametrize("implementation_index", (0, 1))
 @pytest.mark.parametrize(
     ("dtype", "rtol", "atol"),
     (
@@ -57,11 +60,6 @@ def test_add(
             "The `torch.musa` test cloning path does not support `uint16`, `uint32`, or `uint64`."
         )
 
-    active_indices = infini.ops.Add.active_implementation_indices(device)
-
-    if implementation_index not in active_indices:
-        pytest.skip(f"implementation `{implementation_index}` not active on `{device}`")
-
     if implementation_index == 1 and dtype in _UINT_DTYPES:
         pytest.skip("ATen `add` does not support unsigned integer types")
 
@@ -89,7 +87,13 @@ def test_add(
 
 
 def _add(input, other, out, implementation_index=0):
-    infini.ops.add(input, other, out, implementation_index=implementation_index)
+    infini.ops.add(
+        input,
+        other,
+        out,
+        stream=get_stream(input.device),
+        implementation_index=implementation_index,
+    )
 
     return out
 
diff --git a/tests/test_cast.py b/tests/test_cast.py
new file mode 100644
index 000000000..bd19d934b
--- /dev/null
+++ b/tests/test_cast.py
@@ -0,0 +1,62 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shape, input_strides, out_strides",
+    (
+        ((13, 4), None, None),
+        ((13, 4), (10, 1), (10, 1)),
+        ((13, 4, 4), None, None),
+        ((16, 5632), None, None),
+        ((4, 4, 5632), None, None),
+    ),
+)
+@pytest.mark.parametrize(
+    ("input_dtype", "out_dtype", "rtol", "atol"),
+    (
+        (torch.float16, torch.float32, 1e-3, 1e-3),
+        (torch.float32, torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, torch.float32, 1e-2, 5e-3),
+        (torch.float32, torch.bfloat16, 1e-2, 5e-3),
+        (torch.float16, torch.bfloat16, 1e-2, 5e-3),
+        (torch.bfloat16, torch.float16, 1e-2, 5e-3),
+    ),
+)
+def test_cast(
+    shape,
+    input_strides,
+    out_strides,
+    input_dtype,
+    out_dtype,
+    device,
+    rtol,
+    atol,
+):
+    input = randn_strided(shape, input_strides, dtype=input_dtype, device=device)
+    out = empty_strided(shape, out_strides, dtype=out_dtype, device=device)
+
+    return Payload(
+        _cast,
+        _torch_cast,
+        (input, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _cast(input, out):
+    infini.ops.cast(input, out, stream=get_stream(input.device))
+
+    return out
+
+
+def _torch_cast(input, out):
+    out.copy_(input.to(out.dtype))
+
+    return out
diff --git a/tests/test_cat.py b/tests/test_cat.py
new file mode 100644
index 000000000..85428b53f
--- /dev/null
+++ b/tests/test_cat.py
@@ -0,0 +1,69 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shapes, dim, out_shape",
+    (
+        # 2 inputs, dim=0
+        (((4, 64), (4, 64)), 0, (8, 64)),
+        # 2 inputs, dim=1
+        (((4, 32), (4, 64)), 1, (4, 96)),
+        # 2 inputs, dim=-1 (negative dim)
+        (((4, 32), (4, 64)), -1, (4, 96)),
+        # 3 inputs, dim=1
+        (((4, 16), (4, 32), (4, 16)), 1, (4, 64)),
+        # 2 inputs, dim=0, 3D
+        (((2, 4, 64), (2, 4, 64)), 0, (4, 4, 64)),
+        # 2 inputs, dim=2, 3D
+        (((2, 4, 32), (2, 4, 64)), 2, (2, 4, 96)),
+        # 4 inputs, dim=1
+        (((1, 1024), (1, 1024), (1, 1024), (1, 1024)), 1, (1, 4096)),
+    ),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float32, 1e-7, 1e-7),
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-2, 5e-3),
+    ),
+)
+def test_cat(shapes, dim, out_shape, dtype, device, rtol, atol):
+    inputs = [randn_strided(s, None, dtype=dtype, device=device) for s in shapes]
+    out = empty_strided(out_shape, None, dtype=dtype, device=device)
+
+    return Payload(
+        lambda *args: _cat(*args, dim=dim),
+        lambda *args: _torch_cat(*args, dim=dim),
+        (*inputs, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _cat(*args, dim):
+    inputs = list(args[:-1])
+    out = args[-1]
+
+    first = inputs[0]
+    rest = inputs[1:]
+
+    infini.ops.cat(first, rest, dim, out, stream=get_stream(first.device))
+
+    return out
+
+
+def _torch_cat(*args, dim):
+    inputs = list(args[:-1])
+    out = args[-1]
+
+    result = torch.cat(inputs, dim=dim)
+    out.copy_(result)
+
+    return out
diff --git a/tests/test_causal_softmax.py b/tests/test_causal_softmax.py
index 8b35457a9..79e6be8e0 100644
--- a/tests/test_causal_softmax.py
+++ b/tests/test_causal_softmax.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import Payload, empty_strided, randn_strided
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -40,7 +40,7 @@ def test_causal_softmax(shape, input_strides, out_strides, dtype, device, rtol,
 
 
 def _causal_softmax(input, out):
-    infini.ops.causal_softmax(input, out)
+    infini.ops.causal_softmax(input, out, stream=get_stream(input.device))
 
     return out
 
@@ -48,7 +48,7 @@ def _causal_softmax(input, out):
 def _torch_causal_softmax(input, out):
     mask = torch.tril(torch.ones_like(input), diagonal=-1).flip(dims=[-2, -1])
     masked = torch.where(mask == 1, -torch.inf, input.to(torch.float32))
-    result = torch.nn.functional.softmax(masked, dim=-1, dtype=input.dtype)
+    result = torch.nn.functional.softmax(masked, dim=-1)
     out.copy_(result)
 
     return out
diff --git a/tests/test_gemm.py b/tests/test_gemm.py
index 26e102d25..71e0e8fde 100644
--- a/tests/test_gemm.py
+++ b/tests/test_gemm.py
@@ -20,9 +20,6 @@
 @pytest.mark.parametrize("beta", (-1, -0.5, 0, 0.5, 1))
 @pytest.mark.parametrize("trans_a", (False, True))
 @pytest.mark.parametrize("trans_b", (False, True))
-# TODO: Generate implementation indices dynamically from
-# `Gemm.active_implementation_indices` instead of hardcoding.
-@pytest.mark.parametrize("implementation_index", (0, 1, 2))
 @pytest.mark.parametrize(
     ("dtype", "rtol", "atol"),
     (
@@ -56,11 +53,6 @@ def test_gemm(
     if device == "mlu" and dtype == torch.bfloat16:
         pytest.skip("`bfloat16` is not supported by `cnnlBatchMatMulEx`")
 
-    active_indices = infini.ops.Gemm.active_implementation_indices(device)
-
-    if implementation_index not in active_indices:
-        pytest.skip(f"implementation `{implementation_index}` not active on `{device}`")
-
     if implementation_index == 1 and dtype in (torch.float16, torch.bfloat16):
         pytest.skip("cuBLASLt half-precision exceeds current tolerances")
 
@@ -71,6 +63,18 @@ def test_gemm(
     ):
         pytest.skip("ATen CPU `addmm`/`baddbmm` does not support half-precision")
 
+    if implementation_index == 2 and device == "npu":
+        # `src/torch/gemm/gemm.h` partial-specializes `Operator<Gemm, kDev, 2>`
+        # for every `kDev` including `kAscend`, so the SFINAE-based
+        # `active_implementation_indices` reports `2` as active even though
+        # `torch/gemm/gemm.cc` only instantiates it for CPU/NVIDIA.
+        # Dispatching through the unused Ascend specialization reads from an
+        # uninitialized vtable and crashes. See PR #64 discussion.
+        pytest.skip(
+            "Gemm impl=2 on Ascend is a torch-fallback stub without an "
+            "instantiated specialization"
+        )
+
     a = randn_strided(a_shape, a_strides, dtype=dtype, device=device)
     b = randn_strided(b_shape, b_strides, dtype=dtype, device=device)
 
diff --git a/tests/test_linear.py b/tests/test_linear.py
new file mode 100644
index 000000000..364ba5fcf
--- /dev/null
+++ b/tests/test_linear.py
@@ -0,0 +1,90 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "a_shape, b_shape, out_shape",
+    (
+        ((4, 64), (64, 32), (4, 32)),
+        ((2, 128), (128, 256), (2, 256)),
+        ((1, 4096), (4096, 4096), (1, 4096)),
+        ((2, 4, 64), (2, 64, 32), (2, 4, 32)),
+        ((4, 8, 128), (4, 128, 64), (4, 8, 64)),
+    ),
+)
+@pytest.mark.parametrize("trans_a", (False, True))
+@pytest.mark.parametrize("trans_b", (False, True))
+@pytest.mark.parametrize("has_bias", (False, True))
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float32, 1e-2, 5e-2),
+        (torch.float16, 1e-2, 1e-2),
+        (torch.bfloat16, 1e-2, 1e-2),
+    ),
+)
+def test_linear(
+    a_shape,
+    b_shape,
+    out_shape,
+    trans_a,
+    trans_b,
+    has_bias,
+    dtype,
+    device,
+    rtol,
+    atol,
+):
+    a = randn_strided(a_shape, None, dtype=dtype, device=device)
+    b = randn_strided(b_shape, None, dtype=dtype, device=device)
+
+    if trans_a:
+        a = a.transpose(-2, -1)
+
+    if trans_b:
+        b = b.transpose(-2, -1)
+
+    # Bias shape is [N], the last dim of the output.
+    bias = None
+
+    if has_bias:
+        N = out_shape[-1]
+        bias = randn_strided((N,), None, dtype=dtype, device=device)
+
+    out = empty_strided(out_shape, None, dtype=dtype, device=device)
+
+    return Payload(
+        lambda *args: _linear(*args, trans_a=trans_a, trans_b=trans_b),
+        lambda *args: _torch_linear(*args, trans_a=trans_a, trans_b=trans_b),
+        (a, b, bias, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _linear(a, b, bias, out, trans_a=False, trans_b=False):
+    infini.ops.linear(a, b, bias, trans_a, trans_b, out, stream=get_stream(a.device))
+
+    return out
+
+
+def _torch_linear(a, b, bias, out, trans_a=False, trans_b=False):
+    if trans_a:
+        a = a.transpose(-2, -1)
+
+    if trans_b:
+        b = b.transpose(-2, -1)
+
+    result = torch.matmul(a.float(), b.float())
+
+    if bias is not None:
+        result = result + bias.float()
+
+    out.copy_(result.to(out.dtype))
+
+    return out
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
new file mode 100644
index 000000000..fea3822a8
--- /dev/null
+++ b/tests/test_matmul.py
@@ -0,0 +1,76 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "a_shape, b_shape, c_shape",
+    (
+        ((4, 64), (64, 32), (4, 32)),
+        ((2, 128), (128, 256), (2, 256)),
+        ((2, 4, 64), (2, 64, 32), (2, 4, 32)),
+        ((4, 8, 128), (4, 128, 64), (4, 8, 64)),
+    ),
+)
+@pytest.mark.parametrize("trans_a", (False, True))
+@pytest.mark.parametrize("trans_b", (False, True))
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float32, 1e-2, 1e-2),
+        (torch.float16, 1e-2, 1e-2),
+        (torch.bfloat16, 1e-2, 1e-2),
+    ),
+)
+def test_matmul(
+    a_shape,
+    b_shape,
+    c_shape,
+    trans_a,
+    trans_b,
+    dtype,
+    device,
+    rtol,
+    atol,
+):
+    a = randn_strided(a_shape, None, dtype=dtype, device=device)
+    b = randn_strided(b_shape, None, dtype=dtype, device=device)
+
+    if trans_a:
+        a = a.transpose(-2, -1)
+
+    if trans_b:
+        b = b.transpose(-2, -1)
+
+    c = empty_strided(c_shape, None, dtype=dtype, device=device)
+
+    return Payload(
+        lambda *args: _matmul(*args, trans_a=trans_a, trans_b=trans_b),
+        lambda *args: _torch_matmul(*args, trans_a=trans_a, trans_b=trans_b),
+        (a, b, c),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _matmul(a, b, c, trans_a=False, trans_b=False):
+    infini.ops.matmul(a, b, c, trans_a, trans_b, stream=get_stream(a.device))
+
+    return c
+
+
+def _torch_matmul(a, b, c, trans_a=False, trans_b=False):
+    if trans_a:
+        a = a.transpose(-2, -1)
+
+    if trans_b:
+        b = b.transpose(-2, -1)
+
+    result = torch.matmul(a.float(), b.float()).to(c.dtype)
+    c.copy_(result)
+
+    return c
diff --git a/tests/test_mul.py b/tests/test_mul.py
new file mode 100644
index 000000000..e368f96df
--- /dev/null
+++ b/tests/test_mul.py
@@ -0,0 +1,87 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import (
+    Payload,
+    empty_strided,
+    get_stream,
+    randint_strided,
+    randn_strided,
+)
+
+_INT_DTYPES = (torch.int16, torch.int32, torch.int64)
+
+_UINT_DTYPES = tuple(
+    filter(None, (getattr(torch, f"uint{bits}", None) for bits in (16, 32, 64)))
+)
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shape, input_strides, other_strides, out_strides",
+    (
+        ((13, 4), None, None, None),
+        ((13, 4), (10, 1), (10, 1), (10, 1)),
+        ((13, 4), (0, 1), None, None),
+        ((13, 4, 4), None, None, None),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+        ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+        ((16, 5632), None, None, None),
+        ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+        ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+        ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+        ((4, 4, 5632), None, None, None),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+    ),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float32, 1e-7, 1e-7),
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-2, 5e-3),
+    )
+    + tuple((dtype, 0, 0) for dtype in _INT_DTYPES + _UINT_DTYPES),
+)
+def test_mul(
+    shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol
+):
+    if device == "musa" and dtype in _UINT_DTYPES:
+        pytest.skip(
+            "The `torch.musa` test cloning path does not support `uint16`, `uint32`, or `uint64`."
+        )
+
+    if dtype in _INT_DTYPES or dtype in _UINT_DTYPES:
+        input = randint_strided(
+            0, 100, shape, input_strides, dtype=dtype, device=device
+        )
+        other = randint_strided(
+            0, 100, shape, other_strides, dtype=dtype, device=device
+        )
+    else:
+        input = randn_strided(shape, input_strides, dtype=dtype, device=device)
+        other = randn_strided(shape, other_strides, dtype=dtype, device=device)
+
+    out = empty_strided(shape, out_strides, dtype=dtype, device=device)
+
+    return Payload(_mul, _torch_mul, (input, other, out), {}, rtol=rtol, atol=atol)
+
+
+def _mul(input, other, out):
+    infini.ops.mul(input, other, out, stream=get_stream(input.device))
+
+    return out
+
+
+def _torch_mul(input, other, out):
+    if input.dtype in _UINT_DTYPES:
+        input = input.to(torch.int64)
+
+    if other.dtype in _UINT_DTYPES:
+        other = other.to(torch.int64)
+
+    res = torch.mul(input, other)
+    out.copy_(res.to(out.dtype))
+
+    return out
diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py
index d6d4dff17..45f9199ba 100644
--- a/tests/test_rms_norm.py
+++ b/tests/test_rms_norm.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import Payload, empty_strided, randn_strided
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -33,6 +33,7 @@ def test_rms_norm(
     weight_strides,
     out_strides,
     eps,
+    implementation_index,
     dtype,
     device,
     rtol,
@@ -43,7 +44,9 @@ def test_rms_norm(
     out = empty_strided(input_shape, out_strides, dtype=dtype, device=device)
 
     return Payload(
-        _rms_norm,
+        lambda *args, **kwargs: _rms_norm(
+            *args, **kwargs, implementation_index=implementation_index
+        ),
         _torch_rms_norm,
         (input, weight),
         {"eps": eps, "out": out},
@@ -52,8 +55,15 @@ def test_rms_norm(
     )
 
 
-def _rms_norm(input, weight, *, eps=1e-6, out=None):
-    infini.ops.rms_norm(input, weight, eps, out)
+def _rms_norm(input, weight, *, eps=1e-6, out=None, implementation_index=0):
+    infini.ops.rms_norm(
+        input,
+        weight,
+        eps,
+        out,
+        implementation_index=implementation_index,
+        stream=get_stream(input.device),
+    )
 
     return out
 
diff --git a/tests/test_swiglu.py b/tests/test_swiglu.py
index 89c95f77f..f159742ca 100644
--- a/tests/test_swiglu.py
+++ b/tests/test_swiglu.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import Payload, empty_strided, rand_strided
+from tests.utils import Payload, empty_strided, get_stream, rand_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -28,17 +28,40 @@
     ),
 )
 def test_swiglu(
-    shape, input_strides, gate_strides, out_strides, dtype, device, rtol, atol
+    shape,
+    input_strides,
+    gate_strides,
+    out_strides,
+    implementation_index,
+    dtype,
+    device,
+    rtol,
+    atol,
 ):
     input = rand_strided(shape, input_strides, dtype=dtype, device=device)
     gate = rand_strided(shape, gate_strides, dtype=dtype, device=device)
     out = empty_strided(shape, out_strides, dtype=dtype, device=device)
 
-    return Payload(_swiglu, _torch_swiglu, (input, gate, out), {}, rtol=rtol, atol=atol)
+    return Payload(
+        lambda *args, **kwargs: _swiglu(
+            *args, **kwargs, implementation_index=implementation_index
+        ),
+        _torch_swiglu,
+        (input, gate, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
 
 
-def _swiglu(input, gate, out):
-    infini.ops.swiglu(input, gate, out)
+def _swiglu(input, gate, out, implementation_index=0):
+    infini.ops.swiglu(
+        input,
+        gate,
+        out,
+        implementation_index=implementation_index,
+        stream=get_stream(input.device),
+    )
 
     return out
 
diff --git a/tests/utils.py b/tests/utils.py
index 8f9532aa0..982d05aec 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -82,11 +82,21 @@ def randint_strided(low, high, shape, strides, *, dtype=None, device=None):
     return output
 
 
+_STREAM_ACCESSORS = {
+    "npu": ("npu", "npu_stream"),
+    "cuda": ("cuda", "cuda_stream"),
+    "mlu": ("mlu", "mlu_stream"),
+    "musa": ("musa", "musa_stream"),
+}
+
+
 def get_stream(device):
     """Return the raw stream handle for `device`, or 0 for CPU.
 
-    Uses `torch.accelerator.current_stream` when available, falling back to
-    device-specific APIs for older PyTorch versions.
+    Uses the device-specific `torch.<dev>.current_stream()` API rather than
+    `torch.accelerator.current_stream()` — the latter returns a different
+    stream object on torch 2.9 + vllm-ascend, producing cross-stream data
+    hazards on cached-executor ops.
     """
     if isinstance(device, torch.device):
         device = device.type
@@ -97,32 +107,19 @@ def get_stream(device):
     if device == "cpu":
         return 0
 
-    if hasattr(torch, "accelerator") and hasattr(torch.accelerator, "current_stream"):
-        stream = torch.accelerator.current_stream()
-
-        # Each backend exposes the raw handle under a different attribute name.
-        for attr in ("npu_stream", "cuda_stream", "mlu_stream", "musa_stream"):
-            if hasattr(stream, attr):
-                return getattr(stream, attr)
+    mod_name, attr = _STREAM_ACCESSORS.get(device, (None, None))
 
+    if mod_name is None:
         return 0
 
-    # Fallback for older PyTorch builds without `torch.accelerator`.
-    _STREAM_ACCESSORS = {
-        "npu": ("npu", "npu_stream"),
-        "cuda": ("cuda", "cuda_stream"),
-        "mlu": ("mlu", "mlu_stream"),
-        "musa": ("musa", "musa_stream"),
-    }
+    mod = getattr(torch, mod_name, None)
 
-    if device in _STREAM_ACCESSORS:
-        mod_name, attr = _STREAM_ACCESSORS[device]
-        mod = getattr(torch, mod_name, None)
+    if mod is None:
+        return 0
 
-        if mod is not None and hasattr(mod, "current_stream"):
-            return getattr(mod.current_stream(), attr)
+    stream = mod.current_stream()
 
-    return 0
+    return getattr(stream, attr, 0)
 
 
 def clone_strided(input):