JamePeng · alcoftTAO · May 4, 2026 · May 4, 2026 · May 5, 2026 · May 5, 2026
diff --git a/.github/workflows/build-wheels-cu131-win.yml b/.github/workflows/build-wheels-cu131-win.yml
@@ -67,6 +67,31 @@ jobs:
           echo LIB=%LIB%>>%GITHUB_ENV%
           echo LIBPATH=%LIBPATH%>>%GITHUB_ENV%
 
+      - name: Copy LLVM OpenMP runtime
+        shell: pwsh
+        run: |
+          # GGML CPU all-variant backends are built with LLVM OpenMP on Windows.
+          # The dynamically loaded ggml-cpu-*.dll files depend on this runtime.
+          # If it is missing from the wheel, ggml_backend_load_all_from_path()
+          # may fail to load CPU backend DLLs at runtime.
+          $packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib"
+          New-Item -ItemType Directory -Force $packageLibDir | Out-Null
+
+          $omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" `
+            -Recurse `
+            -Filter "libomp140.x86_64.dll" `
+            -ErrorAction SilentlyContinue |
+            Where-Object { $_.FullName -match "OpenMP\.LLVM" } |
+            Select-Object -First 1
+
+          if (!$omp) {
+            Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables."
+            exit 1
+          }
+
+          Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force
+          Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)"
+
       - name: Build wheel
         run: |
           $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '')

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -96,6 +96,7 @@ class Llama:
     def __init__(
         self,
         model_path: str,
+        clip_model_path: Optional[str] = None,
         *,
         # Model Params
         n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto",
@@ -171,6 +172,7 @@ def __init__(
         log_filters: Optional[Sequence[str]] = None,
         log_filters_case_sensitive: bool = True,
         # Extra Params
+        chat_handler_kwargs: Dict[str, Any] = {},
         **kwargs,  # type: ignore
     ):
         """Load a llama.cpp model from `model_path`.
@@ -703,6 +705,17 @@ def __init__(
 
         if self.verbose:
             print(f"Model metadata: {self.metadata}", file=sys.stderr)
+
+        if clip_model_path is not None:
+            if self.chat_handler is not None and self.verbose:
+                print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True)
+
+            self.chat_handler = llama_chat_format.GenericMTMDChatHandler(
+                gguf_metadata = self.metadata,
+                clip_model_path = clip_model_path,
+                verbose = self.verbose,
+                **chat_handler_kwargs
+            )
 
         eos_token_id = self.token_eos()
         bos_token_id = self.token_bos()

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2887,10 +2887,14 @@ def __init__(
             raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}")
 
         # Pre-compile Jinja template
+        if not hasattr(self, "chat_format") or self.chat_format is None:
+            self.chat_format = self.CHAT_FORMAT
+
+        self._chat_format_parser_tags = []
         self.chat_template = ImmutableSandboxedEnvironment(
             trim_blocks=True,
             lstrip_blocks=True,
-        ).from_string(self.CHAT_FORMAT)
+        ).from_string(self.chat_format)
 
         self._exit_stack = ExitStack()
 
@@ -3117,6 +3121,13 @@ def _process_mtmd_prompt(
             tool_choice=tool_choice,
             **getattr(self, 'extra_template_arguments', {})
         )
+
+        for tag in self._chat_format_parser_tags:
+            if tag not in text:
+                continue
+
+            text = text.replace(tag, media_marker)
+
         # Replace image_url by media_marker in text
         for item in media_items:
             text = text.replace(item["url"], media_marker)
@@ -3828,6 +3839,43 @@ def from_pretrained(
             **kwargs,
         )
 
+class GenericMTMDChatHandler(MTMDChatHandler):
+    KNOWN_MEDIA_TAGS = [
+        "<|image_pad|>",
+        "<|audio_pad|>",
+        "<|video_pad|>",
+        "<|image|>",
+        "<|audio|>",
+        "<|video|>",
+        "[IMG]"
+    ]
+
+    def __init__(
+        self,
+        gguf_metadata: Dict[str, Any],
+        clip_model_path: str,
+        verbose: bool = True,
+        **kwargs
+    ) -> None:
+        self.model_metadata = gguf_metadata
+        self.chat_format = self.model_metadata.get("tokenizer.chat_template", None)
+
+        if verbose:
+            print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)
+
+        if self.chat_format is None:
+            raise ValueError("Failed to get model chat template automatically.")
+
+        super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs)
+
+    def __call__(self, **kwargs):
+        self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format]
+
+        if self.verbose:
+            print(f"{self.log_prefix} - Start processing")
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
 
 class Llava15ChatHandler(MTMDChatHandler):
     CHAT_FORMAT = (

diff --git a/vendor/llama.cpp b/vendor/llama.cpp