Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/build-wheels-cu131-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,31 @@ jobs:
echo LIB=%LIB%>>%GITHUB_ENV%
echo LIBPATH=%LIBPATH%>>%GITHUB_ENV%

- name: Copy LLVM OpenMP runtime
shell: pwsh
run: |
# GGML CPU all-variant backends are built with LLVM OpenMP on Windows.
# The dynamically loaded ggml-cpu-*.dll files depend on this runtime.
# If it is missing from the wheel, ggml_backend_load_all_from_path()
# may fail to load CPU backend DLLs at runtime.
$packageLibDir = Join-Path $env:GITHUB_WORKSPACE "llama_cpp\lib"
New-Item -ItemType Directory -Force $packageLibDir | Out-Null

$omp = Get-ChildItem "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC" `
-Recurse `
-Filter "libomp140.x86_64.dll" `
-ErrorAction SilentlyContinue |
Where-Object { $_.FullName -match "OpenMP\.LLVM" } |
Select-Object -First 1

if (!$omp) {
Write-Error "Could not find libomp140.x86_64.dll in Visual Studio LLVM OpenMP redistributables."
exit 1
}

Copy-Item $omp.FullName (Join-Path $packageLibDir "libomp140.x86_64.dll") -Force
Write-Output "Copied LLVM OpenMP runtime: $($omp.FullName)"

- name: Build wheel
run: |
$cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.', '')
Expand Down
13 changes: 13 additions & 0 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class Llama:
def __init__(
self,
model_path: str,
clip_model_path: Optional[str] = None,
*,
# Model Params
n_gpu_layers: Union[int, Literal["auto", "all"]] = "auto",
Expand Down Expand Up @@ -171,6 +172,7 @@ def __init__(
log_filters: Optional[Sequence[str]] = None,
log_filters_case_sensitive: bool = True,
# Extra Params
chat_handler_kwargs: Dict[str, Any] = {},
**kwargs, # type: ignore
):
"""Load a llama.cpp model from `model_path`.
Expand Down Expand Up @@ -703,6 +705,17 @@ def __init__(

if self.verbose:
print(f"Model metadata: {self.metadata}", file=sys.stderr)

if clip_model_path is not None:
if self.chat_handler is not None and self.verbose:
print("Warning: Both `chat_handler` and `clip_model_path` are not null. Chat handler will be overwritten.", flush = True)

self.chat_handler = llama_chat_format.GenericMTMDChatHandler(
gguf_metadata = self.metadata,
clip_model_path = clip_model_path,
verbose = self.verbose,
**chat_handler_kwargs
)

eos_token_id = self.token_eos()
bos_token_id = self.token_bos()
Expand Down
50 changes: 49 additions & 1 deletion llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2887,10 +2887,14 @@ def __init__(
raise ValueError(f"{self.log_prefix}(__init__): Clip model path does not exist: {clip_model_path}")

# Pre-compile Jinja template
if not hasattr(self, "chat_format") or self.chat_format is None:
self.chat_format = self.CHAT_FORMAT

self._chat_format_parser_tags = []
self.chat_template = ImmutableSandboxedEnvironment(
trim_blocks=True,
lstrip_blocks=True,
).from_string(self.CHAT_FORMAT)
).from_string(self.chat_format)

self._exit_stack = ExitStack()

Expand Down Expand Up @@ -3117,6 +3121,13 @@ def _process_mtmd_prompt(
tool_choice=tool_choice,
**getattr(self, 'extra_template_arguments', {})
)

for tag in self._chat_format_parser_tags:
if tag not in text:
continue

text = text.replace(tag, media_marker)

# Replace image_url by media_marker in text
for item in media_items:
text = text.replace(item["url"], media_marker)
Expand Down Expand Up @@ -3828,6 +3839,43 @@ def from_pretrained(
**kwargs,
)

class GenericMTMDChatHandler(MTMDChatHandler):
KNOWN_MEDIA_TAGS = [
"<|image_pad|>",
"<|audio_pad|>",
"<|video_pad|>",
"<|image|>",
"<|audio|>",
"<|video|>",
"[IMG]"
]

def __init__(
self,
gguf_metadata: Dict[str, Any],
clip_model_path: str,
verbose: bool = True,
**kwargs
) -> None:
self.model_metadata = gguf_metadata
self.chat_format = self.model_metadata.get("tokenizer.chat_template", None)

if verbose:
print(f"Got chat template from model:\n```jinja\n{self.chat_format}\n```", flush = True)

if self.chat_format is None:
raise ValueError("Failed to get model chat template automatically.")

super().__init__(clip_model_path = clip_model_path, verbose = verbose, **kwargs)

def __call__(self, **kwargs):
self._chat_format_parser_tags = [tag for tag in self.KNOWN_MEDIA_TAGS if tag in self.chat_format]

if self.verbose:
print(f"{self.log_prefix} - Start processing")

# Use parent implementation
return super().__call__(**kwargs)

class Llava15ChatHandler(MTMDChatHandler):
CHAT_FORMAT = (
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Submodule llama.cpp updated 110 files
Loading