From 35ad690198801edf90c7d7ba8f7f82236ac19ed7 Mon Sep 17 00:00:00 2001 From: BuildTools Date: Thu, 15 May 2025 19:01:51 -0700 Subject: [PATCH] feat(core): update llama.cpp, improve backend UI, logging, and task handling - update llama.cpp python to `bc098c3` (now adds support for Qwen3, Llama 4, etc.) - update requirements and general maint - UI fixes in AutoGGUF - Updated backend selection box to sort by newest version - Fixed log information box inserting newlines on open and autoscroll - Modified task deletion behavior - Fixed logging for cancellation/deletion - Updated readme information --- README.md | 27 +- requirements.txt | 12 +- setup.py | 2 +- src/AutoGGUF.py | 50 +- src/Localizations.py | 11 +- src/TaskListItem.py | 45 +- src/convert_hf_to_gguf.py | 2948 +++++++++++++++++++++++++---------- src/convert_lora_to_gguf.py | 65 +- src/gguf/constants.py | 330 ++++ src/gguf/gguf.py | 4 - src/gguf/gguf_reader.py | 4 - src/gguf/gguf_writer.py | 67 + src/gguf/lazy.py | 21 + src/gguf/tensor_mapping.py | 176 ++- src/gguf/utility.py | 215 +++ 15 files changed, 3070 insertions(+), 907 deletions(-) diff --git a/README.md b/README.md index a617853..1167121 100644 --- a/README.md +++ b/README.md @@ -29,16 +29,16 @@ # AutoGGUF - automated GGUF model quantizer ## Features -- πŸ“© Download and manage llama.cpp backends -- πŸ—ƒοΈ Select and quantize GGUF models +- πŸ“© Update and manage llama.cpp backends +- πŸ—ƒοΈ Download and quantize GGUF/safetensors models - πŸ“ Configure quantization parameters -- πŸ’» Monitor system resources during quantization +- πŸ’» Monitor system resources in real time during quantization - ⏳ Parallel quantization + imatrix generation - πŸŽ‰ LoRA conversion and merging - πŸ“ Preset saving and loading - 8️⃣ AutoFP8 quantization - πŸͺ“ GGUF splitting and merging -- 🌐 HTTP API for automated monitoring +- 🌐 HTTP API for automation and monitoring ## Why AutoGGUF? - Fast: Saves time on manual configuration @@ -138,17 +138,20 @@ ## Localizations ## Issues -- Some inconsistent logging -- Missing translations +- Some inconsistent logging and signal handling +- Missing or duplicated translations +- Buggy/incomplete API interfaces ## Planned Features -- Time estimation for quantization -- Quantization file size estimate -- Perplexity testing -- bitsandbytes +- [ ] Time estimation for quantization +- [ ] Quantization file size estimate +- [ ] Perplexity testing +- [ ] bitsandbytes support -Due to my limited availability and a lack of time, I won't be actively developing new features for this project as much. While I'll continue to publish builds from time to time, I strongly recommend running from source if you want to stay up to date with the latest changes. I'm still committed to keeping dependencies updated weekly and making small maintenance fixes to ensure everything runs smoothly. If you run into any problems or notice issues, please don't hesitate to let me know - I appreciate your feedback and will do my best to address them. +#### Project Status + +AutoGGUF has now entered maintenance mode. It's considered stable and feature-complete for most use cases, so I'm not actively developing new features, but I’ll continue to publish occasional builds, update dependencies regularly, and fix critical bugs as needed. If you encounter issues or have suggestions, feel free to open an issue. ## Support @@ -162,3 +165,5 @@ ## Contributing ## Stargazers [![Star History Chart](https://api.star-history.com/svg?repos=leafspark/AutoGGUF&type=Date)](https://star-history.com/#leafspark/AutoGGUF&Date) + +`Last Updated: 5/15/2025` diff --git a/requirements.txt b/requirements.txt index 9919d4b..2abf5da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ PyYAML~=6.0.2 psutil~=7.0.0 pynvml~=12.0.0 -PySide6~=6.8.2 -safetensors~=0.5.2 +PySide6~=6.9.0 +safetensors~=0.5.3 numpy<2.0.0 torch~=2.5.1 sentencepiece~=0.2.0 -setuptools~=75.6.0 -huggingface-hub~=0.29.2 -transformers~=4.48.0 -fastapi~=0.115.6 +setuptools~=80.4.0 +huggingface-hub~=0.31.1 +transformers~=4.51.3 +fastapi~=0.115.12 uvicorn~=0.34.0 diff --git a/setup.py b/setup.py index 3d7d923..57b6fa5 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ url="https://github.com/leafspark/AutoGGUF", license="apache-2.0", author="leafspark", - author_email="leafspark@duck.com", + author_email="leafspark@proton.me", description="automatically quant GGUF models", install_requires=required, entry_points={"console_scripts": ["autogguf-gui = main:main"]}, diff --git a/src/AutoGGUF.py b/src/AutoGGUF.py index 4e813a3..12709e9 100644 --- a/src/AutoGGUF.py +++ b/src/AutoGGUF.py @@ -1,11 +1,10 @@ import json -import os import shutil import urllib.error import urllib.request from datetime import datetime from functools import partial, wraps -from typing import Any, List, Union +from typing import List from PySide6.QtCore import * from PySide6.QtGui import * @@ -339,15 +338,15 @@ def __init__(self, args: List[str]) -> None: output_layout.addWidget(output_button) self.merge_gguf_layout.addLayout(output_layout) - # Split button - split_button = QPushButton(MERGE_GGUF) - split_button.clicked.connect( + # Merge button + merge_button = QPushButton(MERGE_GGUF) + merge_button.clicked.connect( lambda: self.merge_gguf( self.merge_gguf_input.text(), self.merge_gguf_output.text(), ) ) - self.merge_gguf_layout.addWidget(split_button) + self.merge_gguf_layout.addWidget(merge_button) self.merge_gguf_dialog.setLayout(self.merge_gguf_layout) # HF Upload Window @@ -763,7 +762,7 @@ def __init__(self, args: List[str]) -> None: self.extra_arguments = QLineEdit() quant_options_layout.addRow( - self.create_label(EXTRA_ARGUMENTS, EXTRA_COMMAND_ARGUMENTS), + self.create_label(EXTRA_ARGUMENTS, EXTRA_ARGUMENTS_LABEL), self.extra_arguments, ) @@ -1202,15 +1201,25 @@ def refresh_backends(self) -> None: and "cudart-llama" not in item.lower() ] + def extract_b_val(name: str) -> int: + match = re.search(r"b(\d+)", name) + return int(match.group(1)) if match else -1 + if valid_backends: + # Sort by newest version + valid_backends.sort(key=lambda x: extract_b_val(x[0]), reverse=True) + for name, path in valid_backends: self.backend_combo.addItem(name, userData=path) - self.backend_combo.setEnabled( - True - ) # Enable the combo box if there are valid backends + + self.backend_combo.setEnabled(True) + + # Selects the newest version (now at index 0) + self.backend_combo.setCurrentIndex(0) else: self.backend_combo.addItem(NO_BACKENDS_AVAILABLE) self.backend_combo.setEnabled(False) + self.logger.info(FOUND_VALID_BACKENDS.format(len(valid_backends))) def save_task_preset(self, task_item) -> None: @@ -1252,13 +1261,13 @@ def download_finished(self, extract_dir) -> None: ) else: QMessageBox.warning( - self, CUDA_EXTRACTION_FAILED, NO_SUITABLE_CUDA_BACKEND_FOUND + self, CUDA_EXTRACTION_FAILED, NO_SUITABLE_CUDA_BACKEND_EXTRACTION ) else: QMessageBox.information( self, DOWNLOAD_COMPLETE, - LLAMACPP_BINARY_DOWNLOADED_AND_EXTRACTED.format(extract_dir), + LLAMACPP_DOWNLOADED_AND_EXTRACTED.format(extract_dir), ) self.refresh_backends() # Refresh the backends after successful download @@ -1906,12 +1915,25 @@ def show_task_details(self, item) -> None: # Load existing content if os.path.exists(task_item.log_file): with open_file_safe(task_item.log_file, "r") as f: - log_text.setPlainText(f.read()) + content = f.read().rstrip("\n") # Remove trailing newlines + log_text.setPlainText(content) + + # Scroll to the end + log_text.moveCursor(QTextCursor.End) # Connect to the thread if it's still running for thread in self.quant_threads: if thread.log_file == task_item.log_file: - thread.output_signal.connect(log_text.appendPlainText) + # Create a local slot function that updates the text + def update_log(text): + log_text.appendPlainText(text) + log_text.moveCursor(QTextCursor.End) + + thread.output_signal.connect(update_log) + # Disconnect the signal when the dialog is destroyed + log_dialog.destroyed.connect( + lambda: thread.output_signal.disconnect(update_log) + ) break log_dialog.exec() diff --git a/src/Localizations.py b/src/Localizations.py index 7b3f672..0dfadaa 100644 --- a/src/Localizations.py +++ b/src/Localizations.py @@ -1,7 +1,7 @@ import os import re -AUTOGGUF_VERSION = "v2.0.0" +AUTOGGUF_VERSION = "v2.0.1" class _Localization: @@ -53,13 +53,11 @@ def __init__(self): self.QUANTIZE_TO_FP8_DYNAMIC = "Quantize to FP8 Dynamic" self.OPEN_MODEL_FOLDER = "Open Model Folder" self.QUANTIZE = "Quantize" - self.OPEN_MODEL_FOLDER = "Open Model Folder" self.INPUT_MODEL = "Input Model:" # GGUF Verification self.INVALID_GGUF_FILE = "Invalid GGUF file: {}" self.SHARDED_MODEL_NAME = "{} (Sharded)" - self.IMPORTED_MODEL_TOOLTIP = "Imported model: {}" self.CONCATENATED_FILE_WARNING = "This is a concatenated file part. It will not work with llama-quantize; please concat the file first." self.CONCATENATED_FILES_FOUND = ( "Found {} concatenated file parts. Please concat the files first." @@ -250,12 +248,6 @@ def __init__(self): self.LLAMACPP_DOWNLOADED_AND_EXTRACTED = ( "llama.cpp binary downloaded and extracted to {0}" ) - self.NO_SUITABLE_CUDA_BACKEND_FOUND = ( - "No suitable CUDA backend found for extraction" - ) - self.LLAMACPP_BINARY_DOWNLOADED_AND_EXTRACTED = ( - "llama.cpp binary downloaded and extracted to {0}" - ) self.REFRESHING_LLAMACPP_RELEASES = "Refreshing llama.cpp releases" self.UPDATING_ASSET_LIST = "Updating asset list" self.UPDATING_CUDA_OPTIONS = "Updating CUDA options" @@ -454,7 +446,6 @@ def __init__(self): self.UPLOAD = "Upload" self.INFO = "Info" - self.EXTRA_COMMAND_ARGUMENTS = "Additional command-line arguments" self.COPIED_COMMAND_TO_CLIPBOARD = "Copied command to clipboard:" # Repository diff --git a/src/TaskListItem.py b/src/TaskListItem.py index 293a7cd..826abff 100644 --- a/src/TaskListItem.py +++ b/src/TaskListItem.py @@ -104,20 +104,35 @@ def show_task_properties(self, item) -> None: break def cancel_task(self, item) -> None: - self.logger.info(CANCELLING_TASK.format(item.text())) + # TODO: fix possibly buggy signal behavior task_item = self.task_list.itemWidget(item) - for thread in self.quant_threads: - if thread.log_file == task_item.log_file: - thread.terminate() - task_item.update_status(CANCELED) - self.quant_threads.remove(thread) - break + if task_item: + task_name = task_item.task_name # Store the name before any changes + self.logger.info(CANCELLING_TASK.format(task_name)) + + # Find the thread and disconnect signals before terminating + for thread in self.quant_threads: + if thread.log_file == task_item.log_file: + # Disconnect all signals from this thread first + try: + thread.error_signal.disconnect() # Disconnect all error signal connections + thread.output_signal.disconnect() # Disconnect all output signal connections + except TypeError: + # No connections to disconnect + pass + + # Now terminate the thread + thread.terminate() + self.quant_threads.remove(thread) + break def delete_task(self, item) -> None: - self.logger.info(DELETING_TASK.format(item.text())) + task_item = self.task_list.itemWidget(item) + if not task_item: + return - # Cancel the task first - self.cancel_task(item) + task_name = task_item.task_name # Store task_name before deletion + self.logger.info(DELETING_TASK.format(task_name)) reply = QMessageBox.question( self, @@ -126,13 +141,17 @@ def delete_task(self, item) -> None: QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No, QMessageBox.StandardButton.No, ) + if reply == QMessageBox.StandardButton.Yes: - task_item = self.task_list.itemWidget(item) + # Cancel the task first (which disconnects signals) + self.cancel_task(item) + + # Now remove from list and delete row = self.task_list.row(item) self.task_list.takeItem(row) - if task_item: - task_item.deleteLater() + # Delete the widget after removing from list + task_item.deleteLater() def update_status(self, status) -> None: self.status = status diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py index 765187f..ffd1f76 100644 --- a/src/convert_hf_to_gguf.py +++ b/src/convert_hf_to_gguf.py @@ -24,6 +24,7 @@ cast, ) from itertools import chain +from transformers import AutoConfig import math import numpy as np @@ -37,6 +38,9 @@ logger = logging.getLogger("hf-to-gguf") +###### MODEL DEFINITIONS ###### + + class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 @@ -46,11 +50,19 @@ class SentencePieceTokenTypes(IntEnum): BYTE = 6 -AnyModel = TypeVar("AnyModel", bound="type[Model]") +class ModelType(IntEnum): + TEXT = 1 + VISION = 2 -class Model: - _model_classes: dict[str, type[Model]] = {} +AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") + + +class ModelBase: + _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { + ModelType.TEXT: {}, + ModelType.VISION: {}, + } dir_model: Path ftype: gguf.LlamaFileType @@ -62,21 +74,26 @@ class Model: part_names: list[str] is_safetensors: bool hparams: dict[str, Any] - block_count: int - tensor_map: gguf.TensorNameMap tensor_names: set[str] | None gguf_writer: gguf.GGUFWriter model_name: str | None metadata_override: Path | None dir_model_card: Path + remote_hf_model_id: str | None + # subclasses should define this! model_arch: gguf.MODEL_ARCH + # subclasses should initialize this! + block_count: int + tensor_map: gguf.TensorNameMap + def __init__( self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, + *, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, @@ -87,8 +104,13 @@ def __init__( dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, + remote_hf_model_id: str | None = None, ): - if type(self) is Model: + if ( + type(self) is ModelBase + or type(self) is TextModel + or type(self) is VisionModel + ): raise TypeError( f"{type(self).__name__!r} should not be directly instantiated" ) @@ -101,29 +123,50 @@ def __init__( gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE ) self.use_temp_file = use_temp_file - self.lazy = not eager - self.part_names = Model.get_model_part_names( - self.dir_model, "model", ".safetensors" - ) - self.is_safetensors = len(self.part_names) > 0 - if not self.is_safetensors: - self.part_names = Model.get_model_part_names( - self.dir_model, "pytorch_model", ".bin" + self.lazy = not eager or (remote_hf_model_id is not None) + self.remote_hf_model_id = remote_hf_model_id + if remote_hf_model_id is not None: + self.is_safetensors = True + + def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + logger.info( + f"Using remote model with HuggingFace id: {remote_hf_model_id}" + ) + remote_tensors = ( + gguf.utility.SafetensorRemote.get_list_tensors_hf_model( + remote_hf_model_id + ) + ) + self.tensor_names = set(name for name in remote_tensors.keys()) + for ( + name, + remote_tensor, + ) in gguf.utility.SafetensorRemote.get_list_tensors_hf_model( + remote_hf_model_id + ).items(): + yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) + + self.get_tensors = get_remote_tensors + else: + self.part_names = ModelBase.get_model_part_names( + self.dir_model, "model", ".safetensors" ) + self.is_safetensors = len(self.part_names) > 0 + if not self.is_safetensors: + self.part_names = ModelBase.get_model_part_names( + self.dir_model, "pytorch_model", ".bin" + ) self.hparams = ( - Model.load_hparams(self.dir_model) if hparams is None else hparams + ModelBase.load_hparams(self.dir_model) if hparams is None else hparams ) - self.block_count = self.find_hparam( - ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] - ) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None self.metadata_override = metadata_override self.model_name = model_name - self.dir_model_card = dir_model + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: - + # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) if first_tensor.dtype == torch.float16: logger.info( @@ -136,6 +179,7 @@ def __init__( ) self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter( path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], @@ -148,10 +192,10 @@ def __init__( ) @classmethod - def __init_subclass__(cls): - - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: + stem, suffix = path.stem, path.suffix + new_name = f"{prefix}{stem}{suffix}" + return path.with_name(new_name) def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: key = next((k for k in keys if k in self.hparams), None) @@ -161,9 +205,6 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: return None raise KeyError(f"could not find any of: {keys}") - def set_vocab(self): - self._set_vocab_gpt2() - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() @@ -220,6 +261,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: data = LazyTorchTensor.from_eager(data) yield name, data + # verify tensor name presence and identify potentially missing files if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) @@ -278,12 +320,342 @@ def map_tensor_name( raise ValueError(f"Can not map tensor {name!r}") return new_name + def set_gguf_parameters(self): + raise NotImplementedError( + "set_gguf_parameters() must be implemented in subclasses" + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + return [(self.map_tensor_name(name), data_torch)] + + def tensor_force_quant( + self, name: str, new_name: str, bid: int | None, n_dims: int + ) -> gguf.GGMLQuantizationType | bool: + del name, new_name, bid, n_dims # unused + + return False + + # some models need extra generated tensors (like rope_freqs) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + return () + + def prepare_tensors(self): + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len( + ".weight," + ) + + for name, data_torch in chain( + self.generate_extra_tensors(), self.get_tensors() + ): + # we don't need these + if name.endswith( + (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") + ): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # use the first number-like part of the tensor name as the block id + bid = None + for part in name.split("."): + if part.isdecimal(): + bid = int(part) + break + + for new_name, data_torch in self.modify_tensors(data_torch, name, bid): + # TODO: why do we squeeze here? + # data = data_torch.squeeze().numpy() + data = data_torch.numpy() + + # if data ends up empty, it means data_torch was a scalar tensor -> restore + if len(data.shape) == 0: + data = data_torch.numpy() + + n_dims = len(data.shape) + data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant( + name, new_name, bid, n_dims + ) + + # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors + if n_dims <= 1 or new_name.endswith("_norm.weight"): + data_qtype = gguf.GGMLQuantizationType.F32 + + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + # Some tensor types are always in float32 + if data_qtype is False and ( + any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.TIME_MIX_FIRST, + gguf.MODEL_TENSOR.TIME_MIX_W1, + gguf.MODEL_TENSOR.TIME_MIX_W2, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, + gguf.MODEL_TENSOR.POSNET_NORM1, + gguf.MODEL_TENSOR.POSNET_NORM2, + ) + ) + or not new_name.endswith(".weight") + ): + data_qtype = gguf.GGMLQuantizationType.F32 + + if data_qtype is False and any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + ) + ): + if self.ftype in ( + gguf.LlamaFileType.MOSTLY_TQ1_0, + gguf.LlamaFileType.MOSTLY_TQ2_0, + ): + # TODO: use Q4_K and Q6_K + data_qtype = gguf.GGMLQuantizationType.F16 + + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) + if isinstance(data_qtype, bool): + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: + data_qtype = gguf.GGMLQuantizationType.TQ1_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: + data_qtype = gguf.GGMLQuantizationType.TQ2_0 + else: + raise ValueError(f"Unknown file type: {self.ftype.name}") + + try: + data = gguf.quants.quantize(data, data_qtype) + except gguf.QuantError as e: + logger.warning("%s, %s", e, "falling back to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + shape = ( + gguf.quant_shape_from_byte_shape(data.shape, data_qtype) + if data.dtype == np.uint8 + else data.shape + ) + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + + # n_dims is implicit in the shape + logger.info( + f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" + ) + + self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def prepare_metadata(self, vocab_only: bool): + + total_params, shared_params, expert_params, expert_count = ( + self.gguf_writer.get_total_parameter_count() + ) + + self.metadata = gguf.Metadata.load( + self.metadata_override, self.dir_model_card, self.model_name, total_params + ) + + # If we are using HF model id, set the metadata name to the model id + if self.remote_hf_model_id: + self.metadata.name = self.remote_hf_model_id + + # Fallback to model directory name if metadata name is still missing + if self.metadata.name is None: + self.metadata.name = self.dir_model.name + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label( + total_params, shared_params, expert_params, expert_count + ) + + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + def write_vocab(self): + raise NotImplementedError("write_vocab() must be implemented in subclasses") + + def write(self): + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @staticmethod + def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: + part_names: list[str] = [] + for filename in os.listdir(dir_model): + if filename.startswith(prefix) and filename.endswith(suffix): + part_names.append(filename) + + part_names.sort() + + return part_names + + @staticmethod + def load_hparams(dir_model: Path): + try: + # for security reason, we don't allow loading remote code by default + # if a model need remote code, we will fallback to config.json + return AutoConfig.from_pretrained( + dir_model, trust_remote_code=False + ).to_dict() + except Exception as e: + logger.warning(f"Failed to load model config from {dir_model}: {e}") + logger.warning("Trying to load config.json instead") + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if "llm_config" in config: + # rename for InternVL + config["text_config"] = config["llm_config"] + return config + + @classmethod + def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: + assert names + + def func(modelcls: AnyModel) -> AnyModel: + model_type = ( + ModelType.VISION + if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION + else ModelType.TEXT + ) + for name in names: + cls._model_classes[model_type][name] = modelcls + return modelcls + + return func + + @classmethod + def print_registered_models(cls): + for model_type, model_classes in cls._model_classes.items(): + logger.error(f"{model_type.name} models:") + for name in sorted(model_classes.keys()): + logger.error(f" - {name}") + + @classmethod + def from_model_architecture( + cls, arch: str, model_type=ModelType.TEXT + ) -> type[ModelBase]: + try: + return cls._model_classes[model_type][arch] + except KeyError: + raise NotImplementedError(f"Architecture {arch!r} not supported!") from None + + +class TextModel(ModelBase): + model_type = ModelType.TEXT + hf_arch: str + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hf_arch = get_model_architecture(self.hparams, self.model_type) + + if "text_config" in self.hparams: + # move the text_config to the root level + self.hparams = {**self.hparams, **self.hparams["text_config"]} + + self.block_count = self.find_hparam( + ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] + ) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + + def set_vocab(self): + self._set_vocab_gpt2() + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + total_params = self.gguf_writer.get_total_parameter_count()[0] + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention( + self.metadata.name, + self.metadata.basename, + self.metadata.finetune, + self.metadata.version, + self.metadata.size_label, + output_type, + model_type="LoRA" if total_params < 0 else None, + ) + else: + fname_default: str = gguf.naming_convention( + self.metadata.name, + self.metadata.basename, + self.metadata.finetune, + self.metadata.version, + size_label=None, + output_type=None, + model_type="vocab", + ) + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename( + self.fname_out.name, output_type + ) + + logger.info("Set model tokenizer") + self.set_vocab() + def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) if ( n_ctx := self.find_hparam( - ["max_position_embeddings", "n_ctx"], optional=True + ["max_position_embeddings", "n_ctx", "n_positions"], optional=True ) ) is not None: self.gguf_writer.add_context_length(n_ctx) @@ -338,211 +710,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - del bid - - return [(self.map_tensor_name(name), data_torch)] - - def tensor_force_quant( - self, name: str, new_name: str, bid: int | None, n_dims: int - ) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid, n_dims - - return False - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - return () - - def prepare_tensors(self): - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len( - ".weight," - ) - - for name, data_torch in chain( - self.generate_extra_tensors(), self.get_tensors() - ): - - if name.endswith( - (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") - ): - continue - - old_dtype = data_torch.dtype - - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - bid = None - for part in name.split("."): - if part.isdecimal(): - bid = int(part) - break - - for new_name, data_torch in self.modify_tensors(data_torch, name, bid): - - data = data_torch.numpy() - - if len(data.shape) == 0: - data = data_torch.numpy() - - n_dims = len(data.shape) - data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant( - name, new_name, bid, n_dims - ) - - if n_dims <= 1 or new_name.endswith("_norm.weight"): - data_qtype = gguf.GGMLQuantizationType.F32 - - if data_qtype is False and ( - any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - gguf.MODEL_TENSOR.SSM_CONV1D, - gguf.MODEL_TENSOR.TIME_MIX_FIRST, - gguf.MODEL_TENSOR.TIME_MIX_W1, - gguf.MODEL_TENSOR.TIME_MIX_W2, - gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, - gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, - gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, - gguf.MODEL_TENSOR.POSNET_NORM1, - gguf.MODEL_TENSOR.POSNET_NORM2, - ) - ) - or not new_name.endswith(".weight") - ): - data_qtype = gguf.GGMLQuantizationType.F32 - - if data_qtype is False and any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.TOKEN_EMBD, - gguf.MODEL_TENSOR.OUTPUT, - ) - ): - if self.ftype in ( - gguf.LlamaFileType.MOSTLY_TQ1_0, - gguf.LlamaFileType.MOSTLY_TQ2_0, - ): - - data_qtype = gguf.GGMLQuantizationType.F16 - - if isinstance(data_qtype, bool): - if self.ftype == gguf.LlamaFileType.ALL_F32: - data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: - data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: - data_qtype = gguf.GGMLQuantizationType.TQ1_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: - data_qtype = gguf.GGMLQuantizationType.TQ2_0 - else: - raise ValueError(f"Unknown file type: {self.ftype.name}") - - try: - data = gguf.quants.quantize(data, data_qtype) - except gguf.QuantError as e: - logger.warning("%s, %s", e, "falling back to F16") - data_qtype = gguf.GGMLQuantizationType.F16 - data = gguf.quants.quantize(data, data_qtype) - - shape = ( - gguf.quant_shape_from_byte_shape(data.shape, data_qtype) - if data.dtype == np.uint8 - else data.shape - ) - - shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" - - logger.info( - f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" - ) - - self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - - def set_type(self): - self.gguf_writer.add_type(gguf.GGUFType.MODEL) - - def prepare_metadata(self, vocab_only: bool): - - total_params, shared_params, expert_params, expert_count = ( - self.gguf_writer.get_total_parameter_count() - ) - - self.metadata = gguf.Metadata.load( - self.metadata_override, self.dir_model_card, self.model_name, total_params - ) - - if self.metadata.name is None: - self.metadata.name = self.dir_model.name - - if self.metadata.size_label is None and total_params > 0: - self.metadata.size_label = gguf.size_label( - total_params, shared_params, expert_params, expert_count - ) - - output_type: str = self.ftype.name.partition("_")[2] - - if self.fname_out.is_dir(): - - if not vocab_only: - fname_default: str = gguf.naming_convention( - self.metadata.name, - self.metadata.basename, - self.metadata.finetune, - self.metadata.version, - self.metadata.size_label, - output_type, - model_type="LoRA" if total_params < 0 else None, - ) - else: - fname_default: str = gguf.naming_convention( - self.metadata.name, - self.metadata.basename, - self.metadata.finetune, - self.metadata.version, - size_label=None, - output_type=None, - model_type="vocab", - ) - - self.fname_out = self.fname_out / f"{fname_default}.gguf" - else: - - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename( - self.fname_out.name, output_type - ) - - self.set_type() - - logger.info("Set meta model") - self.metadata.set_gguf_meta_model(self.gguf_writer) - - logger.info("Set model parameters") - self.set_gguf_parameters() - - logger.info("Set model tokenizer") - self.set_vocab() - - logger.info("Set model quantization version") - self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - - def write(self): - self.prepare_tensors() - self.prepare_metadata(vocab_only=False) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - def write_vocab(self): if len(self.gguf_writer.tensors) != 1: raise ValueError("Splitting the vocabulary is not supported") @@ -552,45 +719,6 @@ def write_vocab(self): self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() - @staticmethod - def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: - part_names: list[str] = [] - for filename in os.listdir(dir_model): - if filename.startswith(prefix) and filename.endswith(suffix): - part_names.append(filename) - - part_names.sort() - - return part_names - - @staticmethod - def load_hparams(dir_model: Path): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @classmethod - def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: - assert names - - def func(modelcls: AnyModel) -> AnyModel: - for name in names: - cls._model_classes[name] = modelcls - return modelcls - - return func - - @classmethod - def print_registered_models(cls): - for name in sorted(cls._model_classes.keys()): - logger.error(f"- {name}") - - @classmethod - def from_model_architecture(cls, arch: str) -> type[Model]: - try: - return cls._model_classes[arch] - except KeyError: - raise NotImplementedError(f"Architecture {arch!r} not supported!") from None - def does_token_look_special(self, token: str | bytes) -> bool: if isinstance(token, (bytes, bytearray)): token_text = token.decode(encoding="utf-8") @@ -599,11 +727,13 @@ def does_token_look_special(self, token: str | bytes) -> bool: else: token_text = token + # Some models mark some added tokens which ought to be control tokens as not special. + # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) seems_special = token_text in ( - "", + "", # deepseek-coder "", "<2mass>", - "[@BOS@]", + "[@BOS@]", # gemma{,-2} ) seems_special = seems_special or ( @@ -611,14 +741,16 @@ def does_token_look_special(self, token: str | bytes) -> bool: ) seems_special = seems_special or ( token_text.startswith("<|") and token_text.endswith("|>") - ) + ) # deepseek-coder + # TODO: should these be marked as UNUSED instead? (maybe not) seems_special = seems_special or ( token_text.startswith("") - ) + ) # gemma{,-2} return seems_special + # used for GPT-2 BPE and WordPiece vocabs def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] toktypes: list[int] = [] @@ -645,7 +777,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: token: str = reverse_vocab[i] if token in added_vocab: - + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens if not added_tokens_decoder[i].normalized: previous_token = token token = tokenizer.decode( @@ -661,8 +794,11 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: ): toktypes.append(gguf.TokenType.CONTROL) else: - - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. + token = token.replace( + b"\xe2\x96\x81".decode("utf-8"), " " + ) # pre-normalize user-defined spaces toktypes.append(gguf.TokenType.USER_DEFINED) else: toktypes.append(gguf.TokenType.NORMAL) @@ -670,7 +806,15 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: return tokens, toktypes, tokpre + # NOTE: this function is generated by convert_hf_to_gguf_update.py + # do not modify it manually! + # ref: https://github.com/ggml-org/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \nπŸš€ (normal) 😢\u200d🌫️ (multiple emojis concatenated) βœ… πŸ¦™πŸ¦™ 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…πŸ˜ ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~ ------======= Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" @@ -682,133 +826,157 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = None + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base res = "deepseek-coder" if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - + # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": - + # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base res = "falcon3" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": - + # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 res = "bert-bge-large" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - + # ref: https://huggingface.co/mosaicml/mpt-7b res = "mpt" if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - + # ref: https://huggingface.co/bigcode/starcoder2-3b res = "starcoder" if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - + # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b res = "stablelm2" if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base res = "refact" if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 res = "command-r" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - + # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - + # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf res = "olmo" if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - + # ref: https://huggingface.co/databricks/dbrx-base res = "dbrx" if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": - + # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en res = "jina-v1-en" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en res = "jina-v2-en" if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es res = "jina-v2-es" if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - + # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": - + # ref: https://huggingface.co/LumiOpen/Poro-34B-chat res = "poro-chat" if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": - + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code res = "jina-v2-code" if ( chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" ): - + # ref: https://huggingface.co/THUDM/glm-4-9b-chat res = "chatglm-bpe" if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": - + # ref: https://huggingface.co/LumiOpen/Viking-7B res = "viking" if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": - + # ref: https://huggingface.co/core42/jais-13b res = "jais" if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": - + # ref: https://huggingface.co/WisdomShell/CodeShell-7B res = "codeshell" if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": - + # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 res = "tekken" if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": - + # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M res = "smollm" if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": - + # ref: https://huggingface.co/bigscience/bloom res = "bloom" if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": - + # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small res = "gpt3-finnish" if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": - + # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct res = "exaone" if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": - + # ref: https://huggingface.co/microsoft/phi-2 res = "phi-2" if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": - + # ref: https://huggingface.co/facebook/chameleon-7b res = "chameleon" if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": - + # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 res = "minerva-7b" if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": - + # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base res = "roberta-bpe" if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": - + # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct res = "gigachat" if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": - + # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct res = "megrez" if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": - + # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 res = "deepseek-v3" if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": - + # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B res = "deepseek-r1-qwen" if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": - + # ref: https://huggingface.co/Xenova/gpt-4o res = "gpt-4o" + if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": + # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k + res = "superbpe" + if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15": + # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview + res = "trillion" + if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224": + # ref: https://huggingface.co/inclusionAI/Ling-lite + res = "bailingmoe" + if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406": + # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct + res = "llama4" + if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": + # ref: https://huggingface.co/THUDM/glm-4-9b-hf + res = "glm4" + if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3": + # ref: https://huggingface.co/mistral-community/pixtral-12b + res = "pixtral" + if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec": + # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base + res = "seed-coder" if res is None: logger.warning("\n") @@ -843,6 +1011,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.debug(f"chkhsh: {chkhsh}") return res + # Marker: End get_vocab_base_pre def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") @@ -882,6 +1051,7 @@ def _set_vocab_qwen(self): assert len(merged) == 2 merges.append(" ".join(map(QwenModel.token_bytes_to_string, merged))) + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.special_tokens reverse_vocab = { id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() @@ -905,7 +1075,7 @@ def _set_vocab_qwen(self): special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) special_vocab.merges = merges - + # only add special tokens when they were not already loaded from config.json if len(special_vocab.special_token_ids) == 0: special_vocab._set_special_token( "bos", tokenizer.special_tokens["<|endoftext|>"] @@ -913,7 +1083,7 @@ def _set_vocab_qwen(self): special_vocab._set_special_token( "eos", tokenizer.special_tokens["<|endoftext|>"] ) - + # this one is usually not in config.json anyway special_vocab._set_special_token( "unk", tokenizer.special_tokens["<|endoftext|>"] ) @@ -1006,7 +1176,9 @@ def _create_vocab_sentencepiece(self): if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") + token = token.replace( + b"\xe2\x96\x81".decode("utf-8"), " " + ) # pre-normalize user-defined spaces toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED scores[token_id] = -1000.0 @@ -1066,7 +1238,7 @@ def _set_vocab_rwkv_world(self): token = token.encode("utf-8") if isinstance(token, str) else token assert isinstance(token, bytes) assert len(token) == token_len - token_text: str = repr(token)[2:-1] + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" tokens.append(token_text.encode("utf-8")) toktypes.append(gguf.TokenType.NORMAL) remainder = vocab_size - len(tokens) @@ -1080,7 +1252,7 @@ def _set_vocab_rwkv_world(self): self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) special_vocab.chat_template = "rwkv-world" - + # hack: Add '\n\n' as the EOT token to make it chat normally special_vocab._set_special_token("eot", 261) special_vocab.add_to_gguf(self.gguf_writer) @@ -1096,7 +1268,7 @@ def _set_vocab_builtin( default_pre = "mpt" if model_name == "gpt-neox" else "default" field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) - assert field + assert field # tokenizer model self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) @@ -1105,27 +1277,27 @@ def _set_vocab_builtin( ) field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field + assert field # token list self.gguf_writer.add_token_list( [bytes(field.parts[i]) for i in field.data][:vocab_size] ) if model_name == "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) - assert field + assert field # token scores self.gguf_writer.add_token_scores( [field.parts[i].tolist()[0] for i in field.data][:vocab_size] ) field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field + assert field # token types self.gguf_writer.add_token_types( [field.parts[i].tolist()[0] for i in field.data][:vocab_size] ) if model_name != "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field + assert field # token merges self.gguf_writer.add_token_merges( [bytes(field.parts[i]) for i in field.data] ) @@ -1143,9 +1315,107 @@ def _set_vocab_builtin( if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) + def _try_set_pooling_type(self) -> None: + # get pooling path + pooling_path = None + module_path = self.dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"] == "sentence_transformers.models.Pooling": + pooling_path = mod["path"] + break -@Model.register("GPTNeoXForCausalLM") -class GPTNeoXModel(Model): + # get pooling type + if pooling_path is not None: + with open( + self.dir_model / pooling_path / "config.json", encoding="utf-8" + ) as f: + pooling = json.load(f) + if pooling["pooling_mode_mean_tokens"]: + pooling_type = gguf.PoolingType.MEAN + elif pooling["pooling_mode_cls_token"]: + pooling_type = gguf.PoolingType.CLS + elif pooling["pooling_mode_lasttoken"]: + pooling_type = gguf.PoolingType.LAST + else: + raise NotImplementedError( + "Only MEAN, CLS, and LAST pooling types supported" + ) + self.gguf_writer.add_pooling_type(pooling_type) + + +class VisionModel(ModelBase): + model_type = ModelType.VISION + model_arch = gguf.MODEL_ARCH.CLIP_VISION + preprocessor_config: dict[str, Any] + global_config: dict[str, Any] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION: + raise TypeError( + "VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION" + ) + + # get n_embd of the text model + if "text_config" not in self.hparams: + self.hparams["text_config"] = {} + text_config = {**self.hparams, **self.hparams["text_config"]} + self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) + assert self.n_embd_text > 0, "n_embd not found in hparams" + + if "vision_config" not in self.hparams: + raise ValueError("vision_config not found in hparams") + # move vision config to the top level, while preserving the original hparams in global_config + self.global_config = self.hparams + self.hparams = self.hparams["vision_config"] + + self.block_count = self.find_hparam( + ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"] + ) + self.tensor_map = gguf.get_tensor_name_map( + gguf.MODEL_ARCH.CLIP_VISION, self.block_count + ) + + # load preprocessor config + with open( + self.dir_model / "preprocessor_config.json", "r", encoding="utf-8" + ) as f: + self.preprocessor_config = json.load(f) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) + + def set_gguf_parameters(self): + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_vision_projection_dim(self.n_embd_text) + self.gguf_writer.add_vision_has_vision_encoder(True) + + # vision config + self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"])) + self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"])) + self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"])) + self.gguf_writer.add_vision_feed_forward_length( + self.find_hparam(["intermediate_size"]) + ) + self.gguf_writer.add_vision_block_count(self.block_count) + self.gguf_writer.add_vision_head_count( + self.find_hparam(["num_attention_heads"]) + ) + + # preprocessor config + self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"]) + + def write_vocab(self): + raise ValueError("VisionModel does not support vocab writing") + + +@ModelBase.register("GPTNeoXForCausalLM") +class GPTNeoXModel(TextModel): model_arch = gguf.MODEL_ARCH.GPTNEOX def set_gguf_parameters(self): @@ -1170,7 +1440,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -1178,7 +1448,9 @@ def modify_tensors( tensors: list[tuple[str, Tensor]] = [] if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1206,8 +1478,8 @@ def modify_tensors( return tensors -@Model.register("BloomForCausalLM", "BloomModel") -class BloomModel(Model): +@ModelBase.register("BloomForCausalLM", "BloomModel") +class BloomModel(TextModel): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): @@ -1225,7 +1497,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -1235,7 +1507,9 @@ def modify_tensors( tensors: list[tuple[str, Tensor]] = [] if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1263,15 +1537,15 @@ def modify_tensors( return tensors -@Model.register("MPTForCausalLM") -class MPTModel(Model): +@ModelBase.register("MPTForCausalLM") +class MPTModel(TextModel): model_arch = gguf.MODEL_ARCH.MPT def set_vocab(self): try: self._set_vocab_gpt2() except Exception: - + # Fallback for SEA-LION model self._set_vocab_sentencepiece() self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_pad_token_id(3) @@ -1300,7 +1574,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused if "scales" in name: new_name = self.map_tensor_name( @@ -1313,8 +1587,8 @@ def modify_tensors( return [(new_name, data_torch)] -@Model.register("OrionForCausalLM") -class OrionModel(Model): +@ModelBase.register("OrionForCausalLM") +class OrionModel(TextModel): model_arch = gguf.MODEL_ARCH.ORION def set_vocab(self): @@ -1343,12 +1617,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) - + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) -@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(Model): +@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(TextModel): model_arch = gguf.MODEL_ARCH.BAICHUAN def set_vocab(self): @@ -1382,15 +1657,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling ): - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None @@ -1456,8 +1729,8 @@ def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: return weights[r * n_part : r * n_part + r, ...] -@Model.register("XverseForCausalLM") -class XverseModel(Model): +@ModelBase.register("XverseForCausalLM") +class XverseModel(TextModel): model_arch = gguf.MODEL_ARCH.XVERSE def set_vocab(self): @@ -1472,7 +1745,8 @@ def set_vocab(self): tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - + # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, + # because vocab_size is the count of items, and indexes start at 0. max_vocab_index = max(tokenizer.get_vocab().values()) if max_vocab_index >= vocab_size: raise ValueError("Vocabulary size exceeds expected maximum size.") @@ -1484,12 +1758,12 @@ def set_vocab(self): for token_id in range(vocab_size): token_text = reverse_vocab[token_id].encode("utf-8") - + # replace "\x00" to string with length > 0 if token_text == b"\x00": - toktype = gguf.TokenType.BYTE + toktype = gguf.TokenType.BYTE # special token_text = f"<{token_text}>".encode("utf-8") elif re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): - toktype = gguf.TokenType.BYTE + toktype = gguf.TokenType.BYTE # special elif reverse_vocab[token_id] in added_vocab: if tokenizer.added_tokens_decoder[token_id].special: toktype = gguf.TokenType.CONTROL @@ -1537,24 +1811,23 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling ): - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) + # HF models permute some of the tensors, so we need to undo that if name.endswith("q_proj.weight"): data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) if name.endswith("k_proj.weight"): @@ -1577,25 +1850,25 @@ def _reverse_hf_permute( ) -@Model.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(Model): +@ModelBase.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(TextModel): model_arch = gguf.MODEL_ARCH.FALCON def set_gguf_parameters(self): block_count = self.hparams.get("num_hidden_layers") if block_count is None: - block_count = self.hparams["n_layer"] + block_count = self.hparams["n_layer"] # old name n_head = self.hparams.get("num_attention_heads") if n_head is None: - n_head = self.hparams["n_head"] + n_head = self.hparams["n_head"] # old name n_head_kv = self.hparams.get("num_kv_heads") if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) + n_head_kv = self.hparams.get("n_head_kv", 1) # old name - self.gguf_writer.add_context_length(2048) - self.gguf_writer.add_tensor_data_layout("jploski") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1607,7 +1880,17 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py if "query_key_value" in name: n_head = self.find_hparam(["num_attention_heads", "n_head"]) @@ -1627,8 +1910,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("GPTBigCodeForCausalLM") -class StarCoderModel(Model): +@ModelBase.register("GPTBigCodeForCausalLM") +class StarCoderModel(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER def set_gguf_parameters(self): @@ -1644,13 +1927,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) -@Model.register("GPTRefactForCausalLM") -class RefactModel(Model): +@ModelBase.register("GPTRefactForCausalLM") +class RefactModel(TextModel): model_arch = gguf.MODEL_ARCH.REFACT def set_vocab(self): super().set_vocab() + # TODO: how to determine special FIM tokens automatically? special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, @@ -1659,7 +1943,7 @@ def set_vocab(self): special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) - special_vocab.chat_template = None + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -1671,6 +1955,7 @@ def set_gguf_parameters(self): block_count = self.hparams["n_layer"] + # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1733,17 +2018,17 @@ def modify_tensors( return tensors -@Model.register( +@ModelBase.register( "StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM" ) -class StableLMModel(Model): +class StableLMModel(TextModel): model_arch = gguf.MODEL_ARCH.STABLELM def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() else: - + # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab self._set_vocab_qwen() def set_gguf_parameters(self): @@ -1822,7 +2107,7 @@ def _stack_qk_norm( layer_name: str = "q_layernorm", ): datas: list[Tensor] = [] - + # extract the norms in order for xid in range(n_head): ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" datas.append(norms[ename]) @@ -1838,7 +2123,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._q_norms is not None or self._k_norms is not None: - + # flatten two `list[dict[str, Tensor]]` into a single `list[str]` norms = ( [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None @@ -1852,11 +2137,25 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register( - "LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM" +@ModelBase.register( + "LLaMAForCausalLM", + "LlamaForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + "VLlama3ForCausalLM", + "LlavaForConditionalGeneration", ) -class LlamaModel(Model): +class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA + undo_permute = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing `num_attention_heads` in config.json + if self.hf_arch == "VLlama3ForCausalLM": + self.hparams["num_attention_heads"] = self.hparams.get( + "num_attention_heads", 32 + ) def set_vocab(self): try: @@ -1865,9 +2164,10 @@ def set_vocab(self): try: self._set_vocab_llama_hf() except (FileNotFoundError, TypeError): - + # Llama 3 self._set_vocab_gpt2() + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( self.dir_model, @@ -1889,6 +2189,7 @@ def set_vocab(self): tokenizer_config_json["add_prefix_space"] ) + # Apply to granite small models only if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) @@ -1903,15 +2204,13 @@ def set_gguf_parameters(self): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling ): - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): @@ -1932,12 +2231,27 @@ def modify_tensors( ) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") + is_vision_tensor = ( + "vision_tower" in name + or "vision_model" in name + or "model.connector" in name + or "multi_modal_projector" in name + ) - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + if is_vision_tensor: + return [] # skip vision tensors + elif name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM + elif name.startswith("language_model."): + name = name.replace("language_model.", "") # for the rest + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] @@ -1951,6 +2265,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -1993,7 +2308,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen + # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4 rope_factors = [] for freq in freqs: @@ -2017,22 +2332,191 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("Mistral3ForConditionalGeneration") -class Mistral3Model(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA +@ModelBase.register( + "LlavaForConditionalGeneration", # pixtral + "Mistral3ForConditionalGeneration", # mistral small 3.1 +) +class LlavaVisionModel(VisionModel): + img_break_tok_id = -1 def __init__(self, *args, **kwargs): - hparams = Model.load_hparams(kwargs["dir_model"]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams super().__init__(*args, **kwargs) + if self.hparams["model_type"] == "pixtral": + # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py + self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) + self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") + logger.info(f"Image break token id: {self.img_break_tok_id}") + else: + raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") + + def get_token_id(self, token: str) -> int: + tokenizer_config_file = self.dir_model / "tokenizer_config.json" + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + added_tokens_decoder = json.load(f)["added_tokens_decoder"] + for id_, token_data in added_tokens_decoder.items(): + if token_data["content"] == token: + return int(id_) + raise ValueError(f"Token '{token}' not found in tokenizer config.") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if hparams["model_type"] == "pixtral": + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL) + self.gguf_writer.add_vision_attention_layernorm_eps( + hparams["layer_norm_eps"] + ) + + # hidden_act + if hparams["hidden_act"] == "silu": + self.gguf_writer.add_vision_use_silu(True) + elif hparams["hidden_act"] == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + else: + raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") + + # spatial_merge_size + if "spatial_merge_size" in self.global_config: + self.gguf_writer.add_vision_spatial_merge_size( + self.global_config["spatial_merge_size"] + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + n_head = self.hparams["num_attention_heads"] + n_kv_head = n_head + + if name.startswith("multi_modal_projector.") or name.startswith( + "vision_tower." + ): + # process vision tensors + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + if self.img_break_tok_id > 0 and "embed_tokens.weight" in name: + logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") + # for pixtral model, we need to extract the [IMG_BREAK] token embedding + img_break_embd = data_torch[self.img_break_tok_id] + name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] + return [(self.map_tensor_name(name), img_break_embd)] + + return [] # skip other tensors + + +@ModelBase.register( + "Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration" +) +class SmolVLMModel(VisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams["model_type"] == "smolvlm_vision": + # fix for SmolVLM2, missing some keys in config.json + # default values are taken from transformers code + self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) + self.hparams["num_attention_heads"] = self.hparams.get( + "num_attention_heads", 16 + ) + self.hparams["intermediate_size"] = self.hparams.get( + "intermediate_size", 3072 + ) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3) + self.gguf_writer.add_vision_attention_layernorm_eps( + self.hparams.get("layer_norm_eps", 1e-5) + ) + self.gguf_writer.add_vision_projector_scale_factor( + self.global_config.get("scale_factor", 2) + ) + self.gguf_writer.add_vision_use_gelu(True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = ( + "vision_tower" in name + or "vision_model" in name + or "model.connector" in name + ) + + if is_vision_tensor: + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("Llama4ForConditionalGeneration") +class Llama4Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA4 + undo_permute = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this + self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] + self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] + + def set_vocab(self): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_interleave_moe_layer_step( + self.hparams["interleave_moe_layer_step"] + ) + self.gguf_writer.add_expert_feed_forward_length( + self.hparams["intermediate_size_moe"] + ) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + if name.startswith("language_model."): + name = name.replace("language_model.", "") + + # split the gate_up into gate and up + if "gate_up_proj" in name: + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + dim_half = data_torch.shape[-1] // 2 + gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split( + dim_half, dim=-2 + ) + return [ + (self.map_tensor_name(name_gate), gate_proj_weight), + (self.map_tensor_name(name_up), up_proj_weight), + ] + + if name.endswith("down_proj"): + name += ".weight" + data_torch = data_torch.transpose(-1, -2) + + if "multi_modal_projector" in name or "vision_model" in name: + return [] + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Mistral3ForConditionalGeneration") +class Mistral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("language_model.", "") @@ -2041,19 +2525,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): return super().modify_tensors(data_torch, name, bid) -@Model.register("DeciLMForCausalLM") -class DeciModel(Model): +@ModelBase.register("DeciLMForCausalLM") +class DeciModel(TextModel): model_arch = gguf.MODEL_ARCH.DECI @staticmethod def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: - + # DeciLM-specific code intermediate_size = int(2 * ffn_mult * n_embd / 3) return DeciModel._find_multiple(intermediate_size, 256) @staticmethod def _find_multiple(n: int, k: int) -> int: - + # DeciLM-specific code if n % k == 0: return n return n + k - (n % k) @@ -2061,13 +2545,25 @@ def _find_multiple(n: int, k: int) -> int: def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if "block_configs" in self.hparams: + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B _block_configs: list[dict[str, Any]] = self.hparams["block_configs"] assert self.block_count == len(_block_configs) self._num_kv_heads = list() self._num_heads = list() _ffn_multipliers = list() - + # ***linear attention layer*** + # if n_heads_in_group is None and replace_with_linear is True + # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads + # ***attention-free layer*** + # if n_heads_in_group is None and replace_with_linear is False + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 + # ***normal attention-layer*** + # if n_heads_in_group is not None, then + # _num_kv_heads[il] is num_attention_head // n_heads_in_group and + # _num_heads[il] is num_attention_head + # ***dummy layer*** for nemotron 253B + # if n_heads_in_group is None and ffn_mult is None + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0 for il in range(len(_block_configs)): if _block_configs[il]["attention"]["n_heads_in_group"] is None: if _block_configs[il]["attention"]["replace_with_linear"] is True: @@ -2082,7 +2578,10 @@ def __init__(self, *args, **kwargs): // _block_configs[il]["attention"]["n_heads_in_group"] ) self._num_heads.append(self.hparams["num_attention_heads"]) - _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) + if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer + _ffn_multipliers.append(0.0) + else: + _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(_ffn_multipliers) @@ -2103,7 +2602,8 @@ def __init__(self, *args, **kwargs): ] def set_vocab(self): - + # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's + # eos_token from '|eot_id|' to '|end_of_text|' if self.hparams.get("vocab_size", 128256) == 128256: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -2114,11 +2614,11 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) else: - + # DeciLM-7B self._set_vocab_llama_hf() def set_gguf_parameters(self): - if "block_configs" in self.hparams: + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(self._ffn_dims) @@ -2138,9 +2638,9 @@ def set_gguf_parameters(self): self.hparams["hidden_size"] // self.hparams["num_attention_heads"] ) self.gguf_writer.add_file_type(self.ftype) - else: + else: # DeciLM-7B super().set_gguf_parameters() - if "num_key_value_heads_per_layer" in self.hparams: + if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B self._num_kv_heads: list[int] = self.hparams[ "num_key_value_heads_per_layer" ] @@ -2155,15 +2655,13 @@ def set_gguf_parameters(self): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling ): - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): @@ -2243,8 +2741,8 @@ def prepare_tensors(self): super().prepare_tensors() -@Model.register("BitnetForCausalLM") -class BitnetModel(Model): +@ModelBase.register("BitnetForCausalLM") +class BitnetModel(TextModel): model_arch = gguf.MODEL_ARCH.BITNET def set_vocab(self): @@ -2260,7 +2758,9 @@ def weight_quant(self, weight: Tensor) -> Tensor: weight = weight.float() scale = weight.abs().mean().clamp(min=1e-5) iscale = 1 / scale - + # TODO: multiply by the scale directly instead of inverting it twice + # (this is also unnecessarily doubly inverted upstream) + # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 result = (weight * iscale).round().clamp(-1, 1) / iscale return result.type(dtype) @@ -2281,14 +2781,14 @@ def modify_tensors( gguf.MODEL_TENSOR.FFN_GATE, ] ): - + # transform weight into 1/0/-1 (in fp32) data_torch = self.weight_quant(data_torch) yield (new_name, data_torch) -@Model.register("GrokForCausalLM") -class GrokModel(Model): +@ModelBase.register("GrokForCausalLM") +class GrokModel(TextModel): model_arch = gguf.MODEL_ARCH.GROK def set_vocab(self): @@ -2305,7 +2805,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # process the experts separately if name.find(".moe.") != -1: n_experts = self.hparams["num_local_experts"] @@ -2319,6 +2819,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for wid in ["linear", "linear_1", "linear_v"]: datas: list[Tensor] = [] @@ -2343,8 +2844,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("DbrxForCausalLM") -class DbrxModel(Model): +@ModelBase.register("DbrxForCausalLM") +class DbrxModel(TextModel): model_arch = gguf.MODEL_ARCH.DBRX def set_gguf_parameters(self): @@ -2374,17 +2875,26 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_expert = self.hparams["ffn_config"]["moe_num_experts"] n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor exp_tensor_names = { - "ffn.experts.mlp.w1": None, - "ffn.experts.mlp.w2": (0, 2, 1), + "ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": ( + 0, + 2, + 1, + ), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} "ffn.experts.mlp.v1": None, - } + } # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): @@ -2395,6 +2905,12 @@ def modify_tensors( data_torch = data_torch.permute(*permute_tensor) break + # map tensor names + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 new_name = self.map_tensor_name( name if not experts else name + ".weight", try_suffixes=(".weight",) ) @@ -2404,13 +2920,13 @@ def modify_tensors( def tensor_force_quant( self, name: str, new_name: str, bid: int | None, n_dims: int ) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid + del name, new_name, bid # unused return n_dims > 1 -@Model.register("MiniCPMForCausalLM") -class MiniCPMModel(Model): +@ModelBase.register("MiniCPMForCausalLM") +class MiniCPMModel(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): @@ -2426,12 +2942,12 @@ def set_gguf_parameters(self): logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] self.gguf_writer.add_logit_scale(logit_scale) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") - if self.hparams.get("rope_scaling") is not None: - if self.hparams["rope_scaling"].get("type") == "longrope": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) - logger.info( - f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}" - ) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) + logger.info( + f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}" + ) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] @@ -2469,11 +2985,12 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") + # HF models permute some of the tensors, so we need to undo that if name.endswith(("q_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight")): @@ -2482,8 +2999,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(Model): +@ModelBase.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM3 def set_gguf_parameters(self): @@ -2554,8 +3071,8 @@ def _reverse_hf_permute( ) -@Model.register("QWenLMHeadModel") -class QwenModel(Model): +@ModelBase.register("QWenLMHeadModel") +class QwenModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN @staticmethod @@ -2605,8 +3122,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) -@Model.register("Qwen2ForCausalLM") -class Qwen2Model(Model): +@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM") +class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 def set_vocab(self): @@ -2617,22 +3134,35 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() + self._try_set_pooling_type() + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" + and "factor" in rope_scaling ): - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) - self.gguf_writer.add_rope_scaling_orig_ctx_len( - self.hparams["rope_scaling"]["original_max_position_embeddings"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len( + rope_scaling["original_max_position_embeddings"] + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + if self.hf_arch == "Qwen2Model": + name = f"model.{name}" # map to Qwen2ForCausalLM tensors + if "language_model." in name: + name = name.replace("language_model.", "") # for InternVL + if name.startswith("mlp") or name.startswith("vision_model"): + # skip visual tensors + return [] + yield from super().modify_tensors(data_torch, name, bid) -@Model.register("Qwen2VLForConditionalGeneration") -class Qwen2VLModel(Model): +@ModelBase.register( + "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration" +) +class Qwen2VLModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2VL def set_gguf_parameters(self): @@ -2647,21 +3177,202 @@ def set_vocab(self): except FileNotFoundError: self._set_vocab_gpt2() - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, data in super().get_tensors(): - if name.startswith("visual."): - continue - yield name, data + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + if name.startswith("visual."): + # skip visual tensors + return [] + return [(self.map_tensor_name(name), data_torch)] -@Model.register("WavTokenizerDec") -class WavTokenizerDecModel(Model): +@ModelBase.register( + "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration" +) +class Qwen2VLVisionModel(VisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hparams["image_size"] = self.hparams.get("image_size", 560) + # rename config.json values + self.hparams["num_attention_heads"] = self.hparams.get("num_heads") + self.hparams["num_hidden_layers"] = self.hparams.get("depth") + if "embed_dim" in self.hparams: # qwen2vl + self.hparams["intermediate_size"] = self.hparams.get("hidden_size") + self.hparams["hidden_size"] = self.hparams.get("embed_dim") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if self.global_config["model_type"] == "qwen2_vl": + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL) + elif self.global_config["model_type"] == "qwen2_5_vl": + self.gguf_writer.add_vision_projector_type( + gguf.VisionProjectorType.QWEN25VL + ) + self.gguf_writer.add_vision_use_silu(True) + # find n_wa_pattern (window attention pattern) + fullatt_block_indexes = hparams.get("fullatt_block_indexes") + assert ( + fullatt_block_indexes is not None + ), "fullatt_block_indexes is required for qwen2_5_vl" + n_wa_pattern = fullatt_block_indexes[0] + 1 + # validate n_wa_pattern + for i in range(1, len(fullatt_block_indexes)): + if ( + fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] + != n_wa_pattern + ): + raise ValueError( + f"Invalid fullatt_block_indexes: {fullatt_block_indexes}" + ) + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + else: + raise ValueError( + f"Unknown QwenVL model type: {self.global_config['model_type']}" + ) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps( + self.global_config.get("rms_norm_eps", 1e-6) + ) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, name, n_dims # unused + if ".patch_embd." in new_name: + return gguf.GGMLQuantizationType.F16 + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + if name.startswith("visual."): + # process visual tensors + # split QKV tensors if needed + if ".qkv." in name: + if data_torch.ndim == 2: # weight + c3, _ = data_torch.shape + else: # bias + c3 = data_torch.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = data_torch[:c] + wk = data_torch[c : c * 2] + wv = data_torch[c * 2 :] + return [ + (self.map_tensor_name(name.replace("qkv", "q")), wq), + (self.map_tensor_name(name.replace("qkv", "k")), wk), + (self.map_tensor_name(name.replace("qkv", "v")), wv), + ] + elif "patch_embed.proj.weight" in name: + # split Conv3D into Conv2Ds + c1, c2, kt, kh, kw = data_torch.shape + del c1, c2, kh, kw # unused + assert ( + kt == 2 + ), "Current implmentation only support temporal_patch_size of 2" + return [ + ( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + + ".weight", + data_torch[:, :, 0, ...], + ), + ( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + + ".weight.1", + data_torch[:, :, 1, ...], + ), + ] + else: + return [(self.map_tensor_name(name), data_torch)] + return [] # skip other tensors + + +@ModelBase.register("InternVisionModel") +class InternVisionModel(VisionModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) + # hidden_act + if hparams["hidden_act"] == "silu": + self.gguf_writer.add_vision_use_silu(True) + elif hparams["hidden_act"] == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + else: + raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") + # downsample_ratio + downsample_ratio = self.global_config.get("downsample_ratio") + assert downsample_ratio is not None + self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, name, n_dims # unused + if ".patch_embd." in new_name: + return gguf.GGMLQuantizationType.F16 + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + if name.startswith("vision_model") or name.startswith("mlp"): + # process visual tensors + # correct name + if name.startswith("vision_model"): + name = "vision_tower." + name + if (".ls" in name or "position_embedding" in name) and not name.endswith( + ".weight" + ): + name += ".weight" + # split QKV tensors if needed + if ".qkv." in name: + if data_torch.ndim == 2: # weight + c3, _ = data_torch.shape + else: # bias + c3 = data_torch.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = data_torch[:c] + wk = data_torch[c : c * 2] + wv = data_torch[c * 2 :] + return [ + ( + self.map_tensor_name( + name.replace("attn.qkv", "self_attn.q_proj") + ), + wq, + ), + ( + self.map_tensor_name( + name.replace("attn.qkv", "self_attn.k_proj") + ), + wk, + ), + ( + self.map_tensor_name( + name.replace("attn.qkv", "self_attn.v_proj") + ), + wv, + ), + ] + return [(self.map_tensor_name(name), data_torch)] + return [] # skip other tensors + + +@ModelBase.register("WavTokenizerDec") +class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused if ( name.endswith("codebook.cluster_size") @@ -2697,8 +3408,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_causal_attention(False) -@Model.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(Model): +@ModelBase.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2MOE def set_gguf_parameters(self): @@ -2721,13 +3432,25 @@ def set_gguf_parameters(self): logger.info( f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}" ) + # YaRN is not enabled by default + # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts + rope_scaling = self.hparams.get("rope_scaling") or {} + if ( + rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" + and "factor" in rope_scaling + ): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len( + rope_scaling["original_max_position_embeddings"] + ) _experts: list[dict[str, Tensor]] | None = None def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # process the experts separately if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -2740,6 +3463,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -2765,14 +3489,24 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("GPT2LMHeadModel") -class GPT2Model(Model): +@ModelBase.register("Qwen3ForCausalLM") +class Qwen3Model(Qwen2Model): + model_arch = gguf.MODEL_ARCH.QWEN3 + + +@ModelBase.register("Qwen3MoeForCausalLM") +class Qwen3MoeModel(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3MOE + + +@ModelBase.register("GPT2LMHeadModel") +class GPT2Model(TextModel): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): @@ -2787,10 +3521,11 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused tensors: list[tuple[str, Tensor]] = [] + # we don't need these if name.endswith((".attn.bias", ".attn.masked_bias")): return tensors @@ -2806,8 +3541,8 @@ def modify_tensors( return tensors -@Model.register("PhiForCausalLM") -class Phi2Model(Model): +@ModelBase.register("PhiForCausalLM") +class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 def set_gguf_parameters(self): @@ -2834,12 +3569,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_add_bos_token(False) -@Model.register("Phi3ForCausalLM") -class Phi3MiniModel(Model): +@ModelBase.register("Phi3ForCausalLM") +class Phi3MiniModel(TextModel): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): - + # Phi-4 model uses GPT2Tokenizer tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: @@ -2976,7 +3711,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) sliding_window = self.hparams.get("sliding_window") - + # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models if sliding_window is None: sliding_window = 0 self.gguf_writer.add_sliding_window(sliding_window) @@ -2989,13 +3724,16 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rot_pct = self.hparams.get("partial_rotary_factor", 1.0) rope_dims = int(rot_pct * n_embd) // n_head + # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(["rope_scaling"], True) if rope_scaling is None: return scale = max_pos_embds / orig_max_pos_embds - rope_scaling_type = rope_scaling.get("type", "").lower() + rope_scaling_type = rope_scaling.get( + "rope_type", rope_scaling.get("type", "") + ).lower() if len(rope_scaling_type) == 0: raise KeyError("Missing the required key rope_scaling.type") @@ -3040,7 +3778,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: ) -@Model.register("PhiMoEForCausalLM") +@ModelBase.register("PhiMoEForCausalLM") class PhiMoeModel(Phi3MiniModel): model_arch = gguf.MODEL_ARCH.PHIMOE @@ -3054,7 +3792,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] assert bid is not None @@ -3067,6 +3805,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for w_name in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -3094,14 +3833,14 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("PlamoForCausalLM") -class PlamoModel(Model): +@ModelBase.register("PlamoForCausalLM") +class PlamoModel(TextModel): model_arch = gguf.MODEL_ARCH.PLAMO def set_vocab(self): @@ -3111,12 +3850,14 @@ def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_context_length(4096) + self.gguf_writer.add_context_length(4096) # not in config.json self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) + self.gguf_writer.add_head_count_kv( + 5 + ) # hparams["num_key_value_heads"]) is wrong self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) @@ -3137,10 +3878,11 @@ def shuffle_attn_output_weight(self, data_torch): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused new_name = self.map_tensor_name(name) + # shuffle for broadcasting of gqa in ggml_mul_mat if new_name.endswith("attn_q.weight"): data_torch = self.shuffle_attn_q_weight(data_torch) elif new_name.endswith("attn_output.weight"): @@ -3149,8 +3891,8 @@ def modify_tensors( return [(new_name, data_torch)] -@Model.register("CodeShellForCausalLM") -class CodeShellModel(Model): +@ModelBase.register("CodeShellForCausalLM") +class CodeShellModel(TextModel): model_arch = gguf.MODEL_ARCH.CODESHELL def set_gguf_parameters(self): @@ -3173,17 +3915,18 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) new_name = self.map_tensor_name(name) + # assuming token_embd.weight is seen before output.weight if not self._has_tok_embd and new_name == self.format_tensor_name( gguf.MODEL_TENSOR.OUTPUT ): - + # even though the tensor file(s) does not contain the word embeddings they are still in the weight map if self.tensor_names and "transformer.wte.weight" in self.tensor_names: logger.debug( f"{tok_embd_name} not found before {output_name}, assuming they are tied" @@ -3195,12 +3938,15 @@ def modify_tensors( return [(new_name, data_torch)] -@Model.register("InternLM2ForCausalLM") -class InternLM2Model(Model): +@ModelBase.register("InternLM2ForCausalLM") +class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 def set_vocab(self): - + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model @@ -3214,7 +3960,9 @@ def set_vocab(self): logger.error(f"Error: Missing {tokenizer_path}") sys.exit(1) - sentencepiece_model = model.ModelProto() + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix @@ -3228,7 +3976,8 @@ def set_vocab(self): text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) if text == b"\x00": - + # (TODO): fixme + # Hack here and replace the \x00 characters. logger.warning(f"InternLM2 convert token '{text}' to 'πŸ‰'!") text = "πŸ‰".encode("utf-8") @@ -3241,7 +3990,7 @@ def set_vocab(self): toktype = SentencePieceTokenTypes.UNUSED elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE - + # take care of ununsed raw token if piece.startswith("[UNUSED"): toktype = SentencePieceTokenTypes.UNUSED @@ -3318,7 +4067,9 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) old_eos = special_vocab.special_token_ids["eos"] if chat_eos_token_id is not None: - + # For the chat model, we replace the eos with '<|im_end|>'. + # TODO: this is a hack, should be fixed + # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = chat_eos_token_id logger.warning( f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" @@ -3337,15 +4088,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_file_type(self.ftype) + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling ): - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None @@ -3357,12 +4106,18 @@ def modify_tensors( head_dim = n_embd // num_heads num_groups = num_heads // q_per_kv + name = name.replace("language_model.", "") # InternVL + if name.startswith("mlp") or name.startswith("vision_model"): + # skip visual tensors + return [] + if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: qkv = data_torch qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) q, k, v = qkv[:, :q_per_kv], qkv[:, -2], qkv[:, -1] + # The model weights of q and k equire additional reshape. q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) k = LlamaModel.permute( k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads @@ -3378,8 +4133,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("InternLM3ForCausalLM") -class InternLM3Model(Model): +@ModelBase.register("InternLM3ForCausalLM") +class InternLM3Model(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA def set_vocab(self): @@ -3410,7 +4165,7 @@ def set_vocab(self): token_id = int(token_id) token = token_data["content"] special_vocab._set_special_token(token, token_id) - + # update eos token if ( token == "<|im_end|>" and "eos" in special_vocab.special_token_ids @@ -3430,24 +4185,23 @@ def set_gguf_parameters(self): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling ): - if ( - self.hparams["rope_scaling"].get("type") == "linear" - or self.hparams["rope_scaling"].get("rope_type") == "linear" - ): - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") + name = name.replace("language_model.", "") # InternVL + if name.startswith("mlp") or name.startswith("vision_model"): + # skip visual tensors + return [] if name.endswith(("q_proj.weight", "q_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): @@ -3455,8 +4209,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") -class BertModel(Model): +@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel") +class BertModel(TextModel): model_arch = gguf.MODEL_ARCH.BERT def __init__(self, *args, **kwargs): @@ -3466,36 +4220,18 @@ def __init__(self, *args, **kwargs): def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_causal_attention(False) - - pooling_path = None - module_path = self.dir_model / "modules.json" - if module_path.is_file(): - with open(module_path, encoding="utf-8") as f: - modules = json.load(f) - for mod in modules: - if mod["type"] == "sentence_transformers.models.Pooling": - pooling_path = mod["path"] - break - - if pooling_path is not None: - with open( - self.dir_model / pooling_path / "config.json", encoding="utf-8" - ) as f: - pooling = json.load(f) - if pooling["pooling_mode_mean_tokens"]: - pooling_type = gguf.PoolingType.MEAN - elif pooling["pooling_mode_cls_token"]: - pooling_type = gguf.PoolingType.CLS - else: - raise NotImplementedError("Only MEAN and CLS pooling types supported") - self.gguf_writer.add_pooling_type(pooling_type) + self._try_set_pooling_type() def set_vocab(self): tokens, toktypes, tokpre = self.get_vocab_base() self.vocab_size = len(tokens) + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + # convert to phantom space vocab def phantom(tok): if tok.startswith("[") and tok.endswith("]"): return tok @@ -3505,18 +4241,20 @@ def phantom(tok): tokens = list(map(phantom, tokens)) + # add vocab to gguf self.gguf_writer.add_tokenizer_model("bert") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) + # handle special tokens special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused if name.startswith("bert."): name = name[5:] @@ -3527,12 +4265,13 @@ def modify_tensors( if name.endswith(".beta"): name = name[:-5] + ".bias" + # we are only using BERT for embeddings so we don't need the pooling layer if name in ( "embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias", ): - return [] + return [] # we don't need these if name.startswith("cls.predictions"): return [] @@ -3542,14 +4281,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] - -@Model.register("RobertaModel") -class RobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - + def _xlmroberta_tokenizer_init(self) -> None: + # we need the pad_token_id to know how to chop down position_embd matrix if (pad_token_id := self.hparams.get("pad_token_id")) is not None: self._position_offset = 1 + pad_token_id if "max_position_embeddings" in self.hparams: @@ -3557,79 +4290,9 @@ def __init__(self, *args, **kwargs): else: self._position_offset = None - def set_vocab(self): - """Support BPE tokenizers for roberta models""" - bpe_tok_path = self.dir_model / "tokenizer.json" - if bpe_tok_path.exists(): - self._set_vocab_gpt2() - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - self.gguf_writer.add_token_type_count( - self.hparams.get("type_vocab_size", 1) - ) - - else: - return super().set_vocab() - - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - - if name.startswith("roberta."): - name = name[8:] - - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset :, :] - - return super().modify_tensors(data_torch, name, bid) - - -@Model.register("NomicBertModel") -class NomicBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.NOMIC_BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.hparams["n_ctx"] = 2048 - - assert self.hparams["activation_function"] == "swiglu" - - assert self.hparams["causal"] is False - - assert self.hparams["qkv_proj_bias"] is False - assert self.hparams["mlp_fc1_bias"] is False - assert self.hparams["mlp_fc2_bias"] is False - - assert self.hparams["prenorm"] is False - - assert self.hparams["rotary_emb_fraction"] == 1.0 - assert self.hparams["rotary_emb_interleaved"] is False - assert self.hparams["rotary_emb_scale_base"] is None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - - -@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") -class XLMRobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): - + def _xlmroberta_set_vocab(self) -> None: + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model @@ -3638,9 +4301,11 @@ def set_vocab(self): if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - assert sentencepiece_model.trainer_spec.model_type == 1 + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -3686,6 +4351,7 @@ def set_vocab(self): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.UNUSED) + # realign tokens (see HF tokenizer code) tokens = [b"", b"", b"", b""] + tokens[3:-1] scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] toktypes = [ @@ -3712,13 +4378,49 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) + +@ModelBase.register("RobertaModel") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count( + self.hparams.get("type_vocab_size", 1) + ) + + else: + return super().set_vocab() + def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main if name.startswith("roberta."): name = name[8:] + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: data_torch = data_torch[self._position_offset :, :] @@ -3726,13 +4428,149 @@ def modify_tensors( return super().modify_tensors(data_torch, name, bid) -@Model.register("GemmaForCausalLM") -class GemmaModel(Model): +@ModelBase.register("NomicBertModel") +class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__( + self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any + ): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model) + + self.is_moe = bool(hparams.get("moe_every_n_layers")) + self.model_arch = ( + gguf.MODEL_ARCH.NOMIC_BERT_MOE + if self.is_moe + else gguf.MODEL_ARCH.NOMIC_BERT + ) + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + + self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta() + if self._tokenizer_is_xlmroberta: + self._xlmroberta_tokenizer_init() + + npos, mtp = self.hparams["n_positions"], self.hparams.get( + "max_trained_positions", 2048 + ) + if npos == 8192 and mtp == 2048: + self.hparams["n_positions"] = ( + 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens. + ) + elif npos == 2048 and mtp == 2048: + self.hparams["n_positions"] = ( + 512 # nomic-embed-text-v2-moe is trained for 512 tokens. + ) + else: + raise ValueError( + f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}" + ) + + assert ( + self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu" + ) + + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors unless MoE + assert self.hparams["qkv_proj_bias"] == self.is_moe + assert self.hparams["mlp_fc1_bias"] == self.is_moe + assert self.hparams["mlp_fc2_bias"] == self.is_moe + + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_vocab(self) -> None: + if self._tokenizer_is_xlmroberta: + return self._xlmroberta_set_vocab() + return super().set_vocab() + + def modify_tensors( + self, data_torch: torch.Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, torch.Tensor]]: + # If the tensor is an experts bias tensor, skip it by returning an empty list. + if "mlp.experts.bias" in name: + return [] # Explicitly return an empty list. + + if "mlp.experts.mlp.w1" in name: + data_torch = data_torch.view( + self.hparams["num_experts"], + self.hparams["n_inner"], + self.hparams["n_embd"], + ) + name += ".weight" + + if "mlp.experts.mlp.w2" in name: + data_torch = data_torch.view( + self.hparams["num_experts"], + self.hparams["n_inner"], + self.hparams["n_embd"], + ) + data_torch = data_torch.transpose(1, 2) + name += ".weight" + + return [(self.map_tensor_name(name), data_torch)] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + if self.is_moe: + self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) + self.gguf_writer.add_expert_count(self.hparams["num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) + + def _is_tokenizer_xlmroberta(self) -> bool: + with open(self.dir_model / "tokenizer.json") as f: + tokenizer_json = json.load(f) + toktyp = tokenizer_json["model"]["type"] + if toktyp == "Unigram": + return True + if toktyp == "WordPiece": + return False + raise ValueError(f"unknown tokenizer: {toktyp}") + + +@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +class XLMRobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._xlmroberta_tokenizer_init() + + def set_vocab(self): + self._xlmroberta_set_vocab() + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset :, :] + + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("GemmaForCausalLM") +class GemmaModel(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA def set_vocab(self): self._set_vocab_sentencepiece() + # TODO: these special tokens should be exported only for the CodeGemma family special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, @@ -3743,7 +4581,7 @@ def set_vocab(self): special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("eot", 107) - special_vocab.chat_template = None + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) @@ -3770,22 +4608,25 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug( f"Skipping get tensor {name!r} in safetensors so that convert can end normally." ) return [] + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma2ForCausalLM") -class Gemma2Model(Model): +@ModelBase.register("Gemma2ForCausalLM") +class Gemma2Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA2 def set_vocab(self): @@ -3822,42 +4663,26 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug( f"Skipping get tensor {name!r} in safetensors so that convert can end normally." ) return [] + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") -class Gemma3Model(Model): +@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA3 - has_vision: bool = False - - def __init__(self, *args, **kwargs): - hparams = Model.load_hparams(kwargs["dir_model"]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams - super().__init__(*args, **kwargs) - if "vision_config" in hparams: - logger.info("Has vision encoder, but it will be ignored") - self.has_vision = True - - def write(self): - super().write() - if self.has_vision: - logger.info("NOTE: this script only convert the language model to GGUF") - logger.info( - " for the vision model, please use gemma3_convert_encoder_to_gguf.py" - ) def set_vocab(self): self._set_vocab_sentencepiece() @@ -3868,6 +4693,7 @@ def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] + # some default values are not specified in the hparams self.gguf_writer.add_context_length( hparams.get("max_position_embeddings", 131072) ) @@ -3879,52 +4705,117 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) - + self.gguf_writer.add_rope_freq_base( + hparams.get("rope_theta", 1_000_000.0) + ) # for global layers + # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3 assert hparams.get("attn_logit_softcapping") is None assert hparams.get("final_logit_softcapping") is None self.gguf_writer.add_sliding_window(hparams["sliding_window"]) self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) if hparams.get("rope_scaling") is not None: assert hparams["rope_scaling"]["rope_type"] == "linear" - + # important: this rope_scaling is only applied for global layers, and not used by 1B model self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused if name.startswith("language_model."): name = name.replace("language_model.", "") + elif ( name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") or name.startswith("multimodal_projector.") or name.startswith("vision_model.") ): + return [] # skip vision tensors - return [] - + # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: vocab = self._create_vocab_sentencepiece() tokens = vocab[0] data_torch = data_torch[: len(tokens)] + # ref code in Gemma3RMSNorm + # output = output * (1.0 + self.weight.float()) if name.endswith("norm.weight"): data_torch = data_torch + 1 return [(self.map_tensor_name(name), data_torch)] -@Model.register("Starcoder2ForCausalLM") -class StarCoder2Model(Model): +@ModelBase.register("Gemma3ForConditionalGeneration") +class Gemma3VisionModel(VisionModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps( + hparams.get("layer_norm_eps", 1e-6) + ) + self.gguf_writer.add_vision_use_gelu(True) + # calculate proj_scale_factor (used by tinygemma3 test model) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + n_per_side = int(image_seq_length**0.5) + image_size = self.hparams["image_size"] + patch_size = self.hparams["patch_size"] + proj_scale_factor = (image_size // patch_size) // n_per_side + if proj_scale_factor > 0 and proj_scale_factor != 4: + # we only need to write this if it's not the default value + # in this case, we are converting a test model + self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + # related to https://github.com/ggml-org/llama.cpp/issues/13025 + if "input_projection" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if "vision_model.head." in name: + return [] # skip redundant tensors for tinygemma3 + + if ( + name.startswith("multi_modal_projector.") + or name.startswith("vision_tower.") + or name.startswith("multimodal_projector.") + or name.startswith("vision_model.") + ): + # process vision tensors + name = name.replace("_weight", ".weight") + + # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("Starcoder2ForCausalLM") +class StarCoder2Model(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("Rwkv6ForCausalLM") -class Rwkv6Model(Model): +@ModelBase.register("Rwkv6ForCausalLM") +class Rwkv6Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): @@ -3944,6 +4835,7 @@ def set_gguf_parameters(self): time_mix_extra_dim = 64 if hidden_size == 4096 else 32 time_decay_extra_dim = 128 if hidden_size == 4096 else 64 + # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) @@ -3955,6 +4847,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) + # required by llama.cpp, unused self.gguf_writer.add_head_count(0) lerp_weights: dict[int, dict[str, Tensor]] = {} @@ -3992,6 +4885,8 @@ def modify_tensors( except KeyError: pass + # concat time_mix_lerp weights to reduce some cpu overhead + # also reduces the number of tensors in the model if ( bid is not None and "time_mix_lerp" in new_name @@ -4021,7 +4916,7 @@ def modify_tensors( yield (new_name, data_torch) -@Model.register("RWKV6Qwen2ForCausalLM") +@ModelBase.register("RWKV6Qwen2ForCausalLM") class RWKV6Qwen2Model(Rwkv6Model): model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 @@ -4039,9 +4934,14 @@ def set_gguf_parameters(self): head_size = hidden_size // num_attention_heads rms_norm_eps = self.hparams["rms_norm_eps"] intermediate_size = self.hparams["intermediate_size"] - time_mix_extra_dim = 64 if hidden_size >= 4096 else 32 - time_decay_extra_dim = 128 if hidden_size >= 4096 else 64 + time_mix_extra_dim = self.hparams.get( + "lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32 + ) + time_decay_extra_dim = self.hparams.get( + "lora_rank_decay", 128 if hidden_size >= 4096 else 64 + ) + # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) @@ -4051,11 +4951,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) + # special parameters for time_mixing in RWKV6QWEN2 self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) self.gguf_writer.add_token_shift_count(1) - + # RWKV6QWEN2 use grouped key/value like GQA self.gguf_writer.add_head_count_kv(num_key_value_heads) + # required by llama.cpp, unused self.gguf_writer.add_head_count(0) def modify_tensors( @@ -4064,7 +4966,8 @@ def modify_tensors( for new_name, data in super().modify_tensors(data_torch, name, bid): if "time_mix_w1" in new_name or "time_mix_w2" in new_name: data = data.view(5, -1, data.shape[-1]) - + # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg + # permute them here to avoid code changes data = torch.stack( [data[3], data[1], data[2], data[0], data[4]], dim=0 ).view(-1, data.shape[-1]) @@ -4075,8 +4978,8 @@ def modify_tensors( yield (new_name, data) -@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") -class Rwkv7Model(Model): +@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV7 def set_vocab(self): @@ -4100,6 +5003,7 @@ def set_gguf_parameters(self): else (hidden_size * 4) ) + # ICLR: In-Context-Learning-Rate try: lora_rank_decay = ( self.hparams["lora_rank_decay"] @@ -4143,6 +5047,7 @@ def set_gguf_parameters(self): else self.calc_lora_rank(hidden_size, 0.8, 0.6) ) + # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) @@ -4155,6 +5060,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) + # required by llama.cpp, unused self.gguf_writer.add_head_count(0) lerp_weights: dict[int, dict[str, Tensor]] = {} @@ -4163,11 +5069,11 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # unify tensor names here to make life easier name = name.replace("blocks", "layers").replace("ffn", "feed_forward") name = name.replace("self_attn", "attention").replace("attn", "attention") name = name.replace("time_mixer.", "") - + # lora layer names in fla-hub's impl if "_lora.lora" in name: self.lora_needs_transpose = False name = name.replace("_lora.lora.0.weight", "1.weight") @@ -4182,7 +5088,8 @@ def modify_tensors( and "value" not in self.map_tensor_name(name) and bid == 0 ): - + # some models have dummy v0/v1/v2 on first layer while others don't + # ignore them all since they are not used return wkv_has_gate = self.hparams.get("wkv_has_gate", True) @@ -4194,7 +5101,7 @@ def modify_tensors( if bid is not None and "attention.x_" in name: if "attention.x_x" in name: - + # already concatenated new_name = f"blk.{bid}.time_mix_lerp_fused.weight" data = data_torch.reshape(len(lerp_list), 1, 1, -1) yield (new_name, data) @@ -4246,13 +5153,14 @@ def modify_tensors( data_torch = data_torch.flatten() if bid == 0 and "time_mix_a" in new_name: - + # dummy v0/v1/v2 on first layer + # easist way to make llama happy yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) yield (new_name, data_torch) -@Model.register("RwkvHybridForCausalLM") +@ModelBase.register("RwkvHybridForCausalLM") class ARwkv7Model(Rwkv7Model): model_arch = gguf.MODEL_ARCH.ARWKV7 @@ -4271,11 +5179,13 @@ def set_gguf_parameters(self): wkv_has_gate = self.hparams["wkv_has_gate"] assert self.hparams["wkv_version"] == 7 + # ICLR: In-Context-Learning-Rate lora_rank_decay = 64 lora_rank_iclr = 64 lora_rank_value_residual_mix = 32 lora_rank_gate = 128 if wkv_has_gate else 0 + # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) @@ -4289,18 +5199,20 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_token_shift_count(1) + # required by llama.cpp, unused self.gguf_writer.add_head_count(0) -@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") -class MambaModel(Model): +@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +class MambaModel(TextModel): model_arch = gguf.MODEL_ARCH.MAMBA def set_vocab(self): vocab_size = self.hparams["vocab_size"] - + # Round vocab size to next multiple of 8 pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 vocab_size = -(vocab_size // -pad_vocab) * pad_vocab self.hparams["vocab_size"] = vocab_size @@ -4309,7 +5221,7 @@ def set_vocab(self): elif (self.dir_model / "tokenizer.model").is_file(): self._set_vocab_sentencepiece() else: - + # Use the GPT-NeoX tokenizer when no tokenizer files are present self._set_vocab_builtin("gpt-neox", vocab_size) def set_gguf_parameters(self): @@ -4320,7 +5232,9 @@ def set_gguf_parameters(self): or 2 * d_model ) d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -( d_model // -16 ) @@ -4329,23 +5243,31 @@ def set_gguf_parameters(self): or 1e-5 ) use_dt_b_c_norm = False - + # For falconmamba we do apply RMS norm on B / DT and C layers if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): use_dt_b_c_norm = True - + # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_context_length(2**20) + self.gguf_writer.add_context_length( + 2**20 + ) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) - self.gguf_writer.add_head_count(0) + self.gguf_writer.add_feed_forward_length( + 0 + ) # unused, but seemingly required when loading + self.gguf_writer.add_head_count( + 0 + ) # unused, but seemingly required when loading self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) + self.gguf_writer.add_ssm_dt_b_c_rms( + use_dt_b_c_norm + ) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None @@ -4353,8 +5275,6 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) @@ -4364,6 +5284,11 @@ def modify_tensors( logger.debug("A_log --> A ==> " + new_name) data_torch = -torch.exp(data_torch) + # [4 1 8192 1] -> [4 8192 1 1] + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): + data_torch = data_torch.squeeze() + + # assuming token_embd.weight is seen before output.weight if self._tok_embd is not None and new_name == output_name: if torch.equal(self._tok_embd, data_torch): logger.debug( @@ -4376,13 +5301,16 @@ def modify_tensors( return [(new_name, data_torch)] -@Model.register("CohereForCausalLM") -class CommandR2Model(Model): +@ModelBase.register("CohereForCausalLM") +class CommandR2Model(TextModel): model_arch = gguf.MODEL_ARCH.COMMAND_R def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + # aya-23 models don't have model_max_length specified self.hparams["max_position_embeddings"] = self.find_hparam( ["model_max_length", "max_position_embeddings"] ) @@ -4393,8 +5321,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("Cohere2ForCausalLM") -class Cohere2Model(Model): +@ModelBase.register("Cohere2ForCausalLM") +class Cohere2Model(TextModel): model_arch = gguf.MODEL_ARCH.COHERE2 def set_gguf_parameters(self): @@ -4413,9 +5341,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("OlmoForCausalLM") -@Model.register("OLMoForCausalLM") -class OlmoModel(Model): +@ModelBase.register("OlmoForCausalLM") +@ModelBase.register("OLMoForCausalLM") +class OlmoModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMO def set_gguf_parameters(self): @@ -4425,10 +5353,12 @@ def set_gguf_parameters(self): if clip_qkv is not None: self.gguf_writer.add_clamp_kqv(clip_qkv) + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -4441,13 +5371,13 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo2ForCausalLM") -class Olmo2Model(Model): +@ModelBase.register("Olmo2ForCausalLM") +class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 -@Model.register("OlmoeForCausalLM") -class OlmoeModel(Model): +@ModelBase.register("OlmoeForCausalLM") +class OlmoeModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMOE def set_gguf_parameters(self): @@ -4458,10 +5388,11 @@ def set_gguf_parameters(self): _experts: list[dict[str, Tensor]] | None = None + # Copied from: Qwen2MoeModel def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # process the experts separately if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -4474,6 +5405,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -4495,17 +5427,18 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] + # Copied from: Qwen2MoeModel def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("JinaBertModel", "JinaBertForMaskedLM") +@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 @@ -4548,22 +5481,23 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # if name starts with "bert.", remove the prefix + # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en if name.startswith("bert."): name = name[5:] return super().modify_tensors(data_torch, name, bid) -@Model.register("OpenELMForCausalLM") -class OpenELMModel(Model): +@ModelBase.register("OpenELMForCausalLM") +class OpenELMModel(TextModel): model_arch = gguf.MODEL_ARCH.OPENELM @staticmethod def _make_divisible(v: float | int, divisor: int) -> int: - + # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) - + # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v @@ -4587,6 +5521,7 @@ def __init__(self, *args, **kwargs): self._num_query_heads[0], int ) + # Uses the tokenizer from meta-llama/Llama-2-7b-hf def set_vocab(self): try: self._set_vocab_sentencepiece() @@ -4608,7 +5543,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(self._num_query_heads) self.gguf_writer.add_head_count_kv(self._num_kv_heads) self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) - + # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 self.gguf_writer.add_layer_norm_rms_eps(1e-6) self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) self.gguf_writer.add_key_length(head_dim) @@ -4625,6 +5560,7 @@ def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: + # split ff if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": ff_dim = self._ffn_dims[bid] yield ( @@ -4640,12 +5576,14 @@ def modify_tensors( yield (self.map_tensor_name(name), data_torch) -@Model.register("ArcticForCausalLM") -class ArcticModel(Model): +@ModelBase.register("ArcticForCausalLM") +class ArcticModel(TextModel): model_arch = gguf.MODEL_ARCH.ARCTIC def set_vocab(self): - + # The reason for using a custom implementation here is that the + # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from + # tokenizer.model and used them as BOS and EOS instead of adding new tokens. from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / "tokenizer.model" @@ -4654,6 +5592,7 @@ def set_vocab(self): logger.error(f"Error: Missing {tokenizer_path}") sys.exit(1) + # Read the whole vocabulary from the tokenizer.model file tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) @@ -4683,6 +5622,8 @@ def set_vocab(self): scores[token_id] = score toktypes[token_id] = toktype + # Use the added_tokens_decoder field from tokeniser_config.json as the source + # of information about added/redefined tokens and modify them accordingly. tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: @@ -4702,6 +5643,8 @@ def set_vocab(self): token_type = SentencePieceTokenTypes.USER_DEFINED token_score = -10000.0 + # Map unk_token to UNKNOWN, other special tokens to CONTROL + # Set the score to 0.0 as in the original tokenizer.model if ("special" in token_json) and token_json["special"]: if token_content == tokenizer_config_json["unk_token"]: token_type = SentencePieceTokenTypes.UNKNOWN @@ -4746,6 +5689,7 @@ def modify_tensors( if name.endswith("k_proj.weight"): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] @@ -4759,6 +5703,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -4784,14 +5729,14 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekForCausalLM") -class DeepseekModel(Model): +@ModelBase.register("DeepseekForCausalLM") +class DeepseekModel(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK def set_vocab(self): @@ -4844,6 +5789,7 @@ def modify_tensors( if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) + # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] assert bid is not None @@ -4856,6 +5802,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -4881,21 +5828,25 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekV2ForCausalLM") -@Model.register("DeepseekV3ForCausalLM") -class DeepseekV2Model(Model): +@ModelBase.register("DeepseekV2ForCausalLM") +@ModelBase.register("DeepseekV3ForCausalLM") +class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 def set_vocab(self): self._set_vocab_gpt2() def set_gguf_parameters(self): + + # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + super().set_gguf_parameters() hparams = self.hparams @@ -4904,10 +5855,17 @@ def set_gguf_parameters(self): if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + + # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA self.gguf_writer.add_key_length( + hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"] + ) + self.gguf_writer.add_value_length(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length_mla( hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"] ) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) + self.gguf_writer.add_expert_feed_forward_length( hparams["moe_intermediate_size"] ) @@ -4927,36 +5885,36 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" + and "factor" in rope_scaling ): - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) - self.gguf_writer.add_rope_scaling_orig_ctx_len( - self.hparams["rope_scaling"]["original_max_position_embeddings"] - ) - self.gguf_writer.add_rope_scaling_yarn_log_mul( - 0.1 * hparams["rope_scaling"]["mscale_all_dim"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len( + rope_scaling["original_max_position_embeddings"] + ) + self.gguf_writer.add_rope_scaling_yarn_log_mul( + 0.1 * rope_scaling["mscale_all_dim"] + ) _experts: list[dict[str, Tensor]] | None = None def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # rename e_score_correction_bias tensors if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") + # skip Multi-Token Prediction (MTP) layers block_count = self.hparams["num_hidden_layers"] match = re.match(r"model.layers.(\d+)", name) if match and int(match.group(1)) >= block_count: return [] + # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] assert bid is not None @@ -4969,6 +5927,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -4988,23 +5947,72 @@ def modify_tensors( else: return [] + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view( + n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1] + ) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + + return [ + (self.map_tensor_name(name_kb), k_b), + (self.map_tensor_name(name_vb), v_b), + ] + return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("T5WithLMHeadModel") -@Model.register("T5ForConditionalGeneration") -@Model.register("MT5ForConditionalGeneration") -@Model.register("UMT5ForConditionalGeneration") -class T5Model(Model): +@ModelBase.register("PLMForCausalLM") +class PLMModel(TextModel): + model_arch = gguf.MODEL_ARCH.PLM + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length( + hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"] + ) + self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + +@ModelBase.register("T5WithLMHeadModel") +@ModelBase.register("T5ForConditionalGeneration") +@ModelBase.register("MT5ForConditionalGeneration") +@ModelBase.register("UMT5ForConditionalGeneration") +class T5Model(TextModel): model_arch = gguf.MODEL_ARCH.T5 def __init__(self, *args, **kwargs): @@ -5012,28 +6020,33 @@ def __init__(self, *args, **kwargs): self.shared_token_embeddings_found = False def set_vocab(self): - + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model tokenizer_path = self.dir_model / "tokenizer.model" + # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): tokenizer_path = self.dir_model / "spiece.model" if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - if sentencepiece_model.trainer_spec.model_type == 2: - + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct assert tokenizer_path.name == "tokenizer.model" return self._set_vocab_sentencepiece() else: - assert sentencepiece_model.trainer_spec.model_type == 1 + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -5137,8 +6150,12 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. if name in [ "decoder.embed_tokens.weight", "encoder.embed_tokens.weight", @@ -5156,8 +6173,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("T5EncoderModel") -class T5EncoderModel(Model): +@ModelBase.register("T5EncoderModel") +class T5EncoderModel(TextModel): model_arch = gguf.MODEL_ARCH.T5ENCODER def __init__(self, *args, **kwargs): @@ -5165,28 +6182,33 @@ def __init__(self, *args, **kwargs): self.shared_token_embeddings_found = False def set_vocab(self): - + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model tokenizer_path = self.dir_model / "tokenizer.model" + # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): tokenizer_path = self.dir_model / "spiece.model" if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - if sentencepiece_model.trainer_spec.model_type == 2: - + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct assert tokenizer_path.name == "tokenizer.model" return self._set_vocab_sentencepiece() else: - assert sentencepiece_model.trainer_spec.model_type == 1 + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -5287,8 +6309,12 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. if name in [ "decoder.embed_tokens.weight", "encoder.embed_tokens.weight", @@ -5306,17 +6332,19 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("JAISLMHeadModel") -class JaisModel(Model): +@ModelBase.register("JAISLMHeadModel") +class JaisModel(TextModel): model_arch = gguf.MODEL_ARCH.JAIS def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # SwigLU activation assert self.hparams["activation_function"] == "swiglu" - + # ALiBi position embedding assert self.hparams["position_embedding_type"] == "alibi" + # Embeddings scale self.embeddings_scale = 1.0 if "mup_embeddings_scale" in self.hparams: self.embeddings_scale = self.hparams["mup_embeddings_scale"] @@ -5353,15 +6381,19 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused tensors: list[tuple[str, Tensor]] = [] + # we don't need these if name.endswith((".attn.bias")): return tensors if name.endswith(("relative_pe.slopes")): - + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) first_val = float(data_torch[0].item()) self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) @@ -5389,8 +6421,55 @@ def prepare_tensors(self): self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) -@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(Model): +@ModelBase.register("Glm4ForCausalLM") +class Glm4Model(TextModel): + model_arch = gguf.MODEL_ARCH.GLM4 + + def set_vocab(self): + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained( + self.dir_model, trust_remote_code=True + ) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab._set_special_token( + "eos", tokenizer.get_added_vocab()["<|endoftext|>"] + ) + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) + special_vocab._set_special_token( + "unk", tokenizer.get_added_vocab()["<|endoftext|>"] + ) + special_vocab._set_special_token( + "bos", tokenizer.get_added_vocab()["<|endoftext|>"] + ) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + rope_dim = self.hparams["head_dim"] + self.gguf_writer.add_rope_dimension_count( + int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)) + ) + rope_scaling = self.hparams.get("rope_scaling") or {} + if ( + rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" + and "factor" in rope_scaling + ): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len( + rope_scaling["original_max_position_embeddings"] + ) + + +@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +class ChatGLMModel(TextModel): model_arch = gguf.MODEL_ARCH.CHATGLM def set_vocab_chatglm3(self): @@ -5429,7 +6508,8 @@ def set_vocab_chatglm3(self): text = piece.encode("utf-8") score = 0.0 - + # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), + # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): score = tokenizer.tokenizer.sp_model.get_score(token_id) @@ -5461,7 +6541,8 @@ def set_vocab_chatglm3(self): toktypes.append(toktype) self.gguf_writer.add_tokenizer_model("llama") - + # glm3 needs prefix and suffix formatted as: + # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" self.gguf_writer.add_tokenizer_pre("chatglm-spm") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) @@ -5522,12 +6603,12 @@ def set_vocab(self): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - + # only add special tokens when they were not already loaded from config.json special_vocab._set_special_token( "eos", tokenizer.get_added_vocab()["<|endoftext|>"] ) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - + # this one is usually not in config.json anyway special_vocab._set_special_token( "unk", tokenizer.get_added_vocab()["<|endoftext|>"] ) @@ -5573,7 +6654,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith( "model.vision." @@ -5584,8 +6665,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("NemotronForCausalLM") -class NemotronModel(Model): +@ModelBase.register("NemotronForCausalLM") +class NemotronModel(TextModel): model_arch = gguf.MODEL_ARCH.NEMOTRON def set_vocab(self): @@ -5603,6 +6684,7 @@ def set_gguf_parameters(self): ) self.gguf_writer.add_layer_norm_eps(f_norm_eps) + # * Partial RoPE rot_pct = self.find_hparam( ["partial_rotary_factor", "rope_pct", "rope_percent"] ) @@ -5610,6 +6692,7 @@ def set_gguf_parameters(self): n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + # * RopeScaling for Nemotron if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) else: @@ -5619,15 +6702,18 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side + # model.layers.{l}.input_layernorm.weight + # model.layers.{l}.post_attention_layernorm.weight + # model.norm.weight if name.endswith("norm.weight"): data_torch = data_torch + 1 return [(self.map_tensor_name(name), data_torch)] -@Model.register("ExaoneForCausalLM") -class ExaoneModel(Model): +@ModelBase.register("ExaoneForCausalLM") +class ExaoneModel(TextModel): model_arch = gguf.MODEL_ARCH.EXAONE def set_gguf_parameters(self): @@ -5646,7 +6732,10 @@ def set_gguf_parameters(self): else 4 * embed_dim ) num_layers = hparams["num_layers"] - + # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 + # attention_dropout_rate = hparams["attention_dropout"] + # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 + # embed_dropout_rate = hparams["embed_dropout"] self.gguf_writer.add_embedding_length(embed_dim) self.gguf_writer.add_head_count(num_heads) self.gguf_writer.add_head_count_kv(num_kv_heads) @@ -5668,15 +6757,13 @@ def set_gguf_parameters(self): * (hparams["hidden_size"] // hparams["num_attention_heads"]) ) ) + rope_scaling = self.hparams.get("rope_scaling") or {} if ( - hparams.get("rope_scaling") is not None - and "factor" in hparams["rope_scaling"] + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling ): - if hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): @@ -5720,7 +6807,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: ) -@Model.register("GraniteForCausalLM") +@ModelBase.register("GraniteForCausalLM") class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" @@ -5739,7 +6826,8 @@ def set_gguf_parameters(self): if head_dim := self.hparams.pop("head_dim", None): logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) super().set_gguf_parameters() - + # NOTE: Convert _multiplier params to _scale params for naming + # consistency if attention_scale := self.hparams.get("attention_multiplier"): self.gguf_writer.add_attention_scale(attention_scale) logger.info("gguf: (granite) attention_scale = %s", attention_scale) @@ -5754,12 +6842,26 @@ def set_gguf_parameters(self): logger.info("gguf: (granite) logits_scale = %s", logits_scale) -@Model.register("GraniteMoeForCausalLM") +@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM") class GraniteMoeModel(GraniteModel): """Conversion for IBM's GraniteMoeForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE_MOE + def set_gguf_parameters(self): + """GraniteMoeShared uses GraniteMoe parameters plus the following: + - shared_intermediate_size + """ + super().set_gguf_parameters() + if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"): + self.gguf_writer.add_expert_shared_feed_forward_length( + shared_feed_forward_length + ) + logger.info( + "gguf: (granitemoeshared) shared_feed_forward_length = %s", + shared_feed_forward_length, + ) + def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: @@ -5774,18 +6876,160 @@ def modify_tensors( assert ( data_torch.shape[-2] == 2 * ffn_dim ), "Merged FFN tensor size must be 2 * intermediate_size" - gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :] + gate, up = data_torch.split(ffn_dim, dim=-2) return [ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), ] + if name.endswith("shared_mlp.input_linear.weight"): + ffn_dim = self.hparams["shared_intermediate_size"] + assert ( + data_torch.shape[-2] == 2 * ffn_dim + ), "Merged FFN tensor size must be 2 * shared_intermediate_size" + gate, up = data_torch.split(ffn_dim, dim=-2) + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate), + (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up), + ] + return super().modify_tensors(data_torch, name, bid) -@Model.register("ChameleonForConditionalGeneration") -@Model.register("ChameleonForCausalLM") -class ChameleonModel(Model): +@ModelBase.register("BailingMoeForCausalLM") +class BailingMoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.BAILINGMOE + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + rope_dim = ( + hparams.get("head_dim") + or hparams["hidden_size"] // hparams["num_attention_heads"] + ) + + self.gguf_writer.add_rope_dimension_count(rope_dim) + rope_scaling = self.hparams.get("rope_scaling") or {} + if ( + rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" + and "factor" in rope_scaling + ): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len( + rope_scaling["original_max_position_embeddings"] + ) + else: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length( + hparams["moe_intermediate_size"] + ) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_count(hparams["num_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return ( + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + n_embd = self.hparams["hidden_size"] + head_dim = self.hparams.get("head_dim") or n_embd // n_head + + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + + if name.endswith("attention.dense.weight"): + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch) + ] + elif name.endswith("query_key_value.weight"): + q, k, v = data_torch.split( + [n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2 + ) + + return [ + ( + self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + BailingMoeModel.permute(q, n_head, n_head), + ), + ( + self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + BailingMoeModel.permute(k, n_head, n_kv_head), + ), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), + ] + elif name.find("mlp.experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + tensors: list[tuple[str, Tensor]] = [] + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + + return tensors + + new_name = self.map_tensor_name(name) + + if new_name == output_name and self.hparams.get("norm_head"): + data_torch = data_torch.float() + data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7 + + return [(new_name, data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("ChameleonForConditionalGeneration") +@ModelBase.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(TextModel): model_arch = gguf.MODEL_ARCH.CHAMELEON def set_gguf_parameters(self): @@ -5798,7 +7042,8 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # ignore image tokenizer for now + # TODO: remove this once image support is implemented for Chameleon if name.startswith("model.vqmodel"): return [] @@ -5821,6 +7066,7 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] + # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 @staticmethod def _reverse_hf_permute(data_torch, n_heads, hidden_dim): head_dim = hidden_dim // n_heads @@ -5829,24 +7075,35 @@ def _reverse_hf_permute(data_torch, n_heads, hidden_dim): return data_torch +###### CONVERSION LOGIC ###### + + +# tree of lazy tensors class LazyTorchTensor(gguf.LazyBase): _tensor_type = torch.Tensor - + # to keep the type-checker happy dtype: torch.dtype shape: torch.Size + # only used when converting a torch.Tensor to a np.ndarray _dtype_map: dict[torch.dtype, type] = { torch.float16: np.float16, torch.float32: np.float32, } + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 _dtype_str_map: dict[str, torch.dtype] = { "F64": torch.float64, "F32": torch.float32, "BF16": torch.bfloat16, "F16": torch.float16, + # "U64": torch.uint64, "I64": torch.int64, + # "U32": torch.uint32, "I32": torch.int32, + # "U16": torch.uint16, "I16": torch.int16, "U8": torch.uint8, "I8": torch.int8, @@ -5880,9 +7137,21 @@ def from_safetensors_slice(cls, st_slice: Any) -> Tensor: ) return cast(torch.Tensor, lazy) + @classmethod + def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor): + dtype = cls._dtype_str_map[remote_tensor.dtype] + shape = remote_tensor.shape + meta = cls.meta_with_dtype_and_shape(dtype, shape) + lazy = cls( + meta=meta, + args=(remote_tensor,), + func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape), + ) + return cast(torch.Tensor, lazy) + @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): - del types + del types # unused if kwargs is None: kwargs = {} @@ -5978,6 +7247,16 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Print the supported models", ) + parser.add_argument( + "--remote", + action="store_true", + help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", + ) + parser.add_argument( + "--mmproj", + action="store_true", + help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -6005,12 +7284,27 @@ def split_str_to_n_bytes(split_str: str) -> int: return n +def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str: + text_config = hparams.get("text_config", {}) + vision_config = hparams.get("vision_config", {}) + arch = hparams["architectures"][0] + # if "architectures" is found in the sub-config, use that instead + if model_type == ModelType.TEXT and text_config.get("architectures") is not None: + arch = text_config["architectures"][0] + elif ( + model_type == ModelType.VISION + and vision_config.get("architectures") is not None + ): + arch = vision_config["architectures"][0] + return arch + + def main() -> None: args = parse_args() if args.print_supported_models: logger.error("Supported models:") - Model.print_registered_models() + ModelBase.print_registered_models() sys.exit(0) if args.verbose: @@ -6020,6 +7314,16 @@ def main() -> None: dir_model = args.model + if args.remote: + from huggingface_hub import snapshot_download + + local_dir = snapshot_download( + repo_id=str(dir_model), + allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"], + ) + dir_model = Path(local_dir) + logger.info(f"Downloaded config and tokenizer to {local_dir}") + if not dir_model.is_dir(): logger.error(f"Error: {args.model} is not a directory") sys.exit(1) @@ -6041,27 +7345,36 @@ def main() -> None: if args.outfile is not None: fname_out = args.outfile + elif args.remote: + # if remote, use the model ID as the output file name + fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf") else: fname_out = dir_model logger.info(f"Loading model: {dir_model.name}") - hparams = Model.load_hparams(dir_model) + if args.mmproj: + if "mmproj" not in fname_out.name: + fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") with torch.inference_mode(): output_type = ftype_map[args.outtype] - model_architecture = hparams["architectures"][0] - + model_type = ModelType.VISION if args.mmproj else ModelType.TEXT + hparams = ModelBase.load_hparams(dir_model) + model_architecture = get_model_architecture(hparams, model_type) + logger.info(f"Model architecture: {model_architecture}") try: - model_class = Model.from_model_architecture(model_architecture) + model_class = ModelBase.from_model_architecture( + model_architecture, model_type=model_type + ) except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) model_instance = model_class( - dir_model=dir_model, - ftype=output_type, - fname_out=fname_out, + dir_model, + output_type, + fname_out, is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, eager=args.no_lazy, @@ -6071,6 +7384,7 @@ def main() -> None: split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, + remote_hf_model_id=str(args.model) if args.remote else None, ) if args.vocab_only: diff --git a/src/convert_lora_to_gguf.py b/src/convert_lora_to_gguf.py index 7abff09..d51932f 100644 --- a/src/convert_lora_to_gguf.py +++ b/src/convert_lora_to_gguf.py @@ -24,10 +24,10 @@ if TYPE_CHECKING: from torch import Tensor - import gguf -from convert_hf_to_gguf import LazyTorchTensor, Model +# reuse model definitions from convert_hf_to_gguf.py +from convert_hf_to_gguf import LazyTorchTensor, ModelBase logger = logging.getLogger("lora-to-gguf") @@ -38,9 +38,10 @@ class PartialLoraTensor: B: Tensor | None = None +# magic to support tensor shape modifications and splitting class LoraTorchTensor: - _lora_A: Tensor - _lora_B: Tensor + _lora_A: Tensor # (n_rank, row_size) + _lora_B: Tensor # (col_size, n_rank) _rank: int def __init__(self, A: Tensor, B: Tensor): @@ -58,14 +59,20 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]: def __getitem__( self, - indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...], + indices: ( + SupportsIndex + | slice + | tuple[ + SupportsIndex | slice | Tensor, ... + ] # TODO: add ellipsis in the type signature + ), ) -> LoraTorchTensor: shape = self.shape if isinstance(indices, SupportsIndex): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) else: - raise NotImplementedError + raise NotImplementedError # can't return a vector elif isinstance(indices, slice): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) @@ -75,7 +82,7 @@ def __getitem__( assert len(indices) > 0 if indices[-1] is Ellipsis: return self[indices[:-1]] - + # expand ellipsis indices = tuple( u for v in ( @@ -95,6 +102,7 @@ def __getitem__( *(slice(None, None) for _ in range(len(indices), len(shape))), ) + # TODO: make sure this is correct indices_A = ( *( ( @@ -110,7 +118,7 @@ def __getitem__( indices_B = indices[:-1] return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) else: - raise NotImplementedError + raise NotImplementedError # unknown indice type @property def dtype(self) -> torch.dtype: @@ -133,8 +141,9 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: new_shape = cast(tuple[int, ...], shape) orig_shape = self.shape if len(new_shape) < 2: - raise NotImplementedError + raise NotImplementedError # can't become a vector + # expand -1 in the shape if any(dim == -1 for dim in new_shape): n_elems = prod(orig_shape) n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) @@ -144,7 +153,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: ) if new_shape[-1] != orig_shape[-1]: - raise NotImplementedError + raise NotImplementedError # can't reshape the row size trivially shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) shape_B = (*new_shape[:-1], self._rank) @@ -163,7 +172,7 @@ def permute(self, *dims: int) -> LoraTorchTensor: shape = self.shape dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) if dims[-1] == -1: - + # TODO: support higher dimensional A shapes bigger than 1 assert all(dim == 1 for dim in self._lora_A.shape[:-2]) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: @@ -171,7 +180,7 @@ def permute(self, *dims: int) -> LoraTorchTensor: self._lora_B.permute(*dims), self._lora_A.permute(*dims) ) else: - + # TODO: compose the above two raise NotImplementedError def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: @@ -190,7 +199,7 @@ def to(self, *args, **kwargs): @classmethod def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): - del types + del types # unused if kwargs is None: kwargs = {} @@ -231,7 +240,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight") - + # models produced by mergekit-extract-lora have token embeddings in the adapter base_name = base_name.replace(".lora_embedding_A", ".weight") base_name = base_name.replace(".lora_embedding_B", ".weight") return base_name @@ -293,7 +302,7 @@ def parse_args() -> argparse.Namespace: def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: - + # normally, adapter does not come with base model config, we need to load it from AutoConfig config = AutoConfig.from_pretrained(hf_model_id) return config.to_dict() @@ -321,11 +330,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: if args.outfile is not None: fname_out = args.outfile else: - + # output in the same directory as the model by default fname_out = dir_lora if os.path.exists(input_model): - + # lazy import load_file only if lora is in safetensors format. from safetensors.torch import load_file lora_model = load_file(input_model, device="cpu") @@ -333,9 +342,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: input_model = os.path.join(dir_lora, "adapter_model.bin") lora_model = torch.load(input_model, map_location="cpu", weights_only=True) + # load LoRA config with open(lora_config, "r") as f: lparams: dict[str, Any] = json.load(f) + # load base model if base_model_id is not None: logger.info(f"Loading base model from Hugging Face: {base_model_id}") hparams = load_hparams_from_hf(base_model_id) @@ -361,11 +372,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: sys.exit(1) else: logger.info(f"Loading base model: {dir_base_model.name}") - hparams = Model.load_hparams(dir_base_model) + hparams = ModelBase.load_hparams(dir_base_model) with torch.inference_mode(): try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = ModelBase.from_model_architecture(hparams["architectures"][0]) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) @@ -397,7 +408,7 @@ def set_gguf_parameters(self): ) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - + # Never add extra tensors (e.g. rope_freqs) for LoRA adapters return () def get_tensors(self) -> Iterator[tuple[str, Tensor]]: @@ -407,13 +418,13 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - + # note: mergekit-extract-lora also adds token embeddings to the adapter is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue - + # mergekit-extract-lora add these layernorm to the adapter, we need to keep them if "_layernorm" in name or ".norm" in name: yield (base_name, tensor) continue @@ -452,21 +463,27 @@ def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: dest = list(super().modify_tensors(data_torch, name, bid)) - + # some archs may have the same tensor for lm_head and output (tie word embeddings) + # in this case, adapters targeting lm_head will fail when using llama-export-lora + # therefore, we ignore them for now + # see: https://github.com/ggml-org/llama.cpp/issues/9065 if name == "lm_head.weight" and len(dest) == 0: raise ValueError( "lm_head is present in adapter, but is ignored in base model" ) for dest_name, dest_data in dest: - + # mergekit-extract-lora add these layernorm to the adapter if "_norm" in dest_name: assert dest_data.dim() == 1 yield (dest_name, dest_data) continue + # otherwise, we must get the lora_A and lora_B tensors assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() + # note: mergekit-extract-lora flip and transpose A and B + # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd() if "token_embd.weight" in dest_name: lora_a = lora_a.T diff --git a/src/gguf/constants.py b/src/gguf/constants.py index b4b4cca..8a9c282 100644 --- a/src/gguf/constants.py +++ b/src/gguf/constants.py @@ -108,6 +108,7 @@ class LLM: EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm" EXPERT_GATING_FUNC = "{arch}.expert_gating_func" + MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" @@ -120,6 +121,7 @@ class LLM: RESIDUAL_SCALE = "{arch}.residual_scale" EMBEDDING_SCALE = "{arch}.embedding_scale" TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" + INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -142,6 +144,8 @@ class Attention: REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" SCALE = "{arch}.attention.scale" + KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" + VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -221,6 +225,30 @@ class Adapter: TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" + class ClipVision: + PROJECTOR_TYPE = "clip.projector_type" + HAS_VISION_ENCODER = "clip.has_vision_encoder" + HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" + IMAGE_SIZE = "clip.vision.image_size" + PATCH_SIZE = "clip.vision.patch_size" + EMBEDDING_LENGTH = "clip.vision.embedding_length" + FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length" + PROJECTION_DIM = "clip.vision.projection_dim" + BLOCK_COUNT = "clip.vision.block_count" + IMAGE_MEAN = "clip.vision.image_mean" + IMAGE_STD = "clip.vision.image_std" + SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size" + USE_GELU = "clip.use_gelu" + USE_SILU = "clip.use_silu" + N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl + + class Attention: + HEAD_COUNT = "clip.vision.attention.head_count" + LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" + + class Projector: + SCALE_FACTOR = "clip.vision.projector.scale_factor" + # # recommended mapping of model tensor names for storage in gguf @@ -230,10 +258,13 @@ class Adapter: class GGUFType: MODEL = "model" ADAPTER = "adapter" + CLIP_VISION = "clip-vision" class MODEL_ARCH(IntEnum): + CLIP_VISION = auto() # dummy arch for clip.cpp LLAMA = auto() + LLAMA4 = auto() DECI = auto() FALCON = auto() BAICHUAN = auto() @@ -246,6 +277,7 @@ class MODEL_ARCH(IntEnum): REFACT = auto() BERT = auto() NOMIC_BERT = auto() + NOMIC_BERT_MOE = auto() JINA_BERT_V2 = auto() BLOOM = auto() STABLELM = auto() @@ -253,6 +285,8 @@ class MODEL_ARCH(IntEnum): QWEN2 = auto() QWEN2MOE = auto() QWEN2VL = auto() + QWEN3 = auto() + QWEN3MOE = auto() PHI2 = auto() PHI3 = auto() PHIMOE = auto() @@ -283,6 +317,7 @@ class MODEL_ARCH(IntEnum): DEEPSEEK = auto() DEEPSEEK2 = auto() CHATGLM = auto() + GLM4 = auto() BITNET = auto() T5 = auto() T5ENCODER = auto() @@ -293,6 +328,18 @@ class MODEL_ARCH(IntEnum): GRANITE_MOE = auto() CHAMELEON = auto() WAVTOKENIZER_DEC = auto() + PLM = auto() + BAILINGMOE = auto() + + +class VISION_PROJECTOR_TYPE(IntEnum): + MLP = auto() + LDP = auto() + LDPV2 = auto() + RESAMPLER = auto() + GLM_EDGE = auto() + MERGER = auto() + GEMMA3 = auto() class MODEL_TENSOR(IntEnum): @@ -382,6 +429,8 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_B = auto() ATTN_KV_A_MQA = auto() ATTN_KV_B = auto() + ATTN_K_B = auto() + ATTN_V_B = auto() ATTN_Q_A_NORM = auto() ATTN_KV_A_NORM = auto() FFN_SUB_NORM = auto() @@ -432,10 +481,51 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_K = auto() POSNET_ATTN_V = auto() POSNET_ATTN_OUT = auto() + # vision + V_MMPROJ = auto() + V_MMPROJ_FC = auto() + V_MMPROJ_MLP = auto() + V_MMPROJ_PEG = auto() + V_ENC_EMBD_CLS = auto() + V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_POS = auto() + V_ENC_ATTN_Q = auto() + V_ENC_ATTN_Q_NORM = auto() + V_ENC_ATTN_K = auto() + V_ENC_ATTN_K_NORM = auto() + V_ENC_ATTN_V = auto() + V_ENC_INPUT_NORM = auto() + V_ENC_OUTPUT = auto() + V_ENC_OUTPUT_NORM = auto() + V_ENC_FFN_UP = auto() + V_ENC_FFN_GATE = auto() + V_ENC_FFN_DOWN = auto() + V_LAYER_SCALE_1 = auto() + V_LAYER_SCALE_2 = auto() + V_PRE_NORM = auto() + V_POST_NORM = auto() + V_MM_INP_NORM = auto() + V_MM_INP_PROJ = auto() # gemma3 + V_MM_SOFT_EMB_NORM = auto() # gemma3 + V_RESMPL_POS_EMBD_K = auto() # minicpmv + V_RESMPL_ATTN_Q = auto() # minicpmv + V_RESMPL_ATTN_K = auto() # minicpmv + V_RESMPL_ATTN_V = auto() # minicpmv + V_RESMPL_ATTN_OUT = auto() # minicpmv + V_RESMPL_KV = auto() # minicpmv + V_RESMPL_KV_NORM = auto() # minicpmv + V_RESMPL_POST_NORM = auto() # minicpmv + V_RESMPL_Q_NORM = auto() # minicpmv + V_RESMPL_PROJ = auto() # minicpmv + V_RESMPL_QUERY = auto() # minicpmv + V_TOK_EMBD_IMG_BREAK = auto() # pixtral + V_MM_PATCH_MERGER = auto() # mistral small 3.1 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { + MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.LLAMA4: "llama4", MODEL_ARCH.DECI: "deci", MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.BAICHUAN: "baichuan", @@ -448,6 +538,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.REFACT: "refact", MODEL_ARCH.BERT: "bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", @@ -455,6 +546,8 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN2MOE: "qwen2moe", MODEL_ARCH.QWEN2VL: "qwen2vl", + MODEL_ARCH.QWEN3: "qwen3", + MODEL_ARCH.QWEN3MOE: "qwen3moe", MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PHIMOE: "phimoe", @@ -485,6 +578,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DEEPSEEK: "deepseek", MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.CHATGLM: "chatglm", + MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", MODEL_ARCH.T5ENCODER: "t5encoder", @@ -495,6 +589,18 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GRANITE_MOE: "granitemoe", MODEL_ARCH.CHAMELEON: "chameleon", MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", + MODEL_ARCH.PLM: "plm", + MODEL_ARCH.BAILINGMOE: "bailingmoe", +} + +VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { + VISION_PROJECTOR_TYPE.MLP: "mlp", + VISION_PROJECTOR_TYPE.LDP: "ldp", + VISION_PROJECTOR_TYPE.LDPV2: "ldpv2", + VISION_PROJECTOR_TYPE.RESAMPLER: "resampler", + VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", + VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", + VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -584,6 +690,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", + MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", @@ -634,9 +742,88 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + # vision + MODEL_TENSOR.V_MMPROJ: "mm.{bid}", + MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", + MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", + MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", + MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", + MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", + MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", + MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm", + MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", + MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm", + MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v", + MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1", + MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out", + MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2", + MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up", + MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate", + MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down", + MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1", + MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2", + MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", + MODEL_TENSOR.V_POST_NORM: "v.post_ln", + MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", + MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", + MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", + MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", + MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v", + MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out", + MODEL_TENSOR.V_RESMPL_KV: "resampler.kv", + MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv", + MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post", + MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q", + MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj", + MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", + MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral + MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1 } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.CLIP_VISION: [ + MODEL_TENSOR.V_MMPROJ, + MODEL_TENSOR.V_MMPROJ_FC, + MODEL_TENSOR.V_MMPROJ_MLP, + MODEL_TENSOR.V_MMPROJ_PEG, + MODEL_TENSOR.V_ENC_EMBD_CLS, + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_Q_NORM, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_K_NORM, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_GATE, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_LAYER_SCALE_1, + MODEL_TENSOR.V_LAYER_SCALE_2, + MODEL_TENSOR.V_PRE_NORM, + MODEL_TENSOR.V_POST_NORM, + MODEL_TENSOR.V_MM_INP_PROJ, + MODEL_TENSOR.V_MM_INP_NORM, + MODEL_TENSOR.V_MM_SOFT_EMB_NORM, + MODEL_TENSOR.V_RESMPL_POS_EMBD_K, + MODEL_TENSOR.V_RESMPL_ATTN_Q, + MODEL_TENSOR.V_RESMPL_ATTN_K, + MODEL_TENSOR.V_RESMPL_ATTN_V, + MODEL_TENSOR.V_RESMPL_ATTN_OUT, + MODEL_TENSOR.V_RESMPL_KV, + MODEL_TENSOR.V_RESMPL_KV_NORM, + MODEL_TENSOR.V_RESMPL_POST_NORM, + MODEL_TENSOR.V_RESMPL_Q_NORM, + MODEL_TENSOR.V_RESMPL_PROJ, + MODEL_TENSOR.V_RESMPL_QUERY, + MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK, + MODEL_TENSOR.V_MM_PATCH_MERGER, + ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -657,6 +844,29 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.LLAMA4: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], MODEL_ARCH.DECI: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -780,6 +990,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.NOMIC_BERT_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.JINA_BERT_V2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, @@ -930,6 +1156,40 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], + MODEL_ARCH.QWEN3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.QWEN3MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], MODEL_ARCH.PLAMO: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1120,6 +1380,7 @@ class MODEL_TENSOR(IntEnum): ], MODEL_ARCH.GEMMA3: [ MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q_NORM, @@ -1453,6 +1714,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, MODEL_TENSOR.ATTN_OUT, @@ -1470,6 +1733,20 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_EXP_PROBS_B, ], + MODEL_ARCH.PLM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_DOWN, + ], MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.ROPE_FREQS, @@ -1485,6 +1762,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GLM4: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.BITNET: [ MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, @@ -1618,6 +1912,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, ], MODEL_ARCH.CHAMELEON: [ MODEL_TENSOR.TOKEN_EMBD, @@ -1657,6 +1954,25 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_V, MODEL_TENSOR.POSNET_ATTN_OUT, ], + MODEL_ARCH.BAILINGMOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], # TODO } @@ -1709,6 +2025,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.BAILINGMOE: [ + MODEL_TENSOR.ROPE_FREQS, + ], } # @@ -1736,6 +2055,8 @@ class PoolingType(IntEnum): NONE = 0 MEAN = 1 CLS = 2 + LAST = 3 + RANK = 4 class GGMLQuantizationType(IntEnum): @@ -1862,6 +2183,15 @@ def get_type(val: Any) -> GGUFValueType: raise ValueError(f"Unknown type: {type(val)}") +class VisionProjectorType: + GEMMA3 = "gemma3" + IDEFICS3 = "idefics3" + PIXTRAL = "pixtral" + QWEN2VL = "qwen2vl_merger" + QWEN25VL = "qwen2.5vl_merger" + INTERNVL = "internvl" + + # Items here are (block size, type size) QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { diff --git a/src/gguf/gguf.py b/src/gguf/gguf.py index 651a81e..f1b4849 100644 --- a/src/gguf/gguf.py +++ b/src/gguf/gguf.py @@ -1,7 +1,3 @@ -# This file left for compatibility. If you want to use the GGUF API from Python -# then don't import gguf/gguf.py directly. If you're looking for examples, see the -# examples/ directory for gguf-py - import importlib import sys from pathlib import Path diff --git a/src/gguf/gguf_reader.py b/src/gguf/gguf_reader.py index d1d1931..19ad7d8 100644 --- a/src/gguf/gguf_reader.py +++ b/src/gguf/gguf_reader.py @@ -1,7 +1,3 @@ -# -# GGUF file reading/modification support. For API usage information, -# please see the files scripts/ for some fairly simple examples. -# from __future__ import annotations import logging diff --git a/src/gguf/gguf_writer.py b/src/gguf/gguf_writer.py index a279b74..62f07e0 100644 --- a/src/gguf/gguf_writer.py +++ b/src/gguf/gguf_writer.py @@ -774,6 +774,12 @@ def add_key_length(self, length: int) -> None: def add_value_length(self, length: int) -> None: self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) + def add_key_length_mla(self, length: int) -> None: + self.add_uint32(Keys.Attention.KEY_LENGTH_MLA.format(arch=self.arch), length) + + def add_value_length_mla(self, length: int) -> None: + self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length) + def add_max_alibi_bias(self, bias: float) -> None: self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) @@ -807,6 +813,9 @@ def add_expert_weights_norm(self, value: bool) -> None: def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None: self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value) + def add_moe_every_n_layers(self, value: int) -> None: + self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value) + def add_swin_norm(self, value: bool) -> None: self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value) @@ -831,6 +840,11 @@ def add_wkv_head_size(self, size: int) -> None: def add_token_shift_count(self, count: int) -> None: self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count) + def add_interleave_moe_layer_step(self, value: int) -> None: + self.add_uint32( + Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value + ) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) @@ -1017,6 +1031,59 @@ def add_eot_token_id(self, id: int) -> None: def add_eom_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOM_ID, id) + # for vision models + + def add_vision_projection_dim(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value) + + def add_vision_has_vision_encoder(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value) + + def add_vision_patch_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PATCH_SIZE, value) + + def add_vision_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value) + + def add_vision_feed_forward_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value) + + def add_vision_block_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value) + + def add_vision_head_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value) + + def add_vision_projector_type(self, value: str) -> None: + self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value) + + def add_vision_attention_layernorm_eps(self, value: float) -> None: + self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value) + + def add_vision_image_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) + + def add_vision_image_mean(self, values: Sequence[float]) -> None: + self.add_array(Keys.ClipVision.IMAGE_MEAN, values) + + def add_vision_image_std(self, values: Sequence[float]) -> None: + self.add_array(Keys.ClipVision.IMAGE_STD, values) + + def add_vision_spatial_merge_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SPATIAL_MERGE_SIZE, value) + + def add_vision_use_gelu(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.USE_GELU, value) + + def add_vision_use_silu(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.USE_SILU, value) + + def add_vision_projector_scale_factor(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value) + + def add_vision_n_wa_pattern(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = "" if not skip_pack_prefix: diff --git a/src/gguf/lazy.py b/src/gguf/lazy.py index f3273f5..0d6c24b 100644 --- a/src/gguf/lazy.py +++ b/src/gguf/lazy.py @@ -201,6 +201,27 @@ def wrapped_fn(*args, **kwargs): return cls( meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn ) + elif isinstance(res, tuple) and all( + isinstance(t, cls._tensor_type) for t in res + ): + # share the evaluation between lazy tuple elements + shared_args: list = [args, None] + + def eager_tuple_element(a: list[Any], i: int = 0, /, **kw) -> LazyBase: + assert len(a) == 2 + if a[1] is None: + a[1] = fn(*a[0], **kw) + return a[1][i] + + return tuple( + cls( + meta=cls.eager_to_meta(res[i]), + args=(shared_args, i), + kwargs=kwargs, + func=eager_tuple_element, + ) + for i in range(len(res)) + ) else: del res # not needed # non-tensor return likely relies on the contents of the args diff --git a/src/gguf/tensor_mapping.py b/src/gguf/tensor_mapping.py index 5058eba..7140e15 100644 --- a/src/gguf/tensor_mapping.py +++ b/src/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 + "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -29,6 +29,8 @@ class TensorNameMap: "shared", # t5 "rwkv.embeddings", # rwkv6 "model.embeddings", # rwkv7 + "model.word_embeddings", # bailingmoe + "language_model.model.embed_tokens", # llama4 ), # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( @@ -62,6 +64,7 @@ class TensorNameMap: "output_layer", # chatglm "head", # rwkv "head.out", # wavtokenizer + "lm_head", # llama4 ), # Output norm MODEL_TENSOR.OUTPUT_NORM: ( @@ -83,6 +86,7 @@ class TensorNameMap: "rwkv.ln_out", # rwkv6 "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer + "model.norm", # llama4 ), # Rope frequencies MODEL_TENSOR.ROPE_FREQS: ( @@ -119,6 +123,7 @@ class TensorNameMap: "transformer.layers.{bid}.attn_norm", # openelm "rwkv.blocks.{bid}.ln1", # rwkv6 "model.layers.{bid}.ln1", # rwkv7 + "model.layers.{bid}.input_layernorm", # llama4 ), # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( @@ -155,6 +160,7 @@ class TensorNameMap: "model.layers.{bid}.attention.wq", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone + "model.layers.{bid}.self_attn.q_proj", # llama4 ), # Attention key MODEL_TENSOR.ATTN_K: ( @@ -168,6 +174,7 @@ class TensorNameMap: "model.layers.{bid}.attention.wk", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone + "model.layers.{bid}.self_attn.k_proj", # llama4 ), # Attention value MODEL_TENSOR.ATTN_V: ( @@ -180,6 +187,7 @@ class TensorNameMap: "model.layers.{bid}.attention.wv", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone + "model.layers.{bid}.self_attn.v_proj", # llama4 ), # Attention output MODEL_TENSOR.ATTN_OUT: ( @@ -205,6 +213,7 @@ class TensorNameMap: "encoder.layers.{bid}.self_attention.dense", # chatglm "transformer.layers.{bid}.attn.out_proj", # openelm "transformer.h.{bid}.attn.attention.out_proj", # exaone + "model.layers.{bid}.self_attn.o_proj", # llama4 ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( @@ -214,7 +223,8 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 + "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge + "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414 ), # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( @@ -238,6 +248,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm_2", # Grok "encoder.layers.{bid}.post_attention_layernorm", # chatglm "transformer.layers.{bid}.ffn_norm", # openelm + "model.layers.{bid}.post_attention_layernorm", # llama4 ), # Post feed-forward norm MODEL_TENSOR.FFN_PRE_NORM: ( @@ -246,6 +257,7 @@ class TensorNameMap: # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 + "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414 ), MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral @@ -254,6 +266,8 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.router", # Grok "transformer.blocks.{bid}.ffn.router.layer", # dbrx "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe + "model.layers.{bid}.feed_forward.router", # llama4 + "encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe @@ -279,15 +293,17 @@ class TensorNameMap: "h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.gate_up_proj", # phi3 + "model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414 "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert + "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe "model.layers.{bid}.mlp.c_fc", # starcoder2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "transformer.h.{bid}.mlp.c_fc_1", # exaone + "model.layers.{bid}.feed_forward.up_proj", # llama4 ), MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) @@ -295,10 +311,13 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged) + "model.layers.{bid}.feed_forward.experts.up_proj", # llama4 + "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe ), MODEL_TENSOR.FFN_UP_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 + "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 ), # AWQ-activation gate MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt @@ -315,6 +334,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.linear_1", # refact "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone + "model.layers.{bid}.feed_forward.gate_proj", # llama4 ), MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) @@ -322,10 +342,12 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged) + "model.layers.{bid}.feed_forward.experts.gate_proj", # llama4 ), MODEL_TENSOR.FFN_GATE_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 + "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4 ), # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( @@ -353,6 +375,7 @@ class TensorNameMap: "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm "model.layers.h.{bid}.mlp.c_proj", # exaone + "model.layers.{bid}.feed_forward.down_proj", # llama4 ), MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) @@ -361,10 +384,14 @@ class TensorNameMap: "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged) + "model.layers.{bid}.feed_forward.experts.down_proj", # llama4 + "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2 + "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4 + "model.layers.{bid}.shared_mlp.output_linear", # granitemoe ), MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", @@ -539,6 +566,8 @@ class TensorNameMap: MODEL_TENSOR.ATTN_KV_B: ( "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 ), + MODEL_TENSOR.ATTN_K_B: ("model.layers.{bid}.self_attn.k_b_proj",), # deepseek2 + MODEL_TENSOR.ATTN_V_B: ("model.layers.{bid}.self_attn.v_b_proj",), # deepseek2 MODEL_TENSOR.ATTN_Q_A_NORM: ( "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 ), @@ -636,6 +665,147 @@ class TensorNameMap: MODEL_TENSOR.POSNET_ATTN_OUT: ( "backbone.posnet.{bid}.proj_out", # wavtokenizer ), + ############################################################################# + ## Vision encoder + MODEL_TENSOR.V_MMPROJ: ( + "multi_modal_projector.linear_{bid}", + "visual.merger.mlp.{bid}", # qwen2vl + ), + MODEL_TENSOR.V_MMPROJ_FC: ( + "model.connector.modality_projection.proj", # SmolVLM + ), + MODEL_TENSOR.V_MMPROJ_MLP: ( + "model.mm_projector.mlp.mlp.{bid}", + "mlp1.{bid}", # InternVL + ), + MODEL_TENSOR.V_MMPROJ_PEG: ("model.mm_projector.peg.peg.{bid}",), + MODEL_TENSOR.V_ENC_EMBD_CLS: ( + "vision_tower.vision_model.embeddings.class_embedding", + ), + MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "vision_tower.vision_model.embeddings.patch_embedding", + "vpm.embeddings.patch_embedding", + "model.vision_model.embeddings.patch_embedding", # SmolVLM + "vision_tower.patch_conv", # pixtral + "visual.patch_embed.proj", # qwen2vl + ), + MODEL_TENSOR.V_ENC_EMBD_POS: ( + "vision_tower.vision_model.embeddings.position_embedding", + "vpm.embeddings.position_embedding", + "model.vision_model.embeddings.position_embedding", # SmolVLM + ), + MODEL_TENSOR.V_ENC_ATTN_Q: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + "vpm.encoder.layers.{bid}.self_attn.q_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral + "visual.blocks.{bid}.attn.q", # qwen2vl, generated + ), + MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL + ), + MODEL_TENSOR.V_ENC_ATTN_K: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "vpm.encoder.layers.{bid}.self_attn.k_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral + "visual.blocks.{bid}.attn.k", # qwen2vl, generated + ), + MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL + ), + MODEL_TENSOR.V_ENC_ATTN_V: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "vpm.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral + "visual.blocks.{bid}.attn.v", # qwen2vl, generated + ), + MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", + "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL + "vpm.encoder.layers.{bid}.layer_norm1", + "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral + "visual.blocks.{bid}.norm1", # qwen2vl + ), + MODEL_TENSOR.V_ENC_OUTPUT: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", + "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL + "vpm.encoder.layers.{bid}.self_attn.out_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral + "visual.blocks.{bid}.attn.proj", # qwen2vl + ), + MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", + "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL + "vpm.encoder.layers.{bid}.layer_norm2", + "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM + "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral + "visual.blocks.{bid}.norm2", # qwen2vl + ), + MODEL_TENSOR.V_ENC_FFN_UP: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + "vpm.encoder.layers.{bid}.mlp.fc1", + "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 + "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral + "visual.blocks.{bid}.mlp.fc1", # qwen2vl + "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl + ), + MODEL_TENSOR.V_ENC_FFN_GATE: ( + "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral + "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl + ), + MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + "vpm.encoder.layers.{bid}.mlp.fc2", + "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 + "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral + "visual.blocks.{bid}.mlp.fc2", # qwen2vl + "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl + ), + MODEL_TENSOR.V_LAYER_SCALE_1: ( + "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL + ), + MODEL_TENSOR.V_LAYER_SCALE_2: ( + "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL + ), + MODEL_TENSOR.V_PRE_NORM: ( + "vision_tower.vision_model.pre_layrnorm", + "vision_tower.ln_pre", # pixtral + ), + MODEL_TENSOR.V_POST_NORM: ( + "vision_tower.vision_model.post_layernorm", + "model.vision_model.post_layernorm", # SmolVLM + "visual.merger.ln_q", # qwen2vl + ), + MODEL_TENSOR.V_MM_INP_PROJ: ("multi_modal_projector.mm_input_projection",), + MODEL_TENSOR.V_MM_INP_NORM: ("multi_modal_projector.norm",), + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ("multi_modal_projector.mm_soft_emb_norm",), + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ("resampler.pos_embed_k",), + MODEL_TENSOR.V_RESMPL_ATTN_Q: ( + "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj + ), + MODEL_TENSOR.V_RESMPL_ATTN_K: ( + "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj + ), + MODEL_TENSOR.V_RESMPL_ATTN_V: ( + "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj + ), + MODEL_TENSOR.V_RESMPL_ATTN_OUT: ("resampler.attn.out_proj",), + MODEL_TENSOR.V_RESMPL_KV: ("resampler.kv_proj",), + MODEL_TENSOR.V_RESMPL_POST_NORM: ("resampler.ln_post",), + MODEL_TENSOR.V_RESMPL_KV_NORM: ("resampler.ln_kv",), + MODEL_TENSOR.V_RESMPL_Q_NORM: ("resampler.ln_q",), + MODEL_TENSOR.V_RESMPL_PROJ: ("resampler.proj",), + MODEL_TENSOR.V_RESMPL_QUERY: ("resampler.query",), + MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: ( + "v.token_embd.img_break", # for pixtral, this is a generated vector + ), + MODEL_TENSOR.V_MM_PATCH_MERGER: ( + "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 + ), } # architecture-specific block mappings diff --git a/src/gguf/utility.py b/src/gguf/utility.py index c514251..98c1ee7 100644 --- a/src/gguf/utility.py +++ b/src/gguf/utility.py @@ -1,7 +1,11 @@ from __future__ import annotations +from dataclasses import dataclass from typing import Literal +import os +import json + def fill_templated_filename(filename: str, output_type: str | None) -> str: # Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf' @@ -99,3 +103,214 @@ def naming_convention( kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else "" return f"{name}{parameters}{finetune}{version}{encoding}{kind}" + + +@dataclass +class RemoteTensor: + dtype: str + shape: tuple[int, ...] + offset_start: int + size: int + url: str + + def data(self) -> bytearray: + # TODO: handle request errors (maybe with limited retries?) + # NOTE: using a bytearray, otherwise PyTorch complains the buffer is not writeable + data = bytearray( + SafetensorRemote.get_data_by_range( + url=self.url, start=self.offset_start, size=self.size + ) + ) + return data + + +class SafetensorRemote: + """ + Uility class to handle remote safetensor files. + This class is designed to work with Hugging Face model repositories. + + Example (one model has single safetensor file, the other has multiple): + for model_id in ["ngxson/TEST-Tiny-Llama4", "Qwen/Qwen2.5-7B-Instruct"]: + tensors = SafetensorRemote.get_list_tensors_hf_model(model_id) + print(tensors) + + Example reading tensor data: + tensors = SafetensorRemote.get_list_tensors_hf_model(model_id) + for name, meta in tensors.items(): + dtype, shape, offset_start, size, remote_safetensor_url = meta + # read the tensor data + data = SafetensorRemote.get_data_by_range(remote_safetensor_url, offset_start, size) + print(data) + """ + + BASE_DOMAIN = "https://huggingface.co" + ALIGNMENT = 8 # bytes + + @classmethod + def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: + """ + Get list of tensors from a Hugging Face model repository. + + Returns a dictionary of tensor names and their metadata. + Each tensor is represented as a tuple of (dtype, shape, offset_start, size, remote_safetensor_url) + """ + # case 1: model has only one single model.safetensor file + is_single_file = cls.check_file_exist( + f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors" + ) + if is_single_file: + url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors" + return cls.get_list_tensors(url) + + # case 2: model has multiple files + index_url = ( + f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors.index.json" + ) + is_multiple_files = cls.check_file_exist(index_url) + if is_multiple_files: + # read the index file + index_data = cls.get_data_by_range(index_url, 0) + index_str = index_data.decode("utf-8") + index_json = json.loads(index_str) + assert ( + index_json.get("weight_map") is not None + ), "weight_map not found in index file" + weight_map = index_json["weight_map"] + # get the list of files + all_files = list(set(weight_map.values())) + all_files.sort() # make sure we load shard files in order + # get the list of tensors + tensors: dict[str, RemoteTensor] = {} + for file in all_files: + url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/{file}" + for key, val in cls.get_list_tensors(url).items(): + tensors[key] = val + return tensors + + raise ValueError(f"Model {model_id} does not have any safetensor files") + + @classmethod + def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]: + """ + Get list of tensors from a remote safetensor file. + + Returns a dictionary of tensor names and their metadata. + Each tensor is represented as a tuple of (dtype, shape, offset_start, size) + """ + metadata, data_start_offset = cls.get_metadata(url) + res: dict[str, RemoteTensor] = {} + + for name, meta in metadata.items(): + if name == "__metadata__": + continue + if not isinstance(meta, dict): + raise ValueError(f"Invalid metadata for tensor '{name}': {meta}") + try: + dtype = meta["dtype"] + shape = meta["shape"] + offset_start_relative, offset_end_relative = meta["data_offsets"] + size = offset_end_relative - offset_start_relative + offset_start = data_start_offset + offset_start_relative + res[name] = RemoteTensor( + dtype=dtype, + shape=tuple(shape), + offset_start=offset_start, + size=size, + url=url, + ) + except KeyError as e: + raise ValueError( + f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}" + ) + + return res + + @classmethod + def get_metadata(cls, url: str) -> tuple[dict, int]: + """ + Get JSON metadata from a remote safetensor file. + + Returns tuple of (metadata, data_start_offset) + """ + # Request first 5MB of the file (hopefully enough for metadata) + read_size = 5 * 1024 * 1024 + raw_data = cls.get_data_by_range(url, 0, read_size) + + # Parse header + # First 8 bytes contain the metadata length as u64 little-endian + if len(raw_data) < 8: + raise ValueError("Not enough data to read metadata size") + metadata_length = int.from_bytes(raw_data[:8], byteorder="little") + + # Calculate the data start offset + data_start_offset = 8 + metadata_length + alignment = SafetensorRemote.ALIGNMENT + if data_start_offset % alignment != 0: + data_start_offset += alignment - (data_start_offset % alignment) + + # Check if we have enough data to read the metadata + if len(raw_data) < 8 + metadata_length: + raise ValueError( + f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {len(raw_data)}" + ) + + # Extract metadata bytes and parse as JSON + metadata_bytes = raw_data[8 : 8 + metadata_length] + metadata_str = metadata_bytes.decode("utf-8") + try: + metadata = json.loads(metadata_str) + return metadata, data_start_offset + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse safetensor metadata as JSON: {e}") + + @classmethod + def get_data_by_range(cls, url: str, start: int, size: int = -1) -> bytes: + """ + Get raw byte data from a remote file by range. + If size is not specified, it will read the entire file. + """ + import requests + from urllib.parse import urlparse + + parsed_url = urlparse(url) + if not parsed_url.scheme or not parsed_url.netloc: + raise ValueError(f"Invalid URL: {url}") + + headers = cls._get_request_headers() + if size > -1: + headers["Range"] = f"bytes={start}-{start + size}" + response = requests.get(url, allow_redirects=True, headers=headers) + response.raise_for_status() + + # Get raw byte data + return response.content[:size] + + @classmethod + def check_file_exist(cls, url: str) -> bool: + """ + Check if a file exists at the given URL. + Returns True if the file exists, False otherwise. + """ + import requests + from urllib.parse import urlparse + + parsed_url = urlparse(url) + if not parsed_url.scheme or not parsed_url.netloc: + raise ValueError(f"Invalid URL: {url}") + + try: + headers = cls._get_request_headers() + headers["Range"] = "bytes=0-0" + response = requests.head(url, allow_redirects=True, headers=headers) + # Success (2xx) or redirect (3xx) + return 200 <= response.status_code < 400 + except requests.RequestException: + return False + + @classmethod + def _get_request_headers(cls) -> dict[str, str]: + """Prepare common headers for requests.""" + headers = {"User-Agent": "convert_hf_to_gguf"} + if os.environ.get("HF_TOKEN"): + headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}" + return headers