chore: update for new year and improve compliance

- updated copyright year in LICENSE file to 2025
- bundled llama.cpp licensing text in About menu to maintain MIT compliance
- updated llama.cpp and gguf Python library and scripts
- adjusted monitoring intervals from 0.2s to 0.5s
- updated Python requirements to latest compatible versions
- added new HF to GGUF conversion types: `tq1_0` and `tq2_0`

Happy New Year 🎉!
This commit is contained in:
BuildTools 2025-01-08 15:11:47 -08:00
parent ddbf96c8e9
commit 102e3a14fd
No known key found for this signature in database
GPG Key ID: 3270C066C15D530B
15 changed files with 4021 additions and 1102 deletions

View File

@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier same "printed page" as the copyright notice for easier
identification within third-party archives. identification within third-party archives.
Copyright 2024 leafspark Copyright (c) 2024-2025 leafspark
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

View File

@ -1,13 +1,13 @@
PyYAML~=6.0.2 PyYAML~=6.0.2
psutil~=6.1.0 psutil~=6.1.1
pynvml~=12.0.0 pynvml~=12.0.0
PySide6~=6.8.1 PySide6~=6.8.1
safetensors~=0.4.5 safetensors~=0.5.0
numpy<2.0.0 numpy<2.0.0
torch~=2.5.1 torch~=2.5.1
sentencepiece~=0.2.0 sentencepiece~=0.2.0
setuptools~=75.5.0 setuptools~=75.6.0
huggingface-hub~=0.26.5 huggingface-hub~=0.27.0
transformers~=4.47.0 transformers~=4.47.1
fastapi~=0.115.6 fastapi~=0.115.6
uvicorn~=0.34.0 uvicorn~=0.34.0

View File

@ -500,7 +500,7 @@ def __init__(self, args: List[str]) -> None:
# Timer for updating system info # Timer for updating system info
self.timer = QTimer() self.timer = QTimer()
self.timer.timeout.connect(self.update_system_info) self.timer.timeout.connect(self.update_system_info)
self.timer.start(200) self.timer.start(500)
# Backend selection # Backend selection
backend_layout = QHBoxLayout() backend_layout = QHBoxLayout()
@ -1023,7 +1023,9 @@ def __init__(self, args: List[str]) -> None:
hf_to_gguf_layout.addRow(OUTPUT_FILE, hf_outfile_layout) hf_to_gguf_layout.addRow(OUTPUT_FILE, hf_outfile_layout)
self.hf_outtype = QComboBox() self.hf_outtype = QComboBox()
self.hf_outtype.addItems(["f32", "f16", "bf16", "q8_0", "auto"]) self.hf_outtype.addItems(
["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"]
)
hf_to_gguf_layout.addRow(OUTPUT_TYPE, self.hf_outtype) hf_to_gguf_layout.addRow(OUTPUT_TYPE, self.hf_outtype)
self.hf_vocab_only = QCheckBox(VOCAB_ONLY) self.hf_vocab_only = QCheckBox(VOCAB_ONLY)

View File

@ -95,7 +95,7 @@ def __init__(self, parent=None) -> None:
self.timer = QTimer(self) self.timer = QTimer(self)
self.timer.timeout.connect(self.update_gpu_info) self.timer.timeout.connect(self.update_gpu_info)
self.timer.start(200) # Update every 0.2 seconds self.timer.start(500) # Update every 0.5 seconds
self.gpu_data = [] self.gpu_data = []
self.vram_data = [] self.vram_data = []
@ -192,7 +192,7 @@ def update_graph_data() -> None:
timer = QTimer(dialog) timer = QTimer(dialog)
timer.timeout.connect(update_graph_data) timer.timeout.connect(update_graph_data)
timer.start(200) # Update every 0.2 seconds timer.start(500) # Update every 0.5 seconds
dialog.exec() dialog.exec()
@ -227,7 +227,7 @@ def update_graph_data() -> None:
timer = QTimer(dialog) timer = QTimer(dialog)
timer.timeout.connect(update_graph_data) timer.timeout.connect(update_graph_data)
timer.start(200) # Update every 0.2 seconds timer.start(500) # Update every 0.5 seconds
tab_widget.addTab(gpu_graph, GPU_USAGE_OVER_TIME) tab_widget.addTab(gpu_graph, GPU_USAGE_OVER_TIME)
tab_widget.addTab(vram_graph, VRAM_USAGE_OVER_TIME) tab_widget.addTab(vram_graph, VRAM_USAGE_OVER_TIME)

File diff suppressed because it is too large Load Diff

View File

@ -18,14 +18,16 @@
SupportsIndex, SupportsIndex,
cast, cast,
) )
from transformers import AutoConfig
import torch import torch
if TYPE_CHECKING: if TYPE_CHECKING:
from torch import Tensor from torch import Tensor
from gguf.constants import * import gguf
# reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, Model from convert_hf_to_gguf import LazyTorchTensor, Model
logger = logging.getLogger("lora-to-gguf") logger = logging.getLogger("lora-to-gguf")
@ -37,9 +39,10 @@ class PartialLoraTensor:
B: Tensor | None = None B: Tensor | None = None
# magic to support tensor shape modifications and splitting
class LoraTorchTensor: class LoraTorchTensor:
_lora_A: Tensor _lora_A: Tensor # (n_rank, row_size)
_lora_B: Tensor _lora_B: Tensor # (col_size, n_rank)
_rank: int _rank: int
def __init__(self, A: Tensor, B: Tensor): def __init__(self, A: Tensor, B: Tensor):
@ -57,14 +60,20 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
def __getitem__( def __getitem__(
self, self,
indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...], indices: (
SupportsIndex
| slice
| tuple[
SupportsIndex | slice | Tensor, ...
] # TODO: add ellipsis in the type signature
),
) -> LoraTorchTensor: ) -> LoraTorchTensor:
shape = self.shape shape = self.shape
if isinstance(indices, SupportsIndex): if isinstance(indices, SupportsIndex):
if len(shape) > 2: if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
else: else:
raise NotImplementedError raise NotImplementedError # can't return a vector
elif isinstance(indices, slice): elif isinstance(indices, slice):
if len(shape) > 2: if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
@ -74,7 +83,7 @@ def __getitem__(
assert len(indices) > 0 assert len(indices) > 0
if indices[-1] is Ellipsis: if indices[-1] is Ellipsis:
return self[indices[:-1]] return self[indices[:-1]]
# expand ellipsis
indices = tuple( indices = tuple(
u u
for v in ( for v in (
@ -94,6 +103,7 @@ def __getitem__(
*(slice(None, None) for _ in range(len(indices), len(shape))), *(slice(None, None) for _ in range(len(indices), len(shape))),
) )
# TODO: make sure this is correct
indices_A = ( indices_A = (
*( *(
( (
@ -109,7 +119,7 @@ def __getitem__(
indices_B = indices[:-1] indices_B = indices[:-1]
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
else: else:
raise NotImplementedError raise NotImplementedError # unknown indice type
@property @property
def dtype(self) -> torch.dtype: def dtype(self) -> torch.dtype:
@ -132,8 +142,9 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
new_shape = cast(tuple[int, ...], shape) new_shape = cast(tuple[int, ...], shape)
orig_shape = self.shape orig_shape = self.shape
if len(new_shape) < 2: if len(new_shape) < 2:
raise NotImplementedError raise NotImplementedError # can't become a vector
# expand -1 in the shape
if any(dim == -1 for dim in new_shape): if any(dim == -1 for dim in new_shape):
n_elems = prod(orig_shape) n_elems = prod(orig_shape)
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
@ -143,7 +154,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
) )
if new_shape[-1] != orig_shape[-1]: if new_shape[-1] != orig_shape[-1]:
raise NotImplementedError raise NotImplementedError # can't reshape the row size trivially
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
shape_B = (*new_shape[:-1], self._rank) shape_B = (*new_shape[:-1], self._rank)
@ -162,7 +173,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
shape = self.shape shape = self.shape
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
if dims[-1] == -1: if dims[-1] == -1:
# TODO: support higher dimensional A shapes bigger than 1
assert all(dim == 1 for dim in self._lora_A.shape[:-2]) assert all(dim == 1 for dim in self._lora_A.shape[:-2])
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
@ -170,7 +181,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
self._lora_B.permute(*dims), self._lora_A.permute(*dims) self._lora_B.permute(*dims), self._lora_A.permute(*dims)
) )
else: else:
# TODO: compose the above two
raise NotImplementedError raise NotImplementedError
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
@ -189,7 +200,7 @@ def to(self, *args, **kwargs):
@classmethod @classmethod
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
del types del types # unused
if kwargs is None: if kwargs is None:
kwargs = {} kwargs = {}
@ -230,28 +241,73 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "") base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight")
# models produced by mergekit-extract-lora have token embeddings in the adapter
base_name = base_name.replace(".lora_embedding_A", ".weight")
base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name return base_name
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser(
parser.add_argument("--outfile", type=Path) description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file"
)
parser.add_argument(
"--outfile",
type=Path,
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument( parser.add_argument(
"--outtype", "--outtype",
type=str, type=str,
choices=["f32", "f16", "bf16", "q8_0", "auto"], choices=["f32", "f16", "bf16", "q8_0", "auto"],
default="f16", default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
)
parser.add_argument(
"--bigendian",
action="store_true",
help="model is executed on big endian machine",
)
parser.add_argument(
"--no-lazy",
action="store_true",
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
)
parser.add_argument(
"--verbose",
action="store_true",
help="increase output verbosity",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="only print out what will be done, without writing any new files",
)
parser.add_argument(
"--base",
type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
)
parser.add_argument(
"--base-model-id",
type=str,
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
)
parser.add_argument(
"lora_path",
type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
) )
parser.add_argument("--bigendian", action="store_true")
parser.add_argument("--no-lazy", action="store_true")
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--base", type=Path, required=True)
parser.add_argument("lora_path", type=Path)
return parser.parse_args() return parser.parse_args()
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
# normally, adapter does not come with base model config, we need to load it from AutoConfig
config = AutoConfig.from_pretrained(hf_model_id)
return config.to_dict()
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@ -266,19 +322,20 @@ def parse_args() -> argparse.Namespace:
ftype = ftype_map[args.outtype] ftype = ftype_map[args.outtype]
dir_base_model: Path = args.base dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path dir_lora: Path = args.lora_path
base_model_id: str | None = args.base_model_id
lora_config = dir_lora / "adapter_config.json" lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors" input_model = dir_lora / "adapter_model.safetensors"
if args.outfile is not None: if args.outfile is not None:
fname_out = args.outfile fname_out = args.outfile
else: else:
# output in the same directory as the model by default
fname_out = dir_lora fname_out = dir_lora
if os.path.exists(input_model): if os.path.exists(input_model):
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file from safetensors.torch import load_file
lora_model = load_file(input_model, device="cpu") lora_model = load_file(input_model, device="cpu")
@ -286,8 +343,38 @@ def parse_args() -> argparse.Namespace:
input_model = os.path.join(dir_lora, "adapter_model.bin") input_model = os.path.join(dir_lora, "adapter_model.bin")
lora_model = torch.load(input_model, map_location="cpu", weights_only=True) lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
# load LoRA config
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
# load base model
if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}")
try:
hparams = load_hparams_from_hf(model_id)
except OSError as e:
logger.error(f"Failed to load base model config: {e}")
logger.error(
"Please try downloading the base model and add its path to --base"
)
sys.exit(1)
else:
logger.error(
"'base_model_name_or_path' is not found in adapter_config.json"
)
logger.error(
"Base model config is required. Please download the base model and add its path to --base"
)
sys.exit(1)
else:
logger.info(f"Loading base model: {dir_base_model.name}") logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model) hparams = Model.load_hparams(dir_base_model)
with torch.inference_mode(): with torch.inference_mode():
try: try:
model_class = Model.from_model_architecture(hparams["architectures"][0]) model_class = Model.from_model_architecture(hparams["architectures"][0])
@ -309,6 +396,9 @@ def __init__(
self.dir_model_card = dir_lora_model self.dir_model_card = dir_lora_model
self.lora_alpha = float(lora_alpha) self.lora_alpha = float(lora_alpha)
def set_vocab(self):
pass
def set_type(self): def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
@ -317,7 +407,10 @@ def set_gguf_parameters(self):
self.gguf_writer.add_float32( self.gguf_writer.add_float32(
gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha
) )
super().set_gguf_parameters()
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
return ()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_map: dict[str, PartialLoraTensor] = {} tensor_map: dict[str, PartialLoraTensor] = {}
@ -326,14 +419,26 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
if self.lazy: if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor) tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name) base_name = get_base_tensor_name(name)
is_lora_a = ".lora_A.weight" in name # note: mergekit-extract-lora also adds token embeddings to the adapter
is_lora_b = ".lora_B.weight" in name is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
if not is_lora_a and not is_lora_b: if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name: if ".base_layer.weight" in name:
continue continue
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
if "_layernorm" in name or ".norm" in name:
yield (base_name, tensor)
continue
logger.error( logger.error(
f"Unexpected name '{name}': Not a lora_A or lora_B tensor" f"Unexpected name '{name}': Not a lora_A or lora_B tensor"
) )
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error(
"Embeddings is present in the adapter. This can be due to new tokens added during fine tuning"
)
logger.error(
"Please refer to https://github.com/ggerganov/llama.cpp/pull/9948"
)
sys.exit(1) sys.exit(1)
if base_name in tensor_map: if base_name in tensor_map:
@ -358,17 +463,34 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
def modify_tensors( def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]: ) -> Iterable[tuple[str, Tensor]]:
dest = super().modify_tensors(data_torch, name, bid) dest = list(super().modify_tensors(data_torch, name, bid))
# some archs may have the same tensor for lm_head and output (tie word embeddings)
# in this case, adapters targeting lm_head will fail when using llama-export-lora
# therefore, we ignore them for now
# see: https://github.com/ggerganov/llama.cpp/issues/9065
if name == "lm_head.weight" and len(dest) == 0:
raise ValueError(
"lm_head is present in adapter, but is ignored in base model"
)
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
# mergekit-extract-lora add these layernorm to the adapter
if "_norm" in dest_name:
assert dest_data.dim() == 1
yield (dest_name, dest_data)
continue
# otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()
# note: mergekit-extract-lora flip and transpose A and B
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
if "token_embd.weight" in dest_name:
lora_a = lora_a.T
yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b) yield (dest_name + ".lora_b", lora_b)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"] alpha: float = lparams["lora_alpha"]
model_instance = LoraModel( model_instance = LoraModel(
@ -381,7 +503,7 @@ def modify_tensors(
dry_run=args.dry_run, dry_run=args.dry_run,
dir_lora_model=dir_lora, dir_lora_model=dir_lora,
lora_alpha=alpha, lora_alpha=alpha,
is_lora=True, hparams=hparams,
) )
logger.info("Exporting model...") logger.info("Exporting model...")

View File

@ -3,10 +3,18 @@
from enum import Enum, IntEnum, auto from enum import Enum, IntEnum, auto
from typing import Any from typing import Any
GGUF_MAGIC = 0x46554747 #
# constants
#
GGUF_MAGIC = 0x46554747 # "GGUF"
GGUF_VERSION = 3 GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32 GGUF_DEFAULT_ALIGNMENT = 32
GGML_QUANT_VERSION = 2 GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
#
# metadata keys
#
class Keys: class Keys:
@ -17,6 +25,7 @@ class General:
ALIGNMENT = "general.alignment" ALIGNMENT = "general.alignment"
FILE_TYPE = "general.file_type" FILE_TYPE = "general.file_type"
# Authorship Metadata
NAME = "general.name" NAME = "general.name"
AUTHOR = "general.author" AUTHOR = "general.author"
VERSION = "general.version" VERSION = "general.version"
@ -30,38 +39,62 @@ class General:
SIZE_LABEL = "general.size_label" SIZE_LABEL = "general.size_label"
# Licensing details
LICENSE = "general.license" LICENSE = "general.license"
LICENSE_NAME = "general.license.name" LICENSE_NAME = "general.license.name"
LICENSE_LINK = "general.license.link" LICENSE_LINK = "general.license.link"
URL = "general.url" # Typically represents the converted GGUF repo (Unless native)
URL = "general.url" # Model Website/Paper
DOI = "general.doi" DOI = "general.doi"
UUID = "general.uuid" UUID = "general.uuid"
REPO_URL = "general.repo_url" REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...)
SOURCE_URL = "general.source.url" # Model Source during conversion
SOURCE_URL = "general.source.url" # Model Website/Paper
SOURCE_DOI = "general.source.doi" SOURCE_DOI = "general.source.doi"
SOURCE_UUID = "general.source.uuid" SOURCE_UUID = "general.source.uuid"
SOURCE_REPO_URL = "general.source.repo_url" SOURCE_REPO_URL = (
"general.source.repo_url" # Model Source Repository (git/svn/etc...)
)
# Base Model Source. There can be more than one source if it's a merged
# model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in
# tracing linage of models as it is finetuned or merged over time.
BASE_MODEL_COUNT = "general.base_model.count" BASE_MODEL_COUNT = "general.base_model.count"
BASE_MODEL_NAME = "general.base_model.{id}.name" BASE_MODEL_NAME = "general.base_model.{id}.name"
BASE_MODEL_AUTHOR = "general.base_model.{id}.author" BASE_MODEL_AUTHOR = "general.base_model.{id}.author"
BASE_MODEL_VERSION = "general.base_model.{id}.version" BASE_MODEL_VERSION = "general.base_model.{id}.version"
BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization"
BASE_MODEL_URL = "general.base_model.{id}.url" BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description"
BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper
BASE_MODEL_DOI = "general.base_model.{id}.doi" BASE_MODEL_DOI = "general.base_model.{id}.doi"
BASE_MODEL_UUID = "general.base_model.{id}.uuid" BASE_MODEL_UUID = "general.base_model.{id}.uuid"
BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
# Dataset Source
DATASET_COUNT = "general.dataset.count"
DATASET_NAME = "general.dataset.{id}.name"
DATASET_AUTHOR = "general.dataset.{id}.author"
DATASET_VERSION = "general.dataset.{id}.version"
DATASET_ORGANIZATION = "general.dataset.{id}.organization"
DATASET_DESCRIPTION = "general.dataset.{id}.description"
DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper
DATASET_DOI = "general.dataset.{id}.doi"
DATASET_UUID = "general.dataset.{id}.uuid"
DATASET_REPO_URL = (
"general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...)
)
# Array based KV stores
TAGS = "general.tags" TAGS = "general.tags"
LANGUAGES = "general.languages" LANGUAGES = "general.languages"
DATASETS = "general.datasets"
class LLM: class LLM:
VOCAB_SIZE = "{arch}.vocab_size" VOCAB_SIZE = "{arch}.vocab_size"
CONTEXT_LENGTH = "{arch}.context_length" CONTEXT_LENGTH = "{arch}.context_length"
EMBEDDING_LENGTH = "{arch}.embedding_length" EMBEDDING_LENGTH = "{arch}.embedding_length"
FEATURES_LENGTH = "{arch}.features_length"
BLOCK_COUNT = "{arch}.block_count" BLOCK_COUNT = "{arch}.block_count"
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
@ -73,11 +106,14 @@ class LLM:
EXPERT_USED_COUNT = "{arch}.expert_used_count" EXPERT_USED_COUNT = "{arch}.expert_used_count"
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
POOLING_TYPE = "{arch}.pooling_type" POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale" LOGIT_SCALE = "{arch}.logit_scale"
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
SWIN_NORM = "{arch}.swin_norm"
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
@ -93,6 +129,8 @@ class Attention:
VALUE_LENGTH = "{arch}.attention.value_length" VALUE_LENGTH = "{arch}.attention.value_length"
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon"
GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups"
CAUSAL = "{arch}.attention.causal" CAUSAL = "{arch}.attention.causal"
Q_LORA_RANK = "{arch}.attention.q_lora_rank" Q_LORA_RANK = "{arch}.attention.q_lora_rank"
KV_LORA_RANK = "{arch}.attention.kv_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
@ -102,6 +140,7 @@ class Attention:
class Rope: class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_COUNT = "{arch}.rope.dimension_count"
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
FREQ_BASE = "{arch}.rope.freq_base" FREQ_BASE = "{arch}.rope.freq_base"
SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_TYPE = "{arch}.rope.scaling.type"
SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_FACTOR = "{arch}.rope.scaling.factor"
@ -125,16 +164,28 @@ class SSM:
class WKV: class WKV:
HEAD_SIZE = "{arch}.wkv.head_size" HEAD_SIZE = "{arch}.wkv.head_size"
class PosNet:
EMBEDDING_LENGTH = "{arch}.posnet.embedding_length"
BLOCK_COUNT = "{arch}.posnet.block_count"
class ConvNext:
EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
BLOCK_COUNT = "{arch}.convnext.block_count"
class Tokenizer: class Tokenizer:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model"
PRE = "tokenizer.ggml.pre" PRE = "tokenizer.ggml.pre"
LIST = "tokenizer.ggml.tokens" LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" TOKEN_TYPE_COUNT = (
"tokenizer.ggml.token_type_count" # for BERT-style token types
)
SCORES = "tokenizer.ggml.scores" SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges" MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id" BOS_ID = "tokenizer.ggml.bos_token_id"
EOS_ID = "tokenizer.ggml.eos_token_id" EOS_ID = "tokenizer.ggml.eos_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id"
EOM_ID = "tokenizer.ggml.eom_token_id"
UNK_ID = "tokenizer.ggml.unknown_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id" PAD_ID = "tokenizer.ggml.padding_token_id"
@ -150,18 +201,28 @@ class Tokenizer:
CHAT_TEMPLATE = "tokenizer.chat_template" CHAT_TEMPLATE = "tokenizer.chat_template"
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates" CHAT_TEMPLATES = "tokenizer.chat_templates"
# FIM/Infill special tokens constants
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
# deprecated:
PREFIX_ID = "tokenizer.ggml.prefix_token_id" PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
MIDDLE_ID = "tokenizer.ggml.middle_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id"
EOM_ID = "tokenizer.ggml.eom_token_id"
class Adapter: class Adapter:
TYPE = "adapter.type" TYPE = "adapter.type"
LORA_ALPHA = "adapter.lora.alpha" LORA_ALPHA = "adapter.lora.alpha"
#
# recommended mapping of model tensor names for storage in gguf
#
class GGUFType: class GGUFType:
MODEL = "model" MODEL = "model"
ADAPTER = "adapter" ADAPTER = "adapter"
@ -169,6 +230,7 @@ class GGUFType:
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):
LLAMA = auto() LLAMA = auto()
DECI = auto()
FALCON = auto() FALCON = auto()
BAICHUAN = auto() BAICHUAN = auto()
GROK = auto() GROK = auto()
@ -186,6 +248,7 @@ class MODEL_ARCH(IntEnum):
QWEN = auto() QWEN = auto()
QWEN2 = auto() QWEN2 = auto()
QWEN2MOE = auto() QWEN2MOE = auto()
QWEN2VL = auto()
PHI2 = auto() PHI2 = auto()
PHI3 = auto() PHI3 = auto()
PLAMO = auto() PLAMO = auto()
@ -199,14 +262,16 @@ class MODEL_ARCH(IntEnum):
STARCODER2 = auto() STARCODER2 = auto()
RWKV6 = auto() RWKV6 = auto()
MAMBA = auto() MAMBA = auto()
JAMBA = auto()
XVERSE = auto() XVERSE = auto()
COMMAND_R = auto() COMMAND_R = auto()
COHERE2 = auto()
DBRX = auto() DBRX = auto()
OLMO = auto() OLMO = auto()
OLMO2 = auto()
OLMOE = auto() OLMOE = auto()
OPENELM = auto() OPENELM = auto()
ARCTIC = auto() ARCTIC = auto()
DEEPSEEK = auto()
DEEPSEEK2 = auto() DEEPSEEK2 = auto()
CHATGLM = auto() CHATGLM = auto()
BITNET = auto() BITNET = auto()
@ -216,6 +281,9 @@ class MODEL_ARCH(IntEnum):
NEMOTRON = auto() NEMOTRON = auto()
EXAONE = auto() EXAONE = auto()
GRANITE = auto() GRANITE = auto()
GRANITE_MOE = auto()
CHAMELEON = auto()
WAVTOKENIZER_DEC = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -254,6 +322,7 @@ class MODEL_TENSOR(IntEnum):
FFN_GATE_SHEXP = auto() FFN_GATE_SHEXP = auto()
FFN_DOWN_SHEXP = auto() FFN_DOWN_SHEXP = auto()
FFN_UP_SHEXP = auto() FFN_UP_SHEXP = auto()
FFN_EXP_PROBS_B = auto()
ATTN_Q_NORM = auto() ATTN_Q_NORM = auto()
ATTN_K_NORM = auto() ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto() LAYER_OUT_NORM = auto()
@ -261,10 +330,7 @@ class MODEL_TENSOR(IntEnum):
SSM_CONV1D = auto() SSM_CONV1D = auto()
SSM_X = auto() SSM_X = auto()
SSM_DT = auto() SSM_DT = auto()
SSM_DT_NORM = auto()
SSM_A = auto() SSM_A = auto()
SSM_B_NORM = auto()
SSM_C_NORM = auto()
SSM_D = auto() SSM_D = auto()
SSM_OUT = auto() SSM_OUT = auto()
TIME_MIX_W1 = auto() TIME_MIX_W1 = auto()
@ -326,10 +392,29 @@ class MODEL_TENSOR(IntEnum):
ENC_FFN_DOWN = auto() ENC_FFN_DOWN = auto()
ENC_FFN_UP = auto() ENC_FFN_UP = auto()
ENC_OUTPUT_NORM = auto() ENC_OUTPUT_NORM = auto()
CLS = auto() # classifier
CLS_OUT = auto() # classifier output projection
CONV1D = auto()
CONVNEXT_DW = auto()
CONVNEXT_NORM = auto()
CONVNEXT_PW1 = auto()
CONVNEXT_PW2 = auto()
CONVNEXT_GAMMA = auto()
POSNET_CONV1 = auto()
POSNET_CONV2 = auto()
POSNET_NORM = auto()
POSNET_NORM1 = auto()
POSNET_NORM2 = auto()
POSNET_ATTN_NORM = auto()
POSNET_ATTN_Q = auto()
POSNET_ATTN_K = auto()
POSNET_ATTN_V = auto()
POSNET_ATTN_OUT = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.DECI: "deci",
MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.BAICHUAN: "baichuan", MODEL_ARCH.BAICHUAN: "baichuan",
MODEL_ARCH.GROK: "grok", MODEL_ARCH.GROK: "grok",
@ -347,6 +432,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.QWEN2MOE: "qwen2moe", MODEL_ARCH.QWEN2MOE: "qwen2moe",
MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.PLAMO: "plamo",
@ -360,14 +446,16 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.RWKV6: "rwkv6",
MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.JAMBA: "jamba",
MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.COHERE2: "cohere2",
MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO2: "olmo2",
MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.BITNET: "bitnet",
@ -377,6 +465,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.EXAONE: "exaone", MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.GRANITE: "granite", MODEL_ARCH.GRANITE: "granite",
MODEL_ARCH.GRANITE_MOE: "granitemoe",
MODEL_ARCH.CHAMELEON: "chameleon",
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -417,15 +508,13 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm",
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm",
MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm",
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
@ -487,6 +576,24 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
MODEL_TENSOR.CLS: "cls",
MODEL_TENSOR.CLS_OUT: "cls.output",
MODEL_TENSOR.CONV1D: "conv1d",
MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw",
MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm",
MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1",
MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2",
MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma",
MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1",
MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2",
MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm",
MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1",
MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2",
MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm",
MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q",
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -510,6 +617,26 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_UP_EXP,
], ],
MODEL_ARCH.DECI: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.GROK: [ MODEL_ARCH.GROK: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -596,6 +723,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM, MODEL_TENSOR.LAYER_OUT_NORM,
MODEL_TENSOR.CLS,
MODEL_TENSOR.CLS_OUT,
], ],
MODEL_ARCH.NOMIC_BERT: [ MODEL_ARCH.NOMIC_BERT: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
@ -627,6 +756,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.LAYER_OUT_NORM, MODEL_TENSOR.LAYER_OUT_NORM,
MODEL_TENSOR.CLS,
], ],
MODEL_ARCH.MPT: [ MODEL_ARCH.MPT: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
@ -713,6 +843,21 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.QWEN2: [ MODEL_ARCH.QWEN2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.QWEN2VL: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,
@ -790,6 +935,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FACTORS_LONG,
MODEL_TENSOR.ROPE_FACTORS_SHORT,
MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q,
@ -849,6 +996,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ROPE_FACTORS_LONG,
MODEL_TENSOR.ROPE_FACTORS_SHORT,
MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K,
@ -868,6 +1017,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FACTORS_LONG,
MODEL_TENSOR.ROPE_FACTORS_SHORT,
MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q_A, MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_Q_B,
@ -968,34 +1119,6 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_OUT, MODEL_TENSOR.SSM_OUT,
], ],
MODEL_ARCH.JAMBA: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.SSM_IN,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_X,
MODEL_TENSOR.SSM_DT,
MODEL_TENSOR.SSM_DT_NORM,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_B_NORM,
MODEL_TENSOR.SSM_C_NORM,
MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_OUT,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.XVERSE: [ MODEL_ARCH.XVERSE: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -1026,6 +1149,18 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ATTN_K_NORM, MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_Q_NORM, MODEL_TENSOR.ATTN_Q_NORM,
], ],
MODEL_ARCH.COHERE2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.DBRX: [ MODEL_ARCH.DBRX: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -1050,6 +1185,22 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.OLMO2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.FFN_POST_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.OLMOE: [ MODEL_ARCH.OLMOE: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -1101,6 +1252,29 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_UP_EXP,
], ],
MODEL_ARCH.DEEPSEEK: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
],
MODEL_ARCH.DEEPSEEK2: [ MODEL_ARCH.DEEPSEEK2: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -1127,6 +1301,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
], ],
MODEL_ARCH.CHATGLM: [ MODEL_ARCH.CHATGLM: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
@ -1248,6 +1423,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.GRANITE: [ MODEL_ARCH.GRANITE: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K,
@ -1258,13 +1434,72 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.GRANITE_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.CHAMELEON: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.WAVTOKENIZER_DEC: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.CONV1D,
MODEL_TENSOR.CONVNEXT_DW,
MODEL_TENSOR.CONVNEXT_NORM,
MODEL_TENSOR.CONVNEXT_PW1,
MODEL_TENSOR.CONVNEXT_PW2,
MODEL_TENSOR.CONVNEXT_GAMMA,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.POSNET_CONV1,
MODEL_TENSOR.POSNET_CONV2,
MODEL_TENSOR.POSNET_NORM,
MODEL_TENSOR.POSNET_NORM1,
MODEL_TENSOR.POSNET_NORM2,
MODEL_TENSOR.POSNET_ATTN_NORM,
MODEL_TENSOR.POSNET_ATTN_Q,
MODEL_TENSOR.POSNET_ATTN_K,
MODEL_TENSOR.POSNET_ATTN_V,
MODEL_TENSOR.POSNET_ATTN_OUT,
],
# TODO
} }
# tensors that will not be serialized
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_ARCH.LLAMA: [ MODEL_ARCH.LLAMA: [
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
], ],
MODEL_ARCH.DECI: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.BAICHUAN: [ MODEL_ARCH.BAICHUAN: [
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
@ -1289,6 +1524,10 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
], ],
MODEL_ARCH.DEEPSEEK: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.DEEPSEEK2: [ MODEL_ARCH.DEEPSEEK2: [
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
@ -1302,6 +1541,10 @@ class MODEL_TENSOR(IntEnum):
], ],
} }
#
# types
#
class TokenType(IntEnum): class TokenType(IntEnum):
NORMAL = 1 NORMAL = 1
@ -1316,6 +1559,7 @@ class RopeScalingType(Enum):
NONE = "none" NONE = "none"
LINEAR = "linear" LINEAR = "linear"
YARN = "yarn" YARN = "yarn"
LONGROPE = "longrope"
class PoolingType(IntEnum): class PoolingType(IntEnum):
@ -1354,52 +1598,61 @@ class GGMLQuantizationType(IntEnum):
F64 = 28 F64 = 28
IQ1_M = 29 IQ1_M = 29
BF16 = 30 BF16 = 30
Q4_0_4_4 = 31
Q4_0_4_8 = 32
Q4_0_8_8 = 33
TQ1_0 = 34 TQ1_0 = 34
TQ2_0 = 35 TQ2_0 = 35
class ExpertGatingFuncType(IntEnum):
SOFTMAX = 1
SIGMOID = 2
# TODO: add GGMLFileType from ggml_ftype in ggml.h
# from llama_ftype in llama.h
# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
class LlamaFileType(IntEnum): class LlamaFileType(IntEnum):
ALL_F32 = 0 ALL_F32 = 0
MOSTLY_F16 = 1 MOSTLY_F16 = 1 # except 1d tensors
MOSTLY_Q4_0 = 2 MOSTLY_Q4_0 = 2 # except 1d tensors
MOSTLY_Q4_1 = 3 MOSTLY_Q4_1 = 3 # except 1d tensors
# MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
# MOSTLY_Q4_2 = 5 # support has been removed
# MOSTLY_Q4_3 = 6 # support has been removed
MOSTLY_Q8_0 = 7 # except 1d tensors
MOSTLY_Q5_0 = 8 # except 1d tensors
MOSTLY_Q5_1 = 9 # except 1d tensors
MOSTLY_Q2_K = 10 # except 1d tensors
MOSTLY_Q3_K_S = 11 # except 1d tensors
MOSTLY_Q3_K_M = 12 # except 1d tensors
MOSTLY_Q3_K_L = 13 # except 1d tensors
MOSTLY_Q4_K_S = 14 # except 1d tensors
MOSTLY_Q4_K_M = 15 # except 1d tensors
MOSTLY_Q5_K_S = 16 # except 1d tensors
MOSTLY_Q5_K_M = 17 # except 1d tensors
MOSTLY_Q6_K = 18 # except 1d tensors
MOSTLY_IQ2_XXS = 19 # except 1d tensors
MOSTLY_IQ2_XS = 20 # except 1d tensors
MOSTLY_Q2_K_S = 21 # except 1d tensors
MOSTLY_IQ3_XS = 22 # except 1d tensors
MOSTLY_IQ3_XXS = 23 # except 1d tensors
MOSTLY_IQ1_S = 24 # except 1d tensors
MOSTLY_IQ4_NL = 25 # except 1d tensors
MOSTLY_IQ3_S = 26 # except 1d tensors
MOSTLY_IQ3_M = 27 # except 1d tensors
MOSTLY_IQ2_S = 28 # except 1d tensors
MOSTLY_IQ2_M = 29 # except 1d tensors
MOSTLY_IQ4_XS = 30 # except 1d tensors
MOSTLY_IQ1_M = 31 # except 1d tensors
MOSTLY_BF16 = 32 # except 1d tensors
# MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack
# MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack
# MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack
MOSTLY_TQ1_0 = 36 # except 1d tensors
MOSTLY_TQ2_0 = 37 # except 1d tensors
MOSTLY_Q8_0 = 7 GUESSED = 1024 # not specified in the model file
MOSTLY_Q5_0 = 8
MOSTLY_Q5_1 = 9
MOSTLY_Q2_K = 10
MOSTLY_Q3_K_S = 11
MOSTLY_Q3_K_M = 12
MOSTLY_Q3_K_L = 13
MOSTLY_Q4_K_S = 14
MOSTLY_Q4_K_M = 15
MOSTLY_Q5_K_S = 16
MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18
MOSTLY_IQ2_XXS = 19
MOSTLY_IQ2_XS = 20
MOSTLY_Q2_K_S = 21
MOSTLY_IQ3_XS = 22
MOSTLY_IQ3_XXS = 23
MOSTLY_IQ1_S = 24
MOSTLY_IQ4_NL = 25
MOSTLY_IQ3_S = 26
MOSTLY_IQ3_M = 27
MOSTLY_IQ2_S = 28
MOSTLY_IQ2_M = 29
MOSTLY_IQ4_XS = 30
MOSTLY_IQ1_M = 31
MOSTLY_BF16 = 32
MOSTLY_Q4_0_4_4 = 33
MOSTLY_Q4_0_4_8 = 34
MOSTLY_Q4_0_8_8 = 35
MOSTLY_TQ1_0 = 36
MOSTLY_TQ2_0 = 37
GUESSED = 1024
class GGUFEndian(IntEnum): class GGUFEndian(IntEnum):
@ -1434,11 +1687,12 @@ def get_type(val: Any) -> GGUFValueType:
return GGUFValueType.BOOL return GGUFValueType.BOOL
elif isinstance(val, int): elif isinstance(val, int):
return GGUFValueType.INT32 return GGUFValueType.INT32
# TODO: need help with 64-bit types in Python
else: else:
raise ValueError(f"Unknown type: {type(val)}") raise ValueError(f"Unknown type: {type(val)}")
# Items here are (block size, type size)
QK_K = 256 QK_K = 256
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
GGMLQuantizationType.F32: (1, 4), GGMLQuantizationType.F32: (1, 4),
@ -1470,13 +1724,14 @@ def get_type(val: Any) -> GGUFValueType:
GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.F64: (1, 8),
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
GGMLQuantizationType.BF16: (1, 2), GGMLQuantizationType.BF16: (1, 2),
GGMLQuantizationType.Q4_0_4_4: (32, 2 + 16),
GGMLQuantizationType.Q4_0_4_8: (32, 2 + 16),
GGMLQuantizationType.Q4_0_8_8: (32, 2 + 16),
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
GGMLQuantizationType.TQ2_0: (256, 2 + 64), GGMLQuantizationType.TQ2_0: (256, 2 + 64),
} }
# Aliases for backward compatibility.
# general
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
@ -1488,6 +1743,7 @@ def get_type(val: Any) -> GGUFValueType:
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
# LLM
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
@ -1496,6 +1752,7 @@ def get_type(val: Any) -> GGUFValueType:
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
# attention
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
@ -1503,6 +1760,7 @@ def get_type(val: Any) -> GGUFValueType:
KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
# RoPE
KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
@ -1510,12 +1768,14 @@ def get_type(val: Any) -> GGUFValueType:
KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
# SSM
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
# tokenization
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
@ -1524,6 +1784,8 @@ def get_type(val: Any) -> GGUFValueType:
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
@ -1531,8 +1793,15 @@ def get_type(val: Any) -> GGUFValueType:
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
# deprecated
KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID

View File

@ -169,11 +169,10 @@ def _get(
count = int(count) count = int(count)
itemsize = int(np.empty([], dtype=dtype).itemsize) itemsize = int(np.empty([], dtype=dtype).itemsize)
end_offs = offset + itemsize * count end_offs = offset + itemsize * count
return ( arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
self.data[offset:end_offs] if override_order is None:
.view(dtype=dtype)[:count] return arr
.newbyteorder(override_order or self.byte_order) return arr.view(arr.dtype.newbyteorder(override_order))
)
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
if field.name in self.fields: if field.name in self.fields:

View File

@ -26,12 +26,14 @@
RopeScalingType, RopeScalingType,
PoolingType, PoolingType,
TokenType, TokenType,
ExpertGatingFuncType,
) )
from .quants import quant_shape_from_byte_shape from .quants import quant_shape_from_byte_shape
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
@ -135,7 +137,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
continue continue
elif name.endswith(".lora_b"): elif name.endswith(".lora_b"):
if last_lora_a is None or last_lora_a[0] != name[:-1] + "a": if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
# Bail when the LoRA pair can't be found trivially
logger.warning( logger.warning(
"can't measure LoRA size correctly, tensor order is unusual" "can't measure LoRA size correctly, tensor order is unusual"
) )
@ -154,11 +156,14 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
total_params += size total_params += size
# Hopefully this should work even for variable-expert-count models
expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0 expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
# Negate the total to signal it's likely not exact
if last_lora_a is not None: if last_lora_a is not None:
total_params = -total_params total_params = -total_params
# NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
return total_params, shared_params, expert_params, expert_count return total_params, shared_params, expert_params, expert_count
def format_shard_names(self, path: Path) -> list[Path]: def format_shard_names(self, path: Path) -> list[Path]:
@ -177,7 +182,7 @@ def open_output_file(self, path: Path | None = None) -> None:
and self.fout is not None and self.fout is not None
and (path is None or path == self.path) and (path is None or path == self.path)
): ):
# allow calling this multiple times as long as the path is the same
return return
if self.state is not WriterState.NO_FILE: if self.state is not WriterState.NO_FILE:
@ -206,7 +211,7 @@ def print_plan(self) -> list[Path]:
if self.dry_run: if self.dry_run:
logger.info("Dry run, not writing files") logger.info("Dry run, not writing files")
for name in filenames: for name in filenames:
print(name) print(name) # noqa: NP100
exit() exit()
return filenames return filenames
@ -390,11 +395,12 @@ def add_tensor_info(
if tensor_dtype == np.uint8: if tensor_dtype == np.uint8:
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
# make sure there is at least one tensor before splitting
if len(self.tensors[-1]) > 0: if len(self.tensors[-1]) > 0:
if ( if ( # split when over tensor limit
self.split_max_tensors != 0 self.split_max_tensors != 0
and len(self.tensors[-1]) >= self.split_max_tensors and len(self.tensors[-1]) >= self.split_max_tensors
) or ( ) or ( # split when over size limit
self.split_max_size != 0 self.split_max_size != 0
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes
> self.split_max_size > self.split_max_size
@ -460,6 +466,8 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
fout = self.fout[file_id] fout = self.fout[file_id]
# pop the first tensor info
# TODO: cleaner way to get the first key
first_tensor_name = [ first_tensor_name = [
name for name, _ in zip(self.tensors[file_id].keys(), range(1)) name for name, _ in zip(self.tensors[file_id].keys(), range(1))
][0] ][0]
@ -506,8 +514,11 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
total = sum(ti.nbytes for ti in tensors.values()) total = sum(ti.nbytes for ti in tensors.values())
shard_bar.reset(total=(total if total > 0 else None)) shard_bar.reset(total=(total if total > 0 else None))
# relying on the fact that Python dicts preserve insertion order (since 3.7)
for ti in tensors.values(): for ti in tensors.values():
assert ti.tensor is not None assert (
ti.tensor is not None
) # can only iterate once over the tensors
assert ti.tensor.nbytes == ti.nbytes assert ti.tensor.nbytes == ti.nbytes
ti.tensor.tofile(fout) ti.tensor.tofile(fout)
if shard_bar is not None: if shard_bar is not None:
@ -631,6 +642,11 @@ def add_base_model_organization(self, source_id: int, organization: str) -> None
Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization
) )
def add_base_model_description(self, source_id: int, description: str) -> None:
self.add_string(
Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description
)
def add_base_model_url(self, source_id: int, url: str) -> None: def add_base_model_url(self, source_id: int, url: str) -> None:
self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url) self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
@ -643,15 +659,46 @@ def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None: def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url) self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
def add_dataset_count(self, source_count: int) -> None:
self.add_uint32(Keys.General.DATASET_COUNT, source_count)
def add_dataset_name(self, source_id: int, name: str) -> None:
self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name)
def add_dataset_author(self, source_id: int, author: str) -> None:
self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author)
def add_dataset_version(self, source_id: int, version: str) -> None:
self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version)
def add_dataset_organization(self, source_id: int, organization: str) -> None:
self.add_string(
Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization
)
def add_dataset_description(self, source_id: int, description: str) -> None:
self.add_string(
Keys.General.DATASET_DESCRIPTION.format(id=source_id), description
)
def add_dataset_url(self, source_id: int, url: str) -> None:
self.add_string(Keys.General.DATASET_URL.format(id=source_id), url)
def add_dataset_doi(self, source_id: int, doi: str) -> None:
self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi)
def add_dataset_uuid(self, source_id: int, uuid: str) -> None:
self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid)
def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None:
self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url)
def add_tags(self, tags: Sequence[str]) -> None: def add_tags(self, tags: Sequence[str]) -> None:
self.add_array(Keys.General.TAGS, tags) self.add_array(Keys.General.TAGS, tags)
def add_languages(self, languages: Sequence[str]) -> None: def add_languages(self, languages: Sequence[str]) -> None:
self.add_array(Keys.General.LANGUAGES, languages) self.add_array(Keys.General.LANGUAGES, languages)
def add_datasets(self, datasets: Sequence[str]) -> None:
self.add_array(Keys.General.DATASETS, datasets)
def add_tensor_data_layout(self, layout: str) -> None: def add_tensor_data_layout(self, layout: str) -> None:
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
@ -664,6 +711,21 @@ def add_context_length(self, length: int) -> None:
def add_embedding_length(self, length: int) -> None: def add_embedding_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_features_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
def add_posnet_embedding_length(self, length: int) -> None:
self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_posnet_block_count(self, length: int) -> None:
self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
def add_convnext_embedding_length(self, length: int) -> None:
self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_convnext_block_count(self, length: int) -> None:
self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
def add_block_count(self, length: int) -> None: def add_block_count(self, length: int) -> None:
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
@ -739,6 +801,15 @@ def add_expert_shared_count(self, count: int) -> None:
def add_expert_weights_scale(self, value: float) -> None: def add_expert_weights_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
def add_expert_weights_norm(self, value: bool) -> None:
self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value)
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
def add_swin_norm(self, value: bool) -> None:
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
def add_rescale_every_n_layers(self, count: int) -> None: def add_rescale_every_n_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
@ -763,6 +834,12 @@ def add_layer_norm_eps(self, value: float) -> None:
def add_layer_norm_rms_eps(self, value: float) -> None: def add_layer_norm_rms_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
def add_group_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
def add_group_norm_groups(self, value: int) -> None:
self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
def add_causal_attention(self, value: bool) -> None: def add_causal_attention(self, value: bool) -> None:
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
@ -787,6 +864,9 @@ def add_pooling_type(self, value: PoolingType) -> None:
def add_rope_dimension_count(self, count: int) -> None: def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
def add_rope_freq_base(self, value: float) -> None: def add_rope_freq_base(self, value: float) -> None:
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
@ -893,6 +973,7 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
name = choice.get("name", "") name = choice.get("name", "")
template = choice.get("template") template = choice.get("template")
# Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
name = "".join( name = "".join(
(c if c in ascii_letters + digits else "_" for c in name) (c if c in ascii_letters + digits else "_" for c in name)
) )
@ -916,15 +997,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
def add_prefix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
def add_suffix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
def add_middle_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
def add_eot_token_id(self, id: int) -> None: def add_eot_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOT_ID, id) self.add_uint32(Keys.Tokenizer.EOT_ID, id)

View File

@ -12,6 +12,7 @@
class LazyMeta(ABCMeta): class LazyMeta(ABCMeta):
def __new__( def __new__(
cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs
): ):
@ -34,7 +35,7 @@ def __getattr__(self, name: str) -> Any:
# need to make a builder for the wrapped wrapper to copy the name, # need to make a builder for the wrapped wrapper to copy the name,
# or else it fails with very cryptic error messages, # or else it fails with very cryptic error messages,
# because somehow the same string would end up in every closure # because somehow the same string would end up in every closures
def mk_wrap(op_name: str, *, meta_noop: bool = False): def mk_wrap(op_name: str, *, meta_noop: bool = False):
# need to wrap the wrapper to get self # need to wrap the wrapper to get self
def wrapped_special_op(self, *args, **kwargs): def wrapped_special_op(self, *args, **kwargs):
@ -254,6 +255,8 @@ def from_eager(cls, t: Any) -> Any:
class LazyNumpyTensor(LazyBase): class LazyNumpyTensor(LazyBase):
_tensor_type = np.ndarray _tensor_type = np.ndarray
shape: tuple[int, ...] # Makes the type checker happy in quants.py
@classmethod @classmethod
def meta_with_dtype_and_shape( def meta_with_dtype_and_shape(
cls, dtype: DTypeLike, shape: tuple[int, ...] cls, dtype: DTypeLike, shape: tuple[int, ...]

View File

@ -41,7 +41,7 @@ class Metadata:
base_models: Optional[list[dict]] = None base_models: Optional[list[dict]] = None
tags: Optional[list[str]] = None tags: Optional[list[str]] = None
languages: Optional[list[str]] = None languages: Optional[list[str]] = None
datasets: Optional[list[str]] = None datasets: Optional[list[dict]] = None
@staticmethod @staticmethod
def load( def load(
@ -50,7 +50,7 @@ def load(
model_name: Optional[str] = None, model_name: Optional[str] = None,
total_params: int = 0, total_params: int = 0,
) -> Metadata: ) -> Metadata:
# This grabs as much contextual authorship metadata as possible from the model repository # This grabs as many contextual authorship metadata as possible from the model repository
# making any conversion as required to match the gguf kv store metadata format # making any conversion as required to match the gguf kv store metadata format
# as well as giving users the ability to override any authorship metadata that may be incorrect # as well as giving users the ability to override any authorship metadata that may be incorrect
@ -126,13 +126,13 @@ def load(
"general.base_models", metadata.base_models "general.base_models", metadata.base_models
) )
# Datasets is received here as an array of datasets
metadata.datasets = metadata_override.get("general.datasets", metadata.datasets)
metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags) metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags)
metadata.languages = metadata_override.get( metadata.languages = metadata_override.get(
Keys.General.LANGUAGES, metadata.languages Keys.General.LANGUAGES, metadata.languages
) )
metadata.datasets = metadata_override.get(
Keys.General.DATASETS, metadata.datasets
)
# Direct Metadata Override (via direct cli argument) # Direct Metadata Override (via direct cli argument)
if model_name is not None: if model_name is not None:
@ -228,7 +228,11 @@ def get_model_id_components(
org_component, model_full_name_component = None, model_id org_component, model_full_name_component = None, model_id
# Check if we erroneously matched against './' or '../' etc... # Check if we erroneously matched against './' or '../' etc...
if org_component is not None and org_component[0] == ".": if (
org_component is not None
and len(org_component) > 0
and org_component[0] == "."
):
org_component = None org_component = None
name_parts: list[str] = model_full_name_component.split("-") name_parts: list[str] = model_full_name_component.split("-")
@ -387,27 +391,86 @@ def apply_metadata_heuristic(
######################## ########################
if model_card is not None: if model_card is not None:
if "model_name" in model_card and metadata.name is None: def use_model_card_metadata(metadata_key: str, model_card_key: str):
if (
model_card_key in model_card
and getattr(metadata, metadata_key, None) is None
):
setattr(metadata, metadata_key, model_card.get(model_card_key))
def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
# Note: Will append rather than replace if already exist
tags_value = model_card.get(model_card_key, None)
if tags_value is None:
return
current_value = getattr(metadata, metadata_key, None)
if current_value is None:
current_value = []
if isinstance(tags_value, str):
current_value.append(tags_value)
elif isinstance(tags_value, list):
current_value.extend(tags_value)
setattr(metadata, metadata_key, current_value)
# LLAMA.cpp's direct internal convention
# (Definitely not part of hugging face formal/informal standard)
#########################################
use_model_card_metadata("name", "name")
use_model_card_metadata("author", "author")
use_model_card_metadata("version", "version")
use_model_card_metadata("organization", "organization")
use_model_card_metadata("description", "description")
use_model_card_metadata("finetune", "finetune")
use_model_card_metadata("basename", "basename")
use_model_card_metadata("size_label", "size_label")
use_model_card_metadata("source_url", "url")
use_model_card_metadata("source_doi", "doi")
use_model_card_metadata("source_uuid", "uuid")
use_model_card_metadata("source_repo_url", "repo_url")
# LLAMA.cpp's huggingface style convention
# (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
###########################################
use_model_card_metadata("name", "model_name")
use_model_card_metadata("author", "model_author")
use_model_card_metadata("version", "model_version")
use_model_card_metadata("organization", "model_organization")
use_model_card_metadata("description", "model_description")
use_model_card_metadata("finetune", "model_finetune")
use_model_card_metadata("basename", "model_basename")
use_model_card_metadata("size_label", "model_size_label")
use_model_card_metadata("source_url", "model_url")
use_model_card_metadata("source_doi", "model_doi")
use_model_card_metadata("source_uuid", "model_uuid")
use_model_card_metadata("source_repo_url", "model_repo_url")
# Hugging Face Direct Convention
#################################
# Not part of huggingface model card standard but notice some model creator using it # Not part of huggingface model card standard but notice some model creator using it
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
metadata.name = model_card.get("model_name") use_model_card_metadata("name", "model_name")
use_model_card_metadata("author", "model_creator")
use_model_card_metadata("basename", "model_type")
if "model_creator" in model_card and metadata.author is None: if (
# Not part of huggingface model card standard but notice some model creator using it "base_model" in model_card
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' or "base_models" in model_card
metadata.author = model_card.get("model_creator") or "base_model_sources" in model_card
):
if "model_type" in model_card and metadata.basename is None:
# Not part of huggingface model card standard but notice some model creator using it
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
metadata.basename = model_card.get("model_type")
if "base_model" in model_card:
# This represents the parent models that this is based on # This represents the parent models that this is based on
# Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges) # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
# Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
metadata_base_models = [] metadata_base_models = []
base_model_value = model_card.get("base_model", None) base_model_value = model_card.get(
"base_model",
model_card.get(
"base_models", model_card.get("base_model_sources", None)
),
)
if base_model_value is not None: if base_model_value is not None:
if isinstance(base_model_value, str): if isinstance(base_model_value, str):
@ -420,6 +483,47 @@ def apply_metadata_heuristic(
for model_id in metadata_base_models: for model_id in metadata_base_models:
# NOTE: model size of base model is assumed to be similar to the size of the current model # NOTE: model size of base model is assumed to be similar to the size of the current model
base_model = {}
if isinstance(model_id, str):
if (
model_id.startswith("http://")
or model_id.startswith("https://")
or model_id.startswith("ssh://")
):
base_model["repo_url"] = model_id
# Check if Hugging Face ID is present in URL
if "huggingface.co" in model_id:
match = re.match(
r"https?://huggingface.co/([^/]+/[^/]+)$", model_id
)
if match:
model_id_component = match.group(1)
(
model_full_name_component,
org_component,
basename,
finetune,
version,
size_label,
) = Metadata.get_model_id_components(
model_id_component, total_params
)
# Populate model dictionary with extracted components
if model_full_name_component is not None:
base_model["name"] = Metadata.id_to_title(
model_full_name_component
)
if org_component is not None:
base_model["organization"] = (
Metadata.id_to_title(org_component)
)
if version is not None:
base_model["version"] = version
else:
# Likely a Hugging Face ID
( (
model_full_name_component, model_full_name_component,
org_component, org_component,
@ -428,13 +532,16 @@ def apply_metadata_heuristic(
version, version,
size_label, size_label,
) = Metadata.get_model_id_components(model_id, total_params) ) = Metadata.get_model_id_components(model_id, total_params)
base_model = {}
# Populate model dictionary with extracted components
if model_full_name_component is not None: if model_full_name_component is not None:
base_model["name"] = Metadata.id_to_title( base_model["name"] = Metadata.id_to_title(
model_full_name_component model_full_name_component
) )
if org_component is not None: if org_component is not None:
base_model["organization"] = Metadata.id_to_title(org_component) base_model["organization"] = Metadata.id_to_title(
org_component
)
if version is not None: if version is not None:
base_model["version"] = version base_model["version"] = version
if ( if (
@ -444,62 +551,127 @@ def apply_metadata_heuristic(
base_model["repo_url"] = ( base_model["repo_url"] = (
f"https://huggingface.co/{org_component}/{model_full_name_component}" f"https://huggingface.co/{org_component}/{model_full_name_component}"
) )
elif isinstance(model_id, dict):
base_model = model_id
else:
logger.error(
f"base model entry '{str(model_id)}' not in a known format"
)
metadata.base_models.append(base_model) metadata.base_models.append(base_model)
if "license" in model_card and metadata.license is None: if (
metadata.license = model_card.get("license") "datasets" in model_card
or "dataset" in model_card
if "license_name" in model_card and metadata.license_name is None: or "dataset_sources" in model_card
metadata.license_name = model_card.get("license_name") ):
# This represents the datasets that this was trained from
if "license_link" in model_card and metadata.license_link is None: metadata_datasets = []
metadata.license_link = model_card.get("license_link") dataset_value = model_card.get(
"datasets",
tags_value = model_card.get("tags", None) model_card.get("dataset", model_card.get("dataset_sources", None)),
if tags_value is not None:
if metadata.tags is None:
metadata.tags = []
if isinstance(tags_value, str):
metadata.tags.append(tags_value)
elif isinstance(tags_value, list):
metadata.tags.extend(tags_value)
pipeline_tags_value = model_card.get("pipeline_tag", None)
if pipeline_tags_value is not None:
if metadata.tags is None:
metadata.tags = []
if isinstance(pipeline_tags_value, str):
metadata.tags.append(pipeline_tags_value)
elif isinstance(pipeline_tags_value, list):
metadata.tags.extend(pipeline_tags_value)
language_value = model_card.get(
"languages", model_card.get("language", None)
) )
if language_value is not None:
if metadata.languages is None:
metadata.languages = []
if isinstance(language_value, str):
metadata.languages.append(language_value)
elif isinstance(language_value, list):
metadata.languages.extend(language_value)
dataset_value = model_card.get("datasets", model_card.get("dataset", None))
if dataset_value is not None: if dataset_value is not None:
if isinstance(dataset_value, str):
metadata_datasets.append(dataset_value)
elif isinstance(dataset_value, list):
metadata_datasets.extend(dataset_value)
if metadata.datasets is None: if metadata.datasets is None:
metadata.datasets = [] metadata.datasets = []
if isinstance(dataset_value, str): for dataset_id in metadata_datasets:
metadata.datasets.append(dataset_value) # NOTE: model size of base model is assumed to be similar to the size of the current model
elif isinstance(dataset_value, list): dataset = {}
metadata.datasets.extend(dataset_value) if isinstance(dataset_id, str):
if dataset_id.startswith(("http://", "https://", "ssh://")):
dataset["repo_url"] = dataset_id
# Check if Hugging Face ID is present in URL
if "huggingface.co" in dataset_id:
match = re.match(
r"https?://huggingface.co/([^/]+/[^/]+)$",
dataset_id,
)
if match:
dataset_id_component = match.group(1)
(
dataset_name_component,
org_component,
basename,
finetune,
version,
size_label,
) = Metadata.get_model_id_components(
dataset_id_component, total_params
)
# Populate dataset dictionary with extracted components
if dataset_name_component is not None:
dataset["name"] = Metadata.id_to_title(
dataset_name_component
)
if org_component is not None:
dataset["organization"] = Metadata.id_to_title(
org_component
)
if version is not None:
dataset["version"] = version
else:
# Likely a Hugging Face ID
(
dataset_name_component,
org_component,
basename,
finetune,
version,
size_label,
) = Metadata.get_model_id_components(
dataset_id, total_params
)
# Populate dataset dictionary with extracted components
if dataset_name_component is not None:
dataset["name"] = Metadata.id_to_title(
dataset_name_component
)
if org_component is not None:
dataset["organization"] = Metadata.id_to_title(
org_component
)
if version is not None:
dataset["version"] = version
if (
org_component is not None
and dataset_name_component is not None
):
dataset["repo_url"] = (
f"https://huggingface.co/{org_component}/{dataset_name_component}"
)
elif isinstance(dataset_id, dict):
dataset = dataset_id
else:
logger.error(
f"dataset entry '{str(dataset_id)}' not in a known format"
)
metadata.datasets.append(dataset)
use_model_card_metadata("license", "license")
use_model_card_metadata("license_name", "license_name")
use_model_card_metadata("license_link", "license_link")
use_array_model_card_metadata("tags", "tags")
use_array_model_card_metadata("tags", "pipeline_tag")
use_array_model_card_metadata("languages", "languages")
use_array_model_card_metadata("languages", "language")
# Hugging Face Parameter Heuristics # Hugging Face Parameter Heuristics
#################################### ####################################
@ -508,7 +680,7 @@ def apply_metadata_heuristic(
hf_name_or_path = hf_params.get("_name_or_path") hf_name_or_path = hf_params.get("_name_or_path")
if hf_name_or_path is not None and hf_name_or_path.count("/") <= 1: if hf_name_or_path is not None and hf_name_or_path.count("/") <= 1:
# Use _name_or_path only if it's actually a model name and not some computer path # Use _name_or_path only if its actually a model name and not some computer path
# e.g. 'meta-llama/Llama-2-7b-hf' # e.g. 'meta-llama/Llama-2-7b-hf'
model_id = hf_name_or_path model_id = hf_name_or_path
( (
@ -584,6 +756,9 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
gguf_writer.add_size_label(self.size_label) gguf_writer.add_size_label(self.size_label)
if self.license is not None: if self.license is not None:
if isinstance(self.license, list):
gguf_writer.add_license(",".join(self.license))
else:
gguf_writer.add_license(self.license) gguf_writer.add_license(self.license)
if self.license_name is not None: if self.license_name is not None:
gguf_writer.add_license_name(self.license_name) gguf_writer.add_license_name(self.license_name)
@ -621,6 +796,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
gguf_writer.add_base_model_organization( gguf_writer.add_base_model_organization(
key, base_model_entry["organization"] key, base_model_entry["organization"]
) )
if "description" in base_model_entry:
gguf_writer.add_base_model_description(
key, base_model_entry["description"]
)
if "url" in base_model_entry: if "url" in base_model_entry:
gguf_writer.add_base_model_url(key, base_model_entry["url"]) gguf_writer.add_base_model_url(key, base_model_entry["url"])
if "doi" in base_model_entry: if "doi" in base_model_entry:
@ -632,9 +811,33 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
key, base_model_entry["repo_url"] key, base_model_entry["repo_url"]
) )
if self.datasets is not None:
gguf_writer.add_dataset_count(len(self.datasets))
for key, dataset_entry in enumerate(self.datasets):
if "name" in dataset_entry:
gguf_writer.add_dataset_name(key, dataset_entry["name"])
if "author" in dataset_entry:
gguf_writer.add_dataset_author(key, dataset_entry["author"])
if "version" in dataset_entry:
gguf_writer.add_dataset_version(key, dataset_entry["version"])
if "organization" in dataset_entry:
gguf_writer.add_dataset_organization(
key, dataset_entry["organization"]
)
if "description" in dataset_entry:
gguf_writer.add_dataset_description(
key, dataset_entry["description"]
)
if "url" in dataset_entry:
gguf_writer.add_dataset_url(key, dataset_entry["url"])
if "doi" in dataset_entry:
gguf_writer.add_dataset_doi(key, dataset_entry["doi"])
if "uuid" in dataset_entry:
gguf_writer.add_dataset_uuid(key, dataset_entry["uuid"])
if "repo_url" in dataset_entry:
gguf_writer.add_dataset_repo_url(key, dataset_entry["repo_url"])
if self.tags is not None: if self.tags is not None:
gguf_writer.add_tags(self.tags) gguf_writer.add_tags(self.tags)
if self.languages is not None: if self.languages is not None:
gguf_writer.add_languages(self.languages) gguf_writer.add_languages(self.languages)
if self.datasets is not None:
gguf_writer.add_datasets(self.datasets)

File diff suppressed because it is too large Load Diff

View File

@ -7,463 +7,574 @@
class TensorNameMap: class TensorNameMap:
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Token embeddings
MODEL_TENSOR.TOKEN_EMBD: ( MODEL_TENSOR.TOKEN_EMBD: (
"gpt_neox.embed_in", "gpt_neox.embed_in", # gptneox
"transformer.wte", "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
"transformer.word_embeddings", "transformer.word_embeddings", # falcon
"word_embeddings", "word_embeddings", # bloom
"model.embed_tokens", "model.embed_tokens", # llama-hf nemotron olmoe olmo2
"tok_embeddings", "tok_embeddings", # llama-pth
"embeddings.word_embeddings", "embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", "language_model.embedding.word_embeddings", # persimmon
"wte", "wte", # gpt2
"transformer.embd.wte", "transformer.embd.wte", # phi2
"model.tok_embeddings", "model.tok_embeddings", # internlm2
"model.embedding", "model.embedding", # mamba-qbert
"backbone.embedding", "backbone.embedding", # mamba
"backbone.embeddings", "backbone.embeddings", # mamba-hf
"transformer.in_out_embed", "transformer.in_out_embed", # Grok
"embedding.word_embeddings", "embedding.word_embeddings", # chatglm
"transformer.token_embeddings", "transformer.token_embeddings", # openelm
"shared", "shared", # t5
"rwkv.embeddings", "rwkv.embeddings", # rwkv
), ),
MODEL_TENSOR.TOKEN_TYPES: ("embeddings.token_type_embeddings",), # Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: (
"embeddings.token_type_embeddings", # bert nomic-bert
),
# Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: ( MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", "word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", "embeddings.LayerNorm", # bert
"emb_ln", "emb_ln", # nomic-bert
"transformer.norm", "transformer.norm", # openelm
"rwkv.blocks.0.pre_ln", "rwkv.blocks.0.pre_ln", # rwkv
"backbone.norm", # wavtokenizer
), ),
# Position embeddings
MODEL_TENSOR.POS_EMBD: ( MODEL_TENSOR.POS_EMBD: (
"transformer.wpe", "transformer.wpe", # gpt2
"embeddings.position_embeddings", "embeddings.position_embeddings", # bert
"wpe", "wpe", # gpt2
), ),
# Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", "embed_out", # gptneox
"lm_head", "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
"output", "output", # llama-pth bloom internlm2
"word_embeddings_for_head", "word_embeddings_for_head", # persimmon
"lm_head.linear", "lm_head.linear", # phi2
"output_layer", "output_layer", # chatglm
"head", "head", # rwkv
"head.out", # wavtokenizer
), ),
# Output norm
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", "gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", "transformer.ln_f", # gpt2 gpt-j falcon jais exaone
"model.norm", "model.norm", # llama-hf baichuan internlm2 olmoe olmo2
"norm", "norm", # llama-pth
"transformer.norm_f", "transformer.norm_f", # mpt dbrx
"ln_f", "ln_f", # refact bloom qwen gpt2
"language_model.encoder.final_layernorm", "language_model.encoder.final_layernorm", # persimmon
"model.final_layernorm", "model.final_layernorm", # persimmon
"lm_head.ln", "lm_head.ln", # phi2
"model.norm_f", "model.norm_f", # mamba-qbert
"backbone.norm_f", "backbone.norm_f", # mamba
"transformer.rms_norm", "transformer.rms_norm", # Grok
"encoder.final_layernorm", "encoder.final_layernorm", # chatglm
"transformer.norm", "transformer.norm", # openelm
"model.norm", "model.norm", # nemotron
"rwkv.ln_out", "rwkv.ln_out", # rwkv
"backbone.final_layer_norm", # wavtokenizer
), ),
# Rope frequencies
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: (
"rope.freqs", "rope.freqs", # llama-pth
"rotary_pos_emb.inv_freq", "rotary_pos_emb.inv_freq", # chatglm
), ),
MODEL_TENSOR.ROPE_FACTORS_LONG: (),
MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
MODEL_TENSOR.CONV1D: ("backbone.embed",), # roberta
} }
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Attention norm
MODEL_TENSOR.ATTN_NORM: ( MODEL_TENSOR.ATTN_NORM: (
"gpt_neox.layers.{bid}.input_layernorm", "gpt_neox.layers.{bid}.input_layernorm", # gptneox
"transformer.h.{bid}.ln_1", "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone
"transformer.blocks.{bid}.norm_1", "transformer.blocks.{bid}.norm_1", # mpt
"transformer.h.{bid}.input_layernorm", "transformer.h.{bid}.input_layernorm", # falcon7b
"h.{bid}.input_layernorm", "h.{bid}.input_layernorm", # bloom
"transformer.h.{bid}.ln_mlp", "transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe
"layers.{bid}.attention_norm", "layers.{bid}.attention_norm", # llama-pth
"language_model.encoder.layers.{bid}.input_layernorm", "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", "model.layers.{bid}.ln1", # yi
"h.{bid}.ln_1", "h.{bid}.ln_1", # gpt2
"transformer.h.{bid}.ln", "transformer.h.{bid}.ln", # phi2
"model.layers.layers.{bid}.norm", "model.layers.layers.{bid}.norm", # plamo
"model.layers.{bid}.attention_norm", "model.layers.{bid}.attention_norm", # internlm2
"model.layers.{bid}.norm", "model.layers.{bid}.norm", # mamba-qbert
"backbone.layers.{bid}.norm", "backbone.layers.{bid}.norm", # mamba
"transformer.decoder_layer.{bid}.rms_norm", "transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1", "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
"encoder.layers.{bid}.input_layernorm", "encoder.layers.{bid}.input_layernorm", # chatglm
"transformer.layers.{bid}.attn_norm", "transformer.layers.{bid}.attn_norm", # openelm
"rwkv.blocks.{bid}.ln1", "rwkv.blocks.{bid}.ln1", # rwkv
), ),
# Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ( MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", "transformer.h.{bid}.ln_attn", # falcon40b
"encoder.layer.{bid}.layer_norm_1", "encoder.layer.{bid}.layer_norm_1", # jina-v2-code
"rwkv.blocks.{bid}.ln2", "rwkv.blocks.{bid}.ln2", # rwkv
), ),
# Attention query-key-value
MODEL_TENSOR.ATTN_QKV: ( MODEL_TENSOR.ATTN_QKV: (
"gpt_neox.layers.{bid}.attention.query_key_value", "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
"transformer.h.{bid}.attn.c_attn", "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais
"transformer.blocks.{bid}.attn.Wqkv", "transformer.blocks.{bid}.attn.Wqkv", # mpt
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
"transformer.h.{bid}.self_attention.query_key_value", "transformer.h.{bid}.self_attention.query_key_value", # falcon
"h.{bid}.self_attention.query_key_value", "h.{bid}.self_attention.query_key_value", # bloom
"language_model.encoder.layers.{bid}.self_attention.query_key_value", "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
"model.layers.{bid}.self_attn.query_key_value", "model.layers.{bid}.self_attn.query_key_value", # persimmon
"h.{bid}.attn.c_attn", "h.{bid}.attn.c_attn", # gpt2
"transformer.h.{bid}.mixer.Wqkv", "transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", "encoder.layers.{bid}.attn.Wqkv", # nomic-bert
"model.layers.{bid}.self_attn.qkv_proj", "model.layers.{bid}.self_attn.qkv_proj", # phi3
"encoder.layers.{bid}.self_attention.query_key_value", "encoder.layers.{bid}.self_attention.query_key_value", # chatglm
"transformer.layers.{bid}.attn.qkv_proj", "transformer.layers.{bid}.attn.qkv_proj", # openelm
), ),
# Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wq", "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
"encoder.layer.{bid}.attention.self.query", "layers.{bid}.attention.wq", # llama-pth
"transformer.h.{bid}.attn.q_proj", "encoder.layer.{bid}.attention.self.query", # bert
"model.layers.layers.{bid}.self_attn.q_proj", "transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.{bid}.attention.wq", "model.layers.layers.{bid}.self_attn.q_proj", # plamo
"transformer.decoder_layer.{bid}.multi_head_attention.query", "model.layers.{bid}.attention.wq", # internlm2
"transformer.h.{bid}.attn.attention.q_proj", "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
"transformer.h.{bid}.attn.attention.q_proj", # exaone
), ),
# Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wk", "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
"encoder.layer.{bid}.attention.self.key", "layers.{bid}.attention.wk", # llama-pth
"transformer.h.{bid}.attn.k_proj", "encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k", "transformer.h.{bid}.attn.k_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.k_proj", "transformer.h.{bid}.attn.k", # refact
"model.layers.{bid}.attention.wk", "model.layers.layers.{bid}.self_attn.k_proj", # plamo
"transformer.decoder_layer.{bid}.multi_head_attention.key", "model.layers.{bid}.attention.wk", # internlm2
"transformer.h.{bid}.attn.attention.k_proj", "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
"transformer.h.{bid}.attn.attention.k_proj", # exaone
), ),
# Attention value
MODEL_TENSOR.ATTN_V: ( MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wv", "layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", "encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", "transformer.h.{bid}.attn.v_proj", # gpt-j
"transformer.h.{bid}.attn.v", "transformer.h.{bid}.attn.v", # refact
"model.layers.layers.{bid}.self_attn.v_proj", "model.layers.layers.{bid}.self_attn.v_proj", # plamo
"model.layers.{bid}.attention.wv", "model.layers.{bid}.attention.wv", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.value", "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
"transformer.h.{bid}.attn.attention.v_proj", "transformer.h.{bid}.attn.attention.v_proj", # exaone
), ),
# Attention output
MODEL_TENSOR.ATTN_OUT: ( MODEL_TENSOR.ATTN_OUT: (
"gpt_neox.layers.{bid}.attention.dense", "gpt_neox.layers.{bid}.attention.dense", # gptneox
"transformer.h.{bid}.attn.c_proj", "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais
"transformer.blocks.{bid}.attn.out_proj", "transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", "transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", "h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wo", "model.layers.{bid}.self_attn.linear_attn", # deci
"encoder.layer.{bid}.attention.output.dense", "layers.{bid}.attention.wo", # llama-pth
"transformer.h.{bid}.attn.out_proj", "encoder.layer.{bid}.attention.output.dense", # bert
"language_model.encoder.layers.{bid}.self_attention.dense", "transformer.h.{bid}.attn.out_proj", # gpt-j
"model.layers.{bid}.self_attn.dense", "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
"h.{bid}.attn.c_proj", "model.layers.{bid}.self_attn.dense", # persimmon
"transformer.h.{bid}.mixer.out_proj", "h.{bid}.attn.c_proj", # gpt2
"model.layers.layers.{bid}.self_attn.o_proj", "transformer.h.{bid}.mixer.out_proj", # phi2
"model.layers.{bid}.attention.wo", "model.layers.layers.{bid}.self_attn.o_proj", # plamo
"encoder.layers.{bid}.attn.out_proj", "model.layers.{bid}.attention.wo", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.linear", "encoder.layers.{bid}.attn.out_proj", # nomic-bert
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
"encoder.layers.{bid}.self_attention.dense", "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
"transformer.layers.{bid}.attn.out_proj", "encoder.layers.{bid}.self_attention.dense", # chatglm
"transformer.h.{bid}.attn.attention.out_proj", "transformer.layers.{bid}.attn.out_proj", # openelm
"transformer.h.{bid}.attn.attention.out_proj", # exaone
), ),
# Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: ( MODEL_TENSOR.ATTN_OUT_NORM: (
"encoder.layer.{bid}.attention.output.LayerNorm", "encoder.layer.{bid}.attention.output.LayerNorm", # bert
"encoder.layers.{bid}.norm1", "encoder.layers.{bid}.norm1", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_1", "transformer.decoder_layer.{bid}.rms_norm_1", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_2", "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
), ),
MODEL_TENSOR.ATTN_POST_NORM: ("model.layers.{bid}.post_attention_layernorm",), MODEL_TENSOR.ATTN_POST_NORM: (
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
),
# Rotary embeddings
MODEL_TENSOR.ATTN_ROT_EMBD: ( MODEL_TENSOR.ATTN_ROT_EMBD: (
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
"layers.{bid}.attention.inner_attention.rope.freqs", "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
"transformer.h.{bid}.attn.rotary_emb.inv_freq", "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
), ),
# Feed-forward norm
MODEL_TENSOR.FFN_NORM: ( MODEL_TENSOR.FFN_NORM: (
"gpt_neox.layers.{bid}.post_attention_layernorm", "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
"transformer.h.{bid}.ln_2", "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
"h.{bid}.post_attention_layernorm", "h.{bid}.post_attention_layernorm", # bloom
"transformer.blocks.{bid}.norm_2", "transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe
"layers.{bid}.ffn_norm", "layers.{bid}.ffn_norm", # llama-pth
"language_model.encoder.layers.{bid}.post_attention_layernorm", "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", "model.layers.{bid}.ln2", # yi
"h.{bid}.ln_2", "h.{bid}.ln_2", # gpt2
"model.layers.{bid}.ffn_norm", "model.layers.{bid}.ffn_norm", # internlm2
"transformer.decoder_layer.{bid}.rms_norm_2", "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
"encoder.layers.{bid}.post_attention_layernorm", "encoder.layers.{bid}.post_attention_layernorm", # chatglm
"transformer.layers.{bid}.ffn_norm", "transformer.layers.{bid}.ffn_norm", # openelm
),
# Post feed-forward norm
MODEL_TENSOR.FFN_PRE_NORM: (
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
),
# Post feed-forward norm
MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
), ),
MODEL_TENSOR.FFN_PRE_NORM: ("model.layers.{bid}.pre_feedforward_layernorm",),
MODEL_TENSOR.FFN_POST_NORM: ("model.layers.{bid}.post_feedforward_layernorm",),
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", "model.layers.{bid}.block_sparse_moe.gate", # mixtral
"model.layers.{bid}.mlp.gate", "model.layers.{bid}.mlp.gate", # qwen2moe olmoe
"transformer.decoder_layer.{bid}.router", "transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", "transformer.blocks.{bid}.ffn.router.layer", # dbrx
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
), ),
MODEL_TENSOR.FFN_GATE_INP_SHEXP: ("model.layers.{bid}.mlp.shared_expert_gate",), MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
),
MODEL_TENSOR.FFN_EXP_PROBS_B: (
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
),
# Feed-forward up
MODEL_TENSOR.FFN_UP: ( MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
"transformer.h.{bid}.mlp.c_fc", "transformer.h.{bid}.mlp.c_fc", # gpt2 jais
"transformer.blocks.{bid}.ffn.up_proj", "transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", "h.{bid}.mlp.dense_h_to_4h", # bloom
"model.layers.{bid}.mlp.up_proj", "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
"layers.{bid}.feed_forward.w3", "layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", "encoder.layer.{bid}.intermediate.dense", # bert
"transformer.h.{bid}.mlp.fc_in", "transformer.h.{bid}.mlp.fc_in", # gpt-j
"transformer.h.{bid}.mlp.linear_3", "transformer.h.{bid}.mlp.linear_3", # refact
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
"model.layers.{bid}.mlp.dense_h_to_4h", "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
"transformer.h.{bid}.mlp.w1", "transformer.h.{bid}.mlp.w1", # qwen
"h.{bid}.mlp.c_fc", "h.{bid}.mlp.c_fc", # gpt2
"transformer.h.{bid}.mlp.fc1", "transformer.h.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.fc1", "model.layers.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.gate_up_proj", "model.layers.{bid}.mlp.gate_up_proj", # phi3
"model.layers.layers.{bid}.mlp.up_proj", "model.layers.layers.{bid}.mlp.up_proj", # plamo
"model.layers.{bid}.feed_forward.w3", "model.layers.{bid}.feed_forward.w3", # internlm2
"encoder.layers.{bid}.mlp.fc11", "encoder.layers.{bid}.mlp.fc11", # nomic-bert
"model.layers.{bid}.mlp.c_fc", "model.layers.{bid}.mlp.c_fc", # starcoder2
"encoder.layer.{bid}.mlp.gated_layers_v", "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
"model.layers.{bid}.residual_mlp.w3", "model.layers.{bid}.residual_mlp.w3", # arctic
"encoder.layers.{bid}.mlp.dense_h_to_4h", "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
"transformer.h.{bid}.mlp.c_fc_1", "transformer.h.{bid}.mlp.c_fc_1", # exaone
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.w3", "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
), ),
MODEL_TENSOR.FFN_UP_SHEXP: ( MODEL_TENSOR.FFN_UP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.up_proj", "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.up_proj", "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
), ),
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # AWQ-activation gate
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
# Feed-forward gate
MODEL_TENSOR.FFN_GATE: ( MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
"layers.{bid}.feed_forward.w1", "layers.{bid}.feed_forward.w1", # llama-pth
"transformer.h.{bid}.mlp.w2", "transformer.h.{bid}.mlp.w2", # qwen
"transformer.h.{bid}.mlp.c_fc2", "transformer.h.{bid}.mlp.c_fc2", # jais
"model.layers.layers.{bid}.mlp.gate_proj", "model.layers.layers.{bid}.mlp.gate_proj", # plamo
"model.layers.{bid}.feed_forward.w1", "model.layers.{bid}.feed_forward.w1", # internlm2
"encoder.layers.{bid}.mlp.fc12", "encoder.layers.{bid}.mlp.fc12", # nomic-bert
"encoder.layer.{bid}.mlp.gated_layers_w", "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
"transformer.h.{bid}.mlp.linear_1", "transformer.h.{bid}.mlp.linear_1", # refact
"model.layers.{bid}.residual_mlp.w1", "model.layers.{bid}.residual_mlp.w1", # arctic
"transformer.h.{bid}.mlp.c_fc_0", "transformer.h.{bid}.mlp.c_fc_0", # exaone
), ),
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
), ),
MODEL_TENSOR.FFN_GATE_SHEXP: ( MODEL_TENSOR.FFN_GATE_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.gate_proj", "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.gate_proj", "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
), ),
# Feed-forward down
MODEL_TENSOR.FFN_DOWN: ( MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
"transformer.h.{bid}.mlp.c_proj", "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais
"transformer.blocks.{bid}.ffn.down_proj", "transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", "h.{bid}.mlp.dense_4h_to_h", # bloom
"model.layers.{bid}.mlp.down_proj", "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
"layers.{bid}.feed_forward.w2", "layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", "encoder.layer.{bid}.output.dense", # bert
"transformer.h.{bid}.mlp.fc_out", "transformer.h.{bid}.mlp.fc_out", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
"model.layers.{bid}.mlp.dense_4h_to_h", "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
"h.{bid}.mlp.c_proj", "h.{bid}.mlp.c_proj", # gpt2
"transformer.h.{bid}.mlp.fc2", "transformer.h.{bid}.mlp.fc2", # phi2
"model.layers.{bid}.mlp.fc2", "model.layers.{bid}.mlp.fc2", # phi2
"model.layers.layers.{bid}.mlp.down_proj", "model.layers.layers.{bid}.mlp.down_proj", # plamo
"model.layers.{bid}.feed_forward.w2", "model.layers.{bid}.feed_forward.w2", # internlm2
"encoder.layers.{bid}.mlp.fc2", "encoder.layers.{bid}.mlp.fc2", # nomic-bert
"model.layers.{bid}.mlp.c_proj", "model.layers.{bid}.mlp.c_proj", # starcoder2
"encoder.layer.{bid}.mlp.wo", "encoder.layer.{bid}.mlp.wo", # jina-bert-v2
"transformer.layers.{bid}.ffn.proj_2", "transformer.layers.{bid}.ffn.proj_2", # openelm
"model.layers.{bid}.residual_mlp.w2", "model.layers.{bid}.residual_mlp.w2", # arctic
"encoder.layer.{bid}.mlp.down_layer", "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
"encoder.layers.{bid}.mlp.dense_4h_to_h", "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
"model.layers.h.{bid}.mlp.c_proj", "model.layers.h.{bid}.mlp.c_proj", # exaone
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.w2", "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
"model.layers.{bid}.mlp.experts.down_proj", "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
), ),
MODEL_TENSOR.FFN_DOWN_SHEXP: ( MODEL_TENSOR.FFN_DOWN_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.down_proj", "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.down_proj", "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
), ),
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_norm", "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
"transformer.blocks.{bid}.attn.q_ln", "transformer.blocks.{bid}.attn.q_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_q", "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", "transformer.layers.{bid}.attn.q_norm", # openelm
), ),
MODEL_TENSOR.ATTN_K_NORM: ( MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_norm", "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
"transformer.blocks.{bid}.attn.k_ln", "transformer.blocks.{bid}.attn.k_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_k", "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", "transformer.layers.{bid}.attn.k_norm", # openelm
), ),
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: (
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
), ),
MODEL_TENSOR.LAYER_OUT_NORM: ( MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", "encoder.layer.{bid}.output.LayerNorm", # bert
"encoder.layers.{bid}.norm2", "encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
"encoder.layer.{bid}.mlp.layernorm", "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
"encoder.layer.{bid}.layer_norm_2", "encoder.layer.{bid}.layer_norm_2", # jina-v2-code
), ),
MODEL_TENSOR.SSM_IN: ( MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj", "model.layers.{bid}.in_proj",
"backbone.layers.{bid}.mixer.in_proj", "backbone.layers.{bid}.mixer.in_proj",
"model.layers.{bid}.mamba.in_proj",
), ),
MODEL_TENSOR.SSM_CONV1D: ( MODEL_TENSOR.SSM_CONV1D: (
"model.layers.{bid}.conv1d", "model.layers.{bid}.conv1d",
"backbone.layers.{bid}.mixer.conv1d", "backbone.layers.{bid}.mixer.conv1d",
"model.layers.{bid}.mamba.conv1d",
), ),
MODEL_TENSOR.SSM_X: ( MODEL_TENSOR.SSM_X: (
"model.layers.{bid}.x_proj", "model.layers.{bid}.x_proj",
"backbone.layers.{bid}.mixer.x_proj", "backbone.layers.{bid}.mixer.x_proj",
"model.layers.{bid}.mamba.x_proj",
), ),
MODEL_TENSOR.SSM_DT: ( MODEL_TENSOR.SSM_DT: (
"model.layers.{bid}.dt_proj", "model.layers.{bid}.dt_proj",
"backbone.layers.{bid}.mixer.dt_proj", "backbone.layers.{bid}.mixer.dt_proj",
"model.layers.{bid}.mamba.dt_proj",
), ),
MODEL_TENSOR.SSM_DT_NORM: ("model.layers.{bid}.mamba.dt_layernorm",),
MODEL_TENSOR.SSM_A: ( MODEL_TENSOR.SSM_A: (
"model.layers.{bid}.A_log", "model.layers.{bid}.A_log",
"backbone.layers.{bid}.mixer.A_log", "backbone.layers.{bid}.mixer.A_log",
"model.layers.{bid}.mamba.A_log",
),
MODEL_TENSOR.SSM_B_NORM: (
"model.layers.{bid}.mamba.b_layernorm",
"model.layers.{bid}.mamba.B_layernorm",
),
MODEL_TENSOR.SSM_C_NORM: (
"model.layers.{bid}.mamba.c_layernorm",
"model.layers.{bid}.mamba.C_layernorm",
), ),
MODEL_TENSOR.SSM_D: ( MODEL_TENSOR.SSM_D: (
"model.layers.{bid}.D", "model.layers.{bid}.D",
"backbone.layers.{bid}.mixer.D", "backbone.layers.{bid}.mixer.D",
"model.layers.{bid}.mamba.D",
), ),
MODEL_TENSOR.SSM_OUT: ( MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj", "model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj", "backbone.layers.{bid}.mixer.out_proj",
"model.layers.{bid}.mamba.out_proj",
), ),
MODEL_TENSOR.TIME_MIX_W1: ("rwkv.blocks.{bid}.attention.time_maa_w1",), MODEL_TENSOR.TIME_MIX_W1: (
MODEL_TENSOR.TIME_MIX_W2: ("rwkv.blocks.{bid}.attention.time_maa_w2",), "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6
MODEL_TENSOR.TIME_MIX_LERP_X: ("rwkv.blocks.{bid}.attention.time_maa_x",), ),
MODEL_TENSOR.TIME_MIX_LERP_K: ("rwkv.blocks.{bid}.attention.time_maa_k",), MODEL_TENSOR.TIME_MIX_W2: (
MODEL_TENSOR.TIME_MIX_LERP_V: ("rwkv.blocks.{bid}.attention.time_maa_v",), "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6
MODEL_TENSOR.TIME_MIX_LERP_R: ("rwkv.blocks.{bid}.attention.time_maa_r",), ),
MODEL_TENSOR.TIME_MIX_LERP_G: ("rwkv.blocks.{bid}.attention.time_maa_g",), MODEL_TENSOR.TIME_MIX_LERP_X: (
MODEL_TENSOR.TIME_MIX_LERP_W: ("rwkv.blocks.{bid}.attention.time_maa_w",), "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6
MODEL_TENSOR.TIME_MIX_FIRST: ("rwkv.blocks.{bid}.attention.time_faaaa",), ),
MODEL_TENSOR.TIME_MIX_DECAY: ("rwkv.blocks.{bid}.attention.time_decay",), MODEL_TENSOR.TIME_MIX_LERP_K: (
MODEL_TENSOR.TIME_MIX_DECAY_W1: ("rwkv.blocks.{bid}.attention.time_decay_w1",), "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6
MODEL_TENSOR.TIME_MIX_DECAY_W2: ("rwkv.blocks.{bid}.attention.time_decay_w2",), ),
MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",), MODEL_TENSOR.TIME_MIX_LERP_V: (
MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",), "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6
MODEL_TENSOR.TIME_MIX_RECEPTANCE: ("rwkv.blocks.{bid}.attention.receptance",), ),
MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",), MODEL_TENSOR.TIME_MIX_LERP_R: (
MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",), "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6
MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",), ),
MODEL_TENSOR.CHANNEL_MIX_LERP_K: ("rwkv.blocks.{bid}.feed_forward.time_maa_k",), MODEL_TENSOR.TIME_MIX_LERP_G: (
MODEL_TENSOR.CHANNEL_MIX_LERP_R: ("rwkv.blocks.{bid}.feed_forward.time_maa_r",), "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6
MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",), ),
MODEL_TENSOR.TIME_MIX_LERP_W: (
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_FIRST: (
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_DECAY: (
"rwkv.blocks.{bid}.attention.time_decay", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",), # rwkv
MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",), # rwkv
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.attention.receptance", # rwkv
),
MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",), # rwkv
MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",), # rwkv
MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",), # rwkv
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
),
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
),
MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",), # rwkv
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.feed_forward.receptance", "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
), ),
MODEL_TENSOR.CHANNEL_MIX_VALUE: ("rwkv.blocks.{bid}.feed_forward.value",), MODEL_TENSOR.CHANNEL_MIX_VALUE: (
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), "rwkv.blocks.{bid}.feed_forward.value", # rwkv
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), ),
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2
MODEL_TENSOR.ATTN_KV_A_MQA: ( MODEL_TENSOR.ATTN_KV_A_MQA: (
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
),
MODEL_TENSOR.ATTN_KV_B: (
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
),
MODEL_TENSOR.ATTN_Q_A_NORM: (
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
),
MODEL_TENSOR.ATTN_KV_A_NORM: (
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
),
MODEL_TENSOR.ATTN_SUB_NORM: (
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
),
MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",), # bitnet
MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",), # t5
MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",), # t5
MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",), # t5
MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",), # t5
MODEL_TENSOR.DEC_ATTN_OUT: (
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
), ),
MODEL_TENSOR.ATTN_KV_B: ("model.layers.{bid}.self_attn.kv_b_proj",),
MODEL_TENSOR.ATTN_Q_A_NORM: ("model.layers.{bid}.self_attn.q_a_layernorm",),
MODEL_TENSOR.ATTN_KV_A_NORM: ("model.layers.{bid}.self_attn.kv_a_layernorm",),
MODEL_TENSOR.ATTN_SUB_NORM: ("model.layers.{bid}.self_attn.inner_attn_ln",),
MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",),
MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",),
MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",),
MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",),
MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",),
MODEL_TENSOR.DEC_ATTN_OUT: ("decoder.block.{bid}.layer.0.SelfAttention.o",),
MODEL_TENSOR.DEC_ATTN_REL_B: ( MODEL_TENSOR.DEC_ATTN_REL_B: (
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
"decoder.block.{bid}.layer.1.layer_norm", # t5
), ),
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ("decoder.block.{bid}.layer.1.layer_norm",),
MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
"decoder.block.{bid}.layer.1.EncDecAttention.q", "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
), ),
MODEL_TENSOR.DEC_CROSS_ATTN_K: ( MODEL_TENSOR.DEC_CROSS_ATTN_K: (
"decoder.block.{bid}.layer.1.EncDecAttention.k", "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
), ),
MODEL_TENSOR.DEC_CROSS_ATTN_V: ( MODEL_TENSOR.DEC_CROSS_ATTN_V: (
"decoder.block.{bid}.layer.1.EncDecAttention.v", "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
), ),
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
"decoder.block.{bid}.layer.1.EncDecAttention.o", "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
), ),
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",), # t5
MODEL_TENSOR.DEC_FFN_GATE: (
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
), ),
MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",),
MODEL_TENSOR.DEC_FFN_GATE: ("decoder.block.{bid}.layer.2.DenseReluDense.wi_0",),
MODEL_TENSOR.DEC_FFN_UP: ( MODEL_TENSOR.DEC_FFN_UP: (
"decoder.block.{bid}.layer.2.DenseReluDense.wi", "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
),
MODEL_TENSOR.DEC_FFN_DOWN: (
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
),
MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",), # t5
MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",), # t5
MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",), # t5
MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",), # t5
MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",), # t5
MODEL_TENSOR.ENC_ATTN_OUT: (
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
), ),
MODEL_TENSOR.DEC_FFN_DOWN: ("decoder.block.{bid}.layer.2.DenseReluDense.wo",),
MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",),
MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",),
MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",),
MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",),
MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",),
MODEL_TENSOR.ENC_ATTN_OUT: ("encoder.block.{bid}.layer.0.SelfAttention.o",),
MODEL_TENSOR.ENC_ATTN_REL_B: ( MODEL_TENSOR.ENC_ATTN_REL_B: (
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",), # t5
MODEL_TENSOR.ENC_FFN_GATE: (
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
), ),
MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",),
MODEL_TENSOR.ENC_FFN_GATE: ("encoder.block.{bid}.layer.1.DenseReluDense.wi_0",),
MODEL_TENSOR.ENC_FFN_UP: ( MODEL_TENSOR.ENC_FFN_UP: (
"encoder.block.{bid}.layer.1.DenseReluDense.wi", "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
),
MODEL_TENSOR.ENC_FFN_DOWN: (
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
),
############################################################################
# TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",), # t5
MODEL_TENSOR.CLS: (
"classifier", # jina
"classifier.dense", # roberta
),
MODEL_TENSOR.CLS_OUT: ("classifier.out_proj",), # roberta
#############################################################################
MODEL_TENSOR.CONVNEXT_DW: ("backbone.convnext.{bid}.dwconv",), # wavtokenizer
MODEL_TENSOR.CONVNEXT_NORM: ("backbone.convnext.{bid}.norm",), # wavtokenizer
MODEL_TENSOR.CONVNEXT_PW1: ("backbone.convnext.{bid}.pwconv1",), # wavtokenizer
MODEL_TENSOR.CONVNEXT_PW2: ("backbone.convnext.{bid}.pwconv2",), # wavtokenizer
MODEL_TENSOR.CONVNEXT_GAMMA: ("backbone.convnext.{bid}.gamma",), # wavtokenizer
MODEL_TENSOR.POSNET_CONV1: ("backbone.posnet.{bid}.conv1",), # wavtokenizer
MODEL_TENSOR.POSNET_CONV2: ("backbone.posnet.{bid}.conv2",), # wavtokenizer
MODEL_TENSOR.POSNET_NORM: ("backbone.posnet.{bid}.norm",), # wavtokenizer
MODEL_TENSOR.POSNET_NORM1: ("backbone.posnet.{bid}.norm1",), # wavtokenizer
MODEL_TENSOR.POSNET_NORM2: ("backbone.posnet.{bid}.norm2",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_NORM: ("backbone.posnet.{bid}.norm",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_Q: ("backbone.posnet.{bid}.q",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_K: ("backbone.posnet.{bid}.k",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_V: ("backbone.posnet.{bid}.v",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_OUT: (
"backbone.posnet.{bid}.proj_out", # wavtokenizer
), ),
MODEL_TENSOR.ENC_FFN_DOWN: ("encoder.block.{bid}.layer.1.DenseReluDense.wo",),
MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",),
} }
# architecture-specific block mappings
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
MODEL_ARCH.ARCTIC: { MODEL_ARCH.ARCTIC: {
MODEL_TENSOR.FFN_NORM: ("model.layers.{bid}.residual_layernorm",), MODEL_TENSOR.FFN_NORM: ("model.layers.{bid}.residual_layernorm",),

View File

@ -157,8 +157,36 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer = json.load(f) tokenizer = json.load(f)
if self.load_merges: if self.load_merges:
merges = tokenizer.get("model", {}).get("merges") merges = tokenizer.get("model", {}).get("merges")
if isinstance(merges, list) and merges and isinstance(merges[0], str): if isinstance(merges, list) and merges:
if isinstance(merges[0], str):
self.merges = merges self.merges = merges
elif (
isinstance(merges[0], list)
and len(merges[0]) == 2
and isinstance(merges[0][0], str)
):
# New format since transformers 4.45 to support spaces in merges
# ref: https://github.com/ggerganov/llama.cpp/issues/9692
# TODO: internally store as the new format instead of converting to old
if any(" " in s for pair in merges for s in pair):
logger.warning(
f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}'
)
self.merges = [
" ".join(
[
# ensure the spaces are properly encoded
"".join(
chr(ord(c) + 256) if c == " " else c
for c in part
)
for part in pair
]
)
for pair in merges
]
else:
raise ValueError("Unknown tokenizer merges format")
added_tokens = tokenizer.get("added_tokens", {}) added_tokens = tokenizer.get("added_tokens", {})
else: else:
added_tokens = {} added_tokens = {}
@ -225,7 +253,6 @@ class Vocab(BaseVocab, Protocol):
fname_tokenizer: Path fname_tokenizer: Path
def __init__(self, base_path: Path): ... def __init__(self, base_path: Path): ...
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...

View File

@ -80,11 +80,15 @@ def load_dotenv(self=Any) -> None:
def show_about(self) -> None: def show_about(self) -> None:
about_text = ( about_text = f"""AutoGGUF
"AutoGGUF\n\n"
f"Version: {AUTOGGUF_VERSION}\n\n" Version: {AUTOGGUF_VERSION}
"A tool for managing and converting GGUF models."
) A tool for managing and converting GGUF models.
This application is licensed under the Apache License 2.0.
Copyright (c) 2025 leafspark.
It also utilizes llama.cpp, licensed under the MIT License.
Copyright (c) 2023-2024 The ggml authors."""
QMessageBox.about(self, "About AutoGGUF", about_text) QMessageBox.about(self, "About AutoGGUF", about_text)