refactor(ggml): update safetensor conversion scripts

This commit is contained in:
BuildTools 2025-03-22 09:41:54 -07:00
parent c9c2b04534
commit b4817eee06
No known key found for this signature in database
GPG Key ID: 3270C066C15D530B
10 changed files with 1295 additions and 644 deletions

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,6 @@
import gguf import gguf
# reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, Model from convert_hf_to_gguf import LazyTorchTensor, Model
logger = logging.getLogger("lora-to-gguf") logger = logging.getLogger("lora-to-gguf")
@ -39,10 +38,9 @@ class PartialLoraTensor:
B: Tensor | None = None B: Tensor | None = None
# magic to support tensor shape modifications and splitting
class LoraTorchTensor: class LoraTorchTensor:
_lora_A: Tensor # (n_rank, row_size) _lora_A: Tensor
_lora_B: Tensor # (col_size, n_rank) _lora_B: Tensor
_rank: int _rank: int
def __init__(self, A: Tensor, B: Tensor): def __init__(self, A: Tensor, B: Tensor):
@ -60,20 +58,14 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
def __getitem__( def __getitem__(
self, self,
indices: ( indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...],
SupportsIndex
| slice
| tuple[
SupportsIndex | slice | Tensor, ...
] # TODO: add ellipsis in the type signature
),
) -> LoraTorchTensor: ) -> LoraTorchTensor:
shape = self.shape shape = self.shape
if isinstance(indices, SupportsIndex): if isinstance(indices, SupportsIndex):
if len(shape) > 2: if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
else: else:
raise NotImplementedError # can't return a vector raise NotImplementedError
elif isinstance(indices, slice): elif isinstance(indices, slice):
if len(shape) > 2: if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
@ -83,7 +75,7 @@ def __getitem__(
assert len(indices) > 0 assert len(indices) > 0
if indices[-1] is Ellipsis: if indices[-1] is Ellipsis:
return self[indices[:-1]] return self[indices[:-1]]
# expand ellipsis
indices = tuple( indices = tuple(
u u
for v in ( for v in (
@ -103,7 +95,6 @@ def __getitem__(
*(slice(None, None) for _ in range(len(indices), len(shape))), *(slice(None, None) for _ in range(len(indices), len(shape))),
) )
# TODO: make sure this is correct
indices_A = ( indices_A = (
*( *(
( (
@ -119,7 +110,7 @@ def __getitem__(
indices_B = indices[:-1] indices_B = indices[:-1]
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
else: else:
raise NotImplementedError # unknown indice type raise NotImplementedError
@property @property
def dtype(self) -> torch.dtype: def dtype(self) -> torch.dtype:
@ -142,9 +133,8 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
new_shape = cast(tuple[int, ...], shape) new_shape = cast(tuple[int, ...], shape)
orig_shape = self.shape orig_shape = self.shape
if len(new_shape) < 2: if len(new_shape) < 2:
raise NotImplementedError # can't become a vector raise NotImplementedError
# expand -1 in the shape
if any(dim == -1 for dim in new_shape): if any(dim == -1 for dim in new_shape):
n_elems = prod(orig_shape) n_elems = prod(orig_shape)
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
@ -154,7 +144,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
) )
if new_shape[-1] != orig_shape[-1]: if new_shape[-1] != orig_shape[-1]:
raise NotImplementedError # can't reshape the row size trivially raise NotImplementedError
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
shape_B = (*new_shape[:-1], self._rank) shape_B = (*new_shape[:-1], self._rank)
@ -173,7 +163,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
shape = self.shape shape = self.shape
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
if dims[-1] == -1: if dims[-1] == -1:
# TODO: support higher dimensional A shapes bigger than 1
assert all(dim == 1 for dim in self._lora_A.shape[:-2]) assert all(dim == 1 for dim in self._lora_A.shape[:-2])
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
@ -181,7 +171,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
self._lora_B.permute(*dims), self._lora_A.permute(*dims) self._lora_B.permute(*dims), self._lora_A.permute(*dims)
) )
else: else:
# TODO: compose the above two
raise NotImplementedError raise NotImplementedError
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
@ -200,7 +190,7 @@ def to(self, *args, **kwargs):
@classmethod @classmethod
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
del types # unused del types
if kwargs is None: if kwargs is None:
kwargs = {} kwargs = {}
@ -241,7 +231,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "") base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight")
# models produced by mergekit-extract-lora have token embeddings in the adapter
base_name = base_name.replace(".lora_embedding_A", ".weight") base_name = base_name.replace(".lora_embedding_A", ".weight")
base_name = base_name.replace(".lora_embedding_B", ".weight") base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name return base_name
@ -303,7 +293,7 @@ def parse_args() -> argparse.Namespace:
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
# normally, adapter does not come with base model config, we need to load it from AutoConfig
config = AutoConfig.from_pretrained(hf_model_id) config = AutoConfig.from_pretrained(hf_model_id)
return config.to_dict() return config.to_dict()
@ -331,11 +321,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
if args.outfile is not None: if args.outfile is not None:
fname_out = args.outfile fname_out = args.outfile
else: else:
# output in the same directory as the model by default
fname_out = dir_lora fname_out = dir_lora
if os.path.exists(input_model): if os.path.exists(input_model):
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file from safetensors.torch import load_file
lora_model = load_file(input_model, device="cpu") lora_model = load_file(input_model, device="cpu")
@ -343,11 +333,9 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
input_model = os.path.join(dir_lora, "adapter_model.bin") input_model = os.path.join(dir_lora, "adapter_model.bin")
lora_model = torch.load(input_model, map_location="cpu", weights_only=True) lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
# load LoRA config
with open(lora_config, "r") as f: with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f) lparams: dict[str, Any] = json.load(f)
# load base model
if base_model_id is not None: if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}") logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id) hparams = load_hparams_from_hf(base_model_id)
@ -409,7 +397,7 @@ def set_gguf_parameters(self):
) )
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
return () return ()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
@ -419,13 +407,13 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
if self.lazy: if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor) tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name) base_name = get_base_tensor_name(name)
# note: mergekit-extract-lora also adds token embeddings to the adapter
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
if not is_lora_a and not is_lora_b: if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name: if ".base_layer.weight" in name:
continue continue
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
if "_layernorm" in name or ".norm" in name: if "_layernorm" in name or ".norm" in name:
yield (base_name, tensor) yield (base_name, tensor)
continue continue
@ -437,7 +425,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
"Embeddings is present in the adapter. This can be due to new tokens added during fine tuning" "Embeddings is present in the adapter. This can be due to new tokens added during fine tuning"
) )
logger.error( logger.error(
"Please refer to https://github.com/ggerganov/llama.cpp/pull/9948" "Please refer to https://github.com/ggml-org/llama.cpp/pull/9948"
) )
sys.exit(1) sys.exit(1)
@ -464,27 +452,21 @@ def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]: ) -> Iterable[tuple[str, Tensor]]:
dest = list(super().modify_tensors(data_torch, name, bid)) dest = list(super().modify_tensors(data_torch, name, bid))
# some archs may have the same tensor for lm_head and output (tie word embeddings)
# in this case, adapters targeting lm_head will fail when using llama-export-lora
# therefore, we ignore them for now
# see: https://github.com/ggerganov/llama.cpp/issues/9065
if name == "lm_head.weight" and len(dest) == 0: if name == "lm_head.weight" and len(dest) == 0:
raise ValueError( raise ValueError(
"lm_head is present in adapter, but is ignored in base model" "lm_head is present in adapter, but is ignored in base model"
) )
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
# mergekit-extract-lora add these layernorm to the adapter
if "_norm" in dest_name: if "_norm" in dest_name:
assert dest_data.dim() == 1 assert dest_data.dim() == 1
yield (dest_name, dest_data) yield (dest_name, dest_data)
continue continue
# otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()
# note: mergekit-extract-lora flip and transpose A and B
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
if "token_embd.weight" in dest_name: if "token_embd.weight" in dest_name:
lora_a = lora_a.T lora_a = lora_a.T

View File

@ -119,6 +119,7 @@ class LLM:
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
RESIDUAL_SCALE = "{arch}.residual_scale" RESIDUAL_SCALE = "{arch}.residual_scale"
EMBEDDING_SCALE = "{arch}.embedding_scale" EMBEDDING_SCALE = "{arch}.embedding_scale"
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -134,6 +135,10 @@ class Attention:
CAUSAL = "{arch}.attention.causal" CAUSAL = "{arch}.attention.causal"
Q_LORA_RANK = "{arch}.attention.q_lora_rank" Q_LORA_RANK = "{arch}.attention.q_lora_rank"
KV_LORA_RANK = "{arch}.attention.kv_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
DECAY_LORA_RANK = "{arch}.attention.decay_lora_rank"
ICLR_LORA_RANK = "{arch}.attention.iclr_lora_rank"
VALUE_RESIDUAL_MIX_LORA_RANK = "{arch}.attention.value_residual_mix_lora_rank"
GATE_LORA_RANK = "{arch}.attention.gate_lora_rank"
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
SLIDING_WINDOW = "{arch}.attention.sliding_window" SLIDING_WINDOW = "{arch}.attention.sliding_window"
SCALE = "{arch}.attention.scale" SCALE = "{arch}.attention.scale"
@ -189,7 +194,6 @@ class Tokenizer:
UNK_ID = "tokenizer.ggml.unknown_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id" PAD_ID = "tokenizer.ggml.padding_token_id"
CLS_ID = "tokenizer.ggml.cls_token_id"
MASK_ID = "tokenizer.ggml.mask_token_id" MASK_ID = "tokenizer.ggml.mask_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token" ADD_EOS = "tokenizer.ggml.add_eos_token"
@ -251,6 +255,7 @@ class MODEL_ARCH(IntEnum):
QWEN2VL = auto() QWEN2VL = auto()
PHI2 = auto() PHI2 = auto()
PHI3 = auto() PHI3 = auto()
PHIMOE = auto()
PLAMO = auto() PLAMO = auto()
CODESHELL = auto() CODESHELL = auto()
ORION = auto() ORION = auto()
@ -259,8 +264,12 @@ class MODEL_ARCH(IntEnum):
MINICPM3 = auto() MINICPM3 = auto()
GEMMA = auto() GEMMA = auto()
GEMMA2 = auto() GEMMA2 = auto()
GEMMA3 = auto()
STARCODER2 = auto() STARCODER2 = auto()
RWKV6 = auto() RWKV6 = auto()
RWKV6QWEN2 = auto()
RWKV7 = auto()
ARWKV7 = auto()
MAMBA = auto() MAMBA = auto()
XVERSE = auto() XVERSE = auto()
COMMAND_R = auto() COMMAND_R = auto()
@ -333,13 +342,26 @@ class MODEL_TENSOR(IntEnum):
SSM_A = auto() SSM_A = auto()
SSM_D = auto() SSM_D = auto()
SSM_OUT = auto() SSM_OUT = auto()
TIME_MIX_W0 = auto()
TIME_MIX_W1 = auto() TIME_MIX_W1 = auto()
TIME_MIX_W2 = auto() TIME_MIX_W2 = auto()
TIME_MIX_A0 = auto()
TIME_MIX_A1 = auto()
TIME_MIX_A2 = auto()
TIME_MIX_V0 = auto()
TIME_MIX_V1 = auto()
TIME_MIX_V2 = auto()
TIME_MIX_G1 = auto()
TIME_MIX_G2 = auto()
TIME_MIX_K_K = auto()
TIME_MIX_K_A = auto()
TIME_MIX_R_K = auto()
TIME_MIX_LERP_X = auto() TIME_MIX_LERP_X = auto()
TIME_MIX_LERP_K = auto() TIME_MIX_LERP_K = auto()
TIME_MIX_LERP_V = auto() TIME_MIX_LERP_V = auto()
TIME_MIX_LERP_R = auto() TIME_MIX_LERP_R = auto()
TIME_MIX_LERP_G = auto() TIME_MIX_LERP_G = auto()
TIME_MIX_LERP_FUSED = auto()
TIME_MIX_LERP_W = auto() TIME_MIX_LERP_W = auto()
TIME_MIX_FIRST = auto() TIME_MIX_FIRST = auto()
TIME_MIX_DECAY = auto() TIME_MIX_DECAY = auto()
@ -435,6 +457,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.QWEN2VL: "qwen2vl", MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PHIMOE: "phimoe",
MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.ORION: "orion", MODEL_ARCH.ORION: "orion",
@ -443,8 +466,12 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.MINICPM3: "minicpm3", MODEL_ARCH.MINICPM3: "minicpm3",
MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.GEMMA2: "gemma2",
MODEL_ARCH.GEMMA3: "gemma3",
MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.RWKV6: "rwkv6",
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
MODEL_ARCH.RWKV7: "rwkv7",
MODEL_ARCH.ARWKV7: "arwkv7",
MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
@ -517,13 +544,26 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0",
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
MODEL_TENSOR.TIME_MIX_A0: "blk.{bid}.time_mix_a0",
MODEL_TENSOR.TIME_MIX_A1: "blk.{bid}.time_mix_a1",
MODEL_TENSOR.TIME_MIX_A2: "blk.{bid}.time_mix_a2",
MODEL_TENSOR.TIME_MIX_V0: "blk.{bid}.time_mix_v0",
MODEL_TENSOR.TIME_MIX_V1: "blk.{bid}.time_mix_v1",
MODEL_TENSOR.TIME_MIX_V2: "blk.{bid}.time_mix_v2",
MODEL_TENSOR.TIME_MIX_G1: "blk.{bid}.time_mix_g1",
MODEL_TENSOR.TIME_MIX_G2: "blk.{bid}.time_mix_g2",
MODEL_TENSOR.TIME_MIX_K_K: "blk.{bid}.time_mix_k_k",
MODEL_TENSOR.TIME_MIX_K_A: "blk.{bid}.time_mix_k_a",
MODEL_TENSOR.TIME_MIX_R_K: "blk.{bid}.time_mix_r_k",
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused",
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
@ -947,6 +987,24 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.PHIMOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FACTORS_LONG,
MODEL_TENSOR.ROPE_FACTORS_SHORT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.CODESHELL: [ MODEL_ARCH.CODESHELL: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.POS_EMBD, MODEL_TENSOR.POS_EMBD,
@ -1060,6 +1118,23 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_PRE_NORM, MODEL_TENSOR.FFN_PRE_NORM,
MODEL_TENSOR.FFN_POST_NORM, MODEL_TENSOR.FFN_POST_NORM,
], ],
MODEL_ARCH.GEMMA3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_PRE_NORM,
MODEL_TENSOR.FFN_POST_NORM,
],
MODEL_ARCH.STARCODER2: [ MODEL_ARCH.STARCODER2: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -1090,6 +1165,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.TIME_MIX_LERP_R, MODEL_TENSOR.TIME_MIX_LERP_R,
MODEL_TENSOR.TIME_MIX_LERP_G, MODEL_TENSOR.TIME_MIX_LERP_G,
MODEL_TENSOR.TIME_MIX_LERP_W, MODEL_TENSOR.TIME_MIX_LERP_W,
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
MODEL_TENSOR.TIME_MIX_FIRST, MODEL_TENSOR.TIME_MIX_FIRST,
MODEL_TENSOR.TIME_MIX_DECAY, MODEL_TENSOR.TIME_MIX_DECAY,
MODEL_TENSOR.TIME_MIX_DECAY_W1, MODEL_TENSOR.TIME_MIX_DECAY_W1,
@ -1106,6 +1182,97 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE, MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
MODEL_TENSOR.CHANNEL_MIX_VALUE, MODEL_TENSOR.CHANNEL_MIX_VALUE,
], ],
MODEL_ARCH.RWKV6QWEN2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.TIME_MIX_W1,
MODEL_TENSOR.TIME_MIX_W2,
MODEL_TENSOR.TIME_MIX_LERP_X,
MODEL_TENSOR.TIME_MIX_LERP_K,
MODEL_TENSOR.TIME_MIX_LERP_V,
MODEL_TENSOR.TIME_MIX_LERP_R,
MODEL_TENSOR.TIME_MIX_LERP_G,
MODEL_TENSOR.TIME_MIX_LERP_W,
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
MODEL_TENSOR.TIME_MIX_FIRST,
MODEL_TENSOR.TIME_MIX_DECAY,
MODEL_TENSOR.TIME_MIX_DECAY_W1,
MODEL_TENSOR.TIME_MIX_DECAY_W2,
MODEL_TENSOR.TIME_MIX_KEY,
MODEL_TENSOR.TIME_MIX_VALUE,
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
MODEL_TENSOR.TIME_MIX_GATE,
MODEL_TENSOR.TIME_MIX_LN,
MODEL_TENSOR.TIME_MIX_OUTPUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.RWKV7: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_NORM_2,
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
MODEL_TENSOR.TIME_MIX_W0,
MODEL_TENSOR.TIME_MIX_W1,
MODEL_TENSOR.TIME_MIX_W2,
MODEL_TENSOR.TIME_MIX_A0,
MODEL_TENSOR.TIME_MIX_A1,
MODEL_TENSOR.TIME_MIX_A2,
MODEL_TENSOR.TIME_MIX_V0,
MODEL_TENSOR.TIME_MIX_V1,
MODEL_TENSOR.TIME_MIX_V2,
MODEL_TENSOR.TIME_MIX_G1,
MODEL_TENSOR.TIME_MIX_G2,
MODEL_TENSOR.TIME_MIX_K_K,
MODEL_TENSOR.TIME_MIX_K_A,
MODEL_TENSOR.TIME_MIX_R_K,
MODEL_TENSOR.TIME_MIX_KEY,
MODEL_TENSOR.TIME_MIX_VALUE,
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
MODEL_TENSOR.TIME_MIX_LN,
MODEL_TENSOR.TIME_MIX_OUTPUT,
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
MODEL_TENSOR.CHANNEL_MIX_KEY,
MODEL_TENSOR.CHANNEL_MIX_VALUE,
],
MODEL_ARCH.ARWKV7: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
MODEL_TENSOR.TIME_MIX_W0,
MODEL_TENSOR.TIME_MIX_W1,
MODEL_TENSOR.TIME_MIX_W2,
MODEL_TENSOR.TIME_MIX_A0,
MODEL_TENSOR.TIME_MIX_A1,
MODEL_TENSOR.TIME_MIX_A2,
MODEL_TENSOR.TIME_MIX_V0,
MODEL_TENSOR.TIME_MIX_V1,
MODEL_TENSOR.TIME_MIX_V2,
MODEL_TENSOR.TIME_MIX_G1,
MODEL_TENSOR.TIME_MIX_G2,
MODEL_TENSOR.TIME_MIX_K_K,
MODEL_TENSOR.TIME_MIX_K_A,
MODEL_TENSOR.TIME_MIX_R_K,
MODEL_TENSOR.TIME_MIX_KEY,
MODEL_TENSOR.TIME_MIX_VALUE,
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
MODEL_TENSOR.TIME_MIX_LN,
MODEL_TENSOR.TIME_MIX_OUTPUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.MAMBA: [ MODEL_ARCH.MAMBA: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -1310,6 +1477,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
@ -1789,7 +1959,6 @@ def get_type(val: Any) -> GGUFValueType:
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV

15
src/gguf/gguf.py Normal file
View File

@ -0,0 +1,15 @@
# This file left for compatibility. If you want to use the GGUF API from Python
# then don't import gguf/gguf.py directly. If you're looking for examples, see the
# examples/ directory for gguf-py
import importlib
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
# Compatibility for people trying to import gguf/gguf.py directly instead of as a package.
importlib.invalidate_caches()
import gguf # noqa: E402
importlib.reload(gguf)

View File

@ -6,6 +6,7 @@
import logging import logging
import os import os
import sys
from collections import OrderedDict from collections import OrderedDict
from typing import Any, Literal, NamedTuple, TypeVar, Union from typing import Any, Literal, NamedTuple, TypeVar, Union
@ -15,7 +16,6 @@
from .quants import quant_shape_to_byte_shape from .quants import quant_shape_to_byte_shape
if __name__ == "__main__": if __name__ == "__main__":
import sys
from pathlib import Path from pathlib import Path
# Allow running file in package as a script. # Allow running file in package as a script.
@ -28,6 +28,7 @@
GGUF_VERSION, GGUF_VERSION,
GGMLQuantizationType, GGMLQuantizationType,
GGUFValueType, GGUFValueType,
GGUFEndian,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -53,6 +54,52 @@ class ReaderField(NamedTuple):
types: list[GGUFValueType] = [] types: list[GGUFValueType] = []
def contents(self, index_or_slice: int | slice = slice(None)) -> Any:
if self.types:
to_string = lambda x: str(x.tobytes(), encoding="utf-8") # noqa: E731
main_type = self.types[0]
if main_type == GGUFValueType.ARRAY:
sub_type = self.types[-1]
if sub_type == GGUFValueType.STRING:
indices = self.data[index_or_slice]
if isinstance(index_or_slice, int):
return to_string(self.parts[indices]) # type: ignore
else:
return [to_string(self.parts[idx]) for idx in indices] # type: ignore
else:
# FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too
# Check if it's unsafe to perform slice optimization on data
# if any(True for idx in self.data if len(self.parts[idx]) != 1):
# optim_slice = slice(None)
# else:
# optim_slice = index_or_slice
# index_or_slice = slice(None)
# if isinstance(optim_slice, int):
# return self.parts[self.data[optim_slice]].tolist()[0]
# else:
# return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice]
if isinstance(index_or_slice, int):
return self.parts[self.data[index_or_slice]].tolist()[0]
else:
return [
pv
for idx in self.data[index_or_slice]
for pv in self.parts[idx].tolist()
]
if main_type == GGUFValueType.STRING:
return to_string(self.parts[-1])
else:
return self.parts[-1].tolist()[0]
return None
class ReaderTensor(NamedTuple): class ReaderTensor(NamedTuple):
name: str name: str
@ -103,12 +150,23 @@ def __init__(
# If we get 0 here that means it's (probably) a GGUF file created for # If we get 0 here that means it's (probably) a GGUF file created for
# the opposite byte order of the machine this script is running on. # the opposite byte order of the machine this script is running on.
self.byte_order = "S" self.byte_order = "S"
temp_version = temp_version.newbyteorder(self.byte_order) temp_version = temp_version.view(
temp_version.dtype.newbyteorder(self.byte_order)
)
version = temp_version[0] version = temp_version[0]
if version not in READER_SUPPORTED_VERSIONS: if version not in READER_SUPPORTED_VERSIONS:
raise ValueError( raise ValueError(
f"Sorry, file appears to be version {version} which we cannot handle" f"Sorry, file appears to be version {version} which we cannot handle"
) )
if sys.byteorder == "little":
# Host is little endian
host_endian = GGUFEndian.LITTLE
swapped_endian = GGUFEndian.BIG
else:
# Sorry PDP or other weird systems that don't use BE or LE.
host_endian = GGUFEndian.BIG
swapped_endian = GGUFEndian.LITTLE
self.endianess = swapped_endian if self.byte_order == "S" else host_endian
self.fields: OrderedDict[str, ReaderField] = OrderedDict() self.fields: OrderedDict[str, ReaderField] = OrderedDict()
self.tensors: list[ReaderTensor] = [] self.tensors: list[ReaderTensor] = []
offs += self._push_field( offs += self._push_field(
@ -170,9 +228,11 @@ def _get(
itemsize = int(np.empty([], dtype=dtype).itemsize) itemsize = int(np.empty([], dtype=dtype).itemsize)
end_offs = offset + itemsize * count end_offs = offset + itemsize * count
arr = self.data[offset:end_offs].view(dtype=dtype)[:count] arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
if override_order is None: return arr.view(
return arr arr.dtype.newbyteorder(
return arr.view(arr.dtype.newbyteorder(override_order)) self.byte_order if override_order is None else override_order
)
)
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
if field.name in self.fields: if field.name in self.fields:
@ -218,6 +278,7 @@ def _get_field_parts(
offs += int(alen.nbytes) offs += int(alen.nbytes)
aparts: list[npt.NDArray[Any]] = [raw_itype, alen] aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
data_idxs: list[int] = [] data_idxs: list[int] = []
# FIXME: Handle multi-dimensional arrays properly instead of flattening
for idx in range(alen[0]): for idx in range(alen[0]):
curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts( curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(
offs, raw_itype[0] offs, raw_itype[0]

View File

@ -828,6 +828,9 @@ def add_embedding_scale(self, value: float) -> None:
def add_wkv_head_size(self, size: int) -> None: def add_wkv_head_size(self, size: int) -> None:
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
def add_token_shift_count(self, count: int) -> None:
self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
def add_layer_norm_eps(self, value: float) -> None: def add_layer_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
@ -849,6 +852,20 @@ def add_q_lora_rank(self, length: int) -> None:
def add_kv_lora_rank(self, length: int) -> None: def add_kv_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length) self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
def add_decay_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.DECAY_LORA_RANK.format(arch=self.arch), length)
def add_iclr_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.ICLR_LORA_RANK.format(arch=self.arch), length)
def add_value_residual_mix_lora_rank(self, length: int) -> None:
self.add_uint32(
Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length
)
def add_gate_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length)
def add_relative_attn_buckets_count(self, value: int) -> None: def add_relative_attn_buckets_count(self, value: int) -> None:
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value) self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
@ -943,9 +960,6 @@ def add_sep_token_id(self, id: int) -> None:
def add_pad_token_id(self, id: int) -> None: def add_pad_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PAD_ID, id) self.add_uint32(Keys.Tokenizer.PAD_ID, id)
def add_cls_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.CLS_ID, id)
def add_mask_token_id(self, id: int) -> None: def add_mask_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.MASK_ID, id) self.add_uint32(Keys.Tokenizer.MASK_ID, id)

View File

@ -160,21 +160,41 @@ def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
if not model_card_path.is_file(): if not model_card_path.is_file():
return {} return {}
# The model card metadata is assumed to always be in YAML # The model card metadata is assumed to always be in YAML (frontmatter)
# ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473 # ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
yaml_content: str = ""
with open(model_card_path, "r", encoding="utf-8") as f: with open(model_card_path, "r", encoding="utf-8") as f:
if f.readline() == "---\n": content = f.read()
raw = f.read().partition("---\n")[0] lines = content.splitlines()
data = yaml.safe_load(raw) lines_yaml = []
if isinstance(data, dict): if len(lines) == 0:
return data # Empty file
else:
logger.error(
f"while reading YAML model card frontmatter, data is {type(data)} instead of dict"
)
return {}
else:
return {} return {}
if len(lines) > 0 and lines[0] != "---":
# No frontmatter
return {}
for line in lines[1:]:
if line == "---":
break # End of frontmatter
else:
lines_yaml.append(line)
yaml_content = "\n".join(lines_yaml) + "\n"
# Quick hack to fix the Norway problem
# https://hitchdev.com/strictyaml/why/implicit-typing-removed/
yaml_content = yaml_content.replace("- no\n", '- "no"\n')
if yaml_content:
data = yaml.safe_load(yaml_content)
if isinstance(data, dict):
return data
else:
logger.error(
f"while reading YAML model card frontmatter, data is {type(data)} instead of dict"
)
return {}
else:
return {}
@staticmethod @staticmethod
def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]: def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:

View File

@ -13,7 +13,7 @@ class TensorNameMap:
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
"transformer.word_embeddings", # falcon "transformer.word_embeddings", # falcon
"word_embeddings", # bloom "word_embeddings", # bloom
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert "embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
@ -27,7 +27,8 @@ class TensorNameMap:
"embedding.word_embeddings", # chatglm "embedding.word_embeddings", # chatglm
"transformer.token_embeddings", # openelm "transformer.token_embeddings", # openelm
"shared", # t5 "shared", # t5
"rwkv.embeddings", # rwkv "rwkv.embeddings", # rwkv6
"model.embeddings", # rwkv7
), ),
# Token type embeddings # Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: ( MODEL_TENSOR.TOKEN_TYPES: (
@ -40,6 +41,9 @@ class TensorNameMap:
"emb_ln", # nomic-bert "emb_ln", # nomic-bert
"transformer.norm", # openelm "transformer.norm", # openelm
"rwkv.blocks.0.pre_ln", # rwkv "rwkv.blocks.0.pre_ln", # rwkv
"rwkv.blocks.0.pre_ln", # rwkv6
"model.pre_ln", # rwkv7
"model.layers.0.pre_norm", # rwkv7
"backbone.norm", # wavtokenizer "backbone.norm", # wavtokenizer
), ),
# Position embeddings # Position embeddings
@ -51,7 +55,7 @@ class TensorNameMap:
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
"output", # llama-pth bloom internlm2 "output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
@ -63,7 +67,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox "gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone "transformer.ln_f", # gpt2 gpt-j falcon jais exaone
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 "model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe
"norm", # llama-pth "norm", # llama-pth
"transformer.norm_f", # mpt dbrx "transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
@ -76,7 +80,8 @@ class TensorNameMap:
"encoder.final_layernorm", # chatglm "encoder.final_layernorm", # chatglm
"transformer.norm", # openelm "transformer.norm", # openelm
"model.norm", # nemotron "model.norm", # nemotron
"rwkv.ln_out", # rwkv "rwkv.ln_out", # rwkv6
"model.ln_out", # rwkv7
"backbone.final_layer_norm", # wavtokenizer "backbone.final_layer_norm", # wavtokenizer
), ),
# Rope frequencies # Rope frequencies
@ -98,7 +103,7 @@ class TensorNameMap:
"transformer.h.{bid}.input_layernorm", # falcon7b "transformer.h.{bid}.input_layernorm", # falcon7b
"h.{bid}.input_layernorm", # bloom "h.{bid}.input_layernorm", # bloom
"transformer.h.{bid}.ln_mlp", # falcon40b "transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe
"layers.{bid}.attention_norm", # llama-pth "layers.{bid}.attention_norm", # llama-pth
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi "model.layers.{bid}.ln1", # yi
@ -112,13 +117,15 @@ class TensorNameMap:
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
"encoder.layers.{bid}.input_layernorm", # chatglm "encoder.layers.{bid}.input_layernorm", # chatglm
"transformer.layers.{bid}.attn_norm", # openelm "transformer.layers.{bid}.attn_norm", # openelm
"rwkv.blocks.{bid}.ln1", # rwkv "rwkv.blocks.{bid}.ln1", # rwkv6
"model.layers.{bid}.ln1", # rwkv7
), ),
# Attention norm 2 # Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ( MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", # falcon40b "transformer.h.{bid}.ln_attn", # falcon40b
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code "encoder.layer.{bid}.layer_norm_1", # jina-v2-code
"rwkv.blocks.{bid}.ln2", # rwkv "rwkv.blocks.{bid}.ln2", # rwkv6
"model.layers.{bid}.ln2", # rwkv7
), ),
# Attention query-key-value # Attention query-key-value
MODEL_TENSOR.ATTN_QKV: ( MODEL_TENSOR.ATTN_QKV: (
@ -139,7 +146,7 @@ class TensorNameMap:
), ),
# Attention query # Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
"layers.{bid}.attention.wq", # llama-pth "layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert "encoder.layer.{bid}.attention.self.query", # bert
@ -151,7 +158,7 @@ class TensorNameMap:
), ),
# Attention key # Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
"layers.{bid}.attention.wk", # llama-pth "layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert "encoder.layer.{bid}.attention.self.key", # bert
@ -164,7 +171,7 @@ class TensorNameMap:
), ),
# Attention value # Attention value
MODEL_TENSOR.ATTN_V: ( MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
"layers.{bid}.attention.wv", # llama-pth "layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert "encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j "transformer.h.{bid}.attn.v_proj", # gpt-j
@ -181,7 +188,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon "transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom "h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
"model.layers.{bid}.self_attn.linear_attn", # deci "model.layers.{bid}.self_attn.linear_attn", # deci
"layers.{bid}.attention.wo", # llama-pth "layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert "encoder.layer.{bid}.attention.output.dense", # bert
@ -222,7 +229,7 @@ class TensorNameMap:
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
"h.{bid}.post_attention_layernorm", # bloom "h.{bid}.post_attention_layernorm", # bloom
"transformer.blocks.{bid}.norm_2", # mpt "transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe
"layers.{bid}.ffn_norm", # llama-pth "layers.{bid}.ffn_norm", # llama-pth
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", # yi "model.layers.{bid}.ln2", # yi
@ -242,7 +249,7 @@ class TensorNameMap:
), ),
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe "model.layers.{bid}.mlp.gate", # qwen2moe olmoe
"transformer.decoder_layer.{bid}.router", # Grok "transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx "transformer.blocks.{bid}.ffn.router.layer", # dbrx
@ -287,6 +294,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
), ),
MODEL_TENSOR.FFN_UP_SHEXP: ( MODEL_TENSOR.FFN_UP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
@ -313,6 +321,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
), ),
MODEL_TENSOR.FFN_GATE_SHEXP: ( MODEL_TENSOR.FFN_GATE_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
@ -351,6 +360,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
), ),
MODEL_TENSOR.FFN_DOWN_SHEXP: ( MODEL_TENSOR.FFN_DOWN_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
@ -410,62 +420,116 @@ class TensorNameMap:
"model.layers.{bid}.out_proj", "model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj", "backbone.layers.{bid}.mixer.out_proj",
), ),
MODEL_TENSOR.TIME_MIX_W0: ("model.layers.{bid}.attention.w0",), # rwkv7
MODEL_TENSOR.TIME_MIX_W1: ( MODEL_TENSOR.TIME_MIX_W1: (
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6 "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6
"model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
"model.layers.{bid}.attention.w1", # rwkv7
), ),
MODEL_TENSOR.TIME_MIX_W2: ( MODEL_TENSOR.TIME_MIX_W2: (
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6 "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6
"model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
"model.layers.{bid}.attention.w2", # rwkv7
), ),
MODEL_TENSOR.TIME_MIX_A0: ("model.layers.{bid}.attention.a0",), # rwkv7
MODEL_TENSOR.TIME_MIX_A1: ("model.layers.{bid}.attention.a1",), # rwkv7
MODEL_TENSOR.TIME_MIX_A2: ("model.layers.{bid}.attention.a2",), # rwkv7
MODEL_TENSOR.TIME_MIX_V0: ("model.layers.{bid}.attention.v0",), # rwkv7
MODEL_TENSOR.TIME_MIX_V1: ("model.layers.{bid}.attention.v1",), # rwkv7
MODEL_TENSOR.TIME_MIX_V2: ("model.layers.{bid}.attention.v2",), # rwkv7
MODEL_TENSOR.TIME_MIX_G1: ("model.layers.{bid}.attention.g1",), # rwkv7
MODEL_TENSOR.TIME_MIX_G2: ("model.layers.{bid}.attention.g2",), # rwkv7
MODEL_TENSOR.TIME_MIX_K_K: ("model.layers.{bid}.attention.k_k",), # rwkv7
MODEL_TENSOR.TIME_MIX_K_A: ("model.layers.{bid}.attention.k_a",), # rwkv7
MODEL_TENSOR.TIME_MIX_R_K: ("model.layers.{bid}.attention.r_k",), # rwkv7
MODEL_TENSOR.TIME_MIX_LERP_X: ( MODEL_TENSOR.TIME_MIX_LERP_X: (
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6 "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6
"model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2
), ),
MODEL_TENSOR.TIME_MIX_LERP_K: ( MODEL_TENSOR.TIME_MIX_LERP_K: (
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6 "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6
"model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2
), ),
MODEL_TENSOR.TIME_MIX_LERP_V: ( MODEL_TENSOR.TIME_MIX_LERP_V: (
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6 "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6
"model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2
), ),
MODEL_TENSOR.TIME_MIX_LERP_R: ( MODEL_TENSOR.TIME_MIX_LERP_R: (
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6 "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6
"model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2
), ),
MODEL_TENSOR.TIME_MIX_LERP_G: ( MODEL_TENSOR.TIME_MIX_LERP_G: (
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6 "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6
"model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2
), ),
MODEL_TENSOR.TIME_MIX_LERP_W: ( MODEL_TENSOR.TIME_MIX_LERP_W: (
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6 "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6
"model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2
), ),
MODEL_TENSOR.TIME_MIX_FIRST: ( MODEL_TENSOR.TIME_MIX_FIRST: (
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6 "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6
), ),
MODEL_TENSOR.TIME_MIX_DECAY: ( MODEL_TENSOR.TIME_MIX_DECAY: (
"rwkv.blocks.{bid}.attention.time_decay", # rwkv v6 "rwkv.blocks.{bid}.attention.time_decay", # rwkv6
"model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2
), ),
MODEL_TENSOR.TIME_MIX_DECAY_W1: ( MODEL_TENSOR.TIME_MIX_DECAY_W1: (
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6 "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6
"model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2
), ),
MODEL_TENSOR.TIME_MIX_DECAY_W2: ( MODEL_TENSOR.TIME_MIX_DECAY_W2: (
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6 "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6
"model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_KEY: (
"rwkv.blocks.{bid}.attention.key", # rwkv6
"model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
"model.layers.{bid}.attention.key", # rwkv7
"model.layers.{bid}.attention.k_proj", # rwkv7
),
MODEL_TENSOR.TIME_MIX_VALUE: (
"rwkv.blocks.{bid}.attention.value", # rwkv6
"model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
"model.layers.{bid}.attention.value", # rwkv7
"model.layers.{bid}.attention.v_proj", # rwkv7
), ),
MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",), # rwkv
MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",), # rwkv
MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.attention.receptance", # rwkv "rwkv.blocks.{bid}.attention.receptance", # rwkv6
"model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
"model.layers.{bid}.attention.receptance", # rwkv7
"model.layers.{bid}.attention.r_proj", # rwkv7
),
MODEL_TENSOR.TIME_MIX_GATE: (
"rwkv.blocks.{bid}.attention.gate", # rwkv6
"model.layers.{bid}.self_attn.gate", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_LN: (
"rwkv.blocks.{bid}.attention.ln_x", # rwkv6
"model.layers.{bid}.attention.ln_x", # rwkv7
),
MODEL_TENSOR.TIME_MIX_OUTPUT: (
"rwkv.blocks.{bid}.attention.output", # rwkv6
"model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
"model.layers.{bid}.attention.output", # rwkv7
"model.layers.{bid}.attention.o_proj", # rwkv7
), ),
MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",), # rwkv
MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",), # rwkv
MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",), # rwkv
MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6 "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6
"model.layers.{bid}.feed_forward.x_k", # rwkv7
), ),
MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6 "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6
),
MODEL_TENSOR.CHANNEL_MIX_KEY: (
"rwkv.blocks.{bid}.feed_forward.key", # rwkv6
"model.layers.{bid}.feed_forward.key", # rwkv7
), ),
MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",), # rwkv
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6
), ),
MODEL_TENSOR.CHANNEL_MIX_VALUE: ( MODEL_TENSOR.CHANNEL_MIX_VALUE: (
"rwkv.blocks.{bid}.feed_forward.value", # rwkv "rwkv.blocks.{bid}.feed_forward.value", # rwkv6
"model.layers.{bid}.feed_forward.value", # rwkv7
), ),
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2 MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2 MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2

View File

@ -67,7 +67,7 @@ def naming_convention(
output_type: str | None, output_type: str | None,
model_type: Literal["vocab", "LoRA"] | None = None, model_type: Literal["vocab", "LoRA"] | None = None,
) -> str: ) -> str:
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention # Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention
if base_name is not None: if base_name is not None:
name = base_name.strip().replace(" ", "-").replace("/", "-") name = base_name.strip().replace(" ", "-").replace("/", "-")

View File

@ -166,7 +166,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
and isinstance(merges[0][0], str) and isinstance(merges[0][0], str)
): ):
# New format since transformers 4.45 to support spaces in merges # New format since transformers 4.45 to support spaces in merges
# ref: https://github.com/ggerganov/llama.cpp/issues/9692 # ref: https://github.com/ggml-org/llama.cpp/issues/9692
# TODO: internally store as the new format instead of converting to old # TODO: internally store as the new format instead of converting to old
if any(" " in s for pair in merges for s in pair): if any(" " in s for pair in merges for s in pair):
logger.warning( logger.warning(
@ -195,7 +195,12 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
return True return True
with open(tokenizer_config_file, encoding="utf-8") as f: with open(tokenizer_config_file, encoding="utf-8") as f:
tokenizer_config = json.load(f) tokenizer_config = json.load(f)
chat_template = tokenizer_config.get("chat_template") chat_template_alt = None
chat_template_file = path / "chat_template.json"
if chat_template_file.is_file():
with open(chat_template_file, encoding="utf-8") as f:
chat_template_alt = json.load(f).get("chat_template")
chat_template = tokenizer_config.get("chat_template", chat_template_alt)
if chat_template is None or isinstance(chat_template, (str, list)): if chat_template is None or isinstance(chat_template, (str, list)):
self.chat_template = chat_template self.chat_template = chat_template
else: else: