mirror of https://github.com/leafspark/AutoGGUF
feat(convert): update llama.cpp convert scripts
- added support for MiniCPM3, RWKVv6, OLMoE, IBM Granite, and Jamba (conversion only: https://github.com/ggerganov/llama.cpp/pull/7531) - update gguf library from upstream
This commit is contained in:
parent
4e51ed2f56
commit
39441e503f
|
@ -1,5 +1,6 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
import logging
|
import logging
|
||||||
import argparse
|
import argparse
|
||||||
import contextlib
|
import contextlib
|
||||||
|
@ -165,14 +166,14 @@ def set_vocab(self):
|
||||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
tensor_names_from_parts: set[str] = set()
|
tensor_names_from_parts: set[str] = set()
|
||||||
|
|
||||||
if len(self.part_names) > 1:
|
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
||||||
self.tensor_names = set()
|
|
||||||
index_name = (
|
|
||||||
"model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
|
||||||
)
|
|
||||||
index_name += ".index.json"
|
index_name += ".index.json"
|
||||||
|
index_file = self.dir_model / index_name
|
||||||
|
|
||||||
|
if index_file.is_file():
|
||||||
|
self.tensor_names = set()
|
||||||
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
||||||
with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
|
with open(index_file, "r", encoding="utf-8") as f:
|
||||||
index: dict[str, Any] = json.load(f)
|
index: dict[str, Any] = json.load(f)
|
||||||
weight_map = index.get("weight_map")
|
weight_map = index.get("weight_map")
|
||||||
if weight_map is None or not isinstance(weight_map, dict):
|
if weight_map is None or not isinstance(weight_map, dict):
|
||||||
|
@ -180,6 +181,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
self.tensor_names.update(weight_map.keys())
|
self.tensor_names.update(weight_map.keys())
|
||||||
else:
|
else:
|
||||||
self.tensor_names = tensor_names_from_parts
|
self.tensor_names = tensor_names_from_parts
|
||||||
|
weight_map = {}
|
||||||
|
|
||||||
for part_name in self.part_names:
|
for part_name in self.part_names:
|
||||||
logger.info(f"gguf: loading model part '{part_name}'")
|
logger.info(f"gguf: loading model part '{part_name}'")
|
||||||
|
@ -217,16 +219,19 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
data = LazyTorchTensor.from_eager(data)
|
data = LazyTorchTensor.from_eager(data)
|
||||||
yield name, data
|
yield name, data
|
||||||
|
|
||||||
if (
|
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
||||||
len(
|
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
|
||||||
sym_diff := tensor_names_from_parts.symmetric_difference(
|
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
||||||
self.tensor_names
|
missing_files = sorted(
|
||||||
|
set(weight_map[n] for n in missing if n in weight_map)
|
||||||
)
|
)
|
||||||
)
|
if len(extra) == 0 and len(missing_files) > 0:
|
||||||
> 0
|
raise ValueError(f"Missing or incomplete model files: {missing_files}")
|
||||||
):
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Mismatch between weight map and model parts for tensor names: {sym_diff}"
|
"Mismatch between weight map and model parts for tensor names:\n"
|
||||||
|
f"Missing tensors: {missing}\n"
|
||||||
|
f"Extra tensors: {extra}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def format_tensor_name(
|
def format_tensor_name(
|
||||||
|
@ -383,12 +388,31 @@ def prepare_tensors(self):
|
||||||
gguf.MODEL_TENSOR.POS_EMBD,
|
gguf.MODEL_TENSOR.POS_EMBD,
|
||||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
or not name.endswith(".weight")
|
or not new_name.endswith(".weight")
|
||||||
):
|
):
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
if data_qtype is False and any(
|
||||||
|
self.match_model_tensor_name(new_name, key, bid)
|
||||||
|
for key in (
|
||||||
|
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
gguf.MODEL_TENSOR.OUTPUT,
|
||||||
|
)
|
||||||
|
):
|
||||||
|
if self.ftype in (
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
|
):
|
||||||
|
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
@ -398,6 +422,10 @@ def prepare_tensors(self):
|
||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
|
||||||
|
@ -417,7 +445,7 @@ def prepare_tensors(self):
|
||||||
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"{f'%s-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}"
|
f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
||||||
|
@ -704,6 +732,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
|
||||||
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
||||||
|
|
||||||
res = "exaone"
|
res = "exaone"
|
||||||
|
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
||||||
|
|
||||||
|
res = "phi-2"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -1130,6 +1161,7 @@ def set_vocab(self):
|
||||||
try:
|
try:
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
self.gguf_writer.add_pad_token_id(3)
|
self.gguf_writer.add_pad_token_id(3)
|
||||||
|
@ -1710,7 +1742,9 @@ def prepare_tensors(self):
|
||||||
raise ValueError(f"Unprocessed norms: {norms}")
|
raise ValueError(f"Unprocessed norms: {norms}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
@Model.register(
|
||||||
|
"LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM"
|
||||||
|
)
|
||||||
class LlamaModel(Model):
|
class LlamaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||||
|
|
||||||
|
@ -1891,15 +1925,14 @@ def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
def weight_quant(self, weight):
|
def weight_quant(self, weight: Tensor) -> Tensor:
|
||||||
dtype = weight.dtype
|
dtype = weight.dtype
|
||||||
weight = weight.float()
|
weight = weight.float()
|
||||||
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
scale = weight.abs().mean().clamp(min=1e-5)
|
||||||
weight = (weight * s).round().clamp(-1, 1) / s
|
iscale = 1 / scale
|
||||||
scale = weight.abs().max().unsqueeze(0)
|
|
||||||
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
||||||
weight = torch.sign(weight).type(dtype)
|
return result.type(dtype)
|
||||||
return weight.type(dtype), scale.type(torch.float32)
|
|
||||||
|
|
||||||
def modify_tensors(
|
def modify_tensors(
|
||||||
self, data_torch: Tensor, name: str, bid: int | None
|
self, data_torch: Tensor, name: str, bid: int | None
|
||||||
|
@ -1919,10 +1952,8 @@ def modify_tensors(
|
||||||
]
|
]
|
||||||
):
|
):
|
||||||
|
|
||||||
weight_torch, scale_torch = self.weight_quant(data_torch)
|
data_torch = self.weight_quant(data_torch)
|
||||||
yield (new_name, weight_torch)
|
|
||||||
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
|
||||||
else:
|
|
||||||
yield (new_name, data_torch)
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
|
@ -2099,6 +2130,79 @@ def modify_tensors(
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("MiniCPM3ForCausalLM")
|
||||||
|
class MiniCPM3Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.MINICPM3
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
hparams = self.hparams
|
||||||
|
|
||||||
|
rope_dims = hparams["qk_rope_head_dim"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||||
|
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||||
|
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||||
|
self.gguf_writer.add_key_length(
|
||||||
|
hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]
|
||||||
|
)
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
|
rope_scaling = self.find_hparam(["rope_scaling"], True)
|
||||||
|
if rope_scaling is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
long_factors = rope_scaling.get("long_factor", None)
|
||||||
|
short_factors = rope_scaling.get("short_factor", None)
|
||||||
|
|
||||||
|
if long_factors is None or short_factors is None:
|
||||||
|
raise KeyError(
|
||||||
|
"Missing the required key rope_scaling.long_factor or rope_scaling_short_factor"
|
||||||
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
len(long_factors) != len(short_factors)
|
||||||
|
or len(long_factors) != rope_dims / 2
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
f"The length of rope long and short factors must be {rope_dims / 2}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(
|
||||||
|
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight",
|
||||||
|
np.array(long_factors, dtype=np.float32),
|
||||||
|
)
|
||||||
|
self.gguf_writer.add_tensor(
|
||||||
|
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight",
|
||||||
|
np.array(short_factors, dtype=np.float32),
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_llama_hf()
|
||||||
|
|
||||||
|
def _reverse_hf_permute(
|
||||||
|
self, weights: Tensor, n_head: int, n_kv_head: int | None = None
|
||||||
|
) -> Tensor:
|
||||||
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
n_head //= n_kv_head
|
||||||
|
|
||||||
|
return (
|
||||||
|
weights.reshape(
|
||||||
|
n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]
|
||||||
|
)
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(weights.shape)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("QWenLMHeadModel")
|
@Model.register("QWenLMHeadModel")
|
||||||
class QwenModel(Model):
|
class QwenModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN
|
model_arch = gguf.MODEL_ARCH.QWEN
|
||||||
|
@ -3087,6 +3191,100 @@ class StarCoder2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Rwkv6ForCausalLM")
|
||||||
|
class Rwkv6Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.RWKV6
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
||||||
|
vocab_size = self.hparams.get("vocab_size", 65536)
|
||||||
|
|
||||||
|
tokens: list[bytes] = ["<s>".encode("utf-8")]
|
||||||
|
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
||||||
|
|
||||||
|
with open(
|
||||||
|
self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8"
|
||||||
|
) as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
for line in lines:
|
||||||
|
parts = line.split(" ")
|
||||||
|
assert len(parts) >= 3
|
||||||
|
token, token_len = ast.literal_eval(" ".join(parts[1:-1])), int(
|
||||||
|
parts[-1]
|
||||||
|
)
|
||||||
|
token = token.encode("utf-8") if isinstance(token, str) else token
|
||||||
|
assert isinstance(token, bytes)
|
||||||
|
assert len(token) == token_len
|
||||||
|
token_text: str = repr(token)[2:-1]
|
||||||
|
tokens.append(token_text.encode("utf-8"))
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
remainder = vocab_size - len(tokens)
|
||||||
|
assert remainder >= 0
|
||||||
|
for i in range(len(tokens), vocab_size):
|
||||||
|
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
||||||
|
toktypes.append(gguf.TokenType.UNUSED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("rwkv")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
head_size = self.hparams["head_size"]
|
||||||
|
hidden_size = self.hparams["hidden_size"]
|
||||||
|
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
||||||
|
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||||
|
intermediate_size = (
|
||||||
|
self.hparams["intermediate_size"]
|
||||||
|
if self.hparams["intermediate_size"] is not None
|
||||||
|
else int((hidden_size * 3.5) // 32 * 32)
|
||||||
|
)
|
||||||
|
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
||||||
|
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
||||||
|
|
||||||
|
self.gguf_writer.add_context_length(1048576)
|
||||||
|
self.gguf_writer.add_embedding_length(hidden_size)
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
||||||
|
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
||||||
|
self.gguf_writer.add_wkv_head_size(head_size)
|
||||||
|
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
||||||
|
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
||||||
|
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
self.gguf_writer.add_head_count(0)
|
||||||
|
|
||||||
|
def modify_tensors(
|
||||||
|
self, data_torch: Tensor, name: str, bid: int | None
|
||||||
|
) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
||||||
|
new_name += ".weight"
|
||||||
|
|
||||||
|
if (
|
||||||
|
new_name.endswith("time_mix_w1.weight")
|
||||||
|
or new_name.endswith("time_mix_decay_w1.weight")
|
||||||
|
or new_name.endswith("time_mix_decay_w2.weight")
|
||||||
|
):
|
||||||
|
data_torch = data_torch.transpose(0, 1)
|
||||||
|
|
||||||
|
if new_name.endswith("time_mix_w2.weight"):
|
||||||
|
data_torch = data_torch.permute(0, 2, 1)
|
||||||
|
|
||||||
|
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||||
|
if rescale_every_n_layers > 0:
|
||||||
|
if new_name.endswith("time_mix_output.weight") or new_name.endswith(
|
||||||
|
"channel_mix_value.weight"
|
||||||
|
):
|
||||||
|
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
||||||
|
|
||||||
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
||||||
class MambaModel(Model):
|
class MambaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.MAMBA
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
||||||
|
@ -3216,6 +3414,65 @@ def modify_tensors(
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("OlmoeForCausalLM")
|
||||||
|
class OlmoeModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.OLMOE
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||||
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||||
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def modify_tensors(
|
||||||
|
self, data_torch: Tensor, name: str, bid: int | None
|
||||||
|
) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
|
||||||
|
if name.find("experts") != -1:
|
||||||
|
n_experts = self.hparams["num_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
||||||
class JinaBertV2Model(BertModel):
|
class JinaBertV2Model(BertModel):
|
||||||
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
||||||
|
@ -4122,7 +4379,7 @@ def set_vocab(self):
|
||||||
if len(token) == 1:
|
if len(token) == 1:
|
||||||
continue
|
continue
|
||||||
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||||
assert 2 <= len(merged) <= 7
|
assert len(merged) >= 2 and len(merged) <= 7
|
||||||
merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
||||||
|
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
@ -4334,6 +4591,152 @@ def prepare_tensors(self):
|
||||||
super().prepare_tensors()
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("GraniteForCausalLM")
|
||||||
|
class GraniteModel(LlamaModel):
|
||||||
|
"""Conversion for IBM's GraniteForCausalLM"""
|
||||||
|
|
||||||
|
model_arch = gguf.MODEL_ARCH.GRANITE
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
"""Granite uses standard llama parameters with the following differences:
|
||||||
|
|
||||||
|
- No head_dim support
|
||||||
|
- New multiplier params:
|
||||||
|
- attention_scale
|
||||||
|
- embedding_scale
|
||||||
|
- residual_scale
|
||||||
|
- logits_scaling
|
||||||
|
"""
|
||||||
|
if head_dim := self.hparams.pop("head_dim", None):
|
||||||
|
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
|
if attention_scale := self.hparams.get("attention_multiplier"):
|
||||||
|
self.gguf_writer.add_attention_scale(attention_scale)
|
||||||
|
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
||||||
|
self.gguf_writer.add_embedding_scale(embedding_scale)
|
||||||
|
if residual_scale := self.hparams.get("residual_multiplier"):
|
||||||
|
self.gguf_writer.add_residual_scale(residual_scale)
|
||||||
|
if logits_scaling := self.hparams.get("logits_scaling"):
|
||||||
|
self.gguf_writer.add_logit_scale(logits_scaling)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("JambaForCausalLM")
|
||||||
|
class JambaModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.JAMBA
|
||||||
|
|
||||||
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||||
|
del tokenizer
|
||||||
|
|
||||||
|
return "gpt-2"
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
if (self.dir_model / "tokenizer.model").is_file():
|
||||||
|
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
else:
|
||||||
|
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
|
||||||
|
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
|
||||||
|
d_inner = self.hparams["mamba_expand"] * d_model
|
||||||
|
d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
|
||||||
|
|
||||||
|
dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(
|
||||||
|
d_model // -16
|
||||||
|
)
|
||||||
|
rms_norm_eps = (
|
||||||
|
self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True)
|
||||||
|
or 1e-6
|
||||||
|
)
|
||||||
|
n_kv_head = self.hparams["num_key_value_heads"]
|
||||||
|
attn_offset = self.hparams["attn_layer_offset"]
|
||||||
|
attn_period = self.hparams["attn_layer_period"]
|
||||||
|
n_kv_vec = [0 for _ in range(attn_offset)] + [
|
||||||
|
n_kv_head if (i - attn_offset) % attn_period == 0 else 0
|
||||||
|
for i in range(attn_offset, self.block_count)
|
||||||
|
]
|
||||||
|
|
||||||
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
self.gguf_writer.add_context_length(
|
||||||
|
self.find_hparam(["max_position_embeddings", "n_ctx"])
|
||||||
|
)
|
||||||
|
self.gguf_writer.add_embedding_length(d_model)
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(n_kv_vec)
|
||||||
|
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
||||||
|
self.gguf_writer.add_ssm_inner_size(d_inner)
|
||||||
|
self.gguf_writer.add_ssm_state_size(d_state)
|
||||||
|
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
||||||
|
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
||||||
|
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def modify_tensors(
|
||||||
|
self, data_torch: Tensor, name: str, bid: int | None
|
||||||
|
) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
|
||||||
|
name = name.replace(".moe.", ".feed_forward.")
|
||||||
|
if bid is not None:
|
||||||
|
moe_offset = self.hparams["expert_layer_offset"]
|
||||||
|
moe_period = self.hparams["expert_layer_period"]
|
||||||
|
|
||||||
|
if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
|
||||||
|
name = name.replace(".experts.0.", ".")
|
||||||
|
|
||||||
|
if ".feed_forward.experts." in name:
|
||||||
|
n_experts = self.hparams["num_experts"]
|
||||||
|
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
|
||||||
|
for wid in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
yield new_name, data_torch
|
||||||
|
return
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if name.endswith(".A_log"):
|
||||||
|
logger.debug("A_log --> A ==> " + new_name)
|
||||||
|
data_torch = -torch.exp(data_torch)
|
||||||
|
|
||||||
|
yield new_name, data_torch
|
||||||
|
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
class LazyTorchTensor(gguf.LazyBase):
|
class LazyTorchTensor(gguf.LazyBase):
|
||||||
_tensor_type = torch.Tensor
|
_tensor_type = torch.Tensor
|
||||||
|
|
||||||
|
@ -4399,26 +4802,67 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--vocab-only", action="store_true")
|
description="Convert a huggingface model to a GGML compatible file"
|
||||||
parser.add_argument("--outfile", type=Path)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--vocab-only",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outfile",
|
||||||
|
type=Path,
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outtype",
|
"--outtype",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["f32", "f16", "bf16", "q8_0", "auto"],
|
choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"],
|
||||||
default="f16",
|
default="f16",
|
||||||
)
|
)
|
||||||
parser.add_argument("--bigendian", action="store_true")
|
parser.add_argument(
|
||||||
parser.add_argument("model", type=Path)
|
"--bigendian",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"model",
|
||||||
|
type=Path,
|
||||||
|
)
|
||||||
parser.add_argument("--use-temp-file", action="store_true")
|
parser.add_argument("--use-temp-file", action="store_true")
|
||||||
parser.add_argument("--no-lazy", action="store_true")
|
parser.add_argument(
|
||||||
parser.add_argument("--model-name", type=str, default=None)
|
"--no-lazy",
|
||||||
parser.add_argument("--verbose", action="store_true")
|
action="store_true",
|
||||||
parser.add_argument("--split-max-tensors", type=int, default=0)
|
)
|
||||||
parser.add_argument("--split-max-size", type=str, default="0")
|
parser.add_argument(
|
||||||
parser.add_argument("--dry-run", action="store_true")
|
"--model-name",
|
||||||
parser.add_argument("--no-tensor-first-split", action="store_true")
|
type=str,
|
||||||
parser.add_argument("--metadata", type=Path)
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-tensors",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-size",
|
||||||
|
type=str,
|
||||||
|
default="0",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-tensor-first-split",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--metadata",
|
||||||
|
type=Path,
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
@ -4462,6 +4906,8 @@ def main() -> None:
|
||||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
|
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -78,6 +78,11 @@ class LLM:
|
||||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
||||||
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
||||||
|
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
||||||
|
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
|
||||||
|
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
||||||
|
RESIDUAL_SCALE = "{arch}.residual_scale"
|
||||||
|
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -93,6 +98,7 @@ class Attention:
|
||||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||||
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
||||||
|
SCALE = "{arch}.attention.scale"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -114,6 +120,10 @@ class SSM:
|
||||||
INNER_SIZE = "{arch}.ssm.inner_size"
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||||
STATE_SIZE = "{arch}.ssm.state_size"
|
STATE_SIZE = "{arch}.ssm.state_size"
|
||||||
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
||||||
|
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
||||||
|
|
||||||
|
class WKV:
|
||||||
|
HEAD_SIZE = "{arch}.wkv.head_size"
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
|
@ -183,14 +193,18 @@ class MODEL_ARCH(IntEnum):
|
||||||
ORION = auto()
|
ORION = auto()
|
||||||
INTERNLM2 = auto()
|
INTERNLM2 = auto()
|
||||||
MINICPM = auto()
|
MINICPM = auto()
|
||||||
|
MINICPM3 = auto()
|
||||||
GEMMA = auto()
|
GEMMA = auto()
|
||||||
GEMMA2 = auto()
|
GEMMA2 = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
|
RWKV6 = auto()
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
|
JAMBA = auto()
|
||||||
XVERSE = auto()
|
XVERSE = auto()
|
||||||
COMMAND_R = auto()
|
COMMAND_R = auto()
|
||||||
DBRX = auto()
|
DBRX = auto()
|
||||||
OLMO = auto()
|
OLMO = auto()
|
||||||
|
OLMOE = auto()
|
||||||
OPENELM = auto()
|
OPENELM = auto()
|
||||||
ARCTIC = auto()
|
ARCTIC = auto()
|
||||||
DEEPSEEK2 = auto()
|
DEEPSEEK2 = auto()
|
||||||
|
@ -201,6 +215,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
JAIS = auto()
|
JAIS = auto()
|
||||||
NEMOTRON = auto()
|
NEMOTRON = auto()
|
||||||
EXAONE = auto()
|
EXAONE = auto()
|
||||||
|
GRANITE = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -246,9 +261,35 @@ class MODEL_TENSOR(IntEnum):
|
||||||
SSM_CONV1D = auto()
|
SSM_CONV1D = auto()
|
||||||
SSM_X = auto()
|
SSM_X = auto()
|
||||||
SSM_DT = auto()
|
SSM_DT = auto()
|
||||||
|
SSM_DT_NORM = auto()
|
||||||
SSM_A = auto()
|
SSM_A = auto()
|
||||||
|
SSM_B_NORM = auto()
|
||||||
|
SSM_C_NORM = auto()
|
||||||
SSM_D = auto()
|
SSM_D = auto()
|
||||||
SSM_OUT = auto()
|
SSM_OUT = auto()
|
||||||
|
TIME_MIX_W1 = auto()
|
||||||
|
TIME_MIX_W2 = auto()
|
||||||
|
TIME_MIX_LERP_X = auto()
|
||||||
|
TIME_MIX_LERP_K = auto()
|
||||||
|
TIME_MIX_LERP_V = auto()
|
||||||
|
TIME_MIX_LERP_R = auto()
|
||||||
|
TIME_MIX_LERP_G = auto()
|
||||||
|
TIME_MIX_LERP_W = auto()
|
||||||
|
TIME_MIX_FIRST = auto()
|
||||||
|
TIME_MIX_DECAY = auto()
|
||||||
|
TIME_MIX_DECAY_W1 = auto()
|
||||||
|
TIME_MIX_DECAY_W2 = auto()
|
||||||
|
TIME_MIX_KEY = auto()
|
||||||
|
TIME_MIX_VALUE = auto()
|
||||||
|
TIME_MIX_RECEPTANCE = auto()
|
||||||
|
TIME_MIX_GATE = auto()
|
||||||
|
TIME_MIX_LN = auto()
|
||||||
|
TIME_MIX_OUTPUT = auto()
|
||||||
|
CHANNEL_MIX_LERP_K = auto()
|
||||||
|
CHANNEL_MIX_LERP_R = auto()
|
||||||
|
CHANNEL_MIX_KEY = auto()
|
||||||
|
CHANNEL_MIX_RECEPTANCE = auto()
|
||||||
|
CHANNEL_MIX_VALUE = auto()
|
||||||
ATTN_Q_A = auto()
|
ATTN_Q_A = auto()
|
||||||
ATTN_Q_B = auto()
|
ATTN_Q_B = auto()
|
||||||
ATTN_KV_A_MQA = auto()
|
ATTN_KV_A_MQA = auto()
|
||||||
|
@ -313,14 +354,18 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH.ORION: "orion",
|
MODEL_ARCH.ORION: "orion",
|
||||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||||
MODEL_ARCH.MINICPM: "minicpm",
|
MODEL_ARCH.MINICPM: "minicpm",
|
||||||
|
MODEL_ARCH.MINICPM3: "minicpm3",
|
||||||
MODEL_ARCH.GEMMA: "gemma",
|
MODEL_ARCH.GEMMA: "gemma",
|
||||||
MODEL_ARCH.GEMMA2: "gemma2",
|
MODEL_ARCH.GEMMA2: "gemma2",
|
||||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||||
|
MODEL_ARCH.RWKV6: "rwkv6",
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
|
MODEL_ARCH.JAMBA: "jamba",
|
||||||
MODEL_ARCH.XVERSE: "xverse",
|
MODEL_ARCH.XVERSE: "xverse",
|
||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
MODEL_ARCH.DBRX: "dbrx",
|
MODEL_ARCH.DBRX: "dbrx",
|
||||||
MODEL_ARCH.OLMO: "olmo",
|
MODEL_ARCH.OLMO: "olmo",
|
||||||
|
MODEL_ARCH.OLMOE: "olmoe",
|
||||||
MODEL_ARCH.OPENELM: "openelm",
|
MODEL_ARCH.OPENELM: "openelm",
|
||||||
MODEL_ARCH.ARCTIC: "arctic",
|
MODEL_ARCH.ARCTIC: "arctic",
|
||||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
|
@ -331,6 +376,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH.JAIS: "jais",
|
MODEL_ARCH.JAIS: "jais",
|
||||||
MODEL_ARCH.NEMOTRON: "nemotron",
|
MODEL_ARCH.NEMOTRON: "nemotron",
|
||||||
MODEL_ARCH.EXAONE: "exaone",
|
MODEL_ARCH.EXAONE: "exaone",
|
||||||
|
MODEL_ARCH.GRANITE: "granite",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -376,9 +422,35 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||||
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||||
|
MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm",
|
||||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||||
|
MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm",
|
||||||
|
MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm",
|
||||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||||
|
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
||||||
|
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
|
||||||
|
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
|
||||||
|
MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
|
||||||
|
MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
|
||||||
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
|
||||||
|
MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
|
||||||
|
MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
|
||||||
|
MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
|
||||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||||
|
@ -792,6 +864,23 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.MINICPM3: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q_A,
|
||||||
|
MODEL_TENSOR.ATTN_Q_B,
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||||
|
MODEL_TENSOR.ATTN_KV_B,
|
||||||
|
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GEMMA: [
|
MODEL_ARCH.GEMMA: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -835,6 +924,37 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.RWKV6: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_X,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_K,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_V,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_R,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_G,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_W,
|
||||||
|
MODEL_TENSOR.TIME_MIX_FIRST,
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY,
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_KEY,
|
||||||
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_GATE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LN,
|
||||||
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R,
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_KEY,
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
||||||
|
],
|
||||||
MODEL_ARCH.MAMBA: [
|
MODEL_ARCH.MAMBA: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -848,6 +968,34 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.SSM_D,
|
MODEL_TENSOR.SSM_D,
|
||||||
MODEL_TENSOR.SSM_OUT,
|
MODEL_TENSOR.SSM_OUT,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.JAMBA: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.SSM_IN,
|
||||||
|
MODEL_TENSOR.SSM_CONV1D,
|
||||||
|
MODEL_TENSOR.SSM_X,
|
||||||
|
MODEL_TENSOR.SSM_DT,
|
||||||
|
MODEL_TENSOR.SSM_DT_NORM,
|
||||||
|
MODEL_TENSOR.SSM_A,
|
||||||
|
MODEL_TENSOR.SSM_B_NORM,
|
||||||
|
MODEL_TENSOR.SSM_C_NORM,
|
||||||
|
MODEL_TENSOR.SSM_D,
|
||||||
|
MODEL_TENSOR.SSM_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.XVERSE: [
|
MODEL_ARCH.XVERSE: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -902,6 +1050,23 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.OLMOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.OPENELM: [
|
MODEL_ARCH.OPENELM: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -1080,6 +1245,19 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.GRANITE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -1179,6 +1357,8 @@ class GGMLQuantizationType(IntEnum):
|
||||||
Q4_0_4_4 = 31
|
Q4_0_4_4 = 31
|
||||||
Q4_0_4_8 = 32
|
Q4_0_4_8 = 32
|
||||||
Q4_0_8_8 = 33
|
Q4_0_8_8 = 33
|
||||||
|
TQ1_0 = 34
|
||||||
|
TQ2_0 = 35
|
||||||
|
|
||||||
|
|
||||||
class LlamaFileType(IntEnum):
|
class LlamaFileType(IntEnum):
|
||||||
|
@ -1216,6 +1396,8 @@ class LlamaFileType(IntEnum):
|
||||||
MOSTLY_Q4_0_4_4 = 33
|
MOSTLY_Q4_0_4_4 = 33
|
||||||
MOSTLY_Q4_0_4_8 = 34
|
MOSTLY_Q4_0_4_8 = 34
|
||||||
MOSTLY_Q4_0_8_8 = 35
|
MOSTLY_Q4_0_8_8 = 35
|
||||||
|
MOSTLY_TQ1_0 = 36
|
||||||
|
MOSTLY_TQ2_0 = 37
|
||||||
|
|
||||||
GUESSED = 1024
|
GUESSED = 1024
|
||||||
|
|
||||||
|
@ -1291,6 +1473,8 @@ def get_type(val: Any) -> GGUFValueType:
|
||||||
GGMLQuantizationType.Q4_0_4_4: (32, 2 + 16),
|
GGMLQuantizationType.Q4_0_4_4: (32, 2 + 16),
|
||||||
GGMLQuantizationType.Q4_0_4_8: (32, 2 + 16),
|
GGMLQuantizationType.Q4_0_4_8: (32, 2 + 16),
|
||||||
GGMLQuantizationType.Q4_0_8_8: (32, 2 + 16),
|
GGMLQuantizationType.Q4_0_8_8: (32, 2 + 16),
|
||||||
|
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
||||||
|
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
||||||
}
|
}
|
||||||
|
|
||||||
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
|
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
|
||||||
|
@ -1330,6 +1514,7 @@ def get_type(val: Any) -> GGUFValueType:
|
||||||
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
||||||
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
||||||
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
||||||
|
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
|
||||||
|
|
||||||
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
||||||
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
|
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
|
||||||
|
|
|
@ -32,7 +32,6 @@
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
|
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
|
||||||
|
|
||||||
|
|
||||||
|
@ -136,7 +135,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
|
||||||
continue
|
continue
|
||||||
elif name.endswith(".lora_b"):
|
elif name.endswith(".lora_b"):
|
||||||
if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
|
if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
|
||||||
# Bail when the LoRA pair can't be found trivially
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"can't measure LoRA size correctly, tensor order is unusual"
|
"can't measure LoRA size correctly, tensor order is unusual"
|
||||||
)
|
)
|
||||||
|
@ -155,14 +154,11 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
|
||||||
|
|
||||||
total_params += size
|
total_params += size
|
||||||
|
|
||||||
# Hopefully this should work even for variable-expert-count models
|
|
||||||
expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
|
expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
|
||||||
|
|
||||||
# Negate the total to signal it's likely not exact
|
|
||||||
if last_lora_a is not None:
|
if last_lora_a is not None:
|
||||||
total_params = -total_params
|
total_params = -total_params
|
||||||
|
|
||||||
# NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
|
|
||||||
return total_params, shared_params, expert_params, expert_count
|
return total_params, shared_params, expert_params, expert_count
|
||||||
|
|
||||||
def format_shard_names(self, path: Path) -> list[Path]:
|
def format_shard_names(self, path: Path) -> list[Path]:
|
||||||
|
@ -181,7 +177,7 @@ def open_output_file(self, path: Path | None = None) -> None:
|
||||||
and self.fout is not None
|
and self.fout is not None
|
||||||
and (path is None or path == self.path)
|
and (path is None or path == self.path)
|
||||||
):
|
):
|
||||||
# allow calling this multiple times as long as the path is the same
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.state is not WriterState.NO_FILE:
|
if self.state is not WriterState.NO_FILE:
|
||||||
|
@ -210,7 +206,7 @@ def print_plan(self) -> list[Path]:
|
||||||
if self.dry_run:
|
if self.dry_run:
|
||||||
logger.info("Dry run, not writing files")
|
logger.info("Dry run, not writing files")
|
||||||
for name in filenames:
|
for name in filenames:
|
||||||
print(name) # noqa: NP100
|
print(name)
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
return filenames
|
return filenames
|
||||||
|
@ -394,12 +390,11 @@ def add_tensor_info(
|
||||||
if tensor_dtype == np.uint8:
|
if tensor_dtype == np.uint8:
|
||||||
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
|
|
||||||
# make sure there is at least one tensor before splitting
|
|
||||||
if len(self.tensors[-1]) > 0:
|
if len(self.tensors[-1]) > 0:
|
||||||
if ( # split when over tensor limit
|
if (
|
||||||
self.split_max_tensors != 0
|
self.split_max_tensors != 0
|
||||||
and len(self.tensors[-1]) >= self.split_max_tensors
|
and len(self.tensors[-1]) >= self.split_max_tensors
|
||||||
) or ( # split when over size limit
|
) or (
|
||||||
self.split_max_size != 0
|
self.split_max_size != 0
|
||||||
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes
|
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes
|
||||||
> self.split_max_size
|
> self.split_max_size
|
||||||
|
@ -465,8 +460,6 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
|
||||||
|
|
||||||
fout = self.fout[file_id]
|
fout = self.fout[file_id]
|
||||||
|
|
||||||
# pop the first tensor info
|
|
||||||
# TODO: cleaner way to get the first key
|
|
||||||
first_tensor_name = [
|
first_tensor_name = [
|
||||||
name for name, _ in zip(self.tensors[file_id].keys(), range(1))
|
name for name, _ in zip(self.tensors[file_id].keys(), range(1))
|
||||||
][0]
|
][0]
|
||||||
|
@ -513,11 +506,8 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
|
||||||
total = sum(ti.nbytes for ti in tensors.values())
|
total = sum(ti.nbytes for ti in tensors.values())
|
||||||
shard_bar.reset(total=(total if total > 0 else None))
|
shard_bar.reset(total=(total if total > 0 else None))
|
||||||
|
|
||||||
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
|
||||||
for ti in tensors.values():
|
for ti in tensors.values():
|
||||||
assert (
|
assert ti.tensor is not None
|
||||||
ti.tensor is not None
|
|
||||||
) # can only iterate once over the tensors
|
|
||||||
assert ti.tensor.nbytes == ti.nbytes
|
assert ti.tensor.nbytes == ti.nbytes
|
||||||
ti.tensor.tofile(fout)
|
ti.tensor.tofile(fout)
|
||||||
if shard_bar is not None:
|
if shard_bar is not None:
|
||||||
|
@ -749,6 +739,24 @@ def add_expert_shared_count(self, count: int) -> None:
|
||||||
def add_expert_weights_scale(self, value: float) -> None:
|
def add_expert_weights_scale(self, value: float) -> None:
|
||||||
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rescale_every_n_layers(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_time_mix_extra_dim(self, dim: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
|
||||||
|
|
||||||
|
def add_time_decay_extra_dim(self, dim: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
|
||||||
|
|
||||||
|
def add_residual_scale(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_embedding_scale(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_wkv_head_size(self, size: int) -> None:
|
||||||
|
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
|
||||||
|
|
||||||
def add_layer_norm_eps(self, value: float) -> None:
|
def add_layer_norm_eps(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
@ -770,6 +778,9 @@ def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||||
def add_sliding_window(self, value: int) -> None:
|
def add_sliding_window(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
|
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_attention_scale(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
@ -809,6 +820,9 @@ def add_ssm_state_size(self, value: int) -> None:
|
||||||
def add_ssm_time_step_rank(self, value: int) -> None:
|
def add_ssm_time_step_rank(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
|
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_tokenizer_model(self, model: str) -> None:
|
def add_tokenizer_model(self, model: str) -> None:
|
||||||
self.add_string(Keys.Tokenizer.MODEL, model)
|
self.add_string(Keys.Tokenizer.MODEL, model)
|
||||||
|
|
||||||
|
@ -879,7 +893,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
name = choice.get("name", "")
|
name = choice.get("name", "")
|
||||||
template = choice.get("template")
|
template = choice.get("template")
|
||||||
|
|
||||||
# Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
|
|
||||||
name = "".join(
|
name = "".join(
|
||||||
(c if c in ascii_letters + digits else "_" for c in name)
|
(c if c in ascii_letters + digits else "_" for c in name)
|
||||||
)
|
)
|
||||||
|
@ -915,6 +928,9 @@ def add_middle_token_id(self, id: int) -> None:
|
||||||
def add_eot_token_id(self, id: int) -> None:
|
def add_eot_token_id(self, id: int) -> None:
|
||||||
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
||||||
|
|
||||||
|
def add_eom_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ""
|
pack_prefix = ""
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -26,6 +26,7 @@ class TensorNameMap:
|
||||||
"embedding.word_embeddings",
|
"embedding.word_embeddings",
|
||||||
"transformer.token_embeddings",
|
"transformer.token_embeddings",
|
||||||
"shared",
|
"shared",
|
||||||
|
"rwkv.embeddings",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TOKEN_TYPES: ("embeddings.token_type_embeddings",),
|
MODEL_TENSOR.TOKEN_TYPES: ("embeddings.token_type_embeddings",),
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
|
@ -33,6 +34,7 @@ class TensorNameMap:
|
||||||
"embeddings.LayerNorm",
|
"embeddings.LayerNorm",
|
||||||
"emb_ln",
|
"emb_ln",
|
||||||
"transformer.norm",
|
"transformer.norm",
|
||||||
|
"rwkv.blocks.0.pre_ln",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.POS_EMBD: (
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
"transformer.wpe",
|
"transformer.wpe",
|
||||||
|
@ -46,6 +48,7 @@ class TensorNameMap:
|
||||||
"word_embeddings_for_head",
|
"word_embeddings_for_head",
|
||||||
"lm_head.linear",
|
"lm_head.linear",
|
||||||
"output_layer",
|
"output_layer",
|
||||||
|
"head",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm",
|
"gpt_neox.final_layer_norm",
|
||||||
|
@ -63,6 +66,7 @@ class TensorNameMap:
|
||||||
"encoder.final_layernorm",
|
"encoder.final_layernorm",
|
||||||
"transformer.norm",
|
"transformer.norm",
|
||||||
"model.norm",
|
"model.norm",
|
||||||
|
"rwkv.ln_out",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
"rope.freqs",
|
"rope.freqs",
|
||||||
|
@ -92,10 +96,12 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1",
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1",
|
||||||
"encoder.layers.{bid}.input_layernorm",
|
"encoder.layers.{bid}.input_layernorm",
|
||||||
"transformer.layers.{bid}.attn_norm",
|
"transformer.layers.{bid}.attn_norm",
|
||||||
|
"rwkv.blocks.{bid}.ln1",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.ATTN_NORM_2: (
|
MODEL_TENSOR.ATTN_NORM_2: (
|
||||||
"transformer.h.{bid}.ln_attn",
|
"transformer.h.{bid}.ln_attn",
|
||||||
"encoder.layer.{bid}.layer_norm_1",
|
"encoder.layer.{bid}.layer_norm_1",
|
||||||
|
"rwkv.blocks.{bid}.ln2",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value",
|
"gpt_neox.layers.{bid}.attention.query_key_value",
|
||||||
|
@ -332,31 +338,72 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.SSM_IN: (
|
MODEL_TENSOR.SSM_IN: (
|
||||||
"model.layers.{bid}.in_proj",
|
"model.layers.{bid}.in_proj",
|
||||||
"backbone.layers.{bid}.mixer.in_proj",
|
"backbone.layers.{bid}.mixer.in_proj",
|
||||||
|
"model.layers.{bid}.mamba.in_proj",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.SSM_CONV1D: (
|
MODEL_TENSOR.SSM_CONV1D: (
|
||||||
"model.layers.{bid}.conv1d",
|
"model.layers.{bid}.conv1d",
|
||||||
"backbone.layers.{bid}.mixer.conv1d",
|
"backbone.layers.{bid}.mixer.conv1d",
|
||||||
|
"model.layers.{bid}.mamba.conv1d",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.SSM_X: (
|
MODEL_TENSOR.SSM_X: (
|
||||||
"model.layers.{bid}.x_proj",
|
"model.layers.{bid}.x_proj",
|
||||||
"backbone.layers.{bid}.mixer.x_proj",
|
"backbone.layers.{bid}.mixer.x_proj",
|
||||||
|
"model.layers.{bid}.mamba.x_proj",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.SSM_DT: (
|
MODEL_TENSOR.SSM_DT: (
|
||||||
"model.layers.{bid}.dt_proj",
|
"model.layers.{bid}.dt_proj",
|
||||||
"backbone.layers.{bid}.mixer.dt_proj",
|
"backbone.layers.{bid}.mixer.dt_proj",
|
||||||
|
"model.layers.{bid}.mamba.dt_proj",
|
||||||
),
|
),
|
||||||
|
MODEL_TENSOR.SSM_DT_NORM: ("model.layers.{bid}.mamba.dt_layernorm",),
|
||||||
MODEL_TENSOR.SSM_A: (
|
MODEL_TENSOR.SSM_A: (
|
||||||
"model.layers.{bid}.A_log",
|
"model.layers.{bid}.A_log",
|
||||||
"backbone.layers.{bid}.mixer.A_log",
|
"backbone.layers.{bid}.mixer.A_log",
|
||||||
|
"model.layers.{bid}.mamba.A_log",
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.SSM_B_NORM: (
|
||||||
|
"model.layers.{bid}.mamba.b_layernorm",
|
||||||
|
"model.layers.{bid}.mamba.B_layernorm",
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.SSM_C_NORM: (
|
||||||
|
"model.layers.{bid}.mamba.c_layernorm",
|
||||||
|
"model.layers.{bid}.mamba.C_layernorm",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.SSM_D: (
|
MODEL_TENSOR.SSM_D: (
|
||||||
"model.layers.{bid}.D",
|
"model.layers.{bid}.D",
|
||||||
"backbone.layers.{bid}.mixer.D",
|
"backbone.layers.{bid}.mixer.D",
|
||||||
|
"model.layers.{bid}.mamba.D",
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.SSM_OUT: (
|
MODEL_TENSOR.SSM_OUT: (
|
||||||
"model.layers.{bid}.out_proj",
|
"model.layers.{bid}.out_proj",
|
||||||
"backbone.layers.{bid}.mixer.out_proj",
|
"backbone.layers.{bid}.mixer.out_proj",
|
||||||
|
"model.layers.{bid}.mamba.out_proj",
|
||||||
),
|
),
|
||||||
|
MODEL_TENSOR.TIME_MIX_W1: ("rwkv.blocks.{bid}.attention.time_maa_w1",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_W2: ("rwkv.blocks.{bid}.attention.time_maa_w2",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_X: ("rwkv.blocks.{bid}.attention.time_maa_x",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_K: ("rwkv.blocks.{bid}.attention.time_maa_k",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_V: ("rwkv.blocks.{bid}.attention.time_maa_v",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_R: ("rwkv.blocks.{bid}.attention.time_maa_r",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_G: ("rwkv.blocks.{bid}.attention.time_maa_g",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_W: ("rwkv.blocks.{bid}.attention.time_maa_w",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_FIRST: ("rwkv.blocks.{bid}.attention.time_faaaa",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY: ("rwkv.blocks.{bid}.attention.time_decay",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY_W1: ("rwkv.blocks.{bid}.attention.time_decay_w1",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY_W2: ("rwkv.blocks.{bid}.attention.time_decay_w2",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE: ("rwkv.blocks.{bid}.attention.receptance",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",),
|
||||||
|
MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",),
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K: ("rwkv.blocks.{bid}.feed_forward.time_maa_k",),
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R: ("rwkv.blocks.{bid}.feed_forward.time_maa_r",),
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",),
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
|
||||||
|
"rwkv.blocks.{bid}.feed_forward.receptance",
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_VALUE: ("rwkv.blocks.{bid}.feed_forward.value",),
|
||||||
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",),
|
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",),
|
||||||
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",),
|
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",),
|
||||||
MODEL_TENSOR.ATTN_KV_A_MQA: (
|
MODEL_TENSOR.ATTN_KV_A_MQA: (
|
||||||
|
|
Loading…
Reference in New Issue