feat(convert): update llama.cpp convert scripts

- added support for MiniCPM3, RWKVv6, OLMoE, IBM Granite, and Jamba (conversion only: https://github.com/ggerganov/llama.cpp/pull/7531)
- update gguf library from upstream
This commit is contained in:
BuildTools 2024-09-20 15:05:52 -07:00
parent 4e51ed2f56
commit 39441e503f
No known key found for this signature in database
GPG Key ID: 3270C066C15D530B
4 changed files with 757 additions and 63 deletions

View File

@ -1,5 +1,6 @@
from __future__ import annotations
import ast
import logging
import argparse
import contextlib
@ -165,14 +166,14 @@ def set_vocab(self):
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_names_from_parts: set[str] = set()
if len(self.part_names) > 1:
self.tensor_names = set()
index_name = (
"model.safetensors" if self.is_safetensors else "pytorch_model.bin"
)
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
index_name += ".index.json"
index_file = self.dir_model / index_name
if index_file.is_file():
self.tensor_names = set()
logger.info(f"gguf: loading model weight map from '{index_name}'")
with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
with open(index_file, "r", encoding="utf-8") as f:
index: dict[str, Any] = json.load(f)
weight_map = index.get("weight_map")
if weight_map is None or not isinstance(weight_map, dict):
@ -180,6 +181,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
self.tensor_names.update(weight_map.keys())
else:
self.tensor_names = tensor_names_from_parts
weight_map = {}
for part_name in self.part_names:
logger.info(f"gguf: loading model part '{part_name}'")
@ -217,16 +219,19 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
data = LazyTorchTensor.from_eager(data)
yield name, data
if (
len(
sym_diff := tensor_names_from_parts.symmetric_difference(
self.tensor_names
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
missing_files = sorted(
set(weight_map[n] for n in missing if n in weight_map)
)
)
> 0
):
if len(extra) == 0 and len(missing_files) > 0:
raise ValueError(f"Missing or incomplete model files: {missing_files}")
else:
raise ValueError(
f"Mismatch between weight map and model parts for tensor names: {sym_diff}"
"Mismatch between weight map and model parts for tensor names:\n"
f"Missing tensors: {missing}\n"
f"Extra tensors: {extra}"
)
def format_tensor_name(
@ -383,12 +388,31 @@ def prepare_tensors(self):
gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
gguf.MODEL_TENSOR.TIME_MIX_W1,
gguf.MODEL_TENSOR.TIME_MIX_W2,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
)
)
or not name.endswith(".weight")
or not new_name.endswith(".weight")
):
data_qtype = gguf.GGMLQuantizationType.F32
if data_qtype is False and any(
self.match_model_tensor_name(new_name, key, bid)
for key in (
gguf.MODEL_TENSOR.TOKEN_EMBD,
gguf.MODEL_TENSOR.OUTPUT,
)
):
if self.ftype in (
gguf.LlamaFileType.MOSTLY_TQ1_0,
gguf.LlamaFileType.MOSTLY_TQ2_0,
):
data_qtype = gguf.GGMLQuantizationType.F16
if isinstance(data_qtype, bool):
if self.ftype == gguf.LlamaFileType.ALL_F32:
data_qtype = gguf.GGMLQuantizationType.F32
@ -398,6 +422,10 @@ def prepare_tensors(self):
data_qtype = gguf.GGMLQuantizationType.BF16
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
data_qtype = gguf.GGMLQuantizationType.Q8_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
data_qtype = gguf.GGMLQuantizationType.TQ1_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
data_qtype = gguf.GGMLQuantizationType.TQ2_0
else:
raise ValueError(f"Unknown file type: {self.ftype.name}")
@ -417,7 +445,7 @@ def prepare_tensors(self):
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
logger.info(
f"{f'%s-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}"
f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}"
)
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
@ -704,6 +732,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
res = "exaone"
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
res = "phi-2"
if res is None:
logger.warning("\n")
@ -1130,6 +1161,7 @@ def set_vocab(self):
try:
self._set_vocab_gpt2()
except Exception:
self._set_vocab_sentencepiece()
self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_pad_token_id(3)
@ -1710,7 +1742,9 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed norms: {norms}")
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
@Model.register(
"LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM"
)
class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
@ -1891,15 +1925,14 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(1.0)
def weight_quant(self, weight):
def weight_quant(self, weight: Tensor) -> Tensor:
dtype = weight.dtype
weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5)
weight = (weight * s).round().clamp(-1, 1) / s
scale = weight.abs().max().unsqueeze(0)
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
weight = torch.sign(weight).type(dtype)
return weight.type(dtype), scale.type(torch.float32)
scale = weight.abs().mean().clamp(min=1e-5)
iscale = 1 / scale
result = (weight * iscale).round().clamp(-1, 1) / iscale
return result.type(dtype)
def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
@ -1919,10 +1952,8 @@ def modify_tensors(
]
):
weight_torch, scale_torch = self.weight_quant(data_torch)
yield (new_name, weight_torch)
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
else:
data_torch = self.weight_quant(data_torch)
yield (new_name, data_torch)
@ -2099,6 +2130,79 @@ def modify_tensors(
return [(self.map_tensor_name(name), data_torch)]
@Model.register("MiniCPM3ForCausalLM")
class MiniCPM3Model(Model):
model_arch = gguf.MODEL_ARCH.MINICPM3
def set_gguf_parameters(self):
hparams = self.hparams
rope_dims = hparams["qk_rope_head_dim"]
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length(
hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]
)
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
rope_scaling = self.find_hparam(["rope_scaling"], True)
if rope_scaling is None:
return
long_factors = rope_scaling.get("long_factor", None)
short_factors = rope_scaling.get("short_factor", None)
if long_factors is None or short_factors is None:
raise KeyError(
"Missing the required key rope_scaling.long_factor or rope_scaling_short_factor"
)
if (
len(long_factors) != len(short_factors)
or len(long_factors) != rope_dims / 2
):
raise ValueError(
f"The length of rope long and short factors must be {rope_dims / 2}"
)
self.gguf_writer.add_tensor(
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight",
np.array(long_factors, dtype=np.float32),
)
self.gguf_writer.add_tensor(
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight",
np.array(short_factors, dtype=np.float32),
)
def set_vocab(self):
self._set_vocab_llama_hf()
def _reverse_hf_permute(
self, weights: Tensor, n_head: int, n_kv_head: int | None = None
) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(
n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]
)
.swapaxes(1, 2)
.reshape(weights.shape)
)
@Model.register("QWenLMHeadModel")
class QwenModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN
@ -3087,6 +3191,100 @@ class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2
@Model.register("Rwkv6ForCausalLM")
class Rwkv6Model(Model):
model_arch = gguf.MODEL_ARCH.RWKV6
def set_vocab(self):
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
vocab_size = self.hparams.get("vocab_size", 65536)
tokens: list[bytes] = ["<s>".encode("utf-8")]
toktypes: list[int] = [gguf.TokenType.CONTROL]
with open(
self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8"
) as f:
lines = f.readlines()
for line in lines:
parts = line.split(" ")
assert len(parts) >= 3
token, token_len = ast.literal_eval(" ".join(parts[1:-1])), int(
parts[-1]
)
token = token.encode("utf-8") if isinstance(token, str) else token
assert isinstance(token, bytes)
assert len(token) == token_len
token_text: str = repr(token)[2:-1]
tokens.append(token_text.encode("utf-8"))
toktypes.append(gguf.TokenType.NORMAL)
remainder = vocab_size - len(tokens)
assert remainder >= 0
for i in range(len(tokens), vocab_size):
tokens.append(f"[PAD{i}]".encode("utf-8"))
toktypes.append(gguf.TokenType.UNUSED)
self.gguf_writer.add_tokenizer_model("rwkv")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
head_size = self.hparams["head_size"]
hidden_size = self.hparams["hidden_size"]
layer_norm_eps = self.hparams["layer_norm_epsilon"]
rescale_every_n_layers = self.hparams["rescale_every"]
intermediate_size = (
self.hparams["intermediate_size"]
if self.hparams["intermediate_size"] is not None
else int((hidden_size * 3.5) // 32 * 32)
)
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
self.gguf_writer.add_context_length(1048576)
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
self.gguf_writer.add_wkv_head_size(head_size)
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_head_count(0)
def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
new_name += ".weight"
if (
new_name.endswith("time_mix_w1.weight")
or new_name.endswith("time_mix_decay_w1.weight")
or new_name.endswith("time_mix_decay_w2.weight")
):
data_torch = data_torch.transpose(0, 1)
if new_name.endswith("time_mix_w2.weight"):
data_torch = data_torch.permute(0, 2, 1)
rescale_every_n_layers = self.hparams["rescale_every"]
if rescale_every_n_layers > 0:
if new_name.endswith("time_mix_output.weight") or new_name.endswith(
"channel_mix_value.weight"
):
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
yield (new_name, data_torch)
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA
@ -3216,6 +3414,65 @@ def modify_tensors(
return [(self.map_tensor_name(name), data_torch)]
@Model.register("OlmoeForCausalLM")
class OlmoeModel(Model):
model_arch = gguf.MODEL_ARCH.OLMOE
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]:
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
class JinaBertV2Model(BertModel):
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
@ -4122,7 +4379,7 @@ def set_vocab(self):
if len(token) == 1:
continue
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
assert 2 <= len(merged) <= 7
assert len(merged) >= 2 and len(merged) <= 7
merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged)))
added_vocab = tokenizer.get_added_vocab()
@ -4334,6 +4591,152 @@ def prepare_tensors(self):
super().prepare_tensors()
@Model.register("GraniteForCausalLM")
class GraniteModel(LlamaModel):
"""Conversion for IBM's GraniteForCausalLM"""
model_arch = gguf.MODEL_ARCH.GRANITE
def set_gguf_parameters(self):
"""Granite uses standard llama parameters with the following differences:
- No head_dim support
- New multiplier params:
- attention_scale
- embedding_scale
- residual_scale
- logits_scaling
"""
if head_dim := self.hparams.pop("head_dim", None):
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
super().set_gguf_parameters()
if attention_scale := self.hparams.get("attention_multiplier"):
self.gguf_writer.add_attention_scale(attention_scale)
if embedding_scale := self.hparams.get("embedding_multiplier"):
self.gguf_writer.add_embedding_scale(embedding_scale)
if residual_scale := self.hparams.get("residual_multiplier"):
self.gguf_writer.add_residual_scale(residual_scale)
if logits_scaling := self.hparams.get("logits_scaling"):
self.gguf_writer.add_logit_scale(logits_scaling)
@Model.register("JambaForCausalLM")
class JambaModel(Model):
model_arch = gguf.MODEL_ARCH.JAMBA
def get_vocab_base_pre(self, tokenizer) -> str:
del tokenizer
return "gpt-2"
def set_vocab(self):
if (self.dir_model / "tokenizer.model").is_file():
self._set_vocab_sentencepiece()
else:
self._set_vocab_gpt2()
def set_gguf_parameters(self):
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
d_inner = self.hparams["mamba_expand"] * d_model
d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(
d_model // -16
)
rms_norm_eps = (
self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True)
or 1e-6
)
n_kv_head = self.hparams["num_key_value_heads"]
attn_offset = self.hparams["attn_layer_offset"]
attn_period = self.hparams["attn_layer_period"]
n_kv_vec = [0 for _ in range(attn_offset)] + [
n_kv_head if (i - attn_offset) % attn_period == 0 else 0
for i in range(attn_offset, self.block_count)
]
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_context_length(
self.find_hparam(["max_position_embeddings", "n_ctx"])
)
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(n_kv_vec)
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_file_type(self.ftype)
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]:
name = name.replace(".moe.", ".feed_forward.")
if bid is not None:
moe_offset = self.hparams["expert_layer_offset"]
moe_period = self.hparams["expert_layer_period"]
if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
name = name.replace(".experts.0.", ".")
if ".feed_forward.experts." in name:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
for wid in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
new_name = self.map_tensor_name(merged_name)
yield new_name, data_torch
return
new_name = self.map_tensor_name(name)
if name.endswith(".A_log"):
logger.debug("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch)
yield new_name, data_torch
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
class LazyTorchTensor(gguf.LazyBase):
_tensor_type = torch.Tensor
@ -4399,26 +4802,67 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--vocab-only", action="store_true")
parser.add_argument("--outfile", type=Path)
parser = argparse.ArgumentParser(
description="Convert a huggingface model to a GGML compatible file"
)
parser.add_argument(
"--vocab-only",
action="store_true",
)
parser.add_argument(
"--outfile",
type=Path,
)
parser.add_argument(
"--outtype",
type=str,
choices=["f32", "f16", "bf16", "q8_0", "auto"],
choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"],
default="f16",
)
parser.add_argument("--bigendian", action="store_true")
parser.add_argument("model", type=Path)
parser.add_argument(
"--bigendian",
action="store_true",
)
parser.add_argument(
"model",
type=Path,
)
parser.add_argument("--use-temp-file", action="store_true")
parser.add_argument("--no-lazy", action="store_true")
parser.add_argument("--model-name", type=str, default=None)
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--split-max-tensors", type=int, default=0)
parser.add_argument("--split-max-size", type=str, default="0")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--no-tensor-first-split", action="store_true")
parser.add_argument("--metadata", type=Path)
parser.add_argument(
"--no-lazy",
action="store_true",
)
parser.add_argument(
"--model-name",
type=str,
default=None,
)
parser.add_argument(
"--verbose",
action="store_true",
)
parser.add_argument(
"--split-max-tensors",
type=int,
default=0,
)
parser.add_argument(
"--split-max-size",
type=str,
default="0",
)
parser.add_argument(
"--dry-run",
action="store_true",
)
parser.add_argument(
"--no-tensor-first-split",
action="store_true",
)
parser.add_argument(
"--metadata",
type=Path,
)
return parser.parse_args()
@ -4462,6 +4906,8 @@ def main() -> None:
"f16": gguf.LlamaFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
"auto": gguf.LlamaFileType.GUESSED,
}

View File

@ -78,6 +78,11 @@ class LLM:
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
RESIDUAL_SCALE = "{arch}.residual_scale"
EMBEDDING_SCALE = "{arch}.embedding_scale"
class Attention:
HEAD_COUNT = "{arch}.attention.head_count"
@ -93,6 +98,7 @@ class Attention:
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
SLIDING_WINDOW = "{arch}.attention.sliding_window"
SCALE = "{arch}.attention.scale"
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
@ -114,6 +120,10 @@ class SSM:
INNER_SIZE = "{arch}.ssm.inner_size"
STATE_SIZE = "{arch}.ssm.state_size"
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
class WKV:
HEAD_SIZE = "{arch}.wkv.head_size"
class Tokenizer:
MODEL = "tokenizer.ggml.model"
@ -183,14 +193,18 @@ class MODEL_ARCH(IntEnum):
ORION = auto()
INTERNLM2 = auto()
MINICPM = auto()
MINICPM3 = auto()
GEMMA = auto()
GEMMA2 = auto()
STARCODER2 = auto()
RWKV6 = auto()
MAMBA = auto()
JAMBA = auto()
XVERSE = auto()
COMMAND_R = auto()
DBRX = auto()
OLMO = auto()
OLMOE = auto()
OPENELM = auto()
ARCTIC = auto()
DEEPSEEK2 = auto()
@ -201,6 +215,7 @@ class MODEL_ARCH(IntEnum):
JAIS = auto()
NEMOTRON = auto()
EXAONE = auto()
GRANITE = auto()
class MODEL_TENSOR(IntEnum):
@ -246,9 +261,35 @@ class MODEL_TENSOR(IntEnum):
SSM_CONV1D = auto()
SSM_X = auto()
SSM_DT = auto()
SSM_DT_NORM = auto()
SSM_A = auto()
SSM_B_NORM = auto()
SSM_C_NORM = auto()
SSM_D = auto()
SSM_OUT = auto()
TIME_MIX_W1 = auto()
TIME_MIX_W2 = auto()
TIME_MIX_LERP_X = auto()
TIME_MIX_LERP_K = auto()
TIME_MIX_LERP_V = auto()
TIME_MIX_LERP_R = auto()
TIME_MIX_LERP_G = auto()
TIME_MIX_LERP_W = auto()
TIME_MIX_FIRST = auto()
TIME_MIX_DECAY = auto()
TIME_MIX_DECAY_W1 = auto()
TIME_MIX_DECAY_W2 = auto()
TIME_MIX_KEY = auto()
TIME_MIX_VALUE = auto()
TIME_MIX_RECEPTANCE = auto()
TIME_MIX_GATE = auto()
TIME_MIX_LN = auto()
TIME_MIX_OUTPUT = auto()
CHANNEL_MIX_LERP_K = auto()
CHANNEL_MIX_LERP_R = auto()
CHANNEL_MIX_KEY = auto()
CHANNEL_MIX_RECEPTANCE = auto()
CHANNEL_MIX_VALUE = auto()
ATTN_Q_A = auto()
ATTN_Q_B = auto()
ATTN_KV_A_MQA = auto()
@ -313,14 +354,18 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.ORION: "orion",
MODEL_ARCH.INTERNLM2: "internlm2",
MODEL_ARCH.MINICPM: "minicpm",
MODEL_ARCH.MINICPM3: "minicpm3",
MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.GEMMA2: "gemma2",
MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.RWKV6: "rwkv6",
MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.JAMBA: "jamba",
MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
@ -331,6 +376,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.JAIS: "jais",
MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.GRANITE: "granite",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -376,9 +422,35 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm",
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm",
MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm",
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
@ -792,6 +864,23 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.MINICPM3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_KV_A_MQA,
MODEL_TENSOR.ATTN_KV_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV_A_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.GEMMA: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@ -835,6 +924,37 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.RWKV6: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_NORM_2,
MODEL_TENSOR.TIME_MIX_W1,
MODEL_TENSOR.TIME_MIX_W2,
MODEL_TENSOR.TIME_MIX_LERP_X,
MODEL_TENSOR.TIME_MIX_LERP_K,
MODEL_TENSOR.TIME_MIX_LERP_V,
MODEL_TENSOR.TIME_MIX_LERP_R,
MODEL_TENSOR.TIME_MIX_LERP_G,
MODEL_TENSOR.TIME_MIX_LERP_W,
MODEL_TENSOR.TIME_MIX_FIRST,
MODEL_TENSOR.TIME_MIX_DECAY,
MODEL_TENSOR.TIME_MIX_DECAY_W1,
MODEL_TENSOR.TIME_MIX_DECAY_W2,
MODEL_TENSOR.TIME_MIX_KEY,
MODEL_TENSOR.TIME_MIX_VALUE,
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
MODEL_TENSOR.TIME_MIX_GATE,
MODEL_TENSOR.TIME_MIX_LN,
MODEL_TENSOR.TIME_MIX_OUTPUT,
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
MODEL_TENSOR.CHANNEL_MIX_LERP_R,
MODEL_TENSOR.CHANNEL_MIX_KEY,
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
MODEL_TENSOR.CHANNEL_MIX_VALUE,
],
MODEL_ARCH.MAMBA: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@ -848,6 +968,34 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_OUT,
],
MODEL_ARCH.JAMBA: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.SSM_IN,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_X,
MODEL_TENSOR.SSM_DT,
MODEL_TENSOR.SSM_DT_NORM,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_B_NORM,
MODEL_TENSOR.SSM_C_NORM,
MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_OUT,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.XVERSE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@ -902,6 +1050,23 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.OLMOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
],
MODEL_ARCH.OPENELM: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@ -1080,6 +1245,19 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.GRANITE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
}
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -1179,6 +1357,8 @@ class GGMLQuantizationType(IntEnum):
Q4_0_4_4 = 31
Q4_0_4_8 = 32
Q4_0_8_8 = 33
TQ1_0 = 34
TQ2_0 = 35
class LlamaFileType(IntEnum):
@ -1216,6 +1396,8 @@ class LlamaFileType(IntEnum):
MOSTLY_Q4_0_4_4 = 33
MOSTLY_Q4_0_4_8 = 34
MOSTLY_Q4_0_8_8 = 35
MOSTLY_TQ1_0 = 36
MOSTLY_TQ2_0 = 37
GUESSED = 1024
@ -1291,6 +1473,8 @@ def get_type(val: Any) -> GGUFValueType:
GGMLQuantizationType.Q4_0_4_4: (32, 2 + 16),
GGMLQuantizationType.Q4_0_4_8: (32, 2 + 16),
GGMLQuantizationType.Q4_0_8_8: (32, 2 + 16),
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
}
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
@ -1330,6 +1514,7 @@ def get_type(val: Any) -> GGUFValueType:
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE

View File

@ -32,7 +32,6 @@
logger = logging.getLogger(__name__)
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
@ -136,7 +135,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
continue
elif name.endswith(".lora_b"):
if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
# Bail when the LoRA pair can't be found trivially
logger.warning(
"can't measure LoRA size correctly, tensor order is unusual"
)
@ -155,14 +154,11 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
total_params += size
# Hopefully this should work even for variable-expert-count models
expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
# Negate the total to signal it's likely not exact
if last_lora_a is not None:
total_params = -total_params
# NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
return total_params, shared_params, expert_params, expert_count
def format_shard_names(self, path: Path) -> list[Path]:
@ -181,7 +177,7 @@ def open_output_file(self, path: Path | None = None) -> None:
and self.fout is not None
and (path is None or path == self.path)
):
# allow calling this multiple times as long as the path is the same
return
if self.state is not WriterState.NO_FILE:
@ -210,7 +206,7 @@ def print_plan(self) -> list[Path]:
if self.dry_run:
logger.info("Dry run, not writing files")
for name in filenames:
print(name) # noqa: NP100
print(name)
exit()
return filenames
@ -394,12 +390,11 @@ def add_tensor_info(
if tensor_dtype == np.uint8:
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
# make sure there is at least one tensor before splitting
if len(self.tensors[-1]) > 0:
if ( # split when over tensor limit
if (
self.split_max_tensors != 0
and len(self.tensors[-1]) >= self.split_max_tensors
) or ( # split when over size limit
) or (
self.split_max_size != 0
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes
> self.split_max_size
@ -465,8 +460,6 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
fout = self.fout[file_id]
# pop the first tensor info
# TODO: cleaner way to get the first key
first_tensor_name = [
name for name, _ in zip(self.tensors[file_id].keys(), range(1))
][0]
@ -513,11 +506,8 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
total = sum(ti.nbytes for ti in tensors.values())
shard_bar.reset(total=(total if total > 0 else None))
# relying on the fact that Python dicts preserve insertion order (since 3.7)
for ti in tensors.values():
assert (
ti.tensor is not None
) # can only iterate once over the tensors
assert ti.tensor is not None
assert ti.tensor.nbytes == ti.nbytes
ti.tensor.tofile(fout)
if shard_bar is not None:
@ -749,6 +739,24 @@ def add_expert_shared_count(self, count: int) -> None:
def add_expert_weights_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
def add_rescale_every_n_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
def add_time_mix_extra_dim(self, dim: int) -> None:
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
def add_time_decay_extra_dim(self, dim: int) -> None:
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
def add_residual_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value)
def add_embedding_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value)
def add_wkv_head_size(self, size: int) -> None:
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
def add_layer_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
@ -770,6 +778,9 @@ def add_relative_attn_buckets_count(self, value: int) -> None:
def add_sliding_window(self, value: int) -> None:
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
def add_attention_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
@ -809,6 +820,9 @@ def add_ssm_state_size(self, value: int) -> None:
def add_ssm_time_step_rank(self, value: int) -> None:
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str) -> None:
self.add_string(Keys.Tokenizer.MODEL, model)
@ -879,7 +893,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
name = choice.get("name", "")
template = choice.get("template")
# Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
name = "".join(
(c if c in ascii_letters + digits else "_" for c in name)
)
@ -915,6 +928,9 @@ def add_middle_token_id(self, id: int) -> None:
def add_eot_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
def add_eom_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
pack_prefix = ""
if not skip_pack_prefix:

View File

@ -26,6 +26,7 @@ class TensorNameMap:
"embedding.word_embeddings",
"transformer.token_embeddings",
"shared",
"rwkv.embeddings",
),
MODEL_TENSOR.TOKEN_TYPES: ("embeddings.token_type_embeddings",),
MODEL_TENSOR.TOKEN_EMBD_NORM: (
@ -33,6 +34,7 @@ class TensorNameMap:
"embeddings.LayerNorm",
"emb_ln",
"transformer.norm",
"rwkv.blocks.0.pre_ln",
),
MODEL_TENSOR.POS_EMBD: (
"transformer.wpe",
@ -46,6 +48,7 @@ class TensorNameMap:
"word_embeddings_for_head",
"lm_head.linear",
"output_layer",
"head",
),
MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm",
@ -63,6 +66,7 @@ class TensorNameMap:
"encoder.final_layernorm",
"transformer.norm",
"model.norm",
"rwkv.ln_out",
),
MODEL_TENSOR.ROPE_FREQS: (
"rope.freqs",
@ -92,10 +96,12 @@ class TensorNameMap:
"transformer.blocks.{bid}.norm_attn_norm.norm_1",
"encoder.layers.{bid}.input_layernorm",
"transformer.layers.{bid}.attn_norm",
"rwkv.blocks.{bid}.ln1",
),
MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn",
"encoder.layer.{bid}.layer_norm_1",
"rwkv.blocks.{bid}.ln2",
),
MODEL_TENSOR.ATTN_QKV: (
"gpt_neox.layers.{bid}.attention.query_key_value",
@ -332,31 +338,72 @@ class TensorNameMap:
MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj",
"backbone.layers.{bid}.mixer.in_proj",
"model.layers.{bid}.mamba.in_proj",
),
MODEL_TENSOR.SSM_CONV1D: (
"model.layers.{bid}.conv1d",
"backbone.layers.{bid}.mixer.conv1d",
"model.layers.{bid}.mamba.conv1d",
),
MODEL_TENSOR.SSM_X: (
"model.layers.{bid}.x_proj",
"backbone.layers.{bid}.mixer.x_proj",
"model.layers.{bid}.mamba.x_proj",
),
MODEL_TENSOR.SSM_DT: (
"model.layers.{bid}.dt_proj",
"backbone.layers.{bid}.mixer.dt_proj",
"model.layers.{bid}.mamba.dt_proj",
),
MODEL_TENSOR.SSM_DT_NORM: ("model.layers.{bid}.mamba.dt_layernorm",),
MODEL_TENSOR.SSM_A: (
"model.layers.{bid}.A_log",
"backbone.layers.{bid}.mixer.A_log",
"model.layers.{bid}.mamba.A_log",
),
MODEL_TENSOR.SSM_B_NORM: (
"model.layers.{bid}.mamba.b_layernorm",
"model.layers.{bid}.mamba.B_layernorm",
),
MODEL_TENSOR.SSM_C_NORM: (
"model.layers.{bid}.mamba.c_layernorm",
"model.layers.{bid}.mamba.C_layernorm",
),
MODEL_TENSOR.SSM_D: (
"model.layers.{bid}.D",
"backbone.layers.{bid}.mixer.D",
"model.layers.{bid}.mamba.D",
),
MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj",
"model.layers.{bid}.mamba.out_proj",
),
MODEL_TENSOR.TIME_MIX_W1: ("rwkv.blocks.{bid}.attention.time_maa_w1",),
MODEL_TENSOR.TIME_MIX_W2: ("rwkv.blocks.{bid}.attention.time_maa_w2",),
MODEL_TENSOR.TIME_MIX_LERP_X: ("rwkv.blocks.{bid}.attention.time_maa_x",),
MODEL_TENSOR.TIME_MIX_LERP_K: ("rwkv.blocks.{bid}.attention.time_maa_k",),
MODEL_TENSOR.TIME_MIX_LERP_V: ("rwkv.blocks.{bid}.attention.time_maa_v",),
MODEL_TENSOR.TIME_MIX_LERP_R: ("rwkv.blocks.{bid}.attention.time_maa_r",),
MODEL_TENSOR.TIME_MIX_LERP_G: ("rwkv.blocks.{bid}.attention.time_maa_g",),
MODEL_TENSOR.TIME_MIX_LERP_W: ("rwkv.blocks.{bid}.attention.time_maa_w",),
MODEL_TENSOR.TIME_MIX_FIRST: ("rwkv.blocks.{bid}.attention.time_faaaa",),
MODEL_TENSOR.TIME_MIX_DECAY: ("rwkv.blocks.{bid}.attention.time_decay",),
MODEL_TENSOR.TIME_MIX_DECAY_W1: ("rwkv.blocks.{bid}.attention.time_decay_w1",),
MODEL_TENSOR.TIME_MIX_DECAY_W2: ("rwkv.blocks.{bid}.attention.time_decay_w2",),
MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",),
MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",),
MODEL_TENSOR.TIME_MIX_RECEPTANCE: ("rwkv.blocks.{bid}.attention.receptance",),
MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",),
MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",),
MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",),
MODEL_TENSOR.CHANNEL_MIX_LERP_K: ("rwkv.blocks.{bid}.feed_forward.time_maa_k",),
MODEL_TENSOR.CHANNEL_MIX_LERP_R: ("rwkv.blocks.{bid}.feed_forward.time_maa_r",),
MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",),
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.feed_forward.receptance",
),
MODEL_TENSOR.CHANNEL_MIX_VALUE: ("rwkv.blocks.{bid}.feed_forward.value",),
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",),
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",),
MODEL_TENSOR.ATTN_KV_A_MQA: (