mirror of https://github.com/leafspark/AutoGGUF
refactor(ggml): update safetensor conversion scripts
This commit is contained in:
parent
c9c2b04534
commit
b4817eee06
File diff suppressed because it is too large
Load Diff
|
@ -27,7 +27,6 @@
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
# reuse model definitions from convert_hf_to_gguf.py
|
|
||||||
from convert_hf_to_gguf import LazyTorchTensor, Model
|
from convert_hf_to_gguf import LazyTorchTensor, Model
|
||||||
|
|
||||||
logger = logging.getLogger("lora-to-gguf")
|
logger = logging.getLogger("lora-to-gguf")
|
||||||
|
@ -39,10 +38,9 @@ class PartialLoraTensor:
|
||||||
B: Tensor | None = None
|
B: Tensor | None = None
|
||||||
|
|
||||||
|
|
||||||
# magic to support tensor shape modifications and splitting
|
|
||||||
class LoraTorchTensor:
|
class LoraTorchTensor:
|
||||||
_lora_A: Tensor # (n_rank, row_size)
|
_lora_A: Tensor
|
||||||
_lora_B: Tensor # (col_size, n_rank)
|
_lora_B: Tensor
|
||||||
_rank: int
|
_rank: int
|
||||||
|
|
||||||
def __init__(self, A: Tensor, B: Tensor):
|
def __init__(self, A: Tensor, B: Tensor):
|
||||||
|
@ -60,20 +58,14 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
|
||||||
|
|
||||||
def __getitem__(
|
def __getitem__(
|
||||||
self,
|
self,
|
||||||
indices: (
|
indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...],
|
||||||
SupportsIndex
|
|
||||||
| slice
|
|
||||||
| tuple[
|
|
||||||
SupportsIndex | slice | Tensor, ...
|
|
||||||
] # TODO: add ellipsis in the type signature
|
|
||||||
),
|
|
||||||
) -> LoraTorchTensor:
|
) -> LoraTorchTensor:
|
||||||
shape = self.shape
|
shape = self.shape
|
||||||
if isinstance(indices, SupportsIndex):
|
if isinstance(indices, SupportsIndex):
|
||||||
if len(shape) > 2:
|
if len(shape) > 2:
|
||||||
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError # can't return a vector
|
raise NotImplementedError
|
||||||
elif isinstance(indices, slice):
|
elif isinstance(indices, slice):
|
||||||
if len(shape) > 2:
|
if len(shape) > 2:
|
||||||
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||||
|
@ -83,7 +75,7 @@ def __getitem__(
|
||||||
assert len(indices) > 0
|
assert len(indices) > 0
|
||||||
if indices[-1] is Ellipsis:
|
if indices[-1] is Ellipsis:
|
||||||
return self[indices[:-1]]
|
return self[indices[:-1]]
|
||||||
# expand ellipsis
|
|
||||||
indices = tuple(
|
indices = tuple(
|
||||||
u
|
u
|
||||||
for v in (
|
for v in (
|
||||||
|
@ -103,7 +95,6 @@ def __getitem__(
|
||||||
*(slice(None, None) for _ in range(len(indices), len(shape))),
|
*(slice(None, None) for _ in range(len(indices), len(shape))),
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: make sure this is correct
|
|
||||||
indices_A = (
|
indices_A = (
|
||||||
*(
|
*(
|
||||||
(
|
(
|
||||||
|
@ -119,7 +110,7 @@ def __getitem__(
|
||||||
indices_B = indices[:-1]
|
indices_B = indices[:-1]
|
||||||
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
|
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError # unknown indice type
|
raise NotImplementedError
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dtype(self) -> torch.dtype:
|
def dtype(self) -> torch.dtype:
|
||||||
|
@ -142,9 +133,8 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||||
new_shape = cast(tuple[int, ...], shape)
|
new_shape = cast(tuple[int, ...], shape)
|
||||||
orig_shape = self.shape
|
orig_shape = self.shape
|
||||||
if len(new_shape) < 2:
|
if len(new_shape) < 2:
|
||||||
raise NotImplementedError # can't become a vector
|
raise NotImplementedError
|
||||||
|
|
||||||
# expand -1 in the shape
|
|
||||||
if any(dim == -1 for dim in new_shape):
|
if any(dim == -1 for dim in new_shape):
|
||||||
n_elems = prod(orig_shape)
|
n_elems = prod(orig_shape)
|
||||||
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
|
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
|
||||||
|
@ -154,7 +144,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||||
)
|
)
|
||||||
|
|
||||||
if new_shape[-1] != orig_shape[-1]:
|
if new_shape[-1] != orig_shape[-1]:
|
||||||
raise NotImplementedError # can't reshape the row size trivially
|
raise NotImplementedError
|
||||||
|
|
||||||
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
|
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
|
||||||
shape_B = (*new_shape[:-1], self._rank)
|
shape_B = (*new_shape[:-1], self._rank)
|
||||||
|
@ -173,7 +163,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
|
||||||
shape = self.shape
|
shape = self.shape
|
||||||
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
|
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
|
||||||
if dims[-1] == -1:
|
if dims[-1] == -1:
|
||||||
# TODO: support higher dimensional A shapes bigger than 1
|
|
||||||
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
|
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
|
||||||
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
|
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
|
||||||
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
|
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
|
||||||
|
@ -181,7 +171,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
|
||||||
self._lora_B.permute(*dims), self._lora_A.permute(*dims)
|
self._lora_B.permute(*dims), self._lora_A.permute(*dims)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# TODO: compose the above two
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
|
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
|
||||||
|
@ -200,7 +190,7 @@ def to(self, *args, **kwargs):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
||||||
del types # unused
|
del types
|
||||||
|
|
||||||
if kwargs is None:
|
if kwargs is None:
|
||||||
kwargs = {}
|
kwargs = {}
|
||||||
|
@ -241,7 +231,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
|
||||||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||||
# models produced by mergekit-extract-lora have token embeddings in the adapter
|
|
||||||
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
||||||
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
||||||
return base_name
|
return base_name
|
||||||
|
@ -303,7 +293,7 @@ def parse_args() -> argparse.Namespace:
|
||||||
|
|
||||||
|
|
||||||
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
# normally, adapter does not come with base model config, we need to load it from AutoConfig
|
|
||||||
config = AutoConfig.from_pretrained(hf_model_id)
|
config = AutoConfig.from_pretrained(hf_model_id)
|
||||||
return config.to_dict()
|
return config.to_dict()
|
||||||
|
|
||||||
|
@ -331,11 +321,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_lora
|
fname_out = dir_lora
|
||||||
|
|
||||||
if os.path.exists(input_model):
|
if os.path.exists(input_model):
|
||||||
# lazy import load_file only if lora is in safetensors format.
|
|
||||||
from safetensors.torch import load_file
|
from safetensors.torch import load_file
|
||||||
|
|
||||||
lora_model = load_file(input_model, device="cpu")
|
lora_model = load_file(input_model, device="cpu")
|
||||||
|
@ -343,11 +333,9 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||||
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||||
|
|
||||||
# load LoRA config
|
|
||||||
with open(lora_config, "r") as f:
|
with open(lora_config, "r") as f:
|
||||||
lparams: dict[str, Any] = json.load(f)
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
# load base model
|
|
||||||
if base_model_id is not None:
|
if base_model_id is not None:
|
||||||
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||||
hparams = load_hparams_from_hf(base_model_id)
|
hparams = load_hparams_from_hf(base_model_id)
|
||||||
|
@ -409,7 +397,7 @@ def set_gguf_parameters(self):
|
||||||
)
|
)
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
|
@ -419,13 +407,13 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
if self.lazy:
|
if self.lazy:
|
||||||
tensor = LazyTorchTensor.from_eager(tensor)
|
tensor = LazyTorchTensor.from_eager(tensor)
|
||||||
base_name = get_base_tensor_name(name)
|
base_name = get_base_tensor_name(name)
|
||||||
# note: mergekit-extract-lora also adds token embeddings to the adapter
|
|
||||||
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
||||||
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
||||||
if not is_lora_a and not is_lora_b:
|
if not is_lora_a and not is_lora_b:
|
||||||
if ".base_layer.weight" in name:
|
if ".base_layer.weight" in name:
|
||||||
continue
|
continue
|
||||||
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
|
|
||||||
if "_layernorm" in name or ".norm" in name:
|
if "_layernorm" in name or ".norm" in name:
|
||||||
yield (base_name, tensor)
|
yield (base_name, tensor)
|
||||||
continue
|
continue
|
||||||
|
@ -437,7 +425,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
"Embeddings is present in the adapter. This can be due to new tokens added during fine tuning"
|
"Embeddings is present in the adapter. This can be due to new tokens added during fine tuning"
|
||||||
)
|
)
|
||||||
logger.error(
|
logger.error(
|
||||||
"Please refer to https://github.com/ggerganov/llama.cpp/pull/9948"
|
"Please refer to https://github.com/ggml-org/llama.cpp/pull/9948"
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -464,27 +452,21 @@ def modify_tensors(
|
||||||
self, data_torch: Tensor, name: str, bid: int | None
|
self, data_torch: Tensor, name: str, bid: int | None
|
||||||
) -> Iterable[tuple[str, Tensor]]:
|
) -> Iterable[tuple[str, Tensor]]:
|
||||||
dest = list(super().modify_tensors(data_torch, name, bid))
|
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||||
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
|
||||||
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
|
||||||
# therefore, we ignore them for now
|
|
||||||
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
|
||||||
if name == "lm_head.weight" and len(dest) == 0:
|
if name == "lm_head.weight" and len(dest) == 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"lm_head is present in adapter, but is ignored in base model"
|
"lm_head is present in adapter, but is ignored in base model"
|
||||||
)
|
)
|
||||||
for dest_name, dest_data in dest:
|
for dest_name, dest_data in dest:
|
||||||
# mergekit-extract-lora add these layernorm to the adapter
|
|
||||||
if "_norm" in dest_name:
|
if "_norm" in dest_name:
|
||||||
assert dest_data.dim() == 1
|
assert dest_data.dim() == 1
|
||||||
yield (dest_name, dest_data)
|
yield (dest_name, dest_data)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# otherwise, we must get the lora_A and lora_B tensors
|
|
||||||
assert isinstance(dest_data, LoraTorchTensor)
|
assert isinstance(dest_data, LoraTorchTensor)
|
||||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||||
|
|
||||||
# note: mergekit-extract-lora flip and transpose A and B
|
|
||||||
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
|
|
||||||
if "token_embd.weight" in dest_name:
|
if "token_embd.weight" in dest_name:
|
||||||
lora_a = lora_a.T
|
lora_a = lora_a.T
|
||||||
|
|
||||||
|
|
|
@ -119,6 +119,7 @@ class LLM:
|
||||||
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
||||||
RESIDUAL_SCALE = "{arch}.residual_scale"
|
RESIDUAL_SCALE = "{arch}.residual_scale"
|
||||||
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
||||||
|
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -134,6 +135,10 @@ class Attention:
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
|
DECAY_LORA_RANK = "{arch}.attention.decay_lora_rank"
|
||||||
|
ICLR_LORA_RANK = "{arch}.attention.iclr_lora_rank"
|
||||||
|
VALUE_RESIDUAL_MIX_LORA_RANK = "{arch}.attention.value_residual_mix_lora_rank"
|
||||||
|
GATE_LORA_RANK = "{arch}.attention.gate_lora_rank"
|
||||||
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||||
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
||||||
SCALE = "{arch}.attention.scale"
|
SCALE = "{arch}.attention.scale"
|
||||||
|
@ -189,7 +194,6 @@ class Tokenizer:
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
CLS_ID = "tokenizer.ggml.cls_token_id"
|
|
||||||
MASK_ID = "tokenizer.ggml.mask_token_id"
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
|
@ -251,6 +255,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
QWEN2VL = auto()
|
QWEN2VL = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
PHI3 = auto()
|
PHI3 = auto()
|
||||||
|
PHIMOE = auto()
|
||||||
PLAMO = auto()
|
PLAMO = auto()
|
||||||
CODESHELL = auto()
|
CODESHELL = auto()
|
||||||
ORION = auto()
|
ORION = auto()
|
||||||
|
@ -259,8 +264,12 @@ class MODEL_ARCH(IntEnum):
|
||||||
MINICPM3 = auto()
|
MINICPM3 = auto()
|
||||||
GEMMA = auto()
|
GEMMA = auto()
|
||||||
GEMMA2 = auto()
|
GEMMA2 = auto()
|
||||||
|
GEMMA3 = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
RWKV6 = auto()
|
RWKV6 = auto()
|
||||||
|
RWKV6QWEN2 = auto()
|
||||||
|
RWKV7 = auto()
|
||||||
|
ARWKV7 = auto()
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
XVERSE = auto()
|
XVERSE = auto()
|
||||||
COMMAND_R = auto()
|
COMMAND_R = auto()
|
||||||
|
@ -333,13 +342,26 @@ class MODEL_TENSOR(IntEnum):
|
||||||
SSM_A = auto()
|
SSM_A = auto()
|
||||||
SSM_D = auto()
|
SSM_D = auto()
|
||||||
SSM_OUT = auto()
|
SSM_OUT = auto()
|
||||||
|
TIME_MIX_W0 = auto()
|
||||||
TIME_MIX_W1 = auto()
|
TIME_MIX_W1 = auto()
|
||||||
TIME_MIX_W2 = auto()
|
TIME_MIX_W2 = auto()
|
||||||
|
TIME_MIX_A0 = auto()
|
||||||
|
TIME_MIX_A1 = auto()
|
||||||
|
TIME_MIX_A2 = auto()
|
||||||
|
TIME_MIX_V0 = auto()
|
||||||
|
TIME_MIX_V1 = auto()
|
||||||
|
TIME_MIX_V2 = auto()
|
||||||
|
TIME_MIX_G1 = auto()
|
||||||
|
TIME_MIX_G2 = auto()
|
||||||
|
TIME_MIX_K_K = auto()
|
||||||
|
TIME_MIX_K_A = auto()
|
||||||
|
TIME_MIX_R_K = auto()
|
||||||
TIME_MIX_LERP_X = auto()
|
TIME_MIX_LERP_X = auto()
|
||||||
TIME_MIX_LERP_K = auto()
|
TIME_MIX_LERP_K = auto()
|
||||||
TIME_MIX_LERP_V = auto()
|
TIME_MIX_LERP_V = auto()
|
||||||
TIME_MIX_LERP_R = auto()
|
TIME_MIX_LERP_R = auto()
|
||||||
TIME_MIX_LERP_G = auto()
|
TIME_MIX_LERP_G = auto()
|
||||||
|
TIME_MIX_LERP_FUSED = auto()
|
||||||
TIME_MIX_LERP_W = auto()
|
TIME_MIX_LERP_W = auto()
|
||||||
TIME_MIX_FIRST = auto()
|
TIME_MIX_FIRST = auto()
|
||||||
TIME_MIX_DECAY = auto()
|
TIME_MIX_DECAY = auto()
|
||||||
|
@ -435,6 +457,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH.QWEN2VL: "qwen2vl",
|
MODEL_ARCH.QWEN2VL: "qwen2vl",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
MODEL_ARCH.PHI3: "phi3",
|
MODEL_ARCH.PHI3: "phi3",
|
||||||
|
MODEL_ARCH.PHIMOE: "phimoe",
|
||||||
MODEL_ARCH.PLAMO: "plamo",
|
MODEL_ARCH.PLAMO: "plamo",
|
||||||
MODEL_ARCH.CODESHELL: "codeshell",
|
MODEL_ARCH.CODESHELL: "codeshell",
|
||||||
MODEL_ARCH.ORION: "orion",
|
MODEL_ARCH.ORION: "orion",
|
||||||
|
@ -443,8 +466,12 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH.MINICPM3: "minicpm3",
|
MODEL_ARCH.MINICPM3: "minicpm3",
|
||||||
MODEL_ARCH.GEMMA: "gemma",
|
MODEL_ARCH.GEMMA: "gemma",
|
||||||
MODEL_ARCH.GEMMA2: "gemma2",
|
MODEL_ARCH.GEMMA2: "gemma2",
|
||||||
|
MODEL_ARCH.GEMMA3: "gemma3",
|
||||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||||
MODEL_ARCH.RWKV6: "rwkv6",
|
MODEL_ARCH.RWKV6: "rwkv6",
|
||||||
|
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
|
||||||
|
MODEL_ARCH.RWKV7: "rwkv7",
|
||||||
|
MODEL_ARCH.ARWKV7: "arwkv7",
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
MODEL_ARCH.XVERSE: "xverse",
|
MODEL_ARCH.XVERSE: "xverse",
|
||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
|
@ -517,13 +544,26 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||||
|
MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0",
|
||||||
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
||||||
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
||||||
|
MODEL_TENSOR.TIME_MIX_A0: "blk.{bid}.time_mix_a0",
|
||||||
|
MODEL_TENSOR.TIME_MIX_A1: "blk.{bid}.time_mix_a1",
|
||||||
|
MODEL_TENSOR.TIME_MIX_A2: "blk.{bid}.time_mix_a2",
|
||||||
|
MODEL_TENSOR.TIME_MIX_V0: "blk.{bid}.time_mix_v0",
|
||||||
|
MODEL_TENSOR.TIME_MIX_V1: "blk.{bid}.time_mix_v1",
|
||||||
|
MODEL_TENSOR.TIME_MIX_V2: "blk.{bid}.time_mix_v2",
|
||||||
|
MODEL_TENSOR.TIME_MIX_G1: "blk.{bid}.time_mix_g1",
|
||||||
|
MODEL_TENSOR.TIME_MIX_G2: "blk.{bid}.time_mix_g2",
|
||||||
|
MODEL_TENSOR.TIME_MIX_K_K: "blk.{bid}.time_mix_k_k",
|
||||||
|
MODEL_TENSOR.TIME_MIX_K_A: "blk.{bid}.time_mix_k_a",
|
||||||
|
MODEL_TENSOR.TIME_MIX_R_K: "blk.{bid}.time_mix_r_k",
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
|
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
|
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
|
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
|
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
|
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused",
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
|
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
|
||||||
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
|
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
|
||||||
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
|
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
|
||||||
|
@ -947,6 +987,24 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.PHIMOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FACTORS_LONG,
|
||||||
|
MODEL_TENSOR.ROPE_FACTORS_SHORT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.CODESHELL: [
|
MODEL_ARCH.CODESHELL: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.POS_EMBD,
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
@ -1060,6 +1118,23 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_PRE_NORM,
|
MODEL_TENSOR.FFN_PRE_NORM,
|
||||||
MODEL_TENSOR.FFN_POST_NORM,
|
MODEL_TENSOR.FFN_POST_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.GEMMA3: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_POST_NORM,
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM,
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.STARCODER2: [
|
MODEL_ARCH.STARCODER2: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -1090,6 +1165,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_R,
|
MODEL_TENSOR.TIME_MIX_LERP_R,
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_G,
|
MODEL_TENSOR.TIME_MIX_LERP_G,
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_W,
|
MODEL_TENSOR.TIME_MIX_LERP_W,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
||||||
MODEL_TENSOR.TIME_MIX_FIRST,
|
MODEL_TENSOR.TIME_MIX_FIRST,
|
||||||
MODEL_TENSOR.TIME_MIX_DECAY,
|
MODEL_TENSOR.TIME_MIX_DECAY,
|
||||||
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||||
|
@ -1106,6 +1182,97 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
|
||||||
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.RWKV6QWEN2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_X,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_K,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_V,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_R,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_G,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_W,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
||||||
|
MODEL_TENSOR.TIME_MIX_FIRST,
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY,
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_KEY,
|
||||||
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_GATE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LN,
|
||||||
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.RWKV7: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W0,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_A0,
|
||||||
|
MODEL_TENSOR.TIME_MIX_A1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_A2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_V0,
|
||||||
|
MODEL_TENSOR.TIME_MIX_V1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_V2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_G1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_G2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_K_K,
|
||||||
|
MODEL_TENSOR.TIME_MIX_K_A,
|
||||||
|
MODEL_TENSOR.TIME_MIX_R_K,
|
||||||
|
MODEL_TENSOR.TIME_MIX_KEY,
|
||||||
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LN,
|
||||||
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_KEY,
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.ARWKV7: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W0,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_W2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_A0,
|
||||||
|
MODEL_TENSOR.TIME_MIX_A1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_A2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_V0,
|
||||||
|
MODEL_TENSOR.TIME_MIX_V1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_V2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_G1,
|
||||||
|
MODEL_TENSOR.TIME_MIX_G2,
|
||||||
|
MODEL_TENSOR.TIME_MIX_K_K,
|
||||||
|
MODEL_TENSOR.TIME_MIX_K_A,
|
||||||
|
MODEL_TENSOR.TIME_MIX_R_K,
|
||||||
|
MODEL_TENSOR.TIME_MIX_KEY,
|
||||||
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
||||||
|
MODEL_TENSOR.TIME_MIX_LN,
|
||||||
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.MAMBA: [
|
MODEL_ARCH.MAMBA: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -1310,6 +1477,9 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.OUTPUT,
|
MODEL_TENSOR.OUTPUT,
|
||||||
MODEL_TENSOR.ATTN_NORM,
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
MODEL_TENSOR.ATTN_QKV,
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
MODEL_TENSOR.ATTN_OUT,
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
MODEL_TENSOR.FFN_NORM,
|
MODEL_TENSOR.FFN_NORM,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
@ -1789,7 +1959,6 @@ def get_type(val: Any) -> GGUFValueType:
|
||||||
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
||||||
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
||||||
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
||||||
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
|
||||||
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
||||||
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
||||||
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
# This file left for compatibility. If you want to use the GGUF API from Python
|
||||||
|
# then don't import gguf/gguf.py directly. If you're looking for examples, see the
|
||||||
|
# examples/ directory for gguf-py
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
# Compatibility for people trying to import gguf/gguf.py directly instead of as a package.
|
||||||
|
importlib.invalidate_caches()
|
||||||
|
import gguf # noqa: E402
|
||||||
|
|
||||||
|
importlib.reload(gguf)
|
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import Any, Literal, NamedTuple, TypeVar, Union
|
from typing import Any, Literal, NamedTuple, TypeVar, Union
|
||||||
|
|
||||||
|
@ -15,7 +16,6 @@
|
||||||
from .quants import quant_shape_to_byte_shape
|
from .quants import quant_shape_to_byte_shape
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Allow running file in package as a script.
|
# Allow running file in package as a script.
|
||||||
|
@ -28,6 +28,7 @@
|
||||||
GGUF_VERSION,
|
GGUF_VERSION,
|
||||||
GGMLQuantizationType,
|
GGMLQuantizationType,
|
||||||
GGUFValueType,
|
GGUFValueType,
|
||||||
|
GGUFEndian,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -53,6 +54,52 @@ class ReaderField(NamedTuple):
|
||||||
|
|
||||||
types: list[GGUFValueType] = []
|
types: list[GGUFValueType] = []
|
||||||
|
|
||||||
|
def contents(self, index_or_slice: int | slice = slice(None)) -> Any:
|
||||||
|
if self.types:
|
||||||
|
to_string = lambda x: str(x.tobytes(), encoding="utf-8") # noqa: E731
|
||||||
|
main_type = self.types[0]
|
||||||
|
|
||||||
|
if main_type == GGUFValueType.ARRAY:
|
||||||
|
sub_type = self.types[-1]
|
||||||
|
|
||||||
|
if sub_type == GGUFValueType.STRING:
|
||||||
|
indices = self.data[index_or_slice]
|
||||||
|
|
||||||
|
if isinstance(index_or_slice, int):
|
||||||
|
return to_string(self.parts[indices]) # type: ignore
|
||||||
|
else:
|
||||||
|
return [to_string(self.parts[idx]) for idx in indices] # type: ignore
|
||||||
|
else:
|
||||||
|
# FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too
|
||||||
|
|
||||||
|
# Check if it's unsafe to perform slice optimization on data
|
||||||
|
# if any(True for idx in self.data if len(self.parts[idx]) != 1):
|
||||||
|
# optim_slice = slice(None)
|
||||||
|
# else:
|
||||||
|
# optim_slice = index_or_slice
|
||||||
|
# index_or_slice = slice(None)
|
||||||
|
|
||||||
|
# if isinstance(optim_slice, int):
|
||||||
|
# return self.parts[self.data[optim_slice]].tolist()[0]
|
||||||
|
# else:
|
||||||
|
# return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice]
|
||||||
|
|
||||||
|
if isinstance(index_or_slice, int):
|
||||||
|
return self.parts[self.data[index_or_slice]].tolist()[0]
|
||||||
|
else:
|
||||||
|
return [
|
||||||
|
pv
|
||||||
|
for idx in self.data[index_or_slice]
|
||||||
|
for pv in self.parts[idx].tolist()
|
||||||
|
]
|
||||||
|
|
||||||
|
if main_type == GGUFValueType.STRING:
|
||||||
|
return to_string(self.parts[-1])
|
||||||
|
else:
|
||||||
|
return self.parts[-1].tolist()[0]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class ReaderTensor(NamedTuple):
|
class ReaderTensor(NamedTuple):
|
||||||
name: str
|
name: str
|
||||||
|
@ -103,12 +150,23 @@ def __init__(
|
||||||
# If we get 0 here that means it's (probably) a GGUF file created for
|
# If we get 0 here that means it's (probably) a GGUF file created for
|
||||||
# the opposite byte order of the machine this script is running on.
|
# the opposite byte order of the machine this script is running on.
|
||||||
self.byte_order = "S"
|
self.byte_order = "S"
|
||||||
temp_version = temp_version.newbyteorder(self.byte_order)
|
temp_version = temp_version.view(
|
||||||
|
temp_version.dtype.newbyteorder(self.byte_order)
|
||||||
|
)
|
||||||
version = temp_version[0]
|
version = temp_version[0]
|
||||||
if version not in READER_SUPPORTED_VERSIONS:
|
if version not in READER_SUPPORTED_VERSIONS:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Sorry, file appears to be version {version} which we cannot handle"
|
f"Sorry, file appears to be version {version} which we cannot handle"
|
||||||
)
|
)
|
||||||
|
if sys.byteorder == "little":
|
||||||
|
# Host is little endian
|
||||||
|
host_endian = GGUFEndian.LITTLE
|
||||||
|
swapped_endian = GGUFEndian.BIG
|
||||||
|
else:
|
||||||
|
# Sorry PDP or other weird systems that don't use BE or LE.
|
||||||
|
host_endian = GGUFEndian.BIG
|
||||||
|
swapped_endian = GGUFEndian.LITTLE
|
||||||
|
self.endianess = swapped_endian if self.byte_order == "S" else host_endian
|
||||||
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
|
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
|
||||||
self.tensors: list[ReaderTensor] = []
|
self.tensors: list[ReaderTensor] = []
|
||||||
offs += self._push_field(
|
offs += self._push_field(
|
||||||
|
@ -170,9 +228,11 @@ def _get(
|
||||||
itemsize = int(np.empty([], dtype=dtype).itemsize)
|
itemsize = int(np.empty([], dtype=dtype).itemsize)
|
||||||
end_offs = offset + itemsize * count
|
end_offs = offset + itemsize * count
|
||||||
arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
|
arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
|
||||||
if override_order is None:
|
return arr.view(
|
||||||
return arr
|
arr.dtype.newbyteorder(
|
||||||
return arr.view(arr.dtype.newbyteorder(override_order))
|
self.byte_order if override_order is None else override_order
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
|
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
|
||||||
if field.name in self.fields:
|
if field.name in self.fields:
|
||||||
|
@ -218,6 +278,7 @@ def _get_field_parts(
|
||||||
offs += int(alen.nbytes)
|
offs += int(alen.nbytes)
|
||||||
aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
|
aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
|
||||||
data_idxs: list[int] = []
|
data_idxs: list[int] = []
|
||||||
|
# FIXME: Handle multi-dimensional arrays properly instead of flattening
|
||||||
for idx in range(alen[0]):
|
for idx in range(alen[0]):
|
||||||
curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(
|
curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(
|
||||||
offs, raw_itype[0]
|
offs, raw_itype[0]
|
||||||
|
|
|
@ -828,6 +828,9 @@ def add_embedding_scale(self, value: float) -> None:
|
||||||
def add_wkv_head_size(self, size: int) -> None:
|
def add_wkv_head_size(self, size: int) -> None:
|
||||||
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
|
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
|
||||||
|
|
||||||
|
def add_token_shift_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
def add_layer_norm_eps(self, value: float) -> None:
|
def add_layer_norm_eps(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
@ -849,6 +852,20 @@ def add_q_lora_rank(self, length: int) -> None:
|
||||||
def add_kv_lora_rank(self, length: int) -> None:
|
def add_kv_lora_rank(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_decay_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.DECAY_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_iclr_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.ICLR_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_value_residual_mix_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(
|
||||||
|
Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_gate_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_relative_attn_buckets_count(self, value: int) -> None:
|
def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
@ -943,9 +960,6 @@ def add_sep_token_id(self, id: int) -> None:
|
||||||
def add_pad_token_id(self, id: int) -> None:
|
def add_pad_token_id(self, id: int) -> None:
|
||||||
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
||||||
|
|
||||||
def add_cls_token_id(self, id: int) -> None:
|
|
||||||
self.add_uint32(Keys.Tokenizer.CLS_ID, id)
|
|
||||||
|
|
||||||
def add_mask_token_id(self, id: int) -> None:
|
def add_mask_token_id(self, id: int) -> None:
|
||||||
self.add_uint32(Keys.Tokenizer.MASK_ID, id)
|
self.add_uint32(Keys.Tokenizer.MASK_ID, id)
|
||||||
|
|
||||||
|
|
|
@ -160,21 +160,41 @@ def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
|
||||||
if not model_card_path.is_file():
|
if not model_card_path.is_file():
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# The model card metadata is assumed to always be in YAML
|
# The model card metadata is assumed to always be in YAML (frontmatter)
|
||||||
# ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
|
# ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
|
||||||
|
yaml_content: str = ""
|
||||||
with open(model_card_path, "r", encoding="utf-8") as f:
|
with open(model_card_path, "r", encoding="utf-8") as f:
|
||||||
if f.readline() == "---\n":
|
content = f.read()
|
||||||
raw = f.read().partition("---\n")[0]
|
lines = content.splitlines()
|
||||||
data = yaml.safe_load(raw)
|
lines_yaml = []
|
||||||
if isinstance(data, dict):
|
if len(lines) == 0:
|
||||||
return data
|
# Empty file
|
||||||
else:
|
|
||||||
logger.error(
|
|
||||||
f"while reading YAML model card frontmatter, data is {type(data)} instead of dict"
|
|
||||||
)
|
|
||||||
return {}
|
|
||||||
else:
|
|
||||||
return {}
|
return {}
|
||||||
|
if len(lines) > 0 and lines[0] != "---":
|
||||||
|
# No frontmatter
|
||||||
|
return {}
|
||||||
|
for line in lines[1:]:
|
||||||
|
if line == "---":
|
||||||
|
break # End of frontmatter
|
||||||
|
else:
|
||||||
|
lines_yaml.append(line)
|
||||||
|
yaml_content = "\n".join(lines_yaml) + "\n"
|
||||||
|
|
||||||
|
# Quick hack to fix the Norway problem
|
||||||
|
# https://hitchdev.com/strictyaml/why/implicit-typing-removed/
|
||||||
|
yaml_content = yaml_content.replace("- no\n", '- "no"\n')
|
||||||
|
|
||||||
|
if yaml_content:
|
||||||
|
data = yaml.safe_load(yaml_content)
|
||||||
|
if isinstance(data, dict):
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
f"while reading YAML model card frontmatter, data is {type(data)} instead of dict"
|
||||||
|
)
|
||||||
|
return {}
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
|
def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
|
||||||
|
|
|
@ -13,7 +13,7 @@ class TensorNameMap:
|
||||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"word_embeddings", # bloom
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf nemotron olmoe olmo2
|
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
"embeddings.word_embeddings", # bert nomic-bert
|
"embeddings.word_embeddings", # bert nomic-bert
|
||||||
"language_model.embedding.word_embeddings", # persimmon
|
"language_model.embedding.word_embeddings", # persimmon
|
||||||
|
@ -27,7 +27,8 @@ class TensorNameMap:
|
||||||
"embedding.word_embeddings", # chatglm
|
"embedding.word_embeddings", # chatglm
|
||||||
"transformer.token_embeddings", # openelm
|
"transformer.token_embeddings", # openelm
|
||||||
"shared", # t5
|
"shared", # t5
|
||||||
"rwkv.embeddings", # rwkv
|
"rwkv.embeddings", # rwkv6
|
||||||
|
"model.embeddings", # rwkv7
|
||||||
),
|
),
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
MODEL_TENSOR.TOKEN_TYPES: (
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
|
@ -40,6 +41,9 @@ class TensorNameMap:
|
||||||
"emb_ln", # nomic-bert
|
"emb_ln", # nomic-bert
|
||||||
"transformer.norm", # openelm
|
"transformer.norm", # openelm
|
||||||
"rwkv.blocks.0.pre_ln", # rwkv
|
"rwkv.blocks.0.pre_ln", # rwkv
|
||||||
|
"rwkv.blocks.0.pre_ln", # rwkv6
|
||||||
|
"model.pre_ln", # rwkv7
|
||||||
|
"model.layers.0.pre_norm", # rwkv7
|
||||||
"backbone.norm", # wavtokenizer
|
"backbone.norm", # wavtokenizer
|
||||||
),
|
),
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
|
@ -51,7 +55,7 @@ class TensorNameMap:
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
|
||||||
"output", # llama-pth bloom internlm2
|
"output", # llama-pth bloom internlm2
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
"lm_head.linear", # phi2
|
"lm_head.linear", # phi2
|
||||||
|
@ -63,7 +67,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
|
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
|
||||||
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2
|
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"transformer.norm_f", # mpt dbrx
|
"transformer.norm_f", # mpt dbrx
|
||||||
"ln_f", # refact bloom qwen gpt2
|
"ln_f", # refact bloom qwen gpt2
|
||||||
|
@ -76,7 +80,8 @@ class TensorNameMap:
|
||||||
"encoder.final_layernorm", # chatglm
|
"encoder.final_layernorm", # chatglm
|
||||||
"transformer.norm", # openelm
|
"transformer.norm", # openelm
|
||||||
"model.norm", # nemotron
|
"model.norm", # nemotron
|
||||||
"rwkv.ln_out", # rwkv
|
"rwkv.ln_out", # rwkv6
|
||||||
|
"model.ln_out", # rwkv7
|
||||||
"backbone.final_layer_norm", # wavtokenizer
|
"backbone.final_layer_norm", # wavtokenizer
|
||||||
),
|
),
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
|
@ -98,7 +103,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
"h.{bid}.input_layernorm", # bloom
|
"h.{bid}.input_layernorm", # bloom
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe
|
"model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||||
"model.layers.{bid}.ln1", # yi
|
"model.layers.{bid}.ln1", # yi
|
||||||
|
@ -112,13 +117,15 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||||
"encoder.layers.{bid}.input_layernorm", # chatglm
|
"encoder.layers.{bid}.input_layernorm", # chatglm
|
||||||
"transformer.layers.{bid}.attn_norm", # openelm
|
"transformer.layers.{bid}.attn_norm", # openelm
|
||||||
"rwkv.blocks.{bid}.ln1", # rwkv
|
"rwkv.blocks.{bid}.ln1", # rwkv6
|
||||||
|
"model.layers.{bid}.ln1", # rwkv7
|
||||||
),
|
),
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
MODEL_TENSOR.ATTN_NORM_2: (
|
MODEL_TENSOR.ATTN_NORM_2: (
|
||||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||||
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
|
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
|
||||||
"rwkv.blocks.{bid}.ln2", # rwkv
|
"rwkv.blocks.{bid}.ln2", # rwkv6
|
||||||
|
"model.layers.{bid}.ln2", # rwkv7
|
||||||
),
|
),
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
|
@ -139,7 +146,7 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
# Attention query
|
# Attention query
|
||||||
MODEL_TENSOR.ATTN_Q: (
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
||||||
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
|
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
|
||||||
"layers.{bid}.attention.wq", # llama-pth
|
"layers.{bid}.attention.wq", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.query", # bert
|
"encoder.layer.{bid}.attention.self.query", # bert
|
||||||
|
@ -151,7 +158,7 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
||||||
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
|
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
|
||||||
"layers.{bid}.attention.wk", # llama-pth
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.key", # bert
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
|
@ -164,7 +171,7 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
||||||
"layers.{bid}.attention.wv", # llama-pth
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.value", # bert
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
|
@ -181,7 +188,7 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"h.{bid}.self_attention.dense", # bloom
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
||||||
"model.layers.{bid}.self_attn.linear_attn", # deci
|
"model.layers.{bid}.self_attn.linear_attn", # deci
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
|
@ -222,7 +229,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
|
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
|
||||||
"h.{bid}.post_attention_layernorm", # bloom
|
"h.{bid}.post_attention_layernorm", # bloom
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||||
"model.layers.{bid}.ln2", # yi
|
"model.layers.{bid}.ln2", # yi
|
||||||
|
@ -242,7 +249,7 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe
|
||||||
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
|
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
|
||||||
"transformer.decoder_layer.{bid}.router", # Grok
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
|
@ -287,6 +294,7 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
||||||
|
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||||
|
@ -313,6 +321,7 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
||||||
|
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||||
|
@ -351,6 +360,7 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||||
|
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||||
|
@ -410,62 +420,116 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.out_proj",
|
"model.layers.{bid}.out_proj",
|
||||||
"backbone.layers.{bid}.mixer.out_proj",
|
"backbone.layers.{bid}.mixer.out_proj",
|
||||||
),
|
),
|
||||||
|
MODEL_TENSOR.TIME_MIX_W0: ("model.layers.{bid}.attention.w0",), # rwkv7
|
||||||
MODEL_TENSOR.TIME_MIX_W1: (
|
MODEL_TENSOR.TIME_MIX_W1: (
|
||||||
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
|
||||||
|
"model.layers.{bid}.attention.w1", # rwkv7
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_W2: (
|
MODEL_TENSOR.TIME_MIX_W2: (
|
||||||
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
|
||||||
|
"model.layers.{bid}.attention.w2", # rwkv7
|
||||||
),
|
),
|
||||||
|
MODEL_TENSOR.TIME_MIX_A0: ("model.layers.{bid}.attention.a0",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_A1: ("model.layers.{bid}.attention.a1",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_A2: ("model.layers.{bid}.attention.a2",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_V0: ("model.layers.{bid}.attention.v0",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_V1: ("model.layers.{bid}.attention.v1",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_V2: ("model.layers.{bid}.attention.v2",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_G1: ("model.layers.{bid}.attention.g1",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_G2: ("model.layers.{bid}.attention.g2",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_K_K: ("model.layers.{bid}.attention.k_k",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_K_A: ("model.layers.{bid}.attention.k_a",), # rwkv7
|
||||||
|
MODEL_TENSOR.TIME_MIX_R_K: ("model.layers.{bid}.attention.r_k",), # rwkv7
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_X: (
|
MODEL_TENSOR.TIME_MIX_LERP_X: (
|
||||||
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_K: (
|
MODEL_TENSOR.TIME_MIX_LERP_K: (
|
||||||
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_V: (
|
MODEL_TENSOR.TIME_MIX_LERP_V: (
|
||||||
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_R: (
|
MODEL_TENSOR.TIME_MIX_LERP_R: (
|
||||||
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_G: (
|
MODEL_TENSOR.TIME_MIX_LERP_G: (
|
||||||
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_LERP_W: (
|
MODEL_TENSOR.TIME_MIX_LERP_W: (
|
||||||
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_FIRST: (
|
MODEL_TENSOR.TIME_MIX_FIRST: (
|
||||||
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_DECAY: (
|
MODEL_TENSOR.TIME_MIX_DECAY: (
|
||||||
"rwkv.blocks.{bid}.attention.time_decay", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_decay", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
|
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
|
||||||
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
|
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
|
||||||
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6
|
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.TIME_MIX_KEY: (
|
||||||
|
"rwkv.blocks.{bid}.attention.key", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
|
||||||
|
"model.layers.{bid}.attention.key", # rwkv7
|
||||||
|
"model.layers.{bid}.attention.k_proj", # rwkv7
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.TIME_MIX_VALUE: (
|
||||||
|
"rwkv.blocks.{bid}.attention.value", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
|
||||||
|
"model.layers.{bid}.attention.value", # rwkv7
|
||||||
|
"model.layers.{bid}.attention.v_proj", # rwkv7
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",), # rwkv
|
|
||||||
MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",), # rwkv
|
|
||||||
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
|
||||||
"rwkv.blocks.{bid}.attention.receptance", # rwkv
|
"rwkv.blocks.{bid}.attention.receptance", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
|
||||||
|
"model.layers.{bid}.attention.receptance", # rwkv7
|
||||||
|
"model.layers.{bid}.attention.r_proj", # rwkv7
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.TIME_MIX_GATE: (
|
||||||
|
"rwkv.blocks.{bid}.attention.gate", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.gate", # rwkv6qwen2
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.TIME_MIX_LN: (
|
||||||
|
"rwkv.blocks.{bid}.attention.ln_x", # rwkv6
|
||||||
|
"model.layers.{bid}.attention.ln_x", # rwkv7
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.TIME_MIX_OUTPUT: (
|
||||||
|
"rwkv.blocks.{bid}.attention.output", # rwkv6
|
||||||
|
"model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
|
||||||
|
"model.layers.{bid}.attention.output", # rwkv7
|
||||||
|
"model.layers.{bid}.attention.o_proj", # rwkv7
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",), # rwkv
|
|
||||||
MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",), # rwkv
|
|
||||||
MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",), # rwkv
|
|
||||||
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
|
||||||
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
|
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6
|
||||||
|
"model.layers.{bid}.feed_forward.x_k", # rwkv7
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
|
||||||
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
|
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.CHANNEL_MIX_KEY: (
|
||||||
|
"rwkv.blocks.{bid}.feed_forward.key", # rwkv6
|
||||||
|
"model.layers.{bid}.feed_forward.key", # rwkv7
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",), # rwkv
|
|
||||||
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
|
||||||
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
|
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
|
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
|
||||||
"rwkv.blocks.{bid}.feed_forward.value", # rwkv
|
"rwkv.blocks.{bid}.feed_forward.value", # rwkv6
|
||||||
|
"model.layers.{bid}.feed_forward.value", # rwkv7
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2
|
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2
|
||||||
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2
|
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2
|
||||||
|
|
|
@ -67,7 +67,7 @@ def naming_convention(
|
||||||
output_type: str | None,
|
output_type: str | None,
|
||||||
model_type: Literal["vocab", "LoRA"] | None = None,
|
model_type: Literal["vocab", "LoRA"] | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
|
# Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention
|
||||||
|
|
||||||
if base_name is not None:
|
if base_name is not None:
|
||||||
name = base_name.strip().replace(" ", "-").replace("/", "-")
|
name = base_name.strip().replace(" ", "-").replace("/", "-")
|
||||||
|
|
|
@ -166,7 +166,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||||
and isinstance(merges[0][0], str)
|
and isinstance(merges[0][0], str)
|
||||||
):
|
):
|
||||||
# New format since transformers 4.45 to support spaces in merges
|
# New format since transformers 4.45 to support spaces in merges
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/issues/9692
|
# ref: https://github.com/ggml-org/llama.cpp/issues/9692
|
||||||
# TODO: internally store as the new format instead of converting to old
|
# TODO: internally store as the new format instead of converting to old
|
||||||
if any(" " in s for pair in merges for s in pair):
|
if any(" " in s for pair in merges for s in pair):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
@ -195,7 +195,12 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||||
return True
|
return True
|
||||||
with open(tokenizer_config_file, encoding="utf-8") as f:
|
with open(tokenizer_config_file, encoding="utf-8") as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
chat_template = tokenizer_config.get("chat_template")
|
chat_template_alt = None
|
||||||
|
chat_template_file = path / "chat_template.json"
|
||||||
|
if chat_template_file.is_file():
|
||||||
|
with open(chat_template_file, encoding="utf-8") as f:
|
||||||
|
chat_template_alt = json.load(f).get("chat_template")
|
||||||
|
chat_template = tokenizer_config.get("chat_template", chat_template_alt)
|
||||||
if chat_template is None or isinstance(chat_template, (str, list)):
|
if chat_template is None or isinstance(chat_template, (str, list)):
|
||||||
self.chat_template = chat_template
|
self.chat_template = chat_template
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue