diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 87986db..ea00807 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -61,8 +61,8 @@ jobs: if: matrix.os == 'windows-latest' run: | $distPath = if ("${{ github.event.inputs.build_type }}" -eq "RELEASE") { "build\release\dist" } else { "build\dev\dist" } - New-Item -ItemType Directory -Force -Path "$distPath\src\gguf-py" - Copy-Item -Path "src\gguf-py\*" -Destination "$distPath\src\gguf-py" -Recurse + New-Item -ItemType Directory -Force -Path "$distPath\src\gguf" + Copy-Item -Path "src\gguf\*" -Destination "$distPath\src\gguf" -Recurse Copy-Item -Path "src\convert_hf_to_gguf.py" -Destination "$distPath\src" Copy-Item -Path "src\convert_lora_to_gguf.py" -Destination "$distPath\src" Copy-Item -Path "src\convert_lora_to_ggml.py" -Destination "$distPath\src" @@ -72,8 +72,8 @@ jobs: if: matrix.os != 'windows-latest' run: | distPath=$(if [ "${{ github.event.inputs.build_type }}" = "RELEASE" ]; then echo "build/release/dist"; else echo "build/dev/dist"; fi) - mkdir -p $distPath/src/gguf-py - cp -R src/gguf-py/* $distPath/src/gguf-py/ + mkdir -p $distPath/src/gguf + cp -R src/gguf/* $distPath/src/gguf/ cp src/convert_hf_to_gguf.py $distPath/src/ cp src/convert_lora_to_gguf.py $distPath/src/ cp src/convert_lora_to_ggml.py $distPath/src/ diff --git a/.gitignore b/.gitignore index bfd4365..910a80b 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,9 @@ __pycache__/ !src/ src/* !src/*.py +!src/gguf +src/gguf/* +!src/gguf/*.py # Allow docs folder and its .py files !docs/ diff --git a/src/AutoGGUF.py b/src/AutoGGUF.py index 417545f..3a901a6 100644 --- a/src/AutoGGUF.py +++ b/src/AutoGGUF.py @@ -1,8 +1,8 @@ import importlib import json import shutil -import urllib.request import urllib.error +import urllib.request from datetime import datetime from functools import partial, wraps from typing import Any, Dict, List, Tuple @@ -24,10 +24,10 @@ from error_handling import handle_error, show_error from imports_and_globals import ( ensure_directory, + load_dotenv, open_file_safe, resource_path, show_about, - load_dotenv, ) @@ -41,21 +41,18 @@ def wrapper(self, *args, **kwargs): # Length check if len(value) > 1024: - show_error(f"{field} exceeds maximum length") + show_error(self.logger, f"{field} exceeds maximum length") # Normalize path normalized_path = os.path.normpath(value) # Check for path traversal attempts if ".." in normalized_path: - show_error(f"Invalid path in {field}") + show_error(self.logger, f"Invalid path in {field}") # Disallow control characters and null bytes if re.search(r"[\x00-\x1f\x7f]", value): - show_error(f"Invalid characters in {field}") - - # Update the field with normalized path - getattr(self, field).setText(normalized_path) + show_error(self.logger, f"Invalid characters in {field}") return func(self, *args, **kwargs) diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py index 6400e33..0a0215c 100644 --- a/src/convert_hf_to_gguf.py +++ b/src/convert_hf_to_gguf.py @@ -30,8 +30,6 @@ if TYPE_CHECKING: from torch import Tensor -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) import gguf logger = logging.getLogger("hf-to-gguf") diff --git a/src/convert_lora_to_ggml.py b/src/convert_lora_to_ggml.py index a14f620..6047821 100644 --- a/src/convert_lora_to_ggml.py +++ b/src/convert_lora_to_ggml.py @@ -1,19 +1,17 @@ from __future__ import annotations -import logging import json +import logging import os import struct import sys -from pathlib import Path -from typing import Any, BinaryIO, Sequence +from typing import BinaryIO import numpy as np import torch -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf")) -import gguf +from gguf.constants import * +from gguf.tensor_mapping import * logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("lora-to-gguf") @@ -51,11 +49,6 @@ def write_tensor_header( fout.seek((fout.tell() + 31) & -32) -def pyinstaller_include(): - # PyInstaller import - pass - - if __name__ == "__main__": if len(sys.argv) < 2: logger.info(f"Usage: python {sys.argv[0]} [arch]") @@ -63,7 +56,7 @@ def pyinstaller_include(): "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" ) logger.info( - f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)" + f"Arch must be one of {list(MODEL_ARCH_NAMES.values())} (default: llama)" ) sys.exit(1) @@ -82,14 +75,14 @@ def pyinstaller_include(): arch_name = sys.argv[3] if len(sys.argv) == 4 else "llama" - if arch_name not in gguf.MODEL_ARCH_NAMES.values(): + if arch_name not in MODEL_ARCH_NAMES.values(): logger.error(f"Error: unsupported architecture {arch_name}") sys.exit(1) - arch = list(gguf.MODEL_ARCH_NAMES.keys())[ - list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name) + arch = list(MODEL_ARCH_NAMES.keys())[ + list(MODEL_ARCH_NAMES.values()).index(arch_name) ] - name_map = gguf.TensorNameMap(arch, 500) + name_map = TensorNameMap(arch, 500) with open(input_json, "r") as f: params = json.load(f) diff --git a/src/convert_lora_to_gguf.py b/src/convert_lora_to_gguf.py index 2cac648..d5354d2 100644 --- a/src/convert_lora_to_gguf.py +++ b/src/convert_lora_to_gguf.py @@ -24,9 +24,7 @@ if TYPE_CHECKING: from torch import Tensor -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) -import gguf +from gguf.constants import * from convert_hf_to_gguf import LazyTorchTensor, Model diff --git a/src/gguf-py/gguf/gguf.py b/src/gguf-py/gguf/gguf.py deleted file mode 100644 index 651a81e..0000000 --- a/src/gguf-py/gguf/gguf.py +++ /dev/null @@ -1,15 +0,0 @@ -# This file left for compatibility. If you want to use the GGUF API from Python -# then don't import gguf/gguf.py directly. If you're looking for examples, see the -# examples/ directory for gguf-py - -import importlib -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent)) - -# Compatibility for people trying to import gguf/gguf.py directly instead of as a package. -importlib.invalidate_caches() -import gguf # noqa: E402 - -importlib.reload(gguf) diff --git a/src/gguf-py/gguf/__init__.py b/src/gguf/__init__.py similarity index 100% rename from src/gguf-py/gguf/__init__.py rename to src/gguf/__init__.py diff --git a/src/gguf-py/gguf/constants.py b/src/gguf/constants.py similarity index 50% rename from src/gguf-py/gguf/constants.py rename to src/gguf/constants.py index 35da7d7..51ea4e4 100644 --- a/src/gguf-py/gguf/constants.py +++ b/src/gguf/constants.py @@ -3,413 +3,418 @@ from enum import Enum, IntEnum, auto from typing import Any -GGUF_MAGIC = 0x46554747 -GGUF_VERSION = 3 +GGUF_MAGIC = 0x46554747 +GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 -GGML_QUANT_VERSION = 2 +GGML_QUANT_VERSION = 2 + class Keys: class General: - TYPE = "general.type" - ARCHITECTURE = "general.architecture" - QUANTIZATION_VERSION = "general.quantization_version" - ALIGNMENT = "general.alignment" - FILE_TYPE = "general.file_type" + TYPE = "general.type" + ARCHITECTURE = "general.architecture" + QUANTIZATION_VERSION = "general.quantization_version" + ALIGNMENT = "general.alignment" + FILE_TYPE = "general.file_type" - NAME = "general.name" - AUTHOR = "general.author" - VERSION = "general.version" - ORGANIZATION = "general.organization" + NAME = "general.name" + AUTHOR = "general.author" + VERSION = "general.version" + ORGANIZATION = "general.organization" - FINETUNE = "general.finetune" - BASENAME = "general.basename" + FINETUNE = "general.finetune" + BASENAME = "general.basename" - DESCRIPTION = "general.description" - QUANTIZED_BY = "general.quantized_by" + DESCRIPTION = "general.description" + QUANTIZED_BY = "general.quantized_by" - SIZE_LABEL = "general.size_label" + SIZE_LABEL = "general.size_label" - LICENSE = "general.license" - LICENSE_NAME = "general.license.name" - LICENSE_LINK = "general.license.link" + LICENSE = "general.license" + LICENSE_NAME = "general.license.name" + LICENSE_LINK = "general.license.link" - URL = "general.url" - DOI = "general.doi" - UUID = "general.uuid" - REPO_URL = "general.repo_url" + URL = "general.url" + DOI = "general.doi" + UUID = "general.uuid" + REPO_URL = "general.repo_url" - SOURCE_URL = "general.source.url" - SOURCE_DOI = "general.source.doi" - SOURCE_UUID = "general.source.uuid" - SOURCE_REPO_URL = "general.source.repo_url" + SOURCE_URL = "general.source.url" + SOURCE_DOI = "general.source.doi" + SOURCE_UUID = "general.source.uuid" + SOURCE_REPO_URL = "general.source.repo_url" - BASE_MODEL_COUNT = "general.base_model.count" - BASE_MODEL_NAME = "general.base_model.{id}.name" - BASE_MODEL_AUTHOR = "general.base_model.{id}.author" - BASE_MODEL_VERSION = "general.base_model.{id}.version" - BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" - BASE_MODEL_URL = "general.base_model.{id}.url" - BASE_MODEL_DOI = "general.base_model.{id}.doi" - BASE_MODEL_UUID = "general.base_model.{id}.uuid" - BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" + BASE_MODEL_COUNT = "general.base_model.count" + BASE_MODEL_NAME = "general.base_model.{id}.name" + BASE_MODEL_AUTHOR = "general.base_model.{id}.author" + BASE_MODEL_VERSION = "general.base_model.{id}.version" + BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" + BASE_MODEL_URL = "general.base_model.{id}.url" + BASE_MODEL_DOI = "general.base_model.{id}.doi" + BASE_MODEL_UUID = "general.base_model.{id}.uuid" + BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" - TAGS = "general.tags" - LANGUAGES = "general.languages" - DATASETS = "general.datasets" + TAGS = "general.tags" + LANGUAGES = "general.languages" + DATASETS = "general.datasets" class LLM: - VOCAB_SIZE = "{arch}.vocab_size" - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - BLOCK_COUNT = "{arch}.block_count" - LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" - EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" + VOCAB_SIZE = "{arch}.vocab_size" + CONTEXT_LENGTH = "{arch}.context_length" + EMBEDDING_LENGTH = "{arch}.embedding_length" + BLOCK_COUNT = "{arch}.block_count" + LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" + FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" + EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length" - USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" - EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" - POOLING_TYPE = "{arch}.pooling_type" - LOGIT_SCALE = "{arch}.logit_scale" - DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" - ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" - FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" + TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" + EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" + EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" + POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" + DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" + ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" + FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" + HEAD_COUNT = "{arch}.attention.head_count" + HEAD_COUNT_KV = "{arch}.attention.head_count_kv" + MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" + CLAMP_KQV = "{arch}.attention.clamp_kqv" + KEY_LENGTH = "{arch}.attention.key_length" + VALUE_LENGTH = "{arch}.attention.value_length" + LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - CAUSAL = "{arch}.attention.causal" - Q_LORA_RANK = "{arch}.attention.q_lora_rank" - KV_LORA_RANK = "{arch}.attention.kv_lora_rank" + CAUSAL = "{arch}.attention.causal" + Q_LORA_RANK = "{arch}.attention.q_lora_rank" + KV_LORA_RANK = "{arch}.attention.kv_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" - SLIDING_WINDOW = "{arch}.attention.sliding_window" + SLIDING_WINDOW = "{arch}.attention.sliding_window" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" - SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + FREQ_BASE = "{arch}.rope.freq_base" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" + SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" class Split: - LLM_KV_SPLIT_NO = "split.no" - LLM_KV_SPLIT_COUNT = "split.count" + LLM_KV_SPLIT_NO = "split.no" + LLM_KV_SPLIT_COUNT = "split.count" LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" class SSM: - CONV_KERNEL = "{arch}.ssm.conv_kernel" - INNER_SIZE = "{arch}.ssm.inner_size" - STATE_SIZE = "{arch}.ssm.state_size" + CONV_KERNEL = "{arch}.ssm.conv_kernel" + INNER_SIZE = "{arch}.ssm.inner_size" + STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" class Tokenizer: - MODEL = "tokenizer.ggml.model" - PRE = "tokenizer.ggml.pre" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - CLS_ID = "tokenizer.ggml.cls_token_id" - MASK_ID = "tokenizer.ggml.mask_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" + MODEL = "tokenizer.ggml.model" + PRE = "tokenizer.ggml.pre" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + CLS_ID = "tokenizer.ggml.cls_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" + REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" - CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" - CHAT_TEMPLATES = "tokenizer.chat_templates" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" + CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" + CHAT_TEMPLATES = "tokenizer.chat_templates" - PREFIX_ID = "tokenizer.ggml.prefix_token_id" - SUFFIX_ID = "tokenizer.ggml.suffix_token_id" - MIDDLE_ID = "tokenizer.ggml.middle_token_id" - EOT_ID = "tokenizer.ggml.eot_token_id" - EOM_ID = "tokenizer.ggml.eom_token_id" + PREFIX_ID = "tokenizer.ggml.prefix_token_id" + SUFFIX_ID = "tokenizer.ggml.suffix_token_id" + MIDDLE_ID = "tokenizer.ggml.middle_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: - TYPE = "adapter.type" + TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" + class GGUFType: - MODEL = "model" + MODEL = "model" ADAPTER = "adapter" + class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - REFACT = auto() - BERT = auto() - NOMIC_BERT = auto() + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + REFACT = auto() + BERT = auto() + NOMIC_BERT = auto() JINA_BERT_V2 = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - QWEN2MOE = auto() - PHI2 = auto() - PHI3 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - GEMMA = auto() - GEMMA2 = auto() - STARCODER2 = auto() - MAMBA = auto() - XVERSE = auto() - COMMAND_R = auto() - DBRX = auto() - OLMO = auto() - OPENELM = auto() - ARCTIC = auto() - DEEPSEEK2 = auto() - CHATGLM = auto() - BITNET = auto() - T5 = auto() - T5ENCODER = auto() - JAIS = auto() - NEMOTRON = auto() - EXAONE = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + QWEN2MOE = auto() + PHI2 = auto() + PHI3 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + GEMMA = auto() + GEMMA2 = auto() + STARCODER2 = auto() + MAMBA = auto() + XVERSE = auto() + COMMAND_R = auto() + DBRX = auto() + OLMO = auto() + OPENELM = auto() + ARCTIC = auto() + DEEPSEEK2 = auto() + CHATGLM = auto() + BITNET = auto() + T5 = auto() + T5ENCODER = auto() + JAIS = auto() + NEMOTRON = auto() + EXAONE = auto() + class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ROPE_FACTORS_LONG = auto() - ROPE_FACTORS_SHORT = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_OUT_NORM = auto() - ATTN_POST_NORM = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE_INP = auto() - FFN_GATE_INP_SHEXP = auto() - FFN_NORM = auto() - FFN_PRE_NORM = auto() - FFN_POST_NORM = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_NORM_EXP = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - FFN_GATE_SHEXP = auto() - FFN_DOWN_SHEXP = auto() - FFN_UP_SHEXP = auto() - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - LAYER_OUT_NORM = auto() - SSM_IN = auto() - SSM_CONV1D = auto() - SSM_X = auto() - SSM_DT = auto() - SSM_A = auto() - SSM_D = auto() - SSM_OUT = auto() - ATTN_Q_A = auto() - ATTN_Q_B = auto() - ATTN_KV_A_MQA = auto() - ATTN_KV_B = auto() - ATTN_Q_A_NORM = auto() - ATTN_KV_A_NORM = auto() - FFN_SUB_NORM = auto() - ATTN_SUB_NORM = auto() - DEC_ATTN_NORM = auto() - DEC_ATTN_Q = auto() - DEC_ATTN_K = auto() - DEC_ATTN_V = auto() - DEC_ATTN_OUT = auto() - DEC_ATTN_REL_B = auto() - DEC_CROSS_ATTN_NORM = auto() - DEC_CROSS_ATTN_Q = auto() - DEC_CROSS_ATTN_K = auto() - DEC_CROSS_ATTN_V = auto() - DEC_CROSS_ATTN_OUT = auto() + TOKEN_EMBD = auto() + TOKEN_EMBD_NORM = auto() + TOKEN_TYPES = auto() + POS_EMBD = auto() + OUTPUT = auto() + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ROPE_FACTORS_LONG = auto() + ROPE_FACTORS_SHORT = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_OUT_NORM = auto() + ATTN_POST_NORM = auto() + ATTN_ROT_EMBD = auto() + FFN_GATE_INP = auto() + FFN_GATE_INP_SHEXP = auto() + FFN_NORM = auto() + FFN_PRE_NORM = auto() + FFN_POST_NORM = auto() + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_ACT = auto() + FFN_NORM_EXP = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() + FFN_GATE_SHEXP = auto() + FFN_DOWN_SHEXP = auto() + FFN_UP_SHEXP = auto() + ATTN_Q_NORM = auto() + ATTN_K_NORM = auto() + LAYER_OUT_NORM = auto() + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_A = auto() + SSM_D = auto() + SSM_OUT = auto() + ATTN_Q_A = auto() + ATTN_Q_B = auto() + ATTN_KV_A_MQA = auto() + ATTN_KV_B = auto() + ATTN_Q_A_NORM = auto() + ATTN_KV_A_NORM = auto() + FFN_SUB_NORM = auto() + ATTN_SUB_NORM = auto() + DEC_ATTN_NORM = auto() + DEC_ATTN_Q = auto() + DEC_ATTN_K = auto() + DEC_ATTN_V = auto() + DEC_ATTN_OUT = auto() + DEC_ATTN_REL_B = auto() + DEC_CROSS_ATTN_NORM = auto() + DEC_CROSS_ATTN_Q = auto() + DEC_CROSS_ATTN_K = auto() + DEC_CROSS_ATTN_V = auto() + DEC_CROSS_ATTN_OUT = auto() DEC_CROSS_ATTN_REL_B = auto() - DEC_FFN_NORM = auto() - DEC_FFN_GATE = auto() - DEC_FFN_DOWN = auto() - DEC_FFN_UP = auto() - DEC_OUTPUT_NORM = auto() - ENC_ATTN_NORM = auto() - ENC_ATTN_Q = auto() - ENC_ATTN_K = auto() - ENC_ATTN_V = auto() - ENC_ATTN_OUT = auto() - ENC_ATTN_REL_B = auto() - ENC_FFN_NORM = auto() - ENC_FFN_GATE = auto() - ENC_FFN_DOWN = auto() - ENC_FFN_UP = auto() - ENC_OUTPUT_NORM = auto() + DEC_FFN_NORM = auto() + DEC_FFN_GATE = auto() + DEC_FFN_DOWN = auto() + DEC_FFN_UP = auto() + DEC_OUTPUT_NORM = auto() + ENC_ATTN_NORM = auto() + ENC_ATTN_Q = auto() + ENC_ATTN_K = auto() + ENC_ATTN_V = auto() + ENC_ATTN_OUT = auto() + ENC_ATTN_REL_B = auto() + ENC_FFN_NORM = auto() + ENC_FFN_GATE = auto() + ENC_FFN_DOWN = auto() + ENC_FFN_UP = auto() + ENC_OUTPUT_NORM = auto() + MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GROK: "grok", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.QWEN2MOE: "qwen2moe", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PHI3: "phi3", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", - MODEL_ARCH.GEMMA: "gemma", - MODEL_ARCH.GEMMA2: "gemma2", - MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.XVERSE: "xverse", - MODEL_ARCH.COMMAND_R: "command-r", - MODEL_ARCH.DBRX: "dbrx", - MODEL_ARCH.OLMO: "olmo", - MODEL_ARCH.OPENELM: "openelm", - MODEL_ARCH.ARCTIC: "arctic", - MODEL_ARCH.DEEPSEEK2: "deepseek2", - MODEL_ARCH.CHATGLM: "chatglm", - MODEL_ARCH.BITNET: "bitnet", - MODEL_ARCH.T5: "t5", - MODEL_ARCH.T5ENCODER: "t5encoder", - MODEL_ARCH.JAIS: "jais", - MODEL_ARCH.NEMOTRON: "nemotron", - MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.QWEN2MOE: "qwen2moe", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI3: "phi3", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", + MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.XVERSE: "xverse", + MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.DBRX: "dbrx", + MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OPENELM: "openelm", + MODEL_ARCH.ARCTIC: "arctic", + MODEL_ARCH.DEEPSEEK2: "deepseek2", + MODEL_ARCH.CHATGLM: "chatglm", + MODEL_ARCH.BITNET: "bitnet", + MODEL_ARCH.T5: "t5", + MODEL_ARCH.T5ENCODER: "t5encoder", + MODEL_ARCH.JAIS: "jais", + MODEL_ARCH.NEMOTRON: "nemotron", + MODEL_ARCH.EXAONE: "exaone", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", - MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", - MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", - MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", - MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", - MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", - MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", - MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", - MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", - MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", - MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", - MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", - MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", - MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", - MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", - MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", - MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", - MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", - MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", - MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", - MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", - MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", - MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", - MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", - MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", - MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", - MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", - MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", - MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", + MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", + MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", + MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", + MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", + MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", + MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", + MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", + MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", + MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", + MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", + MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", + MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", + MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", + MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", + MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", + MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", + MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", + MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", + MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", - MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", - MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", - MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", - MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", - MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", - MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", - MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", - MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", - MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", - MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", - MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", - MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", - MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", - MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", - MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", - MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", + MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", + MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", + MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", + MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", + MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", + MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", + MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", + MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", + MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", + MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", + MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", + MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", + MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", + MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", + MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", + MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -958,7 +963,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], - MODEL_ARCH.CHATGLM : [ + MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.OUTPUT_NORM, @@ -1075,7 +1080,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], - } MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1120,113 +1124,120 @@ class MODEL_TENSOR(IntEnum): ], } + class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 + UNUSED = 5 + BYTE = 6 + class RopeScalingType(Enum): - NONE = 'none' - LINEAR = 'linear' - YARN = 'yarn' + NONE = "none" + LINEAR = "linear" + YARN = "yarn" + class PoolingType(IntEnum): NONE = 0 MEAN = 1 - CLS = 2 + CLS = 2 + class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 IQ2_XXS = 16 - IQ2_XS = 17 + IQ2_XS = 17 IQ3_XXS = 18 - IQ1_S = 19 - IQ4_NL = 20 - IQ3_S = 21 - IQ2_S = 22 - IQ4_XS = 23 - I8 = 24 - I16 = 25 - I32 = 26 - I64 = 27 - F64 = 28 - IQ1_M = 29 - BF16 = 30 + IQ1_S = 19 + IQ4_NL = 20 + IQ3_S = 21 + IQ2_S = 22 + IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 + I64 = 27 + F64 = 28 + IQ1_M = 29 + BF16 = 30 Q4_0_4_4 = 31 Q4_0_4_8 = 32 Q4_0_8_8 = 33 + class LlamaFileType(IntEnum): - ALL_F32 = 0 - MOSTLY_F16 = 1 - MOSTLY_Q4_0 = 2 - MOSTLY_Q4_1 = 3 + ALL_F32 = 0 + MOSTLY_F16 = 1 + MOSTLY_Q4_0 = 2 + MOSTLY_Q4_1 = 3 - MOSTLY_Q8_0 = 7 - MOSTLY_Q5_0 = 8 - MOSTLY_Q5_1 = 9 - MOSTLY_Q2_K = 10 - MOSTLY_Q3_K_S = 11 - MOSTLY_Q3_K_M = 12 - MOSTLY_Q3_K_L = 13 - MOSTLY_Q4_K_S = 14 - MOSTLY_Q4_K_M = 15 - MOSTLY_Q5_K_S = 16 - MOSTLY_Q5_K_M = 17 - MOSTLY_Q6_K = 18 - MOSTLY_IQ2_XXS = 19 - MOSTLY_IQ2_XS = 20 - MOSTLY_Q2_K_S = 21 - MOSTLY_IQ3_XS = 22 - MOSTLY_IQ3_XXS = 23 - MOSTLY_IQ1_S = 24 - MOSTLY_IQ4_NL = 25 - MOSTLY_IQ3_S = 26 - MOSTLY_IQ3_M = 27 - MOSTLY_IQ2_S = 28 - MOSTLY_IQ2_M = 29 - MOSTLY_IQ4_XS = 30 - MOSTLY_IQ1_M = 31 - MOSTLY_BF16 = 32 - MOSTLY_Q4_0_4_4 = 33 - MOSTLY_Q4_0_4_8 = 34 - MOSTLY_Q4_0_8_8 = 35 + MOSTLY_Q8_0 = 7 + MOSTLY_Q5_0 = 8 + MOSTLY_Q5_1 = 9 + MOSTLY_Q2_K = 10 + MOSTLY_Q3_K_S = 11 + MOSTLY_Q3_K_M = 12 + MOSTLY_Q3_K_L = 13 + MOSTLY_Q4_K_S = 14 + MOSTLY_Q4_K_M = 15 + MOSTLY_Q5_K_S = 16 + MOSTLY_Q5_K_M = 17 + MOSTLY_Q6_K = 18 + MOSTLY_IQ2_XXS = 19 + MOSTLY_IQ2_XS = 20 + MOSTLY_Q2_K_S = 21 + MOSTLY_IQ3_XS = 22 + MOSTLY_IQ3_XXS = 23 + MOSTLY_IQ1_S = 24 + MOSTLY_IQ4_NL = 25 + MOSTLY_IQ3_S = 26 + MOSTLY_IQ3_M = 27 + MOSTLY_IQ2_S = 28 + MOSTLY_IQ2_M = 29 + MOSTLY_IQ4_XS = 30 + MOSTLY_IQ1_M = 31 + MOSTLY_BF16 = 32 + MOSTLY_Q4_0_4_4 = 33 + MOSTLY_Q4_0_4_8 = 34 + MOSTLY_Q4_0_8_8 = 35 + + GUESSED = 1024 - GUESSED = 1024 class GGUFEndian(IntEnum): LITTLE = 0 BIG = 1 + class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 FLOAT64 = 12 @staticmethod @@ -1245,97 +1256,98 @@ def get_type(val: Any) -> GGUFValueType: else: raise ValueError(f"Unknown type: {type(val)}") + QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), - GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), + GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), - GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), - GGMLQuantizationType.IQ4_NL: (32, 2 + 16), - GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), - GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), - GGMLQuantizationType.I8: (1, 1), - GGMLQuantizationType.I16: (1, 2), - GGMLQuantizationType.I32: (1, 4), - GGMLQuantizationType.I64: (1, 8), - GGMLQuantizationType.F64: (1, 8), - GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), - GGMLQuantizationType.BF16: (1, 2), - GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16), - GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16), - GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16), + GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), + GGMLQuantizationType.IQ4_NL: (32, 2 + 16), + GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), + GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), + GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), + GGMLQuantizationType.BF16: (1, 2), + GGMLQuantizationType.Q4_0_4_4: (32, 2 + 16), + GGMLQuantizationType.Q4_0_4_8: (32, 2 + 16), + GGMLQuantizationType.Q4_0_8_8: (32, 2 + 16), } -KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE +KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION -KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT -KEY_GENERAL_NAME = Keys.General.NAME -KEY_GENERAL_AUTHOR = Keys.General.AUTHOR -KEY_GENERAL_URL = Keys.General.URL -KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION -KEY_GENERAL_LICENSE = Keys.General.LICENSE -KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL -KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE +KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT +KEY_GENERAL_NAME = Keys.General.NAME +KEY_GENERAL_AUTHOR = Keys.General.AUTHOR +KEY_GENERAL_URL = Keys.General.URL +KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION +KEY_GENERAL_LICENSE = Keys.General.LICENSE +KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL +KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE -KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE -KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH -KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH -KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT -KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE +KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH +KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH +KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT +KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL -KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT -KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT -KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV -KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS -KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV -KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS +KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT +KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV +KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS +KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV +KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR +KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT +KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE +KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE +KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED -KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL -KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE -KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE +KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL +KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE +KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK -KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL -KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE -KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST +KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE +KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE -KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES -KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES -KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID -KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID -KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID -KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID -KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID -KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID -KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON -KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV -KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID -KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID -KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID -KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID -KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID \ No newline at end of file +KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES +KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES +KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID +KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID +KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID +KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID +KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON +KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV +KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID +KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID +KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID diff --git a/src/gguf-py/gguf/gguf_reader.py b/src/gguf/gguf_reader.py similarity index 100% rename from src/gguf-py/gguf/gguf_reader.py rename to src/gguf/gguf_reader.py diff --git a/src/gguf-py/gguf/gguf_writer.py b/src/gguf/gguf_writer.py similarity index 100% rename from src/gguf-py/gguf/gguf_writer.py rename to src/gguf/gguf_writer.py diff --git a/src/gguf-py/gguf/lazy.py b/src/gguf/lazy.py similarity index 100% rename from src/gguf-py/gguf/lazy.py rename to src/gguf/lazy.py diff --git a/src/gguf-py/gguf/metadata.py b/src/gguf/metadata.py similarity index 99% rename from src/gguf-py/gguf/metadata.py rename to src/gguf/metadata.py index 7092683..6d39f5a 100644 --- a/src/gguf-py/gguf/metadata.py +++ b/src/gguf/metadata.py @@ -441,9 +441,9 @@ def apply_metadata_heuristic( org_component is not None and model_full_name_component is not None ): - base_model[ - "repo_url" - ] = f"https://huggingface.co/{org_component}/{model_full_name_component}" + base_model["repo_url"] = ( + f"https://huggingface.co/{org_component}/{model_full_name_component}" + ) metadata.base_models.append(base_model) if "license" in model_card and metadata.license is None: diff --git a/src/gguf-py/gguf/quants.py b/src/gguf/quants.py similarity index 100% rename from src/gguf-py/gguf/quants.py rename to src/gguf/quants.py diff --git a/src/gguf-py/gguf/tensor_mapping.py b/src/gguf/tensor_mapping.py similarity index 81% rename from src/gguf-py/gguf/tensor_mapping.py rename to src/gguf/tensor_mapping.py index 75d23b4..750917e 100644 --- a/src/gguf-py/gguf/tensor_mapping.py +++ b/src/gguf/tensor_mapping.py @@ -4,9 +4,9 @@ from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES + class TensorNameMap: mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", "transformer.wte", @@ -27,24 +27,18 @@ class TensorNameMap: "transformer.token_embeddings", "shared", ), - - MODEL_TENSOR.TOKEN_TYPES: ( - "embeddings.token_type_embeddings", - ), - + MODEL_TENSOR.TOKEN_TYPES: ("embeddings.token_type_embeddings",), MODEL_TENSOR.TOKEN_EMBD_NORM: ( "word_embeddings_layernorm", "embeddings.LayerNorm", "emb_ln", "transformer.norm", ), - MODEL_TENSOR.POS_EMBD: ( "transformer.wpe", "embeddings.position_embeddings", "wpe", ), - MODEL_TENSOR.OUTPUT: ( "embed_out", "lm_head", @@ -53,7 +47,6 @@ class TensorNameMap: "lm_head.linear", "output_layer", ), - MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", "transformer.ln_f", @@ -71,7 +64,6 @@ class TensorNameMap: "transformer.norm", "model.norm", ), - MODEL_TENSOR.ROPE_FREQS: ( "rope.freqs", "rotary_pos_emb.inv_freq", @@ -79,7 +71,6 @@ class TensorNameMap: } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - MODEL_TENSOR.ATTN_NORM: ( "gpt_neox.layers.{bid}.input_layernorm", "transformer.h.{bid}.ln_1", @@ -102,12 +93,10 @@ class TensorNameMap: "encoder.layers.{bid}.input_layernorm", "transformer.layers.{bid}.attn_norm", ), - MODEL_TENSOR.ATTN_NORM_2: ( "transformer.h.{bid}.ln_attn", "encoder.layer.{bid}.layer_norm_1", ), - MODEL_TENSOR.ATTN_QKV: ( "gpt_neox.layers.{bid}.attention.query_key_value", "transformer.h.{bid}.attn.c_attn", @@ -124,7 +113,6 @@ class TensorNameMap: "encoder.layers.{bid}.self_attention.query_key_value", "transformer.layers.{bid}.attn.qkv_proj", ), - MODEL_TENSOR.ATTN_Q: ( "model.layers.{bid}.self_attn.q_proj", "layers.{bid}.attention.wq", @@ -135,7 +123,6 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.query", "transformer.h.{bid}.attn.attention.q_proj", ), - MODEL_TENSOR.ATTN_K: ( "model.layers.{bid}.self_attn.k_proj", "layers.{bid}.attention.wk", @@ -147,7 +134,6 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.key", "transformer.h.{bid}.attn.attention.k_proj", ), - MODEL_TENSOR.ATTN_V: ( "model.layers.{bid}.self_attn.v_proj", "layers.{bid}.attention.wv", @@ -159,7 +145,6 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.value", "transformer.h.{bid}.attn.attention.v_proj", ), - MODEL_TENSOR.ATTN_OUT: ( "gpt_neox.layers.{bid}.attention.dense", "transformer.h.{bid}.attn.c_proj", @@ -183,25 +168,19 @@ class TensorNameMap: "transformer.layers.{bid}.attn.out_proj", "transformer.h.{bid}.attn.attention.out_proj", ), - MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", "encoder.layers.{bid}.norm1", "transformer.decoder_layer.{bid}.rms_norm_1", "transformer.blocks.{bid}.norm_attn_norm.norm_2", ), - - MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", - ), - + MODEL_TENSOR.ATTN_POST_NORM: ("model.layers.{bid}.post_attention_layernorm",), MODEL_TENSOR.ATTN_ROT_EMBD: ( "model.layers.{bid}.self_attn.rotary_emb.inv_freq", "layers.{bid}.attention.inner_attention.rope.freqs", "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", "transformer.h.{bid}.attn.rotary_emb.inv_freq", ), - MODEL_TENSOR.FFN_NORM: ( "gpt_neox.layers.{bid}.post_attention_layernorm", "transformer.h.{bid}.ln_2", @@ -217,15 +196,8 @@ class TensorNameMap: "encoder.layers.{bid}.post_attention_layernorm", "transformer.layers.{bid}.ffn_norm", ), - - MODEL_TENSOR.FFN_PRE_NORM: ( - "model.layers.{bid}.pre_feedforward_layernorm", - ), - - MODEL_TENSOR.FFN_POST_NORM: ( - "model.layers.{bid}.post_feedforward_layernorm", - ), - + MODEL_TENSOR.FFN_PRE_NORM: ("model.layers.{bid}.pre_feedforward_layernorm",), + MODEL_TENSOR.FFN_POST_NORM: ("model.layers.{bid}.post_feedforward_layernorm",), MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", "model.layers.{bid}.block_sparse_moe.gate", @@ -233,11 +205,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.router", "transformer.blocks.{bid}.ffn.router.layer", ), - - MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert_gate", - ), - + MODEL_TENSOR.FFN_GATE_INP_SHEXP: ("model.layers.{bid}.mlp.shared_expert_gate",), MODEL_TENSOR.FFN_UP: ( "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", "transformer.h.{bid}.mlp.c_fc", @@ -265,23 +233,17 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.dense_h_to_4h", "transformer.h.{bid}.mlp.c_fc_1", ), - MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", "transformer.decoder_layer.{bid}.moe.linear_v", "transformer.blocks.{bid}.ffn.experts.mlp.v1", "model.layers.{bid}.mlp.experts.up_proj", ), - MODEL_TENSOR.FFN_UP_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.up_proj", "model.layers.{bid}.mlp.shared_experts.up_proj", ), - - MODEL_TENSOR.FFN_ACT: ( - "transformer.blocks.{bid}.ffn.act", - ), - + MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), MODEL_TENSOR.FFN_GATE: ( "model.layers.{bid}.mlp.gate_proj", "layers.{bid}.feed_forward.w1", @@ -295,19 +257,16 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", "transformer.h.{bid}.mlp.c_fc_0", ), - MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", "transformer.decoder_layer.{bid}.moe.linear", "transformer.blocks.{bid}.ffn.experts.mlp.w1", "model.layers.{bid}.mlp.experts.gate_proj", ), - MODEL_TENSOR.FFN_GATE_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.gate_proj", "model.layers.{bid}.mlp.shared_experts.gate_proj", ), - MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", "transformer.h.{bid}.mlp.c_proj", @@ -334,19 +293,16 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.dense_4h_to_h", "model.layers.h.{bid}.mlp.c_proj", ), - MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.w2", "transformer.decoder_layer.{bid}.moe.linear_1", "transformer.blocks.{bid}.ffn.experts.mlp.w2", "model.layers.{bid}.mlp.experts.down_proj", ), - MODEL_TENSOR.FFN_DOWN_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.down_proj", "model.layers.{bid}.mlp.shared_experts.down_proj", ), - MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", @@ -355,7 +311,6 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.layer_norm_q", "transformer.layers.{bid}.attn.q_norm", ), - MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", @@ -364,209 +319,108 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.layer_norm_k", "transformer.layers.{bid}.attn.k_norm", ), - MODEL_TENSOR.ROPE_FREQS: ( "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", ), - MODEL_TENSOR.LAYER_OUT_NORM: ( "encoder.layer.{bid}.output.LayerNorm", "encoder.layers.{bid}.norm2", "transformer.decoder_layer.{bid}.rms_norm_3", "encoder.layer.{bid}.mlp.layernorm", - "encoder.layer.{bid}.layer_norm_2" + "encoder.layer.{bid}.layer_norm_2", ), - MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", ), - MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", ), - MODEL_TENSOR.SSM_X: ( "model.layers.{bid}.x_proj", "backbone.layers.{bid}.mixer.x_proj", ), - MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", ), - MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", ), - MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", ), - MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", ), - - MODEL_TENSOR.ATTN_Q_A: ( - "model.layers.{bid}.self_attn.q_a_proj", - ), - - MODEL_TENSOR.ATTN_Q_B: ( - "model.layers.{bid}.self_attn.q_b_proj", - ), - + MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), + MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), MODEL_TENSOR.ATTN_KV_A_MQA: ( "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", ), - - MODEL_TENSOR.ATTN_KV_B: ( - "model.layers.{bid}.self_attn.kv_b_proj", - ), - - MODEL_TENSOR.ATTN_Q_A_NORM: ( - "model.layers.{bid}.self_attn.q_a_layernorm", - ), - - MODEL_TENSOR.ATTN_KV_A_NORM: ( - "model.layers.{bid}.self_attn.kv_a_layernorm", - ), - - MODEL_TENSOR.ATTN_SUB_NORM: ( - "model.layers.{bid}.self_attn.inner_attn_ln", - ), - - MODEL_TENSOR.FFN_SUB_NORM: ( - "model.layers.{bid}.mlp.ffn_layernorm", - ), - - MODEL_TENSOR.DEC_ATTN_NORM: ( - "decoder.block.{bid}.layer.0.layer_norm", - ), - - MODEL_TENSOR.DEC_ATTN_Q: ( - "decoder.block.{bid}.layer.0.SelfAttention.q", - ), - - MODEL_TENSOR.DEC_ATTN_K: ( - "decoder.block.{bid}.layer.0.SelfAttention.k", - ), - - MODEL_TENSOR.DEC_ATTN_V: ( - "decoder.block.{bid}.layer.0.SelfAttention.v", - ), - - MODEL_TENSOR.DEC_ATTN_OUT: ( - "decoder.block.{bid}.layer.0.SelfAttention.o", - ), - + MODEL_TENSOR.ATTN_KV_B: ("model.layers.{bid}.self_attn.kv_b_proj",), + MODEL_TENSOR.ATTN_Q_A_NORM: ("model.layers.{bid}.self_attn.q_a_layernorm",), + MODEL_TENSOR.ATTN_KV_A_NORM: ("model.layers.{bid}.self_attn.kv_a_layernorm",), + MODEL_TENSOR.ATTN_SUB_NORM: ("model.layers.{bid}.self_attn.inner_attn_ln",), + MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",), + MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",), + MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",), + MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",), + MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",), + MODEL_TENSOR.DEC_ATTN_OUT: ("decoder.block.{bid}.layer.0.SelfAttention.o",), MODEL_TENSOR.DEC_ATTN_REL_B: ( "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", ), - - MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ( - "decoder.block.{bid}.layer.1.layer_norm", - ), - + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ("decoder.block.{bid}.layer.1.layer_norm",), MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( "decoder.block.{bid}.layer.1.EncDecAttention.q", ), - MODEL_TENSOR.DEC_CROSS_ATTN_K: ( "decoder.block.{bid}.layer.1.EncDecAttention.k", ), - MODEL_TENSOR.DEC_CROSS_ATTN_V: ( "decoder.block.{bid}.layer.1.EncDecAttention.v", ), - MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( "decoder.block.{bid}.layer.1.EncDecAttention.o", ), - MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", ), - - MODEL_TENSOR.DEC_FFN_NORM: ( - "decoder.block.{bid}.layer.2.layer_norm", - ), - - MODEL_TENSOR.DEC_FFN_GATE: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", - ), - + MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",), + MODEL_TENSOR.DEC_FFN_GATE: ("decoder.block.{bid}.layer.2.DenseReluDense.wi_0",), MODEL_TENSOR.DEC_FFN_UP: ( "decoder.block.{bid}.layer.2.DenseReluDense.wi", "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", ), - - MODEL_TENSOR.DEC_FFN_DOWN: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wo", - ), - - MODEL_TENSOR.DEC_OUTPUT_NORM: ( - "decoder.final_layer_norm", - ), - - MODEL_TENSOR.ENC_ATTN_NORM: ( - "encoder.block.{bid}.layer.0.layer_norm", - ), - - MODEL_TENSOR.ENC_ATTN_Q: ( - "encoder.block.{bid}.layer.0.SelfAttention.q", - ), - - MODEL_TENSOR.ENC_ATTN_K: ( - "encoder.block.{bid}.layer.0.SelfAttention.k", - ), - - MODEL_TENSOR.ENC_ATTN_V: ( - "encoder.block.{bid}.layer.0.SelfAttention.v", - ), - - MODEL_TENSOR.ENC_ATTN_OUT: ( - "encoder.block.{bid}.layer.0.SelfAttention.o", - ), - + MODEL_TENSOR.DEC_FFN_DOWN: ("decoder.block.{bid}.layer.2.DenseReluDense.wo",), + MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",), + MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",), + MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",), + MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",), + MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",), + MODEL_TENSOR.ENC_ATTN_OUT: ("encoder.block.{bid}.layer.0.SelfAttention.o",), MODEL_TENSOR.ENC_ATTN_REL_B: ( "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", ), - - MODEL_TENSOR.ENC_FFN_NORM: ( - "encoder.block.{bid}.layer.1.layer_norm", - ), - - MODEL_TENSOR.ENC_FFN_GATE: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", - ), - + MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",), + MODEL_TENSOR.ENC_FFN_GATE: ("encoder.block.{bid}.layer.1.DenseReluDense.wi_0",), MODEL_TENSOR.ENC_FFN_UP: ( "encoder.block.{bid}.layer.1.DenseReluDense.wi", "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", ), - - MODEL_TENSOR.ENC_FFN_DOWN: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wo", - ), - - MODEL_TENSOR.ENC_OUTPUT_NORM: ( - "encoder.final_layer_norm", - ), + MODEL_TENSOR.ENC_FFN_DOWN: ("encoder.block.{bid}.layer.1.DenseReluDense.wo",), + MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",), } arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { MODEL_ARCH.ARCTIC: { - MODEL_TENSOR.FFN_NORM: ( - "model.layers.{bid}.residual_layernorm", - ), - MODEL_TENSOR.FFN_NORM_EXP: ( - "model.layers.{bid}.post_attention_layernorm", - ), + MODEL_TENSOR.FFN_NORM: ("model.layers.{bid}.residual_layernorm",), + MODEL_TENSOR.FFN_NORM_EXP: ("model.layers.{bid}.post_attention_layernorm",), }, } @@ -588,31 +442,35 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): if tensor not in MODEL_TENSORS[arch]: continue - tensor_name = TENSOR_NAMES[tensor].format(bid = bid) + tensor_name = TENSOR_NAMES[tensor].format(bid=bid) self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: - key = key.format(bid = bid) + key = key.format(bid=bid) self.mapping[key] = (tensor, tensor_name) - def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: + def get_type_and_name( + self, key: str, try_suffixes: Sequence[str] = () + ) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) if result is not None: return result for suffix in try_suffixes: if key.endswith(suffix): - result = self.mapping.get(key[:-len(suffix)]) + result = self.mapping.get(key[: -len(suffix)]) if result is not None: return result[0], result[1] + suffix return None def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: - result = self.get_type_and_name(key, try_suffixes = try_suffixes) + result = self.get_type_and_name(key, try_suffixes=try_suffixes) if result is None: return None return result[1] - def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: - result = self.get_type_and_name(key, try_suffixes = try_suffixes) + def get_type( + self, key: str, try_suffixes: Sequence[str] = () + ) -> MODEL_TENSOR | None: + result = self.get_type_and_name(key, try_suffixes=try_suffixes) if result is None: return None return result[0] @@ -629,5 +487,6 @@ def __contains__(self, key: str) -> bool: def __repr__(self) -> str: return repr(self.mapping) + def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: - return TensorNameMap(arch, n_blocks) \ No newline at end of file + return TensorNameMap(arch, n_blocks) diff --git a/src/gguf-py/gguf/utility.py b/src/gguf/utility.py similarity index 100% rename from src/gguf-py/gguf/utility.py rename to src/gguf/utility.py diff --git a/src/gguf-py/gguf/vocab.py b/src/gguf/vocab.py similarity index 99% rename from src/gguf-py/gguf/vocab.py rename to src/gguf/vocab.py index 77a029d..ea92c4e 100644 --- a/src/gguf-py/gguf/vocab.py +++ b/src/gguf/vocab.py @@ -224,11 +224,9 @@ class Vocab(BaseVocab, Protocol): added_tokens_list: list[str] fname_tokenizer: Path - def __init__(self, base_path: Path): - ... + def __init__(self, base_path: Path): ... - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - ... + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... class NoVocab(BaseVocab):