From b4817eee06213ef8bb52bc32c1cef8c9cfb42b87 Mon Sep 17 00:00:00 2001 From: BuildTools Date: Sat, 22 Mar 2025 09:41:54 -0700 Subject: [PATCH] refactor(ggml): update safetensor conversion scripts --- src/convert_hf_to_gguf.py | 1407 +++++++++++++++++++++-------------- src/convert_lora_to_gguf.py | 60 +- src/gguf/constants.py | 173 ++++- src/gguf/gguf.py | 15 + src/gguf/gguf_reader.py | 71 +- src/gguf/gguf_writer.py | 20 +- src/gguf/metadata.py | 44 +- src/gguf/tensor_mapping.py | 138 +++- src/gguf/utility.py | 2 +- src/gguf/vocab.py | 9 +- 10 files changed, 1295 insertions(+), 644 deletions(-) create mode 100644 src/gguf/gguf.py diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py index b5b3ab6..765187f 100644 --- a/src/convert_hf_to_gguf.py +++ b/src/convert_hf_to_gguf.py @@ -37,9 +37,6 @@ logger = logging.getLogger("hf-to-gguf") -###### MODEL DEFINITIONS ###### - - class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 @@ -73,7 +70,6 @@ class Model: metadata_override: Path | None dir_model_card: Path - # subclasses should define this! model_arch: gguf.MODEL_ARCH def __init__( @@ -124,11 +120,10 @@ def __init__( self.tensor_names = None self.metadata_override = metadata_override self.model_name = model_name - self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + self.dir_model_card = dir_model - # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: - # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. + _, first_tensor = next(self.get_tensors()) if first_tensor.dtype == torch.float16: logger.info( @@ -141,7 +136,6 @@ def __init__( ) self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter( path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], @@ -155,8 +149,7 @@ def __init__( @classmethod def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") @@ -227,7 +220,6 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: data = LazyTorchTensor.from_eager(data) yield name, data - # verify tensor name presence and identify potentially missing files if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) @@ -235,7 +227,10 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: set(weight_map[n] for n in missing if n in weight_map) ) if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}") + raise ValueError( + f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}" + ) else: raise ValueError( "Mismatch between weight map and model parts for tensor names:\n" @@ -346,18 +341,17 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid return [(self.map_tensor_name(name), data_torch)] def tensor_force_quant( self, name: str, new_name: str, bid: int | None, n_dims: int ) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid, n_dims # unused + del name, new_name, bid, n_dims return False - # some models need extra generated tensors (like rope_freqs) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: return () @@ -369,7 +363,7 @@ def prepare_tensors(self): for name, data_torch in chain( self.generate_extra_tensors(), self.get_tensors() ): - # we don't need these + if name.endswith( (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") ): @@ -377,11 +371,9 @@ def prepare_tensors(self): old_dtype = data_torch.dtype - # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) - # use the first number-like part of the tensor name as the block id bid = None for part in name.split("."): if part.isdecimal(): @@ -389,11 +381,9 @@ def prepare_tensors(self): break for new_name, data_torch in self.modify_tensors(data_torch, name, bid): - # TODO: why do we squeeze here? - # data = data_torch.squeeze().numpy() + data = data_torch.numpy() - # if data ends up empty, it means data_torch was a scalar tensor -> restore if len(data.shape) == 0: data = data_torch.numpy() @@ -402,12 +392,9 @@ def prepare_tensors(self): name, new_name, bid, n_dims ) - # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors if n_dims <= 1 or new_name.endswith("_norm.weight"): data_qtype = gguf.GGMLQuantizationType.F32 - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - # Some tensor types are always in float32 if data_qtype is False and ( any( self.match_model_tensor_name(new_name, key, bid) @@ -421,6 +408,7 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, gguf.MODEL_TENSOR.POSNET_NORM1, gguf.MODEL_TENSOR.POSNET_NORM2, ) @@ -440,10 +428,9 @@ def prepare_tensors(self): gguf.LlamaFileType.MOSTLY_TQ1_0, gguf.LlamaFileType.MOSTLY_TQ2_0, ): - # TODO: use Q4_K and Q6_K + data_qtype = gguf.GGMLQuantizationType.F16 - # No override (data_qtype is False), or wants to be quantized (data_qtype is True) if isinstance(data_qtype, bool): if self.ftype == gguf.LlamaFileType.ALL_F32: data_qtype = gguf.GGMLQuantizationType.F32 @@ -473,10 +460,8 @@ def prepare_tensors(self): else data.shape ) - # reverse shape to make it similar to the internal ggml dimension order shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" - # n_dims is implicit in the shape logger.info( f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" ) @@ -496,22 +481,18 @@ def prepare_metadata(self, vocab_only: bool): self.metadata_override, self.dir_model_card, self.model_name, total_params ) - # Fallback to model directory name if metadata name is still missing if self.metadata.name is None: self.metadata.name = self.dir_model.name - # Generate parameter weight class (useful for leader boards) if not yet determined if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label( total_params, shared_params, expert_params, expert_count ) - # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' output_type: str = self.ftype.name.partition("_")[2] - # Filename Output if self.fname_out.is_dir(): - # Generate default filename based on model specification and available metadata + if not vocab_only: fname_default: str = gguf.naming_convention( self.metadata.name, @@ -533,14 +514,9 @@ def prepare_metadata(self, vocab_only: bool): model_type="vocab", ) - # Use the default filename self.fname_out = self.fname_out / f"{fname_default}.gguf" else: - # Output path is a custom defined templated filename - # Note: `not is_dir()` is used because `.is_file()` will not detect - # file template strings as it doesn't actually exist as a file - # Process templated file name with the output ftype, useful with the "auto" ftype self.fname_out = self.fname_out.parent / gguf.fill_templated_filename( self.fname_out.name, output_type ) @@ -603,6 +579,11 @@ def func(modelcls: AnyModel) -> AnyModel: return func + @classmethod + def print_registered_models(cls): + for name in sorted(cls._model_classes.keys()): + logger.error(f"- {name}") + @classmethod def from_model_architecture(cls, arch: str) -> type[Model]: try: @@ -618,13 +599,11 @@ def does_token_look_special(self, token: str | bytes) -> bool: else: token_text = token - # Some models mark some added tokens which ought to be control tokens as not special. - # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) seems_special = token_text in ( - "", # deepseek-coder + "", "", "<2mass>", - "[@BOS@]", # gemma{,-2} + "[@BOS@]", ) seems_special = seems_special or ( @@ -632,16 +611,14 @@ def does_token_look_special(self, token: str | bytes) -> bool: ) seems_special = seems_special or ( token_text.startswith("<|") and token_text.endswith("|>") - ) # deepseek-coder + ) - # TODO: should these be marked as UNUSED instead? (maybe not) seems_special = seems_special or ( token_text.startswith("") - ) # gemma{,-2} + ) return seems_special - # used for GPT-2 BPE and WordPiece vocabs def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] toktypes: list[int] = [] @@ -659,6 +636,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: } added_vocab = tokenizer.get_added_vocab() + added_tokens_decoder = tokenizer.added_tokens_decoder + for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") @@ -666,9 +645,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: token: str = reverse_vocab[i] if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not tokenizer.added_tokens_decoder[i].normalized: + + if not added_tokens_decoder[i].normalized: previous_token = token token = tokenizer.decode( tokenizer.encode(token, add_special_tokens=False) @@ -678,16 +656,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer" ) - if tokenizer.added_tokens_decoder[ - i - ].special or self.does_token_look_special(token): + if added_tokens_decoder[i].special or self.does_token_look_special( + token + ): toktypes.append(gguf.TokenType.CONTROL) else: - # NOTE: this was added for Gemma. - # Encoding and decoding the tokens above isn't sufficient for this case. - token = token.replace( - b"\xe2\x96\x81".decode("utf-8"), " " - ) # pre-normalize user-defined spaces + + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") toktypes.append(gguf.TokenType.USER_DEFINED) else: toktypes.append(gguf.TokenType.NORMAL) @@ -695,15 +670,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: return tokens, toktypes, tokpre - # NOTE: this function is generated by convert_hf_to_gguf_update.py - # do not modify it manually! - # ref: https://github.com/ggerganov/llama.cpp/pull/6920 - # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: - # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that - # is specific for the BPE pre-tokenizer used by the model - # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can - # use in llama.cpp to implement the same pre-tokenizer chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" @@ -715,126 +682,133 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = None - # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script - # or pull the latest version of the model from Huggingface - # don't edit the hashes manually! if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": - # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base + res = "falcon3" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 + res = "bert-bge" if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": - # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 + res = "bert-bge-large" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/mosaicml/mpt-7b + res = "mpt" if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - # ref: https://huggingface.co/bigcode/starcoder2-3b + res = "starcoder" if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b + res = "stablelm2" if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - # ref: https://huggingface.co/Qwen/Qwen1.5-7B + res = "qwen2" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf + res = "olmo" if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base + res = "dbrx" if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": - # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + res = "jina-v1-en" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en + res = "jina-v2-en" if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es + res = "jina-v2-es" if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de + res = "jina-v2-de" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct + res = "smaug-bpe" if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": - # ref: https://huggingface.co/LumiOpen/Poro-34B-chat + res = "poro-chat" if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code + res = "jina-v2-code" - if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": - # ref: https://huggingface.co/THUDM/glm-4-9b-chat + if ( + chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" + or chkhsh + == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" + ): + res = "chatglm-bpe" if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": - # ref: https://huggingface.co/LumiOpen/Viking-7B + res = "viking" if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": - # ref: https://huggingface.co/core42/jais-13b + res = "jais" if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": - # ref: https://huggingface.co/WisdomShell/CodeShell-7B + res = "codeshell" if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": - # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 + res = "tekken" if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": - # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M + res = "smollm" if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": - # ref: https://huggingface.co/bigscience/bloom + res = "bloom" if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": - # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small + res = "gpt3-finnish" if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": - # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct + res = "exaone" if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": - # ref: https://huggingface.co/microsoft/phi-2 + res = "phi-2" if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": - # ref: https://huggingface.co/facebook/chameleon-7b + res = "chameleon" if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": - # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 + res = "minerva-7b" if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": - # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base + res = "roberta-bpe" if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": - # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct + res = "gigachat" if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": - # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct + res = "megrez" if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": - # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 + res = "deepseek-v3" + if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": + + res = "deepseek-r1-qwen" + if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": + + res = "gpt-4o" if res is None: logger.warning("\n") @@ -853,7 +827,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: "** Check your model files and convert_hf_to_gguf_update.py and update them accordingly." ) logger.warning( - "** ref: https://github.com/ggerganov/llama.cpp/pull/6920" + "** ref: https://github.com/ggml-org/llama.cpp/pull/6920" ) logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") @@ -869,7 +843,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.debug(f"chkhsh: {chkhsh}") return res - # Marker: End get_vocab_base_pre def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") @@ -909,7 +882,6 @@ def _set_vocab_qwen(self): assert len(merged) == 2 merges.append(" ".join(map(QwenModel.token_bytes_to_string, merged))) - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.special_tokens reverse_vocab = { id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() @@ -933,7 +905,7 @@ def _set_vocab_qwen(self): special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: special_vocab._set_special_token( "bos", tokenizer.special_tokens["<|endoftext|>"] @@ -941,7 +913,7 @@ def _set_vocab_qwen(self): special_vocab._set_special_token( "eos", tokenizer.special_tokens["<|endoftext|>"] ) - # this one is usually not in config.json anyway + special_vocab._set_special_token( "unk", tokenizer.special_tokens["<|endoftext|>"] ) @@ -1021,6 +993,11 @@ def _create_vocab_sentencepiece(self): for token_id, token_data in added_tokens_decoder.items(): token_id = int(token_id) token: str = token_data["content"] + if token_id >= vocab_size: + logger.warning( + f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" + ) + continue if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token.encode("utf-8"): logger.warning( @@ -1029,9 +1006,7 @@ def _create_vocab_sentencepiece(self): if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: - token = token.replace( - b"\xe2\x96\x81".decode("utf-8"), " " - ) # pre-normalize user-defined spaces + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED scores[token_id] = -1000.0 @@ -1071,6 +1046,44 @@ def _set_vocab_llama_hf(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_rwkv_world(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = ["".encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open( + self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8" + ) as f: + lines = f.readlines() + for line in lines: + parts = line.split(" ") + assert len(parts) >= 3 + token, token_len = ast.literal_eval(" ".join(parts[1:-1])), int( + parts[-1] + ) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.chat_template = "rwkv-world" + + special_vocab._set_special_token("eot", 261) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_builtin( self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int ): @@ -1083,7 +1096,7 @@ def _set_vocab_builtin( default_pre = "mpt" if model_name == "gpt-neox" else "default" field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) - assert field # tokenizer model + assert field self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) @@ -1092,27 +1105,27 @@ def _set_vocab_builtin( ) field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field # token list + assert field self.gguf_writer.add_token_list( [bytes(field.parts[i]) for i in field.data][:vocab_size] ) if model_name == "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) - assert field # token scores + assert field self.gguf_writer.add_token_scores( [field.parts[i].tolist()[0] for i in field.data][:vocab_size] ) field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field # token types + assert field self.gguf_writer.add_token_types( [field.parts[i].tolist()[0] for i in field.data][:vocab_size] ) if model_name != "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field # token merges + assert field self.gguf_writer.add_token_merges( [bytes(field.parts[i]) for i in field.data] ) @@ -1157,7 +1170,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -1165,9 +1178,7 @@ def modify_tensors( tensors: list[tuple[str, Tensor]] = [] if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1214,7 +1225,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -1224,9 +1235,7 @@ def modify_tensors( tensors: list[tuple[str, Tensor]] = [] if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1251,17 +1260,6 @@ def modify_tensors( tensors.append((self.map_tensor_name(name), data_torch)) - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all( - s not in self.tensor_names for s in ("lm_head.weight", "output.weight") - ): - tensors.append( - (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) - ) - return tensors @@ -1273,7 +1271,7 @@ def set_vocab(self): try: self._set_vocab_gpt2() except Exception: - # Fallback for SEA-LION model + self._set_vocab_sentencepiece() self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_pad_token_id(3) @@ -1302,7 +1300,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid if "scales" in name: new_name = self.map_tensor_name( @@ -1345,8 +1343,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) - # note: config provides rms norm but it is actually layer norm - # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) @@ -1475,8 +1472,7 @@ def set_vocab(self): tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, - # because vocab_size is the count of items, and indexes start at 0. + max_vocab_index = max(tokenizer.get_vocab().values()) if max_vocab_index >= vocab_size: raise ValueError("Vocabulary size exceeds expected maximum size.") @@ -1488,12 +1484,12 @@ def set_vocab(self): for token_id in range(vocab_size): token_text = reverse_vocab[token_id].encode("utf-8") - # replace "\x00" to string with length > 0 + if token_text == b"\x00": - toktype = gguf.TokenType.BYTE # special + toktype = gguf.TokenType.BYTE token_text = f"<{token_text}>".encode("utf-8") elif re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): - toktype = gguf.TokenType.BYTE # special + toktype = gguf.TokenType.BYTE elif reverse_vocab[token_id] in added_vocab: if tokenizer.added_tokens_decoder[token_id].special: toktype = gguf.TokenType.CONTROL @@ -1554,12 +1550,11 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - # HF models permute some of the tensors, so we need to undo that if name.endswith("q_proj.weight"): data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) if name.endswith("k_proj.weight"): @@ -1589,18 +1584,18 @@ class FalconModel(Model): def set_gguf_parameters(self): block_count = self.hparams.get("num_hidden_layers") if block_count is None: - block_count = self.hparams["n_layer"] # old name + block_count = self.hparams["n_layer"] n_head = self.hparams.get("num_attention_heads") if n_head is None: - n_head = self.hparams["n_head"] # old name + n_head = self.hparams["n_head"] n_head_kv = self.hparams.get("num_kv_heads") if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name + n_head_kv = self.hparams.get("n_head_kv", 1) - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_context_length(2048) + self.gguf_writer.add_tensor_data_layout("jploski") self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1612,17 +1607,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + del bid if "query_key_value" in name: n_head = self.find_hparam(["num_attention_heads", "n_head"]) @@ -1666,7 +1651,6 @@ class RefactModel(Model): def set_vocab(self): super().set_vocab() - # TODO: how to determine special FIM tokens automatically? special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, @@ -1675,7 +1659,7 @@ def set_vocab(self): special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) - special_vocab.chat_template = None # do not add it twice + special_vocab.chat_template = None special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -1687,7 +1671,6 @@ def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1760,7 +1743,7 @@ def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() else: - # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() def set_gguf_parameters(self): @@ -1839,7 +1822,7 @@ def _stack_qk_norm( layer_name: str = "q_layernorm", ): datas: list[Tensor] = [] - # extract the norms in order + for xid in range(n_head): ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" datas.append(norms[ename]) @@ -1855,7 +1838,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._q_norms is not None or self._k_norms is not None: - # flatten two `list[dict[str, Tensor]]` into a single `list[str]` + norms = ( [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None @@ -1882,10 +1865,9 @@ def set_vocab(self): try: self._set_vocab_llama_hf() except (FileNotFoundError, TypeError): - # Llama 3 + self._set_vocab_gpt2() - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( self.dir_model, @@ -1907,7 +1889,6 @@ def set_vocab(self): tokenizer_config_json["add_prefix_space"] ) - # Apply to granite small models only if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) @@ -1957,7 +1938,6 @@ def modify_tensors( if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] @@ -1971,7 +1951,6 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -2038,25 +2017,43 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("Mistral3ForConditionalGeneration") +class Mistral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + def __init__(self, *args, **kwargs): + hparams = Model.load_hparams(kwargs["dir_model"]) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = name.replace("language_model.", "") + if "multi_modal_projector" in name or "vision_tower" in name: + return [] + return super().modify_tensors(data_torch, name, bid) + + @Model.register("DeciLMForCausalLM") class DeciModel(Model): model_arch = gguf.MODEL_ARCH.DECI @staticmethod def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: - # DeciLM-specific code + intermediate_size = int(2 * ffn_mult * n_embd / 3) return DeciModel._find_multiple(intermediate_size, 256) @staticmethod def _find_multiple(n: int, k: int) -> int: - # DeciLM-specific code + if n % k == 0: return n return n + k - (n % k) @@ -2064,22 +2061,13 @@ def _find_multiple(n: int, k: int) -> int: def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + if "block_configs" in self.hparams: _block_configs: list[dict[str, Any]] = self.hparams["block_configs"] assert self.block_count == len(_block_configs) self._num_kv_heads = list() self._num_heads = list() _ffn_multipliers = list() - # ***linear attention layer*** - # if n_heads_in_group is None and replace_with_linear is True - # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads - # ***attention-free layer*** - # if n_heads_in_group is None and replace_with_linear is False - # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 - # ***normal attention-layer*** - # if n_heads_in_group is not None, then - # _num_kv_heads[il] is num_attention_head // n_heads_in_group and - # _num_heads[il] is num_attention_head + for il in range(len(_block_configs)): if _block_configs[il]["attention"]["n_heads_in_group"] is None: if _block_configs[il]["attention"]["replace_with_linear"] is True: @@ -2115,8 +2103,7 @@ def __init__(self, *args, **kwargs): ] def set_vocab(self): - # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's - # eos_token from '|eot_id|' to '|end_of_text|' + if self.hparams.get("vocab_size", 128256) == 128256: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -2127,11 +2114,11 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) else: - # DeciLM-7B + self._set_vocab_llama_hf() def set_gguf_parameters(self): - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + if "block_configs" in self.hparams: assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(self._ffn_dims) @@ -2151,9 +2138,9 @@ def set_gguf_parameters(self): self.hparams["hidden_size"] // self.hparams["num_attention_heads"] ) self.gguf_writer.add_file_type(self.ftype) - else: # DeciLM-7B + else: super().set_gguf_parameters() - if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B + if "num_key_value_heads_per_layer" in self.hparams: self._num_kv_heads: list[int] = self.hparams[ "num_key_value_heads_per_layer" ] @@ -2273,9 +2260,7 @@ def weight_quant(self, weight: Tensor) -> Tensor: weight = weight.float() scale = weight.abs().mean().clamp(min=1e-5) iscale = 1 / scale - # TODO: multiply by the scale directly instead of inverting it twice - # (this is also unnecessarily doubly inverted upstream) - # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 + result = (weight * iscale).round().clamp(-1, 1) / iscale return result.type(dtype) @@ -2296,7 +2281,7 @@ def modify_tensors( gguf.MODEL_TENSOR.FFN_GATE, ] ): - # transform weight into 1/0/-1 (in fp32) + data_torch = self.weight_quant(data_torch) yield (new_name, data_torch) @@ -2320,7 +2305,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # process the experts separately + if name.find(".moe.") != -1: n_experts = self.hparams["num_local_experts"] @@ -2334,7 +2319,6 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for wid in ["linear", "linear_1", "linear_v"]: datas: list[Tensor] = [] @@ -2390,26 +2374,17 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid n_expert = self.hparams["ffn_config"]["moe_num_experts"] n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] - # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose - # original implementation expects (n_expert, n_ff, n_embd) for all experts weights - # But llama.cpp moe graph works differently - # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions - # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor exp_tensor_names = { - "ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2": ( - 0, - 2, - 1, - ), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.w1": None, + "ffn.experts.mlp.w2": (0, 2, 1), "ffn.experts.mlp.v1": None, - } # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + } experts = False for exp_tensor_name in exp_tensor_names.keys(): @@ -2420,12 +2395,6 @@ def modify_tensors( data_torch = data_torch.permute(*permute_tensor) break - # map tensor names - # In MoE models the ffn tensors are typically most of the model weights, - # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. - # Every other model has the weight names ending in .weight, - # let's assume that is the convention which is not the case for dbrx: - # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 new_name = self.map_tensor_name( name if not experts else name + ".weight", try_suffixes=(".weight",) ) @@ -2435,7 +2404,7 @@ def modify_tensors( def tensor_force_quant( self, name: str, new_name: str, bid: int | None, n_dims: int ) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid # unused + del name, new_name, bid return n_dims > 1 @@ -2500,12 +2469,11 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") - # HF models permute some of the tensors, so we need to undo that if name.endswith(("q_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight")): @@ -2693,7 +2661,7 @@ class WavTokenizerDecModel(Model): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid if ( name.endswith("codebook.cluster_size") @@ -2759,7 +2727,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # process the experts separately + if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -2772,7 +2740,6 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -2798,7 +2765,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @@ -2820,11 +2787,10 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid tensors: list[tuple[str, Tensor]] = [] - # we don't need these if name.endswith((".attn.bias", ".attn.masked_bias")): return tensors @@ -2837,12 +2803,6 @@ def modify_tensors( tensors.append((new_name, data_torch)) - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append( - (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) - ) - return tensors @@ -2879,7 +2839,7 @@ class Phi3MiniModel(Model): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): - # Phi-4 model uses GPT2Tokenizer + tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: @@ -2999,7 +2959,8 @@ def set_gguf_parameters(self): rms_eps = self.find_hparam(["rms_norm_eps"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) @@ -3015,7 +2976,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) sliding_window = self.hparams.get("sliding_window") - # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models + if sliding_window is None: sliding_window = 0 self.gguf_writer.add_sliding_window(sliding_window) @@ -3025,9 +2986,9 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: n_head = self.find_hparam(["num_attention_heads", "n_head"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head - # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(["rope_scaling"], True) if rope_scaling is None: return @@ -3066,7 +3027,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: or len(long_factors) != rope_dims / 2 ): raise ValueError( - f"The length of rope long and short factors must be {rope_dims / 2}" + f"The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}." ) yield ( @@ -3079,6 +3040,66 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: ) +@Model.register("PhiMoEForCausalLM") +class PhiMoeModel(Phi3MiniModel): + model_arch = gguf.MODEL_ARCH.PHIMOE + + _experts: list[dict[str, Tensor]] | None = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) + self.gguf_writer.add_expert_count(self.hparams["num_local_experts"]) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + for w_name in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = ( + f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + ) + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @Model.register("PlamoForCausalLM") class PlamoModel(Model): model_arch = gguf.MODEL_ARCH.PLAMO @@ -3090,14 +3111,12 @@ def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_context_length(4096) # not in config.json + self.gguf_writer.add_context_length(4096) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv( - 5 - ) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_head_count_kv(5) self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) @@ -3118,11 +3137,10 @@ def shuffle_attn_output_weight(self, data_torch): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid new_name = self.map_tensor_name(name) - # shuffle for broadcasting of gqa in ggml_mul_mat if new_name.endswith("attn_q.weight"): data_torch = self.shuffle_attn_q_weight(data_torch) elif new_name.endswith("attn_output.weight"): @@ -3150,27 +3168,31 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) + _has_tok_embd = False + def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid + + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) new_name = self.map_tensor_name(name) - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + if not self._has_tok_embd and new_name == self.format_tensor_name( + gguf.MODEL_TENSOR.OUTPUT + ): - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None - - if all( - s not in self.tensor_names for s in ("lm_head.weight", "output.weight") - ): - # copy tok_embd.weight to output.weight - tensors.append( - (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) + if self.tensor_names and "transformer.wte.weight" in self.tensor_names: + logger.debug( + f"{tok_embd_name} not found before {output_name}, assuming they are tied" ) + self.tensor_names.remove("transformer.wte.weight") + elif new_name == tok_embd_name: + self._has_tok_embd = True - return tensors + return [(new_name, data_torch)] @Model.register("InternLM2ForCausalLM") @@ -3178,10 +3200,7 @@ class InternLM2Model(Model): model_arch = gguf.MODEL_ARCH.INTERNLM2 def set_vocab(self): - # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model @@ -3195,9 +3214,7 @@ def set_vocab(self): logger.error(f"Error: Missing {tokenizer_path}") sys.exit(1) - sentencepiece_model = ( - model.ModelProto() - ) # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix @@ -3211,8 +3228,7 @@ def set_vocab(self): text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. + logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") text = "🐉".encode("utf-8") @@ -3225,7 +3241,7 @@ def set_vocab(self): toktype = SentencePieceTokenTypes.UNUSED elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE - # take care of ununsed raw token + if piece.startswith("[UNUSED"): toktype = SentencePieceTokenTypes.UNUSED @@ -3302,9 +3318,7 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) old_eos = special_vocab.special_token_ids["eos"] if chat_eos_token_id is not None: - # For the chat model, we replace the eos with '<|im_end|>'. - # TODO: this is a hack, should be fixed - # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 + special_vocab.special_token_ids["eos"] = chat_eos_token_id logger.warning( f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" @@ -3349,7 +3363,6 @@ def modify_tensors( qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) q, k, v = qkv[:, :q_per_kv], qkv[:, -2], qkv[:, -1] - # The model weights of q and k equire additional reshape. q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) k = LlamaModel.permute( k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads @@ -3365,6 +3378,83 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] +@Model.register("InternLM3ForCausalLM") +class InternLM3Model(Model): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + + tokenizer_config_file = self.dir_model / "tokenizer_config.json" + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix( + tokenizer_config_json["add_prefix_space"] + ) + + if "added_tokens_decoder" in tokenizer_config_json: + for token_id, token_data in tokenizer_config_json[ + "added_tokens_decoder" + ].items(): + if token_data.get("special"): + token_id = int(token_id) + token = token_data["content"] + special_vocab._set_special_token(token, token_id) + + if ( + token == "<|im_end|>" + and "eos" in special_vocab.special_token_ids + ): + special_vocab.special_token_ids["eos"] = token_id + + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): + if ( + self.hparams["rope_scaling"].get("type") == "linear" + or self.hparams["rope_scaling"].get("rope_type") == "linear" + ): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("BertModel", "BertForMaskedLM", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -3377,7 +3467,6 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_causal_attention(False) - # get pooling path pooling_path = None module_path = self.dir_model / "modules.json" if module_path.is_file(): @@ -3388,7 +3477,6 @@ def set_gguf_parameters(self): pooling_path = mod["path"] break - # get pooling type if pooling_path is not None: with open( self.dir_model / pooling_path / "config.json", encoding="utf-8" @@ -3406,12 +3494,8 @@ def set_vocab(self): tokens, toktypes, tokpre = self.get_vocab_base() self.vocab_size = len(tokens) - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - # convert to phantom space vocab def phantom(tok): if tok.startswith("[") and tok.endswith("]"): return tok @@ -3421,20 +3505,18 @@ def phantom(tok): tokens = list(map(phantom, tokens)) - # add vocab to gguf self.gguf_writer.add_tokenizer_model("bert") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - # handle special tokens special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid if name.startswith("bert."): name = name[5:] @@ -3445,13 +3527,12 @@ def modify_tensors( if name.endswith(".beta"): name = name[:-5] + ".bias" - # we are only using BERT for embeddings so we don't need the pooling layer if name in ( "embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias", ): - return [] # we don't need these + return [] if name.startswith("cls.predictions"): return [] @@ -3469,7 +3550,6 @@ class RobertaModel(BertModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # we need the pad_token_id to know how to chop down position_embd matrix if (pad_token_id := self.hparams.get("pad_token_id")) is not None: self._position_offset = 1 + pad_token_id if "max_position_embeddings" in self.hparams: @@ -3485,9 +3565,6 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" self.gguf_writer.add_token_type_count( self.hparams.get("type_vocab_size", 1) ) @@ -3498,12 +3575,10 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): name = name[8:] - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: data_torch = data_torch[self._position_offset :, :] @@ -3518,20 +3593,18 @@ class NomicBertModel(BertModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # the HF config claims n_ctx=8192, but it uses RoPE scaling self.hparams["n_ctx"] = 2048 - # SwigLU activation assert self.hparams["activation_function"] == "swiglu" - # this doesn't do anything in the HF version + assert self.hparams["causal"] is False - # no bias tensors + assert self.hparams["qkv_proj_bias"] is False assert self.hparams["mlp_fc1_bias"] is False assert self.hparams["mlp_fc2_bias"] is False - # norm at end of layer + assert self.hparams["prenorm"] is False - # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 assert self.hparams["rotary_emb_interleaved"] is False assert self.hparams["rotary_emb_scale_base"] is None @@ -3548,7 +3621,6 @@ class XLMRobertaModel(BertModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # we need the pad_token_id to know how to chop down position_embd matrix if (pad_token_id := self.hparams.get("pad_token_id")) is not None: self._position_offset = 1 + pad_token_id if "max_position_embeddings" in self.hparams: @@ -3557,8 +3629,7 @@ def __init__(self, *args, **kwargs): self._position_offset = None def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model @@ -3567,11 +3638,9 @@ def set_vocab(self): if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = ( - model.ModelProto() - ) # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + assert sentencepiece_model.trainer_spec.model_type == 1 add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -3617,7 +3686,6 @@ def set_vocab(self): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.UNUSED) - # realign tokens (see HF tokenizer code) tokens = [b"", b"", b"", b""] + tokens[3:-1] scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] toktypes = [ @@ -3647,12 +3715,10 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): name = name[8:] - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: data_torch = data_torch[self._position_offset :, :] @@ -3667,7 +3733,6 @@ class GemmaModel(Model): def set_vocab(self): self._set_vocab_sentencepiece() - # TODO: these special tokens should be exported only for the CodeGemma family special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, @@ -3678,7 +3743,7 @@ def set_vocab(self): special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("eot", 107) - special_vocab.chat_template = None # do not add it twice + special_vocab.chat_template = None special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) @@ -3705,17 +3770,14 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug( f"Skipping get tensor {name!r} in safetensors so that convert can end normally." ) return [] - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -3760,17 +3822,96 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug( f"Skipping get tensor {name!r} in safetensors so that convert can end normally." ) return [] - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(Model): + model_arch = gguf.MODEL_ARCH.GEMMA3 + has_vision: bool = False + + def __init__(self, *args, **kwargs): + hparams = Model.load_hparams(kwargs["dir_model"]) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + if "vision_config" in hparams: + logger.info("Has vision encoder, but it will be ignored") + self.has_vision = True + + def write(self): + super().write() + if self.has_vision: + logger.info("NOTE: this script only convert the language model to GGUF") + logger.info( + " for the vision model, please use gemma3_convert_encoder_to_gguf.py" + ) + + def set_vocab(self): + self._set_vocab_sentencepiece() + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_context_length( + hparams.get("max_position_embeddings", 131072) + ) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) + self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) + + assert hparams.get("attn_logit_softcapping") is None + assert hparams.get("final_logit_softcapping") is None + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) + if hparams.get("rope_scaling") is not None: + assert hparams["rope_scaling"]["rope_type"] == "linear" + + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid + + if name.startswith("language_model."): + name = name.replace("language_model.", "") + elif ( + name.startswith("multi_modal_projector.") + or name.startswith("vision_tower.") + or name.startswith("multimodal_projector.") + or name.startswith("vision_model.") + ): + + return [] + + if "embed_tokens.weight" in name: + vocab = self._create_vocab_sentencepiece() + tokens = vocab[0] + data_torch = data_torch[: len(tokens)] + if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -3787,42 +3928,7 @@ class Rwkv6Model(Model): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) - - tokens: list[bytes] = ["".encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] - - with open( - self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8" - ) as f: - lines = f.readlines() - for line in lines: - parts = line.split(" ") - assert len(parts) >= 3 - token, token_len = ast.literal_eval(" ".join(parts[1:-1])), int( - parts[-1] - ) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.chat_template = "rwkv-world" - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - special_vocab.add_to_gguf(self.gguf_writer) + self._set_vocab_rwkv_world() def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -3838,7 +3944,6 @@ def set_gguf_parameters(self): time_mix_extra_dim = 64 if hidden_size == 4096 else 32 time_decay_extra_dim = 128 if hidden_size == 4096 else 64 - # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) @@ -3850,9 +3955,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) - # required by llama.cpp, unused self.gguf_writer.add_head_count(0) + lerp_weights: dict[int, dict[str, Tensor]] = {} + def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: @@ -3874,26 +3980,327 @@ def modify_tensors( if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: data_torch = data_torch.squeeze() - rescale_every_n_layers = self.hparams["rescale_every"] - if rescale_every_n_layers > 0: - if new_name.endswith("time_mix_output.weight") or new_name.endswith( - "channel_mix_value.weight" + try: + rescale_every_n_layers = self.hparams["rescale_every"] + if rescale_every_n_layers > 0: + if new_name.endswith("time_mix_output.weight") or new_name.endswith( + "channel_mix_value.weight" + ): + data_torch = data_torch.div_( + 2 ** int(bid // rescale_every_n_layers) + ) + except KeyError: + pass + + if ( + bid is not None + and "time_mix_lerp" in new_name + and "time_mix_lerp_x" not in new_name + ): + try: + self.lerp_weights[bid][new_name] = data_torch + except KeyError: + self.lerp_weights[bid] = {new_name: data_torch} + if all( + f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() + for i in ["w", "k", "v", "r", "g"] ): - data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack( + [ + self.lerp_weights[bid][ + f"blk.{bid}.time_mix_lerp_{i}.weight" + ].unsqueeze(0) + for i in ["w", "k", "v", "r", "g"] + ], + dim=0, + ).unsqueeze(1) + yield (new_name, data) + return yield (new_name, data_torch) +@Model.register("RWKV6Qwen2ForCausalLM") +class RWKV6Qwen2Model(Rwkv6Model): + model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + num_attention_heads = self.hparams["num_attention_heads"] + num_key_value_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + head_size = hidden_size // num_attention_heads + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + time_mix_extra_dim = 64 if hidden_size >= 4096 else 32 + time_decay_extra_dim = 128 if hidden_size >= 4096 else 64 + + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_token_shift_count(1) + + self.gguf_writer.add_head_count_kv(num_key_value_heads) + + self.gguf_writer.add_head_count(0) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + for new_name, data in super().modify_tensors(data_torch, name, bid): + if "time_mix_w1" in new_name or "time_mix_w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + + data = torch.stack( + [data[3], data[1], data[2], data[0], data[4]], dim=0 + ).view(-1, data.shape[-1]) + if "w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + yield (new_name, data) + continue + yield (new_name, data) + + +@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(Model): + model_arch = gguf.MODEL_ARCH.RWKV7 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def calc_lora_rank(self, hidden_size, exponent, multiplier): + return max(1, round(hidden_size**exponent * multiplier / 32)) * 32 + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + try: + head_size = self.hparams["head_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + except KeyError: + head_size = self.hparams["head_dim"] + layer_norm_eps = self.hparams["norm_eps"] + hidden_size = self.hparams["hidden_size"] + intermediate_size = ( + self.hparams["intermediate_size"] + if self.hparams["intermediate_size"] is not None + else (hidden_size * 4) + ) + + try: + lora_rank_decay = ( + self.hparams["lora_rank_decay"] + if self.hparams["lora_rank_decay"] is not None + else self.calc_lora_rank(hidden_size, 0.5, 1.8) + ) + lora_rank_iclr = ( + self.hparams["lora_rank_iclr"] + if self.hparams["lora_rank_iclr"] is not None + else self.calc_lora_rank(hidden_size, 0.5, 1.8) + ) + lora_rank_value_residual_mix = ( + self.hparams["lora_rank_value_residual_mix"] + if self.hparams["lora_rank_value_residual_mix"] is not None + else self.calc_lora_rank(hidden_size, 0.5, 1.3) + ) + lora_rank_gate = ( + self.hparams["lora_rank_gate"] + if self.hparams["lora_rank_gate"] is not None + else self.calc_lora_rank(hidden_size, 0.8, 0.6) + ) + except KeyError: + lora_rank_decay = ( + self.hparams["decay_low_rank_dim"] + if self.hparams["decay_low_rank_dim"] is not None + else self.calc_lora_rank(hidden_size, 0.5, 1.8) + ) + lora_rank_iclr = ( + self.hparams["a_low_rank_dim"] + if self.hparams["a_low_rank_dim"] is not None + else self.calc_lora_rank(hidden_size, 0.5, 1.8) + ) + lora_rank_value_residual_mix = ( + self.hparams["v_low_rank_dim"] + if self.hparams["v_low_rank_dim"] is not None + else self.calc_lora_rank(hidden_size, 0.5, 1.3) + ) + lora_rank_gate = ( + self.hparams["gate_low_rank_dim"] + if self.hparams["gate_low_rank_dim"] is not None + else self.calc_lora_rank(hidden_size, 0.8, 0.6) + ) + + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + lora_needs_transpose: bool = True + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + + name = name.replace("blocks", "layers").replace("ffn", "feed_forward") + name = name.replace("self_attn", "attention").replace("attn", "attention") + name = name.replace("time_mixer.", "") + + if "_lora.lora" in name: + self.lora_needs_transpose = False + name = name.replace("_lora.lora.0.weight", "1.weight") + name = name.replace("_lora.lora.2.weight", "2.weight") + name = name.replace("_lora.lora.2.bias", "0.weight") + + name = name.replace("feed_forward_norm", "ln2") + name = name.replace("g_norm", "ln_x") + + if ( + "attention.v" in name + and "value" not in self.map_tensor_name(name) + and bid == 0 + ): + + return + + wkv_has_gate = self.hparams.get("wkv_has_gate", True) + lerp_list = ( + ["r", "w", "k", "v", "a", "g"] + if wkv_has_gate + else ["r", "w", "k", "v", "a"] + ) + + if bid is not None and "attention.x_" in name: + if "attention.x_x" in name: + + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = data_torch.reshape(len(lerp_list), 1, 1, -1) + yield (new_name, data) + else: + try: + self.lerp_weights[bid][name] = data_torch + except KeyError: + self.lerp_weights[bid] = {name: data_torch} + if all( + f"model.layers.{bid}.attention.x_{i}" + in self.lerp_weights[bid].keys() + for i in lerp_list + ): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack( + [ + self.lerp_weights[bid][ + f"model.layers.{bid}.attention.x_{i}" + ] + for i in lerp_list + ], + dim=0, + ) + yield (new_name, data) + return + else: + data_torch = data_torch.squeeze() + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if self.lora_needs_transpose and any( + new_name.endswith(t) + for t in [ + "time_mix_w1.weight", + "time_mix_w2.weight", + "time_mix_a1.weight", + "time_mix_a2.weight", + "time_mix_v1.weight", + "time_mix_v2.weight", + "time_mix_g1.weight", + "time_mix_g2.weight", + ] + ): + data_torch = data_torch.transpose(0, 1) + + if "r_k" in new_name: + data_torch = data_torch.flatten() + + if bid == 0 and "time_mix_a" in new_name: + + yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) + + yield (new_name, data_torch) + + +@Model.register("RwkvHybridForCausalLM") +class ARwkv7Model(Rwkv7Model): + model_arch = gguf.MODEL_ARCH.ARWKV7 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + hidden_size = self.hparams["hidden_size"] + head_size = self.hparams["head_size"] + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + wkv_has_gate = self.hparams["wkv_has_gate"] + assert self.hparams["wkv_version"] == 7 + + lora_rank_decay = 64 + lora_rank_iclr = 64 + lora_rank_value_residual_mix = 32 + lora_rank_gate = 128 if wkv_has_gate else 0 + + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_token_shift_count(1) + + self.gguf_writer.add_head_count(0) + + @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA def set_vocab(self): vocab_size = self.hparams["vocab_size"] - # Round vocab size to next multiple of 8 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - # pad using ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab self.hparams["vocab_size"] = vocab_size @@ -3902,7 +4309,7 @@ def set_vocab(self): elif (self.dir_model / "tokenizer.model").is_file(): self._set_vocab_sentencepiece() else: - # Use the GPT-NeoX tokenizer when no tokenizer files are present + self._set_vocab_builtin("gpt-neox", vocab_size) def set_gguf_parameters(self): @@ -3913,9 +4320,7 @@ def set_gguf_parameters(self): or 2 * d_model ) d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -( d_model // -16 ) @@ -3924,31 +4329,23 @@ def set_gguf_parameters(self): or 1e-5 ) use_dt_b_c_norm = False - # For falconmamba we do apply RMS norm on B / DT and C layers + if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): use_dt_b_c_norm = True - # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model - self.gguf_writer.add_context_length( - 2**20 - ) # arbitrary value; for those who use the default + self.gguf_writer.add_context_length(2**20) self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length( - 0 - ) # unused, but seemingly required when loading - self.gguf_writer.add_head_count( - 0 - ) # unused, but seemingly required when loading + self.gguf_writer.add_feed_forward_length(0) + self.gguf_writer.add_head_count(0) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_ssm_dt_b_c_rms( - use_dt_b_c_norm - ) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) self.gguf_writer.add_file_type(self.ftype) _tok_embd = None @@ -3956,7 +4353,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) @@ -3967,7 +4364,6 @@ def modify_tensors( logger.debug("A_log --> A ==> " + new_name) data_torch = -torch.exp(data_torch) - # assuming token_embd.weight is seen before output.weight if self._tok_embd is not None and new_name == output_name: if torch.equal(self._tok_embd, data_torch): logger.debug( @@ -3987,9 +4383,6 @@ class CommandR2Model(Model): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # max_position_embeddings = 8192 in config.json but model was actually - # trained on 128k context length - # aya-23 models don't have model_max_length specified self.hparams["max_position_embeddings"] = self.find_hparam( ["model_max_length", "max_position_embeddings"] ) @@ -4032,12 +4425,10 @@ def set_gguf_parameters(self): if clip_qkv is not None: self.gguf_writer.add_clamp_kqv(clip_qkv) - # Same as super class, but permuting q_proj, k_proj - # Copied from: LlamaModel def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -4067,11 +4458,10 @@ def set_gguf_parameters(self): _experts: list[dict[str, Tensor]] | None = None - # Copied from: Qwen2MoeModel def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # process the experts separately + if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -4084,7 +4474,6 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -4106,12 +4495,11 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] - # Copied from: Qwen2MoeModel def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @@ -4160,8 +4548,7 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # if name starts with "bert.", remove the prefix - # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + if name.startswith("bert."): name = name[5:] @@ -4174,9 +4561,9 @@ class OpenELMModel(Model): @staticmethod def _make_divisible(v: float | int, divisor: int) -> int: - # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 + new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: new_v += divisor return new_v @@ -4200,7 +4587,6 @@ def __init__(self, *args, **kwargs): self._num_query_heads[0], int ) - # Uses the tokenizer from meta-llama/Llama-2-7b-hf def set_vocab(self): try: self._set_vocab_sentencepiece() @@ -4222,7 +4608,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(self._num_query_heads) self.gguf_writer.add_head_count_kv(self._num_kv_heads) self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) - # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 + self.gguf_writer.add_layer_norm_rms_eps(1e-6) self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) self.gguf_writer.add_key_length(head_dim) @@ -4239,7 +4625,6 @@ def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # split ff if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": ff_dim = self._ffn_dims[bid] yield ( @@ -4260,9 +4645,7 @@ class ArcticModel(Model): model_arch = gguf.MODEL_ARCH.ARCTIC def set_vocab(self): - # The reason for using a custom implementation here is that the - # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from - # tokenizer.model and used them as BOS and EOS instead of adding new tokens. + from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / "tokenizer.model" @@ -4271,7 +4654,6 @@ def set_vocab(self): logger.error(f"Error: Missing {tokenizer_path}") sys.exit(1) - # Read the whole vocabulary from the tokenizer.model file tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) @@ -4301,8 +4683,6 @@ def set_vocab(self): scores[token_id] = score toktypes[token_id] = toktype - # Use the added_tokens_decoder field from tokeniser_config.json as the source - # of information about added/redefined tokens and modify them accordingly. tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: @@ -4322,8 +4702,6 @@ def set_vocab(self): token_type = SentencePieceTokenTypes.USER_DEFINED token_score = -10000.0 - # Map unk_token to UNKNOWN, other special tokens to CONTROL - # Set the score to 0.0 as in the original tokenizer.model if ("special" in token_json) and token_json["special"]: if token_content == tokenizer_config_json["unk_token"]: token_type = SentencePieceTokenTypes.UNKNOWN @@ -4368,7 +4746,6 @@ def modify_tensors( if name.endswith("k_proj.weight"): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] @@ -4382,7 +4759,6 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -4408,7 +4784,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @@ -4468,7 +4844,6 @@ def modify_tensors( if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) - # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] assert bid is not None @@ -4481,7 +4856,6 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -4507,7 +4881,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @@ -4574,17 +4948,15 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # rename e_score_correction_bias tensors + if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") - # skip Multi-Token Prediction (MTP) layers block_count = self.hparams["num_hidden_layers"] match = re.match(r"model.layers.(\d+)", name) if match and int(match.group(1)) >= block_count: return [] - # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] assert bid is not None @@ -4597,7 +4969,6 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -4623,7 +4994,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @@ -4641,33 +5012,28 @@ def __init__(self, *args, **kwargs): self.shared_token_embeddings_found = False def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model tokenizer_path = self.dir_model / "tokenizer.model" - # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): tokenizer_path = self.dir_model / "spiece.model" if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = ( - model.ModelProto() - ) # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct + if sentencepiece_model.trainer_spec.model_type == 2: + assert tokenizer_path.name == "tokenizer.model" return self._set_vocab_sentencepiece() else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + assert sentencepiece_model.trainer_spec.model_type == 1 add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -4771,12 +5137,8 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. if name in [ "decoder.embed_tokens.weight", "encoder.embed_tokens.weight", @@ -4803,33 +5165,28 @@ def __init__(self, *args, **kwargs): self.shared_token_embeddings_found = False def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model tokenizer_path = self.dir_model / "tokenizer.model" - # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): tokenizer_path = self.dir_model / "spiece.model" if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = ( - model.ModelProto() - ) # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct + if sentencepiece_model.trainer_spec.model_type == 2: + assert tokenizer_path.name == "tokenizer.model" return self._set_vocab_sentencepiece() else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + assert sentencepiece_model.trainer_spec.model_type == 1 add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -4930,12 +5287,8 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. if name in [ "decoder.embed_tokens.weight", "encoder.embed_tokens.weight", @@ -4960,12 +5313,10 @@ class JaisModel(Model): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # SwigLU activation assert self.hparams["activation_function"] == "swiglu" - # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" - # Embeddings scale self.embeddings_scale = 1.0 if "mup_embeddings_scale" in self.hparams: self.embeddings_scale = self.hparams["mup_embeddings_scale"] @@ -5002,19 +5353,15 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid tensors: list[tuple[str, Tensor]] = [] - # we don't need these if name.endswith((".attn.bias")): return tensors if name.endswith(("relative_pe.slopes")): - # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) - # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, - # but Jais's PyTorch model simply precalculates the slope values and places them - # in relative_pes.slopes + n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) first_val = float(data_torch[0].item()) self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) @@ -5042,7 +5389,7 @@ def prepare_tensors(self): self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) -@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration") +@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") class ChatGLMModel(Model): model_arch = gguf.MODEL_ARCH.CHATGLM @@ -5082,8 +5429,7 @@ def set_vocab_chatglm3(self): text = piece.encode("utf-8") score = 0.0 - # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), - # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() + if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): score = tokenizer.tokenizer.sp_model.get_score(token_id) @@ -5115,8 +5461,7 @@ def set_vocab_chatglm3(self): toktypes.append(toktype) self.gguf_writer.add_tokenizer_model("llama") - # glm3 needs prefix and suffix formatted as: - # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" + self.gguf_writer.add_tokenizer_pre("chatglm-spm") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) @@ -5168,55 +5513,21 @@ def set_vocab(self): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["padded_vocab_size"] + vocab_size = hparams.get("padded_vocab_size", hparams["vocab_size"]) assert max(tokenizer.get_vocab().values()) < vocab_size - tokpre = self.get_vocab_base_pre(tokenizer) - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[ChatGLMModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) >= 2 and len(merged) <= 7 - merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.get_added_vocab() - reverse_vocab = { - id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() - } - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - + tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json special_vocab._set_special_token( "eos", tokenizer.get_added_vocab()["<|endoftext|>"] ) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - # this one is usually not in config.json anyway + special_vocab._set_special_token( "unk", tokenizer.get_added_vocab()["<|endoftext|>"] ) @@ -5225,18 +5536,34 @@ def set_vocab(self): def set_gguf_parameters(self): n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_head_kv = self.hparams.get("multi_query_group_num", n_head) + n_head_kv = self.hparams.get( + "multi_query_group_num", self.hparams.get("num_key_value_heads", n_head) + ) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) self.gguf_writer.add_embedding_length(n_embed) self.gguf_writer.add_feed_forward_length( - self.hparams.get("ffn_hidden_size", 4 * n_embed) + self.hparams.get( + "ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed) + ) + ) + self.gguf_writer.add_block_count( + self.hparams.get("num_layers", self.hparams["num_hidden_layers"]) ) - self.gguf_writer.add_block_count(self.hparams["num_layers"]) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"]) + self.gguf_writer.add_layer_norm_rms_eps( + self.hparams.get("layernorm_epsilon", 1e-5) + ) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_dimension_count(64) + if "attention_dim" in self.hparams: + rope_dim = self.hparams["attention_dim"] + else: + rope_dim = ( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) + self.gguf_writer.add_rope_dimension_count( + int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)) + ) self.gguf_writer.add_add_bos_token(False) rope_freq = 10000 if "rope_ratio" in self.hparams: @@ -5246,9 +5573,11 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + del bid - if name.endswith(".rotary_pos_emb.inv_freq"): + if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith( + "model.vision." + ): return [] name = name.removeprefix("transformer.") @@ -5274,7 +5603,6 @@ def set_gguf_parameters(self): ) self.gguf_writer.add_layer_norm_eps(f_norm_eps) - # * Partial RoPE rot_pct = self.find_hparam( ["partial_rotary_factor", "rope_pct", "rope_percent"] ) @@ -5282,7 +5610,6 @@ def set_gguf_parameters(self): n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - # * RopeScaling for Nemotron if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) else: @@ -5292,10 +5619,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side - # model.layers.{l}.input_layernorm.weight - # model.layers.{l}.post_attention_layernorm.weight - # model.norm.weight + if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -5322,10 +5646,7 @@ def set_gguf_parameters(self): else 4 * embed_dim ) num_layers = hparams["num_layers"] - # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 - # attention_dropout_rate = hparams["attention_dropout"] - # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 - # embed_dropout_rate = hparams["embed_dropout"] + self.gguf_writer.add_embedding_length(embed_dim) self.gguf_writer.add_head_count(num_heads) self.gguf_writer.add_head_count_kv(num_kv_heads) @@ -5418,8 +5739,7 @@ def set_gguf_parameters(self): if head_dim := self.hparams.pop("head_dim", None): logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) super().set_gguf_parameters() - # NOTE: Convert _multiplier params to _scale params for naming - # consistency + if attention_scale := self.hparams.get("attention_multiplier"): self.gguf_writer.add_attention_scale(attention_scale) logger.info("gguf: (granite) attention_scale = %s", attention_scale) @@ -5464,7 +5784,7 @@ def modify_tensors( @Model.register("ChameleonForConditionalGeneration") -@Model.register("ChameleonForCausalLM") # obsolete +@Model.register("ChameleonForCausalLM") class ChameleonModel(Model): model_arch = gguf.MODEL_ARCH.CHAMELEON @@ -5478,8 +5798,7 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - # ignore image tokenizer for now - # TODO: remove this once image support is implemented for Chameleon + if name.startswith("model.vqmodel"): return [] @@ -5502,7 +5821,6 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] - # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 @staticmethod def _reverse_hf_permute(data_torch, n_heads, hidden_dim): head_dim = hidden_dim // n_heads @@ -5511,35 +5829,24 @@ def _reverse_hf_permute(data_torch, n_heads, hidden_dim): return data_torch -###### CONVERSION LOGIC ###### - - -# tree of lazy tensors class LazyTorchTensor(gguf.LazyBase): _tensor_type = torch.Tensor - # to keep the type-checker happy + dtype: torch.dtype shape: torch.Size - # only used when converting a torch.Tensor to a np.ndarray _dtype_map: dict[torch.dtype, type] = { torch.float16: np.float16, torch.float32: np.float32, } - # used for safetensors slices - # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 - # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 _dtype_str_map: dict[str, torch.dtype] = { "F64": torch.float64, "F32": torch.float32, "BF16": torch.bfloat16, "F16": torch.float16, - # "U64": torch.uint64, "I64": torch.int64, - # "U32": torch.uint32, "I32": torch.int32, - # "U16": torch.uint16, "I16": torch.int16, "U8": torch.uint8, "I8": torch.int8, @@ -5575,7 +5882,7 @@ def from_safetensors_slice(cls, st_slice: Any) -> Tensor: @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): - del types # unused + del types if kwargs is None: kwargs = {} @@ -5616,6 +5923,7 @@ def parse_args() -> argparse.Namespace: "model", type=Path, help="directory containing model file", + nargs="?", ) parser.add_argument( "--use-temp-file", @@ -5665,8 +5973,16 @@ def parse_args() -> argparse.Namespace: type=Path, help="Specify the path for an authorship metadata override file", ) + parser.add_argument( + "--print-supported-models", + action="store_true", + help="Print the supported models", + ) - return parser.parse_args() + args = parser.parse_args() + if not args.print_supported_models and args.model is None: + parser.error("the following arguments are required: model") + return args def split_str_to_n_bytes(split_str: str) -> int: @@ -5692,6 +6008,11 @@ def split_str_to_n_bytes(split_str: str) -> int: def main() -> None: args = parse_args() + if args.print_supported_models: + logger.error("Supported models:") + Model.print_registered_models() + sys.exit(0) + if args.verbose: logging.basicConfig(level=logging.DEBUG) else: diff --git a/src/convert_lora_to_gguf.py b/src/convert_lora_to_gguf.py index e132412..7abff09 100644 --- a/src/convert_lora_to_gguf.py +++ b/src/convert_lora_to_gguf.py @@ -27,7 +27,6 @@ import gguf -# reuse model definitions from convert_hf_to_gguf.py from convert_hf_to_gguf import LazyTorchTensor, Model logger = logging.getLogger("lora-to-gguf") @@ -39,10 +38,9 @@ class PartialLoraTensor: B: Tensor | None = None -# magic to support tensor shape modifications and splitting class LoraTorchTensor: - _lora_A: Tensor # (n_rank, row_size) - _lora_B: Tensor # (col_size, n_rank) + _lora_A: Tensor + _lora_B: Tensor _rank: int def __init__(self, A: Tensor, B: Tensor): @@ -60,20 +58,14 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]: def __getitem__( self, - indices: ( - SupportsIndex - | slice - | tuple[ - SupportsIndex | slice | Tensor, ... - ] # TODO: add ellipsis in the type signature - ), + indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...], ) -> LoraTorchTensor: shape = self.shape if isinstance(indices, SupportsIndex): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) else: - raise NotImplementedError # can't return a vector + raise NotImplementedError elif isinstance(indices, slice): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) @@ -83,7 +75,7 @@ def __getitem__( assert len(indices) > 0 if indices[-1] is Ellipsis: return self[indices[:-1]] - # expand ellipsis + indices = tuple( u for v in ( @@ -103,7 +95,6 @@ def __getitem__( *(slice(None, None) for _ in range(len(indices), len(shape))), ) - # TODO: make sure this is correct indices_A = ( *( ( @@ -119,7 +110,7 @@ def __getitem__( indices_B = indices[:-1] return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) else: - raise NotImplementedError # unknown indice type + raise NotImplementedError @property def dtype(self) -> torch.dtype: @@ -142,9 +133,8 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: new_shape = cast(tuple[int, ...], shape) orig_shape = self.shape if len(new_shape) < 2: - raise NotImplementedError # can't become a vector + raise NotImplementedError - # expand -1 in the shape if any(dim == -1 for dim in new_shape): n_elems = prod(orig_shape) n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) @@ -154,7 +144,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: ) if new_shape[-1] != orig_shape[-1]: - raise NotImplementedError # can't reshape the row size trivially + raise NotImplementedError shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) shape_B = (*new_shape[:-1], self._rank) @@ -173,7 +163,7 @@ def permute(self, *dims: int) -> LoraTorchTensor: shape = self.shape dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) if dims[-1] == -1: - # TODO: support higher dimensional A shapes bigger than 1 + assert all(dim == 1 for dim in self._lora_A.shape[:-2]) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: @@ -181,7 +171,7 @@ def permute(self, *dims: int) -> LoraTorchTensor: self._lora_B.permute(*dims), self._lora_A.permute(*dims) ) else: - # TODO: compose the above two + raise NotImplementedError def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: @@ -200,7 +190,7 @@ def to(self, *args, **kwargs): @classmethod def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): - del types # unused + del types if kwargs is None: kwargs = {} @@ -241,7 +231,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight") - # models produced by mergekit-extract-lora have token embeddings in the adapter + base_name = base_name.replace(".lora_embedding_A", ".weight") base_name = base_name.replace(".lora_embedding_B", ".weight") return base_name @@ -303,7 +293,7 @@ def parse_args() -> argparse.Namespace: def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: - # normally, adapter does not come with base model config, we need to load it from AutoConfig + config = AutoConfig.from_pretrained(hf_model_id) return config.to_dict() @@ -331,11 +321,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: if args.outfile is not None: fname_out = args.outfile else: - # output in the same directory as the model by default + fname_out = dir_lora if os.path.exists(input_model): - # lazy import load_file only if lora is in safetensors format. + from safetensors.torch import load_file lora_model = load_file(input_model, device="cpu") @@ -343,11 +333,9 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: input_model = os.path.join(dir_lora, "adapter_model.bin") lora_model = torch.load(input_model, map_location="cpu", weights_only=True) - # load LoRA config with open(lora_config, "r") as f: lparams: dict[str, Any] = json.load(f) - # load base model if base_model_id is not None: logger.info(f"Loading base model from Hugging Face: {base_model_id}") hparams = load_hparams_from_hf(base_model_id) @@ -409,7 +397,7 @@ def set_gguf_parameters(self): ) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # Never add extra tensors (e.g. rope_freqs) for LoRA adapters + return () def get_tensors(self) -> Iterator[tuple[str, Tensor]]: @@ -419,13 +407,13 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - # note: mergekit-extract-lora also adds token embeddings to the adapter + is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue - # mergekit-extract-lora add these layernorm to the adapter, we need to keep them + if "_layernorm" in name or ".norm" in name: yield (base_name, tensor) continue @@ -437,7 +425,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: "Embeddings is present in the adapter. This can be due to new tokens added during fine tuning" ) logger.error( - "Please refer to https://github.com/ggerganov/llama.cpp/pull/9948" + "Please refer to https://github.com/ggml-org/llama.cpp/pull/9948" ) sys.exit(1) @@ -464,27 +452,21 @@ def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: dest = list(super().modify_tensors(data_torch, name, bid)) - # some archs may have the same tensor for lm_head and output (tie word embeddings) - # in this case, adapters targeting lm_head will fail when using llama-export-lora - # therefore, we ignore them for now - # see: https://github.com/ggerganov/llama.cpp/issues/9065 + if name == "lm_head.weight" and len(dest) == 0: raise ValueError( "lm_head is present in adapter, but is ignored in base model" ) for dest_name, dest_data in dest: - # mergekit-extract-lora add these layernorm to the adapter + if "_norm" in dest_name: assert dest_data.dim() == 1 yield (dest_name, dest_data) continue - # otherwise, we must get the lora_A and lora_B tensors assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() - # note: mergekit-extract-lora flip and transpose A and B - # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd() if "token_embd.weight" in dest_name: lora_a = lora_a.T diff --git a/src/gguf/constants.py b/src/gguf/constants.py index 6e9ee09..b4b4cca 100644 --- a/src/gguf/constants.py +++ b/src/gguf/constants.py @@ -119,6 +119,7 @@ class LLM: TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" RESIDUAL_SCALE = "{arch}.residual_scale" EMBEDDING_SCALE = "{arch}.embedding_scale" + TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -134,6 +135,10 @@ class Attention: CAUSAL = "{arch}.attention.causal" Q_LORA_RANK = "{arch}.attention.q_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank" + DECAY_LORA_RANK = "{arch}.attention.decay_lora_rank" + ICLR_LORA_RANK = "{arch}.attention.iclr_lora_rank" + VALUE_RESIDUAL_MIX_LORA_RANK = "{arch}.attention.value_residual_mix_lora_rank" + GATE_LORA_RANK = "{arch}.attention.gate_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" SCALE = "{arch}.attention.scale" @@ -189,7 +194,6 @@ class Tokenizer: UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" - CLS_ID = "tokenizer.ggml.cls_token_id" MASK_ID = "tokenizer.ggml.mask_token_id" ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_EOS = "tokenizer.ggml.add_eos_token" @@ -251,6 +255,7 @@ class MODEL_ARCH(IntEnum): QWEN2VL = auto() PHI2 = auto() PHI3 = auto() + PHIMOE = auto() PLAMO = auto() CODESHELL = auto() ORION = auto() @@ -259,8 +264,12 @@ class MODEL_ARCH(IntEnum): MINICPM3 = auto() GEMMA = auto() GEMMA2 = auto() + GEMMA3 = auto() STARCODER2 = auto() RWKV6 = auto() + RWKV6QWEN2 = auto() + RWKV7 = auto() + ARWKV7 = auto() MAMBA = auto() XVERSE = auto() COMMAND_R = auto() @@ -333,13 +342,26 @@ class MODEL_TENSOR(IntEnum): SSM_A = auto() SSM_D = auto() SSM_OUT = auto() + TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() + TIME_MIX_A0 = auto() + TIME_MIX_A1 = auto() + TIME_MIX_A2 = auto() + TIME_MIX_V0 = auto() + TIME_MIX_V1 = auto() + TIME_MIX_V2 = auto() + TIME_MIX_G1 = auto() + TIME_MIX_G2 = auto() + TIME_MIX_K_K = auto() + TIME_MIX_K_A = auto() + TIME_MIX_R_K = auto() TIME_MIX_LERP_X = auto() TIME_MIX_LERP_K = auto() TIME_MIX_LERP_V = auto() TIME_MIX_LERP_R = auto() TIME_MIX_LERP_G = auto() + TIME_MIX_LERP_FUSED = auto() TIME_MIX_LERP_W = auto() TIME_MIX_FIRST = auto() TIME_MIX_DECAY = auto() @@ -435,6 +457,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.QWEN2VL: "qwen2vl", MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI3: "phi3", + MODEL_ARCH.PHIMOE: "phimoe", MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.ORION: "orion", @@ -443,8 +466,12 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.MINICPM3: "minicpm3", MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", + MODEL_ARCH.GEMMA3: "gemma3", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", + MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", + MODEL_ARCH.RWKV7: "rwkv7", + MODEL_ARCH.ARWKV7: "arwkv7", MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", @@ -517,13 +544,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", + MODEL_TENSOR.TIME_MIX_A0: "blk.{bid}.time_mix_a0", + MODEL_TENSOR.TIME_MIX_A1: "blk.{bid}.time_mix_a1", + MODEL_TENSOR.TIME_MIX_A2: "blk.{bid}.time_mix_a2", + MODEL_TENSOR.TIME_MIX_V0: "blk.{bid}.time_mix_v0", + MODEL_TENSOR.TIME_MIX_V1: "blk.{bid}.time_mix_v1", + MODEL_TENSOR.TIME_MIX_V2: "blk.{bid}.time_mix_v2", + MODEL_TENSOR.TIME_MIX_G1: "blk.{bid}.time_mix_g1", + MODEL_TENSOR.TIME_MIX_G2: "blk.{bid}.time_mix_g2", + MODEL_TENSOR.TIME_MIX_K_K: "blk.{bid}.time_mix_k_k", + MODEL_TENSOR.TIME_MIX_K_A: "blk.{bid}.time_mix_k_a", + MODEL_TENSOR.TIME_MIX_R_K: "blk.{bid}.time_mix_r_k", MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", + MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused", MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", @@ -947,6 +987,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.PHIMOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], MODEL_ARCH.CODESHELL: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.POS_EMBD, @@ -1060,6 +1118,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_PRE_NORM, MODEL_TENSOR.FFN_POST_NORM, ], + MODEL_ARCH.GEMMA3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.STARCODER2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1090,6 +1165,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TIME_MIX_LERP_R, MODEL_TENSOR.TIME_MIX_LERP_G, MODEL_TENSOR.TIME_MIX_LERP_W, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, MODEL_TENSOR.TIME_MIX_FIRST, MODEL_TENSOR.TIME_MIX_DECAY, MODEL_TENSOR.TIME_MIX_DECAY_W1, @@ -1106,6 +1182,97 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE, MODEL_TENSOR.CHANNEL_MIX_VALUE, ], + MODEL_ARCH.RWKV6QWEN2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_LERP_X, + MODEL_TENSOR.TIME_MIX_LERP_K, + MODEL_TENSOR.TIME_MIX_LERP_V, + MODEL_TENSOR.TIME_MIX_LERP_R, + MODEL_TENSOR.TIME_MIX_LERP_G, + MODEL_TENSOR.TIME_MIX_LERP_W, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, + MODEL_TENSOR.TIME_MIX_FIRST, + MODEL_TENSOR.TIME_MIX_DECAY, + MODEL_TENSOR.TIME_MIX_DECAY_W1, + MODEL_TENSOR.TIME_MIX_DECAY_W2, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_GATE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.RWKV7: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, + MODEL_TENSOR.TIME_MIX_W0, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_A0, + MODEL_TENSOR.TIME_MIX_A1, + MODEL_TENSOR.TIME_MIX_A2, + MODEL_TENSOR.TIME_MIX_V0, + MODEL_TENSOR.TIME_MIX_V1, + MODEL_TENSOR.TIME_MIX_V2, + MODEL_TENSOR.TIME_MIX_G1, + MODEL_TENSOR.TIME_MIX_G2, + MODEL_TENSOR.TIME_MIX_K_K, + MODEL_TENSOR.TIME_MIX_K_A, + MODEL_TENSOR.TIME_MIX_R_K, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.CHANNEL_MIX_LERP_K, + MODEL_TENSOR.CHANNEL_MIX_KEY, + MODEL_TENSOR.CHANNEL_MIX_VALUE, + ], + MODEL_ARCH.ARWKV7: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, + MODEL_TENSOR.TIME_MIX_W0, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_A0, + MODEL_TENSOR.TIME_MIX_A1, + MODEL_TENSOR.TIME_MIX_A2, + MODEL_TENSOR.TIME_MIX_V0, + MODEL_TENSOR.TIME_MIX_V1, + MODEL_TENSOR.TIME_MIX_V2, + MODEL_TENSOR.TIME_MIX_G1, + MODEL_TENSOR.TIME_MIX_G2, + MODEL_TENSOR.TIME_MIX_K_K, + MODEL_TENSOR.TIME_MIX_K_A, + MODEL_TENSOR.TIME_MIX_R_K, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.MAMBA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1310,6 +1477,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_DOWN, @@ -1789,7 +1959,6 @@ def get_type(val: Any) -> GGUFValueType: KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/src/gguf/gguf.py b/src/gguf/gguf.py new file mode 100644 index 0000000..651a81e --- /dev/null +++ b/src/gguf/gguf.py @@ -0,0 +1,15 @@ +# This file left for compatibility. If you want to use the GGUF API from Python +# then don't import gguf/gguf.py directly. If you're looking for examples, see the +# examples/ directory for gguf-py + +import importlib +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +# Compatibility for people trying to import gguf/gguf.py directly instead of as a package. +importlib.invalidate_caches() +import gguf # noqa: E402 + +importlib.reload(gguf) diff --git a/src/gguf/gguf_reader.py b/src/gguf/gguf_reader.py index 962c43e..d1d1931 100644 --- a/src/gguf/gguf_reader.py +++ b/src/gguf/gguf_reader.py @@ -6,6 +6,7 @@ import logging import os +import sys from collections import OrderedDict from typing import Any, Literal, NamedTuple, TypeVar, Union @@ -15,7 +16,6 @@ from .quants import quant_shape_to_byte_shape if __name__ == "__main__": - import sys from pathlib import Path # Allow running file in package as a script. @@ -28,6 +28,7 @@ GGUF_VERSION, GGMLQuantizationType, GGUFValueType, + GGUFEndian, ) logger = logging.getLogger(__name__) @@ -53,6 +54,52 @@ class ReaderField(NamedTuple): types: list[GGUFValueType] = [] + def contents(self, index_or_slice: int | slice = slice(None)) -> Any: + if self.types: + to_string = lambda x: str(x.tobytes(), encoding="utf-8") # noqa: E731 + main_type = self.types[0] + + if main_type == GGUFValueType.ARRAY: + sub_type = self.types[-1] + + if sub_type == GGUFValueType.STRING: + indices = self.data[index_or_slice] + + if isinstance(index_or_slice, int): + return to_string(self.parts[indices]) # type: ignore + else: + return [to_string(self.parts[idx]) for idx in indices] # type: ignore + else: + # FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too + + # Check if it's unsafe to perform slice optimization on data + # if any(True for idx in self.data if len(self.parts[idx]) != 1): + # optim_slice = slice(None) + # else: + # optim_slice = index_or_slice + # index_or_slice = slice(None) + + # if isinstance(optim_slice, int): + # return self.parts[self.data[optim_slice]].tolist()[0] + # else: + # return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice] + + if isinstance(index_or_slice, int): + return self.parts[self.data[index_or_slice]].tolist()[0] + else: + return [ + pv + for idx in self.data[index_or_slice] + for pv in self.parts[idx].tolist() + ] + + if main_type == GGUFValueType.STRING: + return to_string(self.parts[-1]) + else: + return self.parts[-1].tolist()[0] + + return None + class ReaderTensor(NamedTuple): name: str @@ -103,12 +150,23 @@ def __init__( # If we get 0 here that means it's (probably) a GGUF file created for # the opposite byte order of the machine this script is running on. self.byte_order = "S" - temp_version = temp_version.newbyteorder(self.byte_order) + temp_version = temp_version.view( + temp_version.dtype.newbyteorder(self.byte_order) + ) version = temp_version[0] if version not in READER_SUPPORTED_VERSIONS: raise ValueError( f"Sorry, file appears to be version {version} which we cannot handle" ) + if sys.byteorder == "little": + # Host is little endian + host_endian = GGUFEndian.LITTLE + swapped_endian = GGUFEndian.BIG + else: + # Sorry PDP or other weird systems that don't use BE or LE. + host_endian = GGUFEndian.BIG + swapped_endian = GGUFEndian.LITTLE + self.endianess = swapped_endian if self.byte_order == "S" else host_endian self.fields: OrderedDict[str, ReaderField] = OrderedDict() self.tensors: list[ReaderTensor] = [] offs += self._push_field( @@ -170,9 +228,11 @@ def _get( itemsize = int(np.empty([], dtype=dtype).itemsize) end_offs = offset + itemsize * count arr = self.data[offset:end_offs].view(dtype=dtype)[:count] - if override_order is None: - return arr - return arr.view(arr.dtype.newbyteorder(override_order)) + return arr.view( + arr.dtype.newbyteorder( + self.byte_order if override_order is None else override_order + ) + ) def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: if field.name in self.fields: @@ -218,6 +278,7 @@ def _get_field_parts( offs += int(alen.nbytes) aparts: list[npt.NDArray[Any]] = [raw_itype, alen] data_idxs: list[int] = [] + # FIXME: Handle multi-dimensional arrays properly instead of flattening for idx in range(alen[0]): curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts( offs, raw_itype[0] diff --git a/src/gguf/gguf_writer.py b/src/gguf/gguf_writer.py index 267ea6c..a279b74 100644 --- a/src/gguf/gguf_writer.py +++ b/src/gguf/gguf_writer.py @@ -828,6 +828,9 @@ def add_embedding_scale(self, value: float) -> None: def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) + def add_token_shift_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) @@ -849,6 +852,20 @@ def add_q_lora_rank(self, length: int) -> None: def add_kv_lora_rank(self, length: int) -> None: self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length) + def add_decay_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.DECAY_LORA_RANK.format(arch=self.arch), length) + + def add_iclr_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.ICLR_LORA_RANK.format(arch=self.arch), length) + + def add_value_residual_mix_lora_rank(self, length: int) -> None: + self.add_uint32( + Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length + ) + + def add_gate_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length) + def add_relative_attn_buckets_count(self, value: int) -> None: self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value) @@ -943,9 +960,6 @@ def add_sep_token_id(self, id: int) -> None: def add_pad_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.PAD_ID, id) - def add_cls_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.CLS_ID, id) - def add_mask_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.MASK_ID, id) diff --git a/src/gguf/metadata.py b/src/gguf/metadata.py index c9046eb..3af3a34 100644 --- a/src/gguf/metadata.py +++ b/src/gguf/metadata.py @@ -160,21 +160,41 @@ def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]: if not model_card_path.is_file(): return {} - # The model card metadata is assumed to always be in YAML + # The model card metadata is assumed to always be in YAML (frontmatter) # ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473 + yaml_content: str = "" with open(model_card_path, "r", encoding="utf-8") as f: - if f.readline() == "---\n": - raw = f.read().partition("---\n")[0] - data = yaml.safe_load(raw) - if isinstance(data, dict): - return data - else: - logger.error( - f"while reading YAML model card frontmatter, data is {type(data)} instead of dict" - ) - return {} - else: + content = f.read() + lines = content.splitlines() + lines_yaml = [] + if len(lines) == 0: + # Empty file return {} + if len(lines) > 0 and lines[0] != "---": + # No frontmatter + return {} + for line in lines[1:]: + if line == "---": + break # End of frontmatter + else: + lines_yaml.append(line) + yaml_content = "\n".join(lines_yaml) + "\n" + + # Quick hack to fix the Norway problem + # https://hitchdev.com/strictyaml/why/implicit-typing-removed/ + yaml_content = yaml_content.replace("- no\n", '- "no"\n') + + if yaml_content: + data = yaml.safe_load(yaml_content) + if isinstance(data, dict): + return data + else: + logger.error( + f"while reading YAML model card frontmatter, data is {type(data)} instead of dict" + ) + return {} + else: + return {} @staticmethod def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]: diff --git a/src/gguf/tensor_mapping.py b/src/gguf/tensor_mapping.py index 3de2a6f..5058eba 100644 --- a/src/gguf/tensor_mapping.py +++ b/src/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron olmoe olmo2 + "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -27,7 +27,8 @@ class TensorNameMap: "embedding.word_embeddings", # chatglm "transformer.token_embeddings", # openelm "shared", # t5 - "rwkv.embeddings", # rwkv + "rwkv.embeddings", # rwkv6 + "model.embeddings", # rwkv7 ), # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( @@ -40,6 +41,9 @@ class TensorNameMap: "emb_ln", # nomic-bert "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv + "rwkv.blocks.0.pre_ln", # rwkv6 + "model.pre_ln", # rwkv7 + "model.layers.0.pre_norm", # rwkv7 "backbone.norm", # wavtokenizer ), # Position embeddings @@ -51,7 +55,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -63,7 +67,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon jais exaone - "model.norm", # llama-hf baichuan internlm2 olmoe olmo2 + "model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe "norm", # llama-pth "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 @@ -76,7 +80,8 @@ class TensorNameMap: "encoder.final_layernorm", # chatglm "transformer.norm", # openelm "model.norm", # nemotron - "rwkv.ln_out", # rwkv + "rwkv.ln_out", # rwkv6 + "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer ), # Rope frequencies @@ -98,7 +103,7 @@ class TensorNameMap: "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe + "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe "layers.{bid}.attention_norm", # llama-pth "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi @@ -112,13 +117,15 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "encoder.layers.{bid}.input_layernorm", # chatglm "transformer.layers.{bid}.attn_norm", # openelm - "rwkv.blocks.{bid}.ln1", # rwkv + "rwkv.blocks.{bid}.ln1", # rwkv6 + "model.layers.{bid}.ln1", # rwkv7 ), # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( "transformer.h.{bid}.ln_attn", # falcon40b "encoder.layer.{bid}.layer_norm_1", # jina-v2-code - "rwkv.blocks.{bid}.ln2", # rwkv + "rwkv.blocks.{bid}.ln2", # rwkv6 + "model.layers.{bid}.ln2", # rwkv7 ), # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( @@ -139,7 +146,7 @@ class TensorNameMap: ), # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert @@ -151,7 +158,7 @@ class TensorNameMap: ), # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert @@ -164,7 +171,7 @@ class TensorNameMap: ), # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe "layers.{bid}.attention.wv", # llama-pth "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j @@ -181,7 +188,7 @@ class TensorNameMap: "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe "model.layers.{bid}.self_attn.linear_attn", # deci "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert @@ -222,7 +229,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe + "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe "layers.{bid}.ffn_norm", # llama-pth "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi @@ -242,7 +249,7 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe "model.layers.{bid}.mlp.gate", # qwen2moe olmoe "transformer.decoder_layer.{bid}.router", # Grok "transformer.blocks.{bid}.ffn.router.layer", # dbrx @@ -287,6 +294,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged) ), MODEL_TENSOR.FFN_UP_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe @@ -313,6 +321,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged) ), MODEL_TENSOR.FFN_GATE_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe @@ -351,6 +360,7 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe + "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged) ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe @@ -410,62 +420,116 @@ class TensorNameMap: "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", ), + MODEL_TENSOR.TIME_MIX_W0: ("model.layers.{bid}.attention.w0",), # rwkv7 MODEL_TENSOR.TIME_MIX_W1: ( - "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2 + "model.layers.{bid}.attention.w1", # rwkv7 ), MODEL_TENSOR.TIME_MIX_W2: ( - "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2 + "model.layers.{bid}.attention.w2", # rwkv7 ), + MODEL_TENSOR.TIME_MIX_A0: ("model.layers.{bid}.attention.a0",), # rwkv7 + MODEL_TENSOR.TIME_MIX_A1: ("model.layers.{bid}.attention.a1",), # rwkv7 + MODEL_TENSOR.TIME_MIX_A2: ("model.layers.{bid}.attention.a2",), # rwkv7 + MODEL_TENSOR.TIME_MIX_V0: ("model.layers.{bid}.attention.v0",), # rwkv7 + MODEL_TENSOR.TIME_MIX_V1: ("model.layers.{bid}.attention.v1",), # rwkv7 + MODEL_TENSOR.TIME_MIX_V2: ("model.layers.{bid}.attention.v2",), # rwkv7 + MODEL_TENSOR.TIME_MIX_G1: ("model.layers.{bid}.attention.g1",), # rwkv7 + MODEL_TENSOR.TIME_MIX_G2: ("model.layers.{bid}.attention.g2",), # rwkv7 + MODEL_TENSOR.TIME_MIX_K_K: ("model.layers.{bid}.attention.k_k",), # rwkv7 + MODEL_TENSOR.TIME_MIX_K_A: ("model.layers.{bid}.attention.k_a",), # rwkv7 + MODEL_TENSOR.TIME_MIX_R_K: ("model.layers.{bid}.attention.r_k",), # rwkv7 MODEL_TENSOR.TIME_MIX_LERP_X: ( - "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_K: ( - "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_V: ( - "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_R: ( - "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_G: ( - "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_W: ( - "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_FIRST: ( - "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6 ), MODEL_TENSOR.TIME_MIX_DECAY: ( - "rwkv.blocks.{bid}.attention.time_decay", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_decay", # rwkv6 + "model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_DECAY_W1: ( - "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6 + "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_DECAY_W2: ( - "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6 + "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2 + ), + MODEL_TENSOR.TIME_MIX_KEY: ( + "rwkv.blocks.{bid}.attention.key", # rwkv6 + "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.key", # rwkv7 + "model.layers.{bid}.attention.k_proj", # rwkv7 + ), + MODEL_TENSOR.TIME_MIX_VALUE: ( + "rwkv.blocks.{bid}.attention.value", # rwkv6 + "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.value", # rwkv7 + "model.layers.{bid}.attention.v_proj", # rwkv7 ), - MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",), # rwkv - MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",), # rwkv MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( - "rwkv.blocks.{bid}.attention.receptance", # rwkv + "rwkv.blocks.{bid}.attention.receptance", # rwkv6 + "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.receptance", # rwkv7 + "model.layers.{bid}.attention.r_proj", # rwkv7 + ), + MODEL_TENSOR.TIME_MIX_GATE: ( + "rwkv.blocks.{bid}.attention.gate", # rwkv6 + "model.layers.{bid}.self_attn.gate", # rwkv6qwen2 + ), + MODEL_TENSOR.TIME_MIX_LN: ( + "rwkv.blocks.{bid}.attention.ln_x", # rwkv6 + "model.layers.{bid}.attention.ln_x", # rwkv7 + ), + MODEL_TENSOR.TIME_MIX_OUTPUT: ( + "rwkv.blocks.{bid}.attention.output", # rwkv6 + "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.output", # rwkv7 + "model.layers.{bid}.attention.o_proj", # rwkv7 ), - MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",), # rwkv - MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",), # rwkv - MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",), # rwkv MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( - "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6 + "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6 + "model.layers.{bid}.feed_forward.x_k", # rwkv7 ), MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( - "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6 + "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6 + ), + MODEL_TENSOR.CHANNEL_MIX_KEY: ( + "rwkv.blocks.{bid}.feed_forward.key", # rwkv6 + "model.layers.{bid}.feed_forward.key", # rwkv7 ), - MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",), # rwkv MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( - "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv + "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6 ), MODEL_TENSOR.CHANNEL_MIX_VALUE: ( - "rwkv.blocks.{bid}.feed_forward.value", # rwkv + "rwkv.blocks.{bid}.feed_forward.value", # rwkv6 + "model.layers.{bid}.feed_forward.value", # rwkv7 ), MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2 MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2 diff --git a/src/gguf/utility.py b/src/gguf/utility.py index e72b904..c514251 100644 --- a/src/gguf/utility.py +++ b/src/gguf/utility.py @@ -67,7 +67,7 @@ def naming_convention( output_type: str | None, model_type: Literal["vocab", "LoRA"] | None = None, ) -> str: - # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention + # Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention if base_name is not None: name = base_name.strip().replace(" ", "-").replace("/", "-") diff --git a/src/gguf/vocab.py b/src/gguf/vocab.py index 3aa13ea..7db9b37 100644 --- a/src/gguf/vocab.py +++ b/src/gguf/vocab.py @@ -166,7 +166,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: and isinstance(merges[0][0], str) ): # New format since transformers 4.45 to support spaces in merges - # ref: https://github.com/ggerganov/llama.cpp/issues/9692 + # ref: https://github.com/ggml-org/llama.cpp/issues/9692 # TODO: internally store as the new format instead of converting to old if any(" " in s for pair in merges for s in pair): logger.warning( @@ -195,7 +195,12 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: return True with open(tokenizer_config_file, encoding="utf-8") as f: tokenizer_config = json.load(f) - chat_template = tokenizer_config.get("chat_template") + chat_template_alt = None + chat_template_file = path / "chat_template.json" + if chat_template_file.is_file(): + with open(chat_template_file, encoding="utf-8") as f: + chat_template_alt = json.load(f).get("chat_template") + chat_template = tokenizer_config.get("chat_template", chat_template_alt) if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: