diff --git a/src/AutoGGUF.py b/src/AutoGGUF.py index b0f71e1..a98643e 100644 --- a/src/AutoGGUF.py +++ b/src/AutoGGUF.py @@ -12,6 +12,7 @@ import platform import requests import zipfile +import re from datetime import datetime from imports_and_globals import ensure_directory, open_file_safe from DownloadThread import DownloadThread @@ -144,10 +145,6 @@ def __init__(self): self.quant_type = QComboBox() self.quant_type.addItems( [ - "Q4_0", - "Q4_1", - "Q5_0", - "Q5_1", "IQ2_XXS", "IQ2_XS", "IQ2_S", @@ -173,12 +170,16 @@ def __init__(self): "Q5_K_S", "Q5_K_M", "Q6_K", - "Q8_0", + "Q8_0", + "Q4_0", + "Q4_1", + "Q5_0", + "Q5_1", "Q4_0_4_4", "Q4_0_4_8", "Q4_0_8_8", - "F16", "BF16", + "F16", "F32", "COPY", ] @@ -796,7 +797,7 @@ def export_lora(self): thread = QuantizationThread(command, backend_path, log_file) self.quant_threads.append(thread) - task_item = TaskListItem(EXPORTING_LORA, log_file) + task_item = TaskListItem(EXPORTING_LORA, log_file, show_progress_bar=False) list_item = QListWidgetItem(self.task_list) list_item.setSizeHint(task_item.sizeHint()) self.task_list.addItem(list_item) @@ -888,7 +889,7 @@ def convert_lora(self): task_name = LORA_CONVERSION_FROM_TO.format( os.path.basename(lora_input_path), os.path.basename(lora_output_path) ) - task_item = TaskListItem(task_name, log_file) + task_item = TaskListItem(task_name, log_file, show_progress_bar=False) list_item = QListWidgetItem(self.task_list) list_item.setSizeHint(task_item.sizeHint()) self.task_list.addItem(list_item) @@ -967,13 +968,14 @@ def refresh_releases(self): response = requests.get( "https://api.github.com/repos/ggerganov/llama.cpp/releases" ) + response.raise_for_status() # Raise an exception for bad status codes releases = response.json() self.release_combo.clear() for release in releases: self.release_combo.addItem(release["tag_name"], userData=release) self.release_combo.currentIndexChanged.connect(self.update_assets) self.update_assets() - except Exception as e: + except requests.exceptions.RequestException as e: self.show_error(ERROR_FETCHING_RELEASES.format(str(e))) def update_assets(self): @@ -981,20 +983,13 @@ def update_assets(self): self.asset_combo.clear() release = self.release_combo.currentData() if release: - for asset in release["assets"]: - self.asset_combo.addItem(asset["name"], userData=asset) + if "assets" in release: + for asset in release["assets"]: + self.asset_combo.addItem(asset["name"], userData=asset) + else: + self.show_error(NO_ASSETS_FOUND_FOR_RELEASE.format(release["tag_name"])) self.update_cuda_option() - def update_cuda_option(self): - self.logger.debug(UPDATING_CUDA_OPTIONS) - asset = self.asset_combo.currentData() - is_cuda = asset and "cudart" in asset["name"].lower() - self.cuda_extract_checkbox.setVisible(is_cuda) - self.cuda_backend_label.setVisible(is_cuda) - self.backend_combo_cuda.setVisible(is_cuda) - if is_cuda: - self.update_cuda_backends() - def download_llama_cpp(self): self.logger.info(STARTING_LLAMACPP_DOWNLOAD) asset = self.asset_combo.currentData() @@ -1017,6 +1012,25 @@ def download_llama_cpp(self): self.download_button.setEnabled(False) self.download_progress.setValue(0) + def update_cuda_option(self): + self.logger.debug(UPDATING_CUDA_OPTIONS) + asset = self.asset_combo.currentData() + + # Handle the case where asset is None + if asset is None: + self.logger.warning(NO_ASSET_SELECTED_FOR_CUDA_CHECK) + self.cuda_extract_checkbox.setVisible(False) + self.cuda_backend_label.setVisible(False) + self.backend_combo_cuda.setVisible(False) + return # Exit the function early + + is_cuda = asset and "cudart" in asset["name"].lower() + self.cuda_extract_checkbox.setVisible(is_cuda) + self.cuda_backend_label.setVisible(is_cuda) + self.backend_combo_cuda.setVisible(is_cuda) + if is_cuda: + self.update_cuda_backends() + def update_cuda_backends(self): self.logger.debug(UPDATING_CUDA_BACKENDS) self.backend_combo_cuda.clear() @@ -1385,6 +1399,8 @@ def quantize_model(self): self.task_list.addItem(list_item) self.task_list.setItemWidget(list_item, task_item) + # Connect the output signal to the new progress parsing function + thread.output_signal.connect(lambda line: self.parse_progress(line, task_item)) thread.status_signal.connect(task_item.update_status) thread.finished_signal.connect(lambda: self.task_finished(thread)) thread.error_signal.connect(lambda err: self.handle_error(err, task_item)) @@ -1401,6 +1417,15 @@ def update_model_info(self, model_info): # TODO: Do something with this pass + def parse_progress(self, line, task_item): + # Parses the output line for progress information and updates the task item. + match = re.search(r"\[(\d+)/(\d+)\]", line) + if match: + current = int(match.group(1)) + total = int(match.group(2)) + progress = int((current / total) * 100) + task_item.update_progress(progress) + def task_finished(self, thread): self.logger.info(TASK_FINISHED.format(thread.log_file)) if thread in self.quant_threads: @@ -1508,7 +1533,7 @@ def generate_imatrix(self): task_name = GENERATING_IMATRIX_FOR.format( os.path.basename(self.imatrix_model.text()) ) - task_item = TaskListItem(task_name, log_file) + task_item = TaskListItem(task_name, log_file, show_progress_bar=False) list_item = QListWidgetItem(self.task_list) list_item.setSizeHint(task_item.sizeHint()) self.task_list.addItem(list_item) diff --git a/src/KVOverrideEntry.py b/src/KVOverrideEntry.py index 7916acf..1144236 100644 --- a/src/KVOverrideEntry.py +++ b/src/KVOverrideEntry.py @@ -2,6 +2,7 @@ from PyQt6.QtCore import pyqtSignal, QRegularExpression from PyQt6.QtGui import QDoubleValidator, QIntValidator, QRegularExpressionValidator + class KVOverrideEntry(QWidget): deleted = pyqtSignal(QWidget) @@ -13,7 +14,7 @@ def __init__(self, parent=None): self.key_input = QLineEdit() self.key_input.setPlaceholderText("Key") # Set validator for key input (letters and dots only) - key_validator = QRegularExpressionValidator(QRegularExpression(r'[A-Za-z.]+')) + key_validator = QRegularExpressionValidator(QRegularExpression(r"[A-Za-z.]+")) self.key_input.setValidator(key_validator) layout.addWidget(self.key_input) @@ -32,7 +33,7 @@ def __init__(self, parent=None): # Connect type change to validator update self.type_combo.currentTextChanged.connect(self.update_validator) - + # Initialize validator self.update_validator(self.type_combo.currentText()) diff --git a/src/QuantizationThread.py b/src/QuantizationThread.py index 5c2d5df..712cac3 100644 --- a/src/QuantizationThread.py +++ b/src/QuantizationThread.py @@ -16,6 +16,7 @@ from imports_and_globals import open_file_safe class QuantizationThread(QThread): + # Define custom signals for communication with the main thread output_signal = pyqtSignal(str) status_signal = pyqtSignal(str) finished_signal = pyqtSignal() @@ -32,48 +33,62 @@ def __init__(self, command, cwd, log_file): def run(self): try: - self.process = subprocess.Popen(self.command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - text=True, cwd=self.cwd) - with open_file_safe(self.log_file, 'w') as log: + # Start the subprocess + self.process = subprocess.Popen( + self.command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + cwd=self.cwd, + ) + # Open log file and process output + with open_file_safe(self.log_file, "w") as log: for line in self.process.stdout: line = line.strip() self.output_signal.emit(line) - log.write(line + '\n') + log.write(line + "\n") log.flush() self.status_signal.emit("In Progress") self.parse_model_info(line) + + # Wait for process to complete self.process.wait() if self.process.returncode == 0: self.status_signal.emit("Completed") self.model_info_signal.emit(self.model_info) else: - self.error_signal.emit(f"Process exited with code {self.process.returncode}") + self.error_signal.emit( + f"Process exited with code {self.process.returncode}" + ) self.finished_signal.emit() except Exception as e: self.error_signal.emit(str(e)) def parse_model_info(self, line): + # Parse output for model information if "llama_model_loader: loaded meta data with" in line: parts = line.split() - self.model_info['kv_pairs'] = parts[6] - self.model_info['tensors'] = parts[9] + self.model_info["kv_pairs"] = parts[6] + self.model_info["tensors"] = parts[9] elif "general.architecture" in line: - self.model_info['architecture'] = line.split('=')[-1].strip() + self.model_info["architecture"] = line.split("=")[-1].strip() elif line.startswith("llama_model_loader: - kv"): - key = line.split(':')[2].strip() - value = line.split('=')[-1].strip() - self.model_info.setdefault('kv_data', {})[key] = value + key = line.split(":")[2].strip() + value = line.split("=")[-1].strip() + self.model_info.setdefault("kv_data", {})[key] = value elif line.startswith("llama_model_loader: - type"): - parts = line.split(':') + parts = line.split(":") if len(parts) > 1: quant_type = parts[1].strip() tensors = parts[2].strip().split()[0] - self.model_info.setdefault('quantization_type', []).append(f"{quant_type}: {tensors} tensors") + self.model_info.setdefault("quantization_type", []).append( + f"{quant_type}: {tensors} tensors" + ) def terminate(self): + # Terminate the subprocess if it's still running if self.process: os.kill(self.process.pid, signal.SIGTERM) self.process.wait(timeout=5) if self.process.poll() is None: - os.kill(self.process.pid, signal.SIGKILL) - + os.kill(self.process.pid, signal.SIGKILL) \ No newline at end of file diff --git a/src/TaskListItem.py b/src/TaskListItem.py index b521716..354d735 100644 --- a/src/TaskListItem.py +++ b/src/TaskListItem.py @@ -14,7 +14,7 @@ from datetime import datetime class TaskListItem(QWidget): - def __init__(self, task_name, log_file, parent=None): + def __init__(self, task_name, log_file, show_progress_bar=True, parent=None): super().__init__(parent) self.task_name = task_name self.log_file = log_file @@ -27,6 +27,14 @@ def __init__(self, task_name, log_file, parent=None): layout.addWidget(self.task_label) layout.addWidget(self.progress_bar) layout.addWidget(self.status_label) + + # Hide progress bar if show_progress_bar is False + self.progress_bar.setVisible(show_progress_bar) + + # Use indeterminate progress bar if not showing percentage + if not show_progress_bar: + self.progress_bar.setRange(0, 0) + self.progress_timer = QTimer(self) self.progress_timer.timeout.connect(self.update_progress) self.progress_value = 0 @@ -35,15 +43,17 @@ def update_status(self, status): self.status = status self.status_label.setText(status) if status == "In Progress": - self.progress_bar.setRange(0, 100) - self.progress_timer.start(100) + # Only start timer if showing percentage progress + if self.progress_bar.isVisible(): + self.progress_bar.setRange(0, 100) + self.progress_timer.start(100) elif status == "Completed": self.progress_timer.stop() self.progress_bar.setValue(100) elif status == "Canceled": self.progress_timer.stop() self.progress_bar.setValue(0) - + def set_error(self): self.status = "Error" self.status_label.setText("Error") @@ -51,7 +61,12 @@ def set_error(self): self.progress_bar.setRange(0, 100) self.progress_timer.stop() - def update_progress(self): - self.progress_value = (self.progress_value + 1) % 101 - self.progress_bar.setValue(self.progress_value) - + def update_progress(self, value=None): + if value is not None: + # Update progress bar with specific value + self.progress_value = value + self.progress_bar.setValue(self.progress_value) + else: + # Increment progress bar for indeterminate progress + self.progress_value = (self.progress_value + 1) % 101 + self.progress_bar.setValue(self.progress_value) \ No newline at end of file diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py index 8b33c30..10416da 100644 --- a/src/convert_hf_to_gguf.py +++ b/src/convert_hf_to_gguf.py @@ -13,7 +13,18 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Iterable, + Iterator, + Literal, + Sequence, + TypeVar, + cast, +) import math import numpy as np @@ -22,8 +33,8 @@ if TYPE_CHECKING: from torch import Tensor -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +if "NO_LOCAL_GGUF" not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) import gguf logger = logging.getLogger("hf-to-gguf") @@ -31,6 +42,7 @@ ###### MODEL DEFINITIONS ###### + class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 @@ -67,26 +79,47 @@ class Model: # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, - use_temp_file: bool = False, eager: bool = False, - metadata_override: Path | None = None, model_name: str | None = None, - split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): + def __init__( + self, + dir_model: Path, + ftype: gguf.LlamaFileType, + fname_out: Path, + is_big_endian: bool = False, + use_temp_file: bool = False, + eager: bool = False, + metadata_override: Path | None = None, + model_name: str | None = None, + split_max_tensors: int = 0, + split_max_size: int = 0, + dry_run: bool = False, + small_first_shard: bool = False, + ): if type(self) is Model: - raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + raise TypeError( + f"{type(self).__name__!r} should not be directly instantiated" + ) self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.endianess = ( + gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + ) self.use_temp_file = use_temp_file self.lazy = not eager - self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") + self.part_names = Model.get_model_part_names( + self.dir_model, "model", ".safetensors" + ) self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: - self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + self.part_names = Model.get_model_part_names( + self.dir_model, "pytorch_model", ".bin" + ) self.hparams = Model.load_hparams(self.dir_model) - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) + self.block_count = self.find_hparam( + ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] + ) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None self.metadata_override = metadata_override @@ -98,15 +131,27 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) if first_tensor.dtype == torch.float16: - logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") + logger.info( + f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})" + ) self.ftype = gguf.LlamaFileType.MOSTLY_F16 else: - logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") + logger.info( + f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})" + ) self.ftype = gguf.LlamaFileType.MOSTLY_BF16 # Configure GGUF Writer - self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, - split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) + self.gguf_writer = gguf.GGUFWriter( + path=None, + arch=gguf.MODEL_ARCH_NAMES[self.model_arch], + endianess=self.endianess, + use_temp_file=self.use_temp_file, + split_max_tensors=split_max_tensors, + split_max_size=split_max_size, + dry_run=dry_run, + small_first_shard=small_first_shard, + ) @classmethod def __init_subclass__(cls): @@ -131,7 +176,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if len(self.part_names) > 1: self.tensor_names = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name = ( + "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + ) index_name += ".index.json" logger.info(f"gguf: loading model weight map from '{index_name}'") with open(self.dir_model / index_name, "r", encoding="utf-8") as f: @@ -148,9 +195,20 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: ctx: ContextManager[Any] if self.is_safetensors: from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) + + ctx = cast( + ContextManager[Any], + safe_open(self.dir_model / part_name, framework="pt", device="cpu"), + ) else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + ctx = contextlib.nullcontext( + torch.load( + str(self.dir_model / part_name), + map_location="cpu", + mmap=True, + weights_only=True, + ) + ) with ctx as model_part: tensor_names_from_parts.update(model_part.keys()) @@ -169,19 +227,38 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: yield name, data # only verify tensor name presence; it doesn't matter if they are not in the right files - if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") + if ( + len( + sym_diff := tensor_names_from_parts.symmetric_difference( + self.tensor_names + ) + ) + > 0 + ): + raise ValueError( + f"Mismatch between weight map and model parts for tensor names: {sym_diff}" + ) - def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: + def format_tensor_name( + self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight" + ) -> str: if key not in gguf.MODEL_TENSORS[self.model_arch]: - raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") + raise ValueError( + f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}" + ) name: str = gguf.TENSOR_NAMES[key] if "{bid}" in name: assert bid is not None name = name.format(bid=bid) return name + suffix - def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: + def match_model_tensor_name( + self, + name: str, + key: gguf.MODEL_TENSOR, + bid: int | None, + suffix: str = ".weight", + ) -> bool: if key not in gguf.MODEL_TENSORS[self.model_arch]: return False key_name: str = gguf.TENSOR_NAMES[key] @@ -194,7 +271,9 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | return False return name == (key_name + suffix) - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + def map_tensor_name( + self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias") + ) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) if new_name is None: raise ValueError(f"Can not map tensor {name!r}") @@ -203,7 +282,11 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", " def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + if ( + n_ctx := self.find_hparam( + ["max_position_embeddings", "n_ctx"], optional=True + ) + ) is not None: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") @@ -211,7 +294,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(n_embd) logger.info(f"gguf: embedding length = {n_embd}") - if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + if ( + n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True) + ) is not None: self.gguf_writer.add_feed_forward_length(n_ff) logger.info(f"gguf: feed forward length = {n_ff}") @@ -229,7 +314,11 @@ def set_gguf_parameters(self): if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + if ( + f_norm_eps := self.find_hparam( + ["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True + ) + ) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") if (n_experts := self.hparams.get("num_local_experts")) is not None: @@ -246,27 +335,37 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused return [(self.map_tensor_name(name), data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def extra_f32_tensors( + self, name: str, new_name: str, bid: int | None, n_dims: int + ) -> bool: del name, new_name, bid, n_dims # unused return False - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def extra_f16_tensors( + self, name: str, new_name: str, bid: int | None, n_dims: int + ) -> bool: del name, new_name, bid, n_dims # unused return False def prepare_tensors(self): - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len( + ".weight," + ) for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + if name.endswith( + (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") + ): continue old_dtype = data_torch.dtype @@ -282,7 +381,10 @@ def prepare_tensors(self): bid = int(part) break - for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): + for new_name, data in ( + (n, d.squeeze().numpy()) + for n, d in self.modify_tensors(data_torch, name, bid) + ): data: np.ndarray # type hint n_dims = len(data.shape) data_dtype = data.dtype @@ -294,32 +396,48 @@ def prepare_tensors(self): # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - extra_f32 = any(cond for cond in ( - extra_f32, - n_dims == 1, - new_name.endswith("_norm.weight"), - )) + extra_f32 = any( + cond + for cond in ( + extra_f32, + n_dims == 1, + new_name.endswith("_norm.weight"), + ) + ) # Some tensor types are always in float32 - extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - )) + extra_f32 = extra_f32 or any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + ) + ) # if f16 desired, convert any float32 2-dim weight tensors to float16 - extra_f16 = any(cond for cond in ( - extra_f16, - (name.endswith(".weight") and n_dims >= 2), - )) + extra_f16 = any( + cond + for cond in ( + extra_f16, + (name.endswith(".weight") and n_dims >= 2), + ) + ) - if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: + if ( + self.ftype != gguf.LlamaFileType.ALL_F32 + and extra_f16 + and not extra_f32 + ): if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: data = gguf.quantize_bf16(data) assert data.dtype == np.uint16 data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): + elif ( + self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 + and gguf.can_quantize_to_q8_0(data) + ): data = gguf.quantize_q8_0(data) assert data.dtype == np.uint8 data_qtype = gguf.GGMLQuantizationType.Q8_0 @@ -334,13 +452,19 @@ def prepare_tensors(self): data = data.astype(np.float32) data_qtype = gguf.GGMLQuantizationType.F32 - shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape + shape = ( + gguf.quant_shape_from_byte_shape(data.shape, data_qtype) + if data.dtype == np.uint8 + else data.shape + ) # reverse shape to make it similar to the internal ggml dimension order shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" # n_dims is implicit in the shape - logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + logger.info( + f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" + ) self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) @@ -349,9 +473,16 @@ def set_type(self): def prepare_metadata(self, vocab_only: bool): - total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + ( + total_params, + shared_params, + expert_params, + expert_count, + ) = self.gguf_writer.get_total_parameter_count() - self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + self.metadata = gguf.Metadata.load( + self.metadata_override, self.dir_model_card, self.model_name, total_params + ) # Fallback to model directory name if metadata name is still missing if self.metadata.name is None: @@ -359,7 +490,9 @@ def prepare_metadata(self, vocab_only: bool): # Generate parameter weight class (useful for leader boards) if not yet determined if self.metadata.size_label is None and total_params > 0: - self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + self.metadata.size_label = gguf.size_label( + total_params, shared_params, expert_params, expert_count + ) # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' output_type: str = self.ftype.name.partition("_")[2] @@ -368,9 +501,25 @@ def prepare_metadata(self, vocab_only: bool): if self.fname_out.is_dir(): # Generate default filename based on model specification and available metadata if not vocab_only: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + fname_default: str = gguf.naming_convention( + self.metadata.name, + self.metadata.basename, + self.metadata.finetune, + self.metadata.version, + self.metadata.size_label, + output_type, + model_type="LoRA" if total_params < 0 else None, + ) else: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + fname_default: str = gguf.naming_convention( + self.metadata.name, + self.metadata.basename, + self.metadata.finetune, + self.metadata.version, + size_label=None, + output_type=None, + model_type="vocab", + ) # Use the default filename self.fname_out = self.fname_out / f"{fname_default}.gguf" @@ -380,7 +529,9 @@ def prepare_metadata(self, vocab_only: bool): # file template strings as it doesn't actually exist as a file # Process templated file name with the output ftype, useful with the "auto" ftype - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename( + self.fname_out.name, output_type + ) self.set_type() @@ -406,7 +557,7 @@ def write(self): def write_vocab(self): if len(self.gguf_writer.tensors) != 1: - raise ValueError('Splitting the vocabulary is not supported') + raise ValueError("Splitting the vocabulary is not supported") self.prepare_metadata(vocab_only=True) self.gguf_writer.write_header_to_file(path=self.fname_out) @@ -437,6 +588,7 @@ def func(modelcls: AnyModel) -> AnyModel: for name in names: cls._model_classes[name] = modelcls return modelcls + return func @classmethod @@ -444,7 +596,7 @@ def from_model_architecture(cls, arch: str) -> type[Model]: try: return cls._model_classes[arch] except KeyError: - raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + raise NotImplementedError(f"Architecture {arch!r} not supported!") from None def does_token_look_special(self, token: str | bytes) -> bool: if isinstance(token, (bytes, bytearray)): @@ -458,14 +610,22 @@ def does_token_look_special(self, token: str | bytes) -> bool: # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) seems_special = token_text in ( "", # deepseek-coder - "", "<2mass>", "[@BOS@]", # gemma{,-2} + "", + "<2mass>", + "[@BOS@]", # gemma{,-2} ) - seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) - seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder + seems_special = seems_special or ( + token_text.startswith("<|") and token_text.endswith("|>") + ) + seems_special = seems_special or ( + token_text.startswith("<|") and token_text.endswith("|>") + ) # deepseek-coder # TODO: should these be marked as UNUSED instead? (maybe not) - seems_special = seems_special or (token_text.startswith("")) # gemma{,-2} + seems_special = seems_special or ( + token_text.startswith("") + ) # gemma{,-2} return seems_special @@ -475,13 +635,16 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: toktypes: list[int] = [] from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + reverse_vocab = { + id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() + } added_vocab = tokenizer.get_added_vocab() for i in range(vocab_size): @@ -491,10 +654,14 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: token: str = reverse_vocab[i] if token in added_vocab: - if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): + if tokenizer.added_tokens_decoder[ + i + ].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + token = token.replace( + b"\xe2\x96\x81".decode("utf-8"), " " + ) # pre-normalize user-defined spaces toktypes.append(gguf.TokenType.USER_DEFINED) else: toktypes.append(gguf.TokenType.NORMAL) @@ -512,7 +679,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -606,18 +773,32 @@ def get_vocab_base_pre(self, tokenizer) -> str: if res is None: logger.warning("\n") - logger.warning("**************************************************************************************") + logger.warning( + "**************************************************************************************" + ) logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") - logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning( + "** - the model has not been added to convert_hf_to_gguf_update.py yet" + ) + logger.warning( + "** - the pre-tokenization config has changed upstream" + ) + logger.warning( + "** Check your model files and convert_hf_to_gguf_update.py and update them accordingly." + ) + logger.warning( + "** ref: https://github.com/ggerganov/llama.cpp/pull/6920" + ) logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") - logger.warning("**************************************************************************************") + logger.warning( + "**************************************************************************************" + ) logger.warning("\n") - raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + raise NotImplementedError( + "BPE pre-tokenizer was not recognized - update get_vocab_base_pre()" + ) logger.debug(f"tokenizer.ggml.pre: {repr(res)}") logger.debug(f"chkhsh: {chkhsh}") @@ -642,6 +823,7 @@ def _set_vocab_qwen(self): toktypes: list[int] = [] from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams["vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size @@ -657,11 +839,13 @@ def _set_vocab_qwen(self): continue merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + merges.append(" ".join(map(QwenModel.token_bytes_to_string, merged))) # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.special_tokens - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} + reverse_vocab = { + id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() + } for i in range(vocab_size): if i not in reverse_vocab: @@ -683,10 +867,16 @@ def _set_vocab_qwen(self): special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token( + "bos", tokenizer.special_tokens["<|endoftext|>"] + ) + special_vocab._set_special_token( + "eos", tokenizer.special_tokens["<|endoftext|>"] + ) # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token( + "unk", tokenizer.special_tokens["<|endoftext|>"] + ) special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_sentencepiece(self, add_to_gguf=True): @@ -704,7 +894,7 @@ def _set_vocab_sentencepiece(self, add_to_gguf=True): def _create_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / 'tokenizer.model' + tokenizer_path = self.dir_model / "tokenizer.model" if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") @@ -712,7 +902,7 @@ def _create_vocab_sentencepiece(self): tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -737,35 +927,43 @@ def _create_vocab_sentencepiece(self): scores[token_id] = score toktypes[token_id] = toktype - added_tokens_file = self.dir_model / 'added_tokens.json' + added_tokens_file = self.dir_model / "added_tokens.json" if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + logger.warning( + f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" + ) continue tokens[token_id] = key.encode("utf-8") scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + added_tokens_decoder = tokenizer_config_json.get( + "added_tokens_decoder", {} + ) for token_id, token_data in added_tokens_decoder.items(): token_id = int(token_id) token: str = token_data["content"] if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token.encode("utf-8"): - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') + logger.warning( + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}' + ) if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + token = token.replace( + b"\xe2\x96\x81".decode("utf-8"), " " + ) # pre-normalize user-defined spaces toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED scores[token_id] = -1000.0 @@ -773,7 +971,9 @@ def _create_vocab_sentencepiece(self): if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + logger.debug( + f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" + ) for i in range(1, pad_count + 1): tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) scores.append(-1000.0) @@ -803,9 +1003,13 @@ def _set_vocab_llama_hf(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): + def _set_vocab_builtin( + self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int + ): tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" - logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + logger.warning( + f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'" + ) vocab_reader = gguf.GGUFReader(tokenizer_path, "r") default_pre = "mpt" if model_name == "gpt-neox" else "default" @@ -815,25 +1019,35 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) + self.gguf_writer.add_tokenizer_pre( + bytes(field.parts[-1]).decode("utf-8") if field else default_pre + ) field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) assert field # token list - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + self.gguf_writer.add_token_list( + [bytes(field.parts[i]) for i in field.data][:vocab_size] + ) if model_name == "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) assert field # token scores - self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + self.gguf_writer.add_token_scores( + [field.parts[i].tolist()[0] for i in field.data][:vocab_size] + ) field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) assert field # token types - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + self.gguf_writer.add_token_types( + [field.parts[i].tolist()[0] for i in field.data][:vocab_size] + ) if model_name != "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) assert field # token merges - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + self.gguf_writer.add_token_merges( + [bytes(field.parts[i]) for i in field.data] + ) if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) @@ -861,13 +1075,20 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + int( + self.hparams["rotary_pct"] + * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + ), ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_parallel_residual( + self.hparams.get("use_parallel_residual", True) + ) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) @@ -922,13 +1143,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - name = re.sub(r'transformer\.', '', name) + name = re.sub(r"transformer\.", "", name) tensors: list[tuple[str, Tensor]] = [] @@ -964,8 +1187,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter assert self.tensor_names is not None # TODO: tie them at runtime, don't duplicate in the model file - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + if all( + s not in self.tensor_names for s in ("lm_head.weight", "output.weight") + ): + tensors.append( + (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) + ) return tensors @@ -998,15 +1225,21 @@ def set_gguf_parameters(self): if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + self.gguf_writer.add_max_alibi_bias( + self.hparams["attn_config"]["alibi_bias_max"] + ) else: self.gguf_writer.add_max_alibi_bias(0.0) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused if "scales" in name: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = self.map_tensor_name( + name, try_suffixes=(".weight", ".bias", ".scales") + ) new_name = new_name.replace("scales", "act.scales") else: new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) @@ -1076,18 +1309,27 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) @@ -1096,37 +1338,57 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": logger.info(f"Unpacking and permuting layer {bid}") tensors = [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), - self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), - self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), - self._reverse_hf_part(data_torch, 2)), + ( + self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + self._reverse_hf_permute_part( + data_torch, 0, head_count, head_count + ), + ), + ( + self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + self._reverse_hf_permute_part( + data_torch, 1, head_count, head_count_kv + ), + ), + ( + self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + self._reverse_hf_part(data_torch, 2), + ), ] else: tensors = [(self.map_tensor_name(name), data_torch)] return tensors - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + def _reverse_hf_permute( + self, weights: Tensor, n_head: int, n_kv_head: int | None = None + ) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) .swapaxes(1, 2) .reshape(weights.shape) ) def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + self, + weights: Tensor, + n_part: int, + n_head: int, + n_head_kv: int | None = None, ) -> Tensor: r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + return self._reverse_hf_permute( + weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv + ) def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] + return weights[r * n_part : r * n_part + r, ...] @Model.register("XverseForCausalLM") @@ -1142,6 +1404,7 @@ def set_vocab(self): toktypes: list[int] = [] from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, @@ -1150,16 +1413,18 @@ def set_vocab(self): if max_vocab_index >= vocab_size: raise ValueError("Vocabulary size exceeds expected maximum size.") - reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + reverse_vocab: dict[int, str] = { + id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() + } added_vocab = tokenizer.get_added_vocab() for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode('utf-8') + token_text = reverse_vocab[token_id].encode("utf-8") # replace "\x00" to string with length > 0 if token_text == b"\x00": toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode('utf-8') - elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + token_text = f"<{token_text}>".encode("utf-8") + elif re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): toktype = gguf.TokenType.BYTE # special elif reverse_vocab[token_id] in added_vocab: if tokenizer.added_tokens_decoder[token_id].special: @@ -1200,18 +1465,27 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused head_count = self.hparams["num_attention_heads"] @@ -1225,12 +1499,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + def _reverse_hf_permute( + self, weights: Tensor, n_head: int, n_kv_head: int | None = None + ) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) .swapaxes(1, 2) .reshape(weights.shape) ) @@ -1263,7 +1541,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused # QKV tensor transform @@ -1278,10 +1558,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "query_key_value" in name: n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 + n_head_kv = ( + self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 + ) head_dim = self.hparams["hidden_size"] // n_head - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + qkv = data_torch.view( + n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head + ) q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) @@ -1315,8 +1599,11 @@ def set_vocab(self): super().set_vocab() # TODO: how to determine special FIM tokens automatically? - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot']) + special_vocab = gguf.SpecialVocab( + self.dir_model, + load_merges=False, + special_token_types=["prefix", "suffix", "middle", "eot"], + ) special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) @@ -1343,7 +1630,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: hidden_dim = self.hparams["n_embd"] inner_dim = 4 * hidden_dim hidden_dim = int(2 * inner_dim / 3) @@ -1357,13 +1646,35 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if bid is not None: if name == f"transformer.h.{bid}.attn.kv.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) + tensors.append( + ( + self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + data_torch[: n_head_kv * head_dim], + ) + ) + tensors.append( + ( + self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + data_torch[n_head_kv * head_dim :], + ) + ) elif name == f"transformer.h.{bid}.attn.q.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) + tensors.append( + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch) + ) elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) + tensors.append( + ( + self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), + data_torch[:ff_dim], + ) + ) + tensors.append( + ( + self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), + data_torch[ff_dim:], + ) + ) if len(tensors) == 0: tensors.append((self.map_tensor_name(name), data_torch)) @@ -1371,7 +1682,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors -@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +@Model.register( + "StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM" +) class StableLMModel(Model): model_arch = gguf.MODEL_ARCH.STABLELM @@ -1391,17 +1704,30 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_rope_dimension_count( + int( + rotary_factor + * (hparams["hidden_size"] // hparams["num_attention_heads"]) + ) + ) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + self.gguf_writer.add_parallel_residual( + hparams["use_parallel_residual"] + if "use_parallel_residual" in hparams + else True + ) + self.gguf_writer.add_layer_norm_eps( + self.find_hparam(["layer_norm_eps", "norm_eps"]) + ) self.gguf_writer.add_file_type(self.ftype) _q_norms: list[dict[str, Tensor]] | None = None _k_norms: list[dict[str, Tensor]] | None = None - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams["num_key_value_heads"] @@ -1414,7 +1740,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._q_norms[bid][name] = data_torch if len(self._q_norms[bid]) >= n_head: - return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") + return self._stack_qk_norm( + bid, n_head, self._q_norms[bid], "q_layernorm" + ) else: return [] @@ -1427,13 +1755,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._k_norms[bid][name] = data_torch if len(self._k_norms[bid]) >= n_kv_head: - return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") + return self._stack_qk_norm( + bid, n_kv_head, self._k_norms[bid], "k_layernorm" + ) else: return [] return [(self.map_tensor_name(name), data_torch)] - def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): + def _stack_qk_norm( + self, + bid: int, + n_head: int, + norms: dict[str, Tensor], + layer_name: str = "q_layernorm", + ): datas: list[Tensor] = [] # extract the norms in order for xid in range(n_head): @@ -1453,9 +1789,13 @@ def prepare_tensors(self): if self._q_norms is not None or self._k_norms is not None: # flatten two `list[dict[str, Tensor]]` into a single `list[str]` norms = ( - [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] + [k for d in self._q_norms for k in d.keys()] + if self._q_norms is not None + else [] ) + ( - [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] + [k for d in self._k_norms for k in d.keys()] + if self._k_norms is not None + else [] ) if len(norms) > 0: raise ValueError(f"Unprocessed norms: {norms}") @@ -1478,13 +1818,14 @@ def set_vocab(self): # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + self.dir_model, + load_merges=False, + special_token_types=["prefix", "suffix", "middle", "eot"], ) special_vocab._set_special_token("prefix", 32007) special_vocab._set_special_token("suffix", 32008) special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) + special_vocab._set_special_token("eot", 32010) special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -1498,17 +1839,24 @@ def set_gguf_parameters(self): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + self.gguf_writer.add_add_space_prefix( + tokenizer_config_json["add_prefix_space"] + ) # Apply to granite small models only if self.hparams.get("vocab_size", 32000) == 49152: @@ -1518,13 +1866,19 @@ def set_gguf_parameters(self): def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + return ( + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) + .swapaxes(1, 2) + .reshape(weights.shape) + ) _experts: list[dict[str, Tensor]] | None = None - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -1571,15 +1925,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter def prepare_tensors(self): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": + if rope_scaling.get("rope_type", "").lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + freqs = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) + ) factor = rope_scaling.get("factor", 8.0) low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = self.hparams.get( + "original_max_position_embeddings", 8192 + ) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor @@ -1593,10 +1951,15 @@ def prepare_tensors(self): elif wavelen > low_freq_wavelen: rope_factors.append(factor) else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + smooth = (old_context_len / wavelen - low_freq_factor) / ( + high_freq_factor - low_freq_factor + ) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) + self.gguf_writer.add_tensor( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), + np.array(rope_factors, dtype=np.float32), + ) super().prepare_tensors() @@ -1629,18 +1992,23 @@ def weight_quant(self, weight): weight = torch.sign(weight).type(dtype) return weight.type(dtype), scale.type(torch.float32) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) - if any(self.match_model_tensor_name(new_name, key, bid) for key in [ - gguf.MODEL_TENSOR.ATTN_Q, - gguf.MODEL_TENSOR.ATTN_K, - gguf.MODEL_TENSOR.ATTN_V, - gguf.MODEL_TENSOR.ATTN_OUT, - gguf.MODEL_TENSOR.FFN_UP, - gguf.MODEL_TENSOR.FFN_DOWN, - gguf.MODEL_TENSOR.FFN_GATE, - ]): + if any( + self.match_model_tensor_name(new_name, key, bid) + for key in [ + gguf.MODEL_TENSOR.ATTN_Q, + gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_OUT, + gguf.MODEL_TENSOR.FFN_UP, + gguf.MODEL_TENSOR.FFN_DOWN, + gguf.MODEL_TENSOR.FFN_GATE, + ] + ): # transform weight into 1/0/-1 (in fp32) weight_torch, scale_torch = self.weight_quant(data_torch) yield (new_name, weight_torch) @@ -1664,7 +2032,9 @@ def set_gguf_parameters(self): _experts: list[dict[str, Tensor]] | None = None - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find(".moe.") != -1: n_experts = self.hparams["num_local_experts"] @@ -1684,7 +2054,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter datas: list[Tensor] = [] for xid in range(n_experts): - ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" + ename = ( + f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" + ) datas.append(self._experts[bid][ename]) del self._experts[bid][ename] @@ -1730,7 +2102,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused n_expert = self.hparams["ffn_config"]["moe_num_experts"] @@ -1742,9 +2116,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # But llama.cpp moe graph works differently # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + exp_tensor_names = { + "ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": ( + 0, + 2, + 1, + ), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.v1": None, + } # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): @@ -1761,11 +2141,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # Every other model has the weight names ending in .weight, # let's assume that is the convention which is not the case for dbrx: # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) + new_name = self.map_tensor_name( + name if not experts else name + ".weight", try_suffixes=(".weight",) + ) return [(new_name, data_torch)] - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def extra_f16_tensors( + self, name: str, new_name: str, bid: int | None, n_dims: int + ) -> bool: del name, new_name, bid # unused return n_dims > 1 @@ -1781,7 +2165,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) @@ -1790,17 +2176,23 @@ def set_gguf_parameters(self): def set_vocab(self): self._set_vocab_llama_hf() - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + def _reverse_hf_permute( + self, weights: Tensor, n_head: int, n_kv_head: int | None = None + ) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) .swapaxes(1, 2) .reshape(weights.shape) ) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused n_head = self.hparams["num_attention_heads"] @@ -1822,11 +2214,14 @@ class QwenModel(Model): @staticmethod def token_bytes_to_string(b): from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + def bpe( + mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None + ) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -1839,7 +2234,11 @@ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = if min_rank is None or (max_rank is not None and min_rank >= max_rank): break assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + parts = ( + parts[:min_idx] + + [parts[min_idx] + parts[min_idx + 1]] + + parts[min_idx + 2 :] + ) return parts def set_vocab(self): @@ -1851,7 +2250,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) @@ -1876,16 +2277,28 @@ def set_gguf_parameters(self): super().set_gguf_parameters() if (n_experts := self.hparams.get("num_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + if ( + moe_intermediate_size := self.hparams.get("moe_intermediate_size") + ) is not None: self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: - self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) - logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") + if ( + shared_expert_intermediate_size := self.hparams.get( + "shared_expert_intermediate_size" + ) + ) is not None: + self.gguf_writer.add_expert_shared_feed_forward_length( + shared_expert_intermediate_size + ) + logger.info( + f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}" + ) _experts: list[dict[str, Tensor]] | None = None - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find("experts") != -1: n_experts = self.hparams["num_experts"] @@ -1944,7 +2357,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused tensors: list[tuple[str, Tensor]] = [] @@ -1953,7 +2368,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith((".attn.bias", ".attn.masked_bias")): return tensors - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + if name.endswith( + (".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight") + ): data_torch = data_torch.transpose(1, 0) new_name = self.map_tensor_name(name) @@ -1962,7 +2379,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # note: GPT2 output is tied to (same as) wte in original model if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + tensors.append( + (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) + ) return tensors @@ -1978,14 +2397,18 @@ def set_gguf_parameters(self): n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + self.gguf_writer.add_context_length( + self.find_hparam(["n_positions", "max_position_embeddings"]) + ) self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_layer_norm_eps( + self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]) + ) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_add_bos_token(False) @@ -1998,15 +2421,15 @@ class Phi3MiniModel(Model): def set_vocab(self): from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / 'tokenizer.model' + tokenizer_path = self.dir_model / "tokenizer.model" if not tokenizer_path.is_file(): - raise ValueError(f'Error: Missing {tokenizer_path}') + raise ValueError(f"Error: Missing {tokenizer_path}") tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -2032,7 +2455,7 @@ def set_vocab(self): scores[token_id] = score toktypes[token_id] = toktype - added_tokens_file = self.dir_model / 'added_tokens.json' + added_tokens_file = self.dir_model / "added_tokens.json" if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) @@ -2040,31 +2463,37 @@ def set_vocab(self): for key in added_tokens_json: token_id = added_tokens_json[key] if token_id >= vocab_size: - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + logger.debug( + f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" + ) continue tokens[token_id] = key.encode("utf-8") scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + added_tokens_decoder = tokenizer_config_json.get( + "added_tokens_decoder", {} + ) for token_id, foken_data in added_tokens_decoder.items(): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + logger.warning( + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' + ) tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED if foken_data.get("special"): toktypes[token_id] = SentencePieceTokenTypes.CONTROL - tokenizer_file = self.dir_model / 'tokenizer.json' + tokenizer_file = self.dir_model / "tokenizer.json" if tokenizer_file.is_file(): with open(tokenizer_file, "r", encoding="utf-8") as f: tokenizer_json = json.load(f) @@ -2074,7 +2503,9 @@ def set_vocab(self): token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + logger.warning( + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' + ) tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2104,7 +2535,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_feed_forward_length( + self.find_hparam(["intermediate_size"]) + ) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head_kv) @@ -2115,36 +2548,55 @@ def set_gguf_parameters(self): self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(['rope_scaling'], True) + rope_scaling = self.find_hparam(["rope_scaling"], True) if rope_scaling is None: return scale = max_pos_embds / orig_max_pos_embds - rope_scaling_type = rope_scaling.get('type', '').lower() + rope_scaling_type = rope_scaling.get("type", "").lower() if len(rope_scaling_type) == 0: - raise KeyError('Missing the required key rope_scaling.type') + raise KeyError("Missing the required key rope_scaling.type") - if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': - attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 - elif rope_scaling_type == 'yarn': + if rope_scaling_type == "su" or rope_scaling_type == "longrope": + attn_factor = ( + math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) + if scale > 1.0 + else 1.0 + ) + elif rope_scaling_type == "yarn": attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 else: - raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') + raise NotImplementedError( + f"The rope scaling type {rope_scaling_type} is not supported yet" + ) self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) + long_factors = rope_scaling.get("long_factor", None) + short_factors = rope_scaling.get("short_factor", None) if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + raise KeyError( + "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor" + ) - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + if ( + len(long_factors) != len(short_factors) + or len(long_factors) != rope_dims / 2 + ): + raise ValueError( + f"The length of rope long and short factors must be {rope_dims / 2}" + ) - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + self.gguf_writer.add_tensor( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", + np.array(long_factors, dtype=np.float32), + ) + self.gguf_writer.add_tensor( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", + np.array(short_factors, dtype=np.float32), + ) @Model.register("PlamoForCausalLM") @@ -2163,7 +2615,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_head_count_kv( + 5 + ) # hparams["num_key_value_heads"]) is wrong self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) @@ -2181,7 +2635,9 @@ def shuffle_attn_output_weight(self, data_torch): data_torch = torch.reshape(data_torch, (5120, 5120)) return data_torch - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused new_name = self.map_tensor_name(name) @@ -2214,7 +2670,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused new_name = self.map_tensor_name(name) @@ -2224,9 +2682,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): assert self.tensor_names is not None - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + if all( + s not in self.tensor_names for s in ("lm_head.weight", "output.weight") + ): # copy tok_embd.weight to output.weight - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + tensors.append( + (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) + ) return tensors @@ -2243,24 +2705,26 @@ def set_vocab(self): from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / 'tokenizer.model' + tokenizer_path = self.dir_model / "tokenizer.model" tokens: list[bytes] = [] scores: list[float] = [] toktypes: list[int] = [] if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') + logger.error(f"Error: Missing {tokenizer_path}") sys.exit(1) - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) for token_id in range(vocab_size): piece = tokenizer.IdToPiece(token_id) @@ -2282,14 +2746,14 @@ def set_vocab(self): elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE # take care of ununsed raw token - if piece.startswith('[UNUSED'): + if piece.startswith("[UNUSED"): toktype = SentencePieceTokenTypes.UNUSED tokens.append(text) scores.append(score) toktypes.append(toktype) - added_tokens_file = self.dir_model / 'added_tokens.json' + added_tokens_file = self.dir_model / "added_tokens.json" if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) @@ -2299,14 +2763,16 @@ def set_vocab(self): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - chat_eos_token = '<|im_end|>' + chat_eos_token = "<|im_end|>" chat_eos_token_id = None - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + added_tokens_decoder = tokenizer_config_json.get( + "added_tokens_decoder", {} + ) for token_id, foken_data in added_tokens_decoder.items(): token_id = int(token_id) token = foken_data["content"] @@ -2315,14 +2781,16 @@ def set_vocab(self): token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + logger.warning( + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' + ) tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED if foken_data.get("special"): toktypes[token_id] = SentencePieceTokenTypes.CONTROL - tokenizer_file = self.dir_model / 'tokenizer.json' + tokenizer_file = self.dir_model / "tokenizer.json" if tokenizer_file.is_file(): with open(tokenizer_file, "r", encoding="utf-8") as f: tokenizer_json = json.load(f) @@ -2335,7 +2803,9 @@ def set_vocab(self): token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + logger.warning( + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' + ) tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2356,8 +2826,10 @@ def set_vocab(self): # TODO: this is a hack, should be fixed # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = chat_eos_token_id - logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" - " in chat mode so that the conversation can end normally.") + logger.warning( + f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" + " in chat mode so that the conversation can end normally." + ) special_vocab.add_to_gguf(self.gguf_writer) @@ -2371,12 +2843,19 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_file_type(self.ftype) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] num_kv_heads = self.hparams["num_key_value_heads"] n_embd = self.hparams["hidden_size"] @@ -2388,11 +2867,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter qkv = data_torch qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) - q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] + q, k, v = qkv[:, :q_per_kv], qkv[:, -2], qkv[:, -1] # The model weights of q and k equire additional reshape. q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) - k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) + k = LlamaModel.permute( + k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads + ) v = v.reshape((-1, v.shape[-1])) return [ @@ -2429,7 +2910,9 @@ def set_gguf_parameters(self): # get pooling type if pooling_path is not None: - with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + with open( + self.dir_model / pooling_path / "config.json", encoding="utf-8" + ) as f: pooling = json.load(f) if pooling["pooling_mode_mean_tokens"]: pooling_type = gguf.PoolingType.MEAN @@ -2454,6 +2937,7 @@ def phantom(tok): if tok.startswith("##"): return tok[2:] return "\u2581" + tok + tokens = list(map(phantom, tokens)) # add vocab to gguf @@ -2466,12 +2950,18 @@ def phantom(tok): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return [] # we don't need these + if name in ( + "embeddings.position_ids", + "pooler.dense.weight", + "pooler.dense.bias", + ): + return [] # we don't need these return [(self.map_tensor_name(name), data_torch)] @@ -2514,13 +3004,16 @@ def set_vocab(self): self._set_vocab_sentencepiece() # TODO: these special tokens should be exported only for the CodeGemma family - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab = gguf.SpecialVocab( + self.dir_model, + load_merges=False, + special_token_types=["prefix", "suffix", "middle", "fsep", "eot"], + ) special_vocab._set_special_token("prefix", 67) special_vocab._set_special_token("suffix", 69) special_vocab._set_special_token("middle", 68) - special_vocab._set_special_token("fsep", 70) - special_vocab._set_special_token("eot", 107) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) @@ -2535,19 +3028,27 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv( + self.hparams["num_key_value_heads"] + if "num_key_value_heads" in hparams + else hparams["num_attention_heads"] + ) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + logger.debug( + f"Skipping get tensor {name!r} in safetensors so that convert can end normally." + ) return [] # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 @@ -2575,7 +3076,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv( + self.hparams["num_key_value_heads"] + if "num_key_value_heads" in hparams + else hparams["num_attention_heads"] + ) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) @@ -2588,13 +3093,17 @@ def set_gguf_parameters(self): ) self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + logger.debug( + f"Skipping get tensor {name!r} in safetensors so that convert can end normally." + ) return [] # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 @@ -2631,23 +3140,37 @@ def set_vocab(self): self._set_vocab_builtin("gpt-neox", vocab_size) def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = ( + self.find_hparam(["intermediate_size", "d_inner"], optional=True) + or 2 * d_model + ) + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 # ceiling division # ref: https://stackoverflow.com/a/17511341/22827863 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -( + d_model // -16 + ) + rms_norm_eps = ( + self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) + or 1e-5 + ) # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_context_length( + 2**20 + ) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_feed_forward_length( + 0 + ) # unused, but seemingly required when loading + self.gguf_writer.add_head_count( + 0 + ) # unused, but seemingly required when loading self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) @@ -2658,7 +3181,9 @@ def set_gguf_parameters(self): _tok_embd = None - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) @@ -2673,18 +3198,25 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # assuming token_embd.weight is seen before output.weight if self._tok_embd is not None and new_name == output_name: if torch.equal(self._tok_embd, data_torch): - logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") + logger.debug( + f"{output_name} is equivalent to {tok_embd_name}, omitting" + ) return [] elif new_name == tok_embd_name: self._tok_embd = data_torch return [(new_name, data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def extra_f32_tensors( + self, name: str, new_name: str, bid: int | None, n_dims: int + ) -> bool: del n_dims # unused return bid is not None and new_name in ( - self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ + self.format_tensor_name( + n, bid, ".weight" if name.endswith(".weight") else "" + ) + for n in [ gguf.MODEL_TENSOR.SSM_CONV1D, gguf.MODEL_TENSOR.SSM_X, gguf.MODEL_TENSOR.SSM_DT, @@ -2704,7 +3236,9 @@ def __init__(self, *args, **kwargs): # max_position_embeddings = 8192 in config.json but model was actually # trained on 128k context length # aya-23 models don't have model_max_length specified - self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) + self.hparams["max_position_embeddings"] = self.find_hparam( + ["model_max_length", "max_position_embeddings"] + ) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -2726,7 +3260,9 @@ def set_gguf_parameters(self): # Same as super class, but permuting q_proj, k_proj # Copied from: LlamaModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused n_head = self.hparams["num_attention_heads"] @@ -2750,13 +3286,13 @@ def __init__(self, *args, **kwargs): def get_tensors(self): for name, data in super().get_tensors(): - if 'gated_layer' in name: - d1 = data[:self.intermediate_size, :] - name1 = name.replace('gated_layers', 'gated_layers_w') - name1 = name1.replace('up_gated_layer', 'gated_layers_v') - d2 = data[self.intermediate_size:, :] - name2 = name.replace('gated_layers', 'gated_layers_v') - name2 = name2.replace('up_gated_layer', 'gated_layers_w') + if "gated_layer" in name: + d1 = data[: self.intermediate_size, :] + name1 = name.replace("gated_layers", "gated_layers_w") + name1 = name1.replace("up_gated_layer", "gated_layers_v") + d2 = data[self.intermediate_size :, :] + name2 = name.replace("gated_layers", "gated_layers_v") + name2 = name2.replace("up_gated_layer", "gated_layers_w") yield name1, d1 yield name2, d2 continue @@ -2764,17 +3300,19 @@ def get_tensors(self): yield name, data def set_vocab(self): - tokenizer_class = 'BertTokenizer' + tokenizer_class = "BertTokenizer" with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_class = json.load(f)['tokenizer_class'] + tokenizer_class = json.load(f)["tokenizer_class"] - if tokenizer_class == 'BertTokenizer': + if tokenizer_class == "BertTokenizer": super().set_vocab() - elif tokenizer_class == 'RobertaTokenizer': + elif tokenizer_class == "RobertaTokenizer": self._set_vocab_gpt2() self.gguf_writer.add_token_type_count(2) else: - raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') + raise NotImplementedError( + f"Tokenizer {tokenizer_class} is not supported for JinaBertModel" + ) self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) @@ -2804,8 +3342,12 @@ def __init__(self, *args, **kwargs): OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) for multiplier in ffn_multipliers ] - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) + assert isinstance(self._num_kv_heads, list) and isinstance( + self._num_kv_heads[0], int + ) + assert isinstance(self._num_query_heads, list) and isinstance( + self._num_query_heads[0], int + ) # Uses the tokenizer from meta-llama/Llama-2-7b-hf def set_vocab(self): @@ -2842,13 +3384,21 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: return super().find_hparam(keys, optional) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: # split ff if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": ff_dim = self._ffn_dims[bid] - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), + data_torch[:ff_dim], + ) + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), + data_torch[ff_dim:], + ) return yield (self.map_tensor_name(name), data_torch) @@ -2864,17 +3414,17 @@ def set_vocab(self): # tokenizer.model and used them as BOS and EOS instead of adding new tokens. from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / 'tokenizer.model' + tokenizer_path = self.dir_model / "tokenizer.model" if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') + logger.error(f"Error: Missing {tokenizer_path}") sys.exit(1) # Read the whole vocabulary from the tokenizer.model file tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -2902,7 +3452,7 @@ def set_vocab(self): # Use the added_tokens_decoder field from tokeniser_config.json as the source # of information about added/redefined tokens and modify them accordingly. - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) @@ -2912,7 +3462,9 @@ def set_vocab(self): for token_id, token_json in added_tokens_decoder.items(): token_id = int(token_id) if token_id >= vocab_size: - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + logger.debug( + f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" + ) continue token_content = token_json["content"] @@ -2928,7 +3480,9 @@ def set_vocab(self): token_type = SentencePieceTokenTypes.CONTROL token_score = 0.0 - logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") + logger.info( + f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})" + ) tokens[token_id] = token_content.encode("utf-8") toktypes[token_id] = token_type scores[token_id] = token_score @@ -2946,11 +3500,15 @@ def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count( + hparams["hidden_size"] // hparams["num_attention_heads"] + ) _experts: list[dict[str, Tensor]] | None = None - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -3021,24 +3579,39 @@ def set_gguf_parameters(self): if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_key_length( + hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"] + ) self.gguf_writer.add_value_length(hparams["v_head_dim"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_feed_forward_length( + hparams["moe_intermediate_size"] + ) self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): if self.hparams["rope_scaling"].get("type") == "yarn": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) + self.gguf_writer.add_rope_scaling_orig_ctx_len( + self.hparams["rope_scaling"]["original_max_position_embeddings"] + ) + self.gguf_writer.add_rope_scaling_yarn_log_mul( + 0.1 * hparams["rope_scaling"]["mscale_all_dim"] + ) _experts: list[dict[str, Tensor]] | None = None - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] @@ -3102,34 +3675,38 @@ def set_vocab(self): from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / 'tokenizer.model' + tokenizer_path = self.dir_model / "tokenizer.model" # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' + tokenizer_path = self.dir_model / "spiece.model" if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) # some models like Pile-T5 family use BPE tokenizer instead of Unigram if sentencepiece_model.trainer_spec.model_type == 2: # BPE # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' + assert tokenizer_path.name == "tokenizer.model" return self._set_vocab_sentencepiece() else: assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + remove_whitespaces = ( + sentencepiece_model.normalizer_spec.remove_extra_whitespaces + ) precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -3154,14 +3731,16 @@ def set_vocab(self): scores[token_id] = score toktypes[token_id] = toktype - added_tokens_file = self.dir_model / 'added_tokens.json' + added_tokens_file = self.dir_model / "added_tokens.json" if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + logger.warning( + f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" + ) continue tokens[token_id] = key.encode("utf-8") @@ -3170,7 +3749,9 @@ def set_vocab(self): if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + logger.debug( + f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" + ) for i in range(1, pad_count + 1): tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) scores.append(-1000.0) @@ -3194,7 +3775,9 @@ def set_vocab(self): def set_gguf_parameters(self): if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") + logger.warning( + "Couldn't find context length in config.json, assuming default value of 512" + ) n_ctx = 512 self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) @@ -3204,24 +3787,36 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length(self.hparams["d_kv"]) self.gguf_writer.add_value_length(self.hparams["d_kv"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_relative_attn_buckets_count( + self.hparams["relative_attention_num_buckets"] + ) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) + self.gguf_writer.add_decoder_start_token_id( + self.hparams["decoder_start_token_id"] + ) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if name in [ + "decoder.embed_tokens.weight", + "encoder.embed_tokens.weight", + "shared.weight", + ]: if not self.shared_token_embeddings_found: name = "shared.weight" self.shared_token_embeddings_found = True else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + logger.debug( + f"Skipping shared tensor {name!r} in safetensors so that convert can end normally." + ) return [] return [(self.map_tensor_name(name), data_torch)] @@ -3243,20 +3838,22 @@ def __init__(self, *args, **kwargs): self.embeddings_scale = 1.0 # note: For some JAIS flavors, output is tied to (same as) wte in original model self.output_is_wte = False - if 'mup_embeddings_scale' in self.hparams: - self.output_is_wte = True # Hack (?) - self.embeddings_scale = self.hparams['mup_embeddings_scale'] - elif 'embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['embeddings_scale'] + if "mup_embeddings_scale" in self.hparams: + self.output_is_wte = True # Hack (?) + self.embeddings_scale = self.hparams["mup_embeddings_scale"] + elif "embeddings_scale" in self.hparams: + self.embeddings_scale = self.hparams["embeddings_scale"] else: assert False self.width_scale = 1.0 - if 'mup_output_alpha' in self.hparams: - assert 'mup_width_scale' in self.hparams - self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] - elif 'width_scale' in self.hparams: - self.width_scale = self.hparams['width_scale'] + if "mup_output_alpha" in self.hparams: + assert "mup_width_scale" in self.hparams + self.width_scale = ( + self.hparams["mup_output_alpha"] * self.hparams["mup_width_scale"] + ) + elif "width_scale" in self.hparams: + self.width_scale = self.hparams["width_scale"] else: assert False @@ -3274,7 +3871,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused tensors: list[tuple[str, Tensor]] = [] @@ -3294,7 +3893,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + if name.endswith( + (".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight") + ): data_torch = data_torch.transpose(1, 0) new_name = self.map_tensor_name(name) @@ -3302,7 +3903,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): tensors.append((new_name, data_torch * self.embeddings_scale)) if self.output_is_wte: - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) + tensors.append( + ( + self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), + data_torch * self.width_scale, + ) + ) elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): assert not self.output_is_wte tensors.append((new_name, data_torch * self.width_scale)) @@ -3328,11 +3934,23 @@ def set_vocab_chatglm3(self): scores: list[float] = [] from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) assert max(tokenizer.get_vocab().values()) < vocab_size - role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] - special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + role_special_tokens = [ + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + ] + special_tokens = [ + "[MASK]", + "[gMASK]", + "[sMASK]", + "sop", + "eop", + ] + role_special_tokens for token_id in range(vocab_size): piece = tokenizer._convert_id_to_token(token_id) if token_id == 0: @@ -3390,11 +4008,14 @@ def set_vocab_chatglm3(self): @staticmethod def token_bytes_to_string(b): from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + def bpe( + mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None + ) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -3407,7 +4028,11 @@ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = if min_rank is None or (max_rank is not None and min_rank >= max_rank): break assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + parts = ( + parts[:min_idx] + + [parts[min_idx] + parts[min_idx + 1]] + + parts[min_idx + 2 :] + ) return parts def set_vocab(self): @@ -3421,6 +4046,7 @@ def set_vocab(self): toktypes: list[int] = [] from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams["padded_vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size @@ -3436,11 +4062,13 @@ def set_vocab(self): continue merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank) assert len(merged) >= 2 and len(merged) <= 7 - merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged))) + merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged))) # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.get_added_vocab() - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} + reverse_vocab = { + id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() + } for i in range(vocab_size): if i not in reverse_vocab: @@ -3464,10 +4092,14 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token( + "eos", tokenizer.get_added_vocab()["<|endoftext|>"] + ) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token( + "unk", tokenizer.get_added_vocab()["<|endoftext|>"] + ) special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -3476,7 +4108,9 @@ def set_gguf_parameters(self): n_head_kv = self.hparams.get("multi_query_group_num", n_head) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed)) + self.gguf_writer.add_feed_forward_length( + self.hparams.get("ffn_hidden_size", 4 * n_embed) + ) self.gguf_writer.add_block_count(self.hparams["num_layers"]) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head_kv) @@ -3489,7 +4123,9 @@ def set_gguf_parameters(self): rope_freq = rope_freq * self.hparams["rope_ratio"] self.gguf_writer.add_rope_freq_base(rope_freq) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: del bid # unused if name.endswith(".rotary_pos_emb.inv_freq"): @@ -3498,6 +4134,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.removeprefix("transformer.") return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### @@ -3540,18 +4177,24 @@ def numpy(self) -> gguf.LazyNumpyTensor: return gguf.LazyNumpyTensor( meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), args=(self,), - func=(lambda s: s.numpy()) + func=(lambda s: s.numpy()), ) @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: + def meta_with_dtype_and_shape( + cls, dtype: torch.dtype, shape: tuple[int, ...] + ) -> Tensor: return torch.empty(size=shape, dtype=dtype, device="meta") @classmethod def from_safetensors_slice(cls, st_slice: Any) -> Tensor: dtype = cls._dtype_str_map[st_slice.get_dtype()] shape: tuple[int, ...] = tuple(st_slice.get_shape()) - lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) + lazy = cls( + meta=cls.meta_with_dtype_and_shape(dtype, shape), + args=(st_slice,), + func=lambda s: s[:], + ) return cast(torch.Tensor, lazy) @classmethod @@ -3569,62 +4212,82 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file") + description="Convert a huggingface model to a GGML compatible file" + ) parser.add_argument( - "--vocab-only", action="store_true", + "--vocab-only", + action="store_true", help="extract only the vocab", ) parser.add_argument( - "--outfile", type=Path, + "--outfile", + type=Path, help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + "--outtype", + type=str, + choices=["f32", "f16", "bf16", "q8_0", "auto"], + default="f16", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( - "--bigendian", action="store_true", + "--bigendian", + action="store_true", help="model is executed on big endian machine", ) parser.add_argument( - "model", type=Path, + "model", + type=Path, help="directory containing model file", ) parser.add_argument( - "--use-temp-file", action="store_true", + "--use-temp-file", + action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)", ) parser.add_argument( - "--no-lazy", action="store_true", + "--no-lazy", + action="store_true", help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", ) parser.add_argument( - "--model-name", type=str, default=None, + "--model-name", + type=str, + default=None, help="name of the model", ) parser.add_argument( - "--verbose", action="store_true", + "--verbose", + action="store_true", help="increase output verbosity", ) parser.add_argument( - "--split-max-tensors", type=int, default=0, + "--split-max-tensors", + type=int, + default=0, help="max tensors in each split", ) parser.add_argument( - "--split-max-size", type=str, default="0", + "--split-max-size", + type=str, + default="0", help="max size per split N(M|G)", ) parser.add_argument( - "--dry-run", action="store_true", + "--dry-run", + action="store_true", help="only print out a split plan and exit, without writing any new files", ) parser.add_argument( - "--no-tensor-first-split", action="store_true", - help="do not add tensors to the first split (disabled by default)" + "--no-tensor-first-split", + action="store_true", + help="do not add tensors to the first split (disabled by default)", ) parser.add_argument( - "--metadata", type=Path, - help="Specify the path for an authorship metadata override file" + "--metadata", + type=Path, + help="Specify the path for an authorship metadata override file", ) return parser.parse_args() @@ -3640,7 +4303,9 @@ def split_str_to_n_bytes(split_str: str) -> int: elif split_str.isnumeric(): n = int(split_str) else: - raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + raise ValueError( + f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G" + ) if n < 0: raise ValueError(f"Invalid split size: {split_str}, must be positive") @@ -3659,7 +4324,7 @@ def main() -> None: dir_model = args.model if not dir_model.is_dir(): - logger.error(f'Error: {args.model} is not a directory') + logger.error(f"Error: {args.model} is not a directory") sys.exit(1) ftype_map: dict[str, gguf.LlamaFileType] = { @@ -3694,24 +4359,37 @@ def main() -> None: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) - model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out, - is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, - eager=args.no_lazy, - metadata_override=args.metadata, model_name=args.model_name, - split_max_tensors=args.split_max_tensors, - split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, - small_first_shard=args.no_tensor_first_split) + model_instance = model_class( + dir_model=dir_model, + ftype=output_type, + fname_out=fname_out, + is_big_endian=args.bigendian, + use_temp_file=args.use_temp_file, + eager=args.no_lazy, + metadata_override=args.metadata, + model_name=args.model_name, + split_max_tensors=args.split_max_tensors, + split_max_size=split_str_to_n_bytes(args.split_max_size), + dry_run=args.dry_run, + small_first_shard=args.no_tensor_first_split, + ) if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() - logger.info(f"Model vocab successfully exported to {model_instance.fname_out}") + logger.info( + f"Model vocab successfully exported to {model_instance.fname_out}" + ) else: logger.info("Exporting model...") model_instance.write() - out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out + out_path = ( + f"{model_instance.fname_out.parent}{os.sep}" + if is_split + else model_instance.fname_out + ) logger.info(f"Model successfully exported to {out_path}") -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/convert_lora_to_gguf.py b/src/convert_lora_to_gguf.py index be8496f..7f10c9d 100644 --- a/src/convert_lora_to_gguf.py +++ b/src/convert_lora_to_gguf.py @@ -11,15 +11,24 @@ import json from math import prod from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Iterable, + Iterator, + Sequence, + SupportsIndex, + cast, +) import torch if TYPE_CHECKING: from torch import Tensor -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +if "NO_LOCAL_GGUF" not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) import gguf # reuse model definitions from convert_hf_to_gguf.py @@ -33,6 +42,7 @@ class PartialLoraTensor: A: Tensor | None = None B: Tensor | None = None + # magic to support tensor shape modifications and splitting class LoraTorchTensor: _lora_A: Tensor # (n_rank, row_size) @@ -57,7 +67,9 @@ def __getitem__( indices: ( SupportsIndex | slice - | tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature + | tuple[ + SupportsIndex | slice | Tensor, ... + ] # TODO: add ellipsis in the type signature ), ) -> LoraTorchTensor: shape = self.shape @@ -90,7 +102,10 @@ def __getitem__( ) if len(indices) < len(shape): - indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape)))) + indices = ( + *indices, + *(slice(None, None) for _ in range(len(indices), len(shape))), + ) # TODO: make sure this is correct indices_A = ( @@ -138,7 +153,9 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: n_elems = prod(orig_shape) n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) assert n_elems % n_new_elems == 0 - new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),) + new_shape = ( + *(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape), + ) if new_shape[-1] != orig_shape[-1]: raise NotImplementedError # can't reshape the row size trivially @@ -164,7 +181,9 @@ def permute(self, *dims: int) -> LoraTorchTensor: assert all(dim == 1 for dim in self._lora_A.shape[:-2]) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: - return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims)) + return LoraTorchTensor( + self._lora_B.permute(*dims), self._lora_A.permute(*dims) + ) else: # TODO: compose the above two raise NotImplementedError @@ -179,7 +198,9 @@ def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor: return self.transpose(axis0, axis1) def to(self, *args, **kwargs): - return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs)) + return LoraTorchTensor( + self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs) + ) @classmethod def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): @@ -226,50 +247,64 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = base_name.replace(".lora_B.weight", ".weight") return base_name + def pyinstaller_include(): # PyInstaller import pass + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file") + description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file" + ) parser.add_argument( - "--outfile", type=Path, + "--outfile", + type=Path, help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + "--outtype", + type=str, + choices=["f32", "f16", "bf16", "q8_0", "auto"], + default="f16", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( - "--bigendian", action="store_true", + "--bigendian", + action="store_true", help="model is executed on big endian machine", ) parser.add_argument( - "--no-lazy", action="store_true", + "--no-lazy", + action="store_true", help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", ) parser.add_argument( - "--verbose", action="store_true", + "--verbose", + action="store_true", help="increase output verbosity", ) parser.add_argument( - "--dry-run", action="store_true", + "--dry-run", + action="store_true", help="only print out what will be done, without writing any new files", ) parser.add_argument( - "--base", type=Path, required=True, + "--base", + type=Path, + required=True, help="directory containing base model file", ) parser.add_argument( - "lora_path", type=Path, + "lora_path", + type=Path, help="directory containing LoRA adapter file", ) return parser.parse_args() -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) @@ -318,7 +353,9 @@ class LoraModel(model_class): lora_alpha: float - def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs): + def __init__( + self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs + ): super().__init__(*args, **kwargs) @@ -330,7 +367,9 @@ def set_type(self): self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") def set_gguf_parameters(self): - self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) + self.gguf_writer.add_float32( + gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha + ) super().set_gguf_parameters() def get_tensors(self) -> Iterator[tuple[str, Tensor]]: @@ -345,7 +384,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue - logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") + logger.error( + f"Unexpected name '{name}': Not a lora_A or lora_B tensor" + ) sys.exit(1) if base_name in tensor_map: @@ -362,9 +403,14 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: for name, tensor in tensor_map.items(): assert tensor.A is not None assert tensor.B is not None - yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) + yield ( + name, + cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)), + ) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: dest = super().modify_tensors(data_torch, name, bid) for dest_name, dest_data in dest: assert isinstance(dest_data, LoraTorchTensor) diff --git a/src/localizations.py b/src/localizations.py index 016ae98..e9cb951 100644 --- a/src/localizations.py +++ b/src/localizations.py @@ -224,6 +224,7 @@ def __init__(self): self.LORA_CONVERSION_FROM_TO = "" self.GENERATING_IMATRIX_FOR = "" self.MODEL_PATH_REQUIRED_FOR_IMATRIX = "" + self.NO_ASSET_SELECTED_FOR_CUDA_CHECK = "" class _English(_Localization): def __init__(self): @@ -450,6 +451,7 @@ def __init__(self): self.LORA_CONVERSION_FROM_TO = "LoRA Conversion from {} to {}" self.GENERATING_IMATRIX_FOR = "Generating IMatrix for {}" self.MODEL_PATH_REQUIRED_FOR_IMATRIX = "Model path is required for IMatrix generation." + self.NO_ASSET_SELECTED_FOR_CUDA_CHECK = "No asset selected for CUDA check" class _French: # French localization @@ -5231,7 +5233,7 @@ def set_language(lang_code): global ADDING_LORA_ADAPTER, DELETING_LORA_ADAPTER, LORA_FILES, SELECT_LORA_ADAPTER_FILE, STARTING_LORA_EXPORT global OUTPUT_TYPE, SELECT_OUTPUT_TYPE, GGUF_AND_BIN_FILES, BASE_MODEL, SELECT_BASE_MODEL_FILE global BASE_MODEL_PATH_REQUIRED, BROWSING_FOR_BASE_MODEL_FILE, SELECT_BASE_MODEL_FOLDER, BROWSING_FOR_BASE_MODEL_FOLDER - global LORA_CONVERSION_FROM_TO, GENERATING_IMATRIX_FOR, MODEL_PATH_REQUIRED_FOR_IMATRIX + global LORA_CONVERSION_FROM_TO, GENERATING_IMATRIX_FOR, MODEL_PATH_REQUIRED_FOR_IMATRIX, NO_ASSET_SELECTED_FOR_CUDA_CHECK loc = _languages.get(lang_code, _English)() english_loc = _English() # Create an instance of English localization for fallback